Mercurial > repos > bgruening > music_construct_eset
changeset 1:271f56c5d629 draft
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/music/ commit 683bb72ae92b5759a239b7e3bf4c5a229ed35b54"
author | bgruening |
---|---|
date | Fri, 26 Nov 2021 15:53:38 +0000 |
parents | 67c243187c1e |
children | baa425d177a3 |
files | construct_eset.xml macros.xml scripts/dendrogram.R scripts/estimateprops.R scripts/inspect.R test-data/EMTABesethealthy.subset.rds test-data/Mousebulkeset.rds test-data/Mousesubeset.degenesonly2.half.rds test-data/default_output.pdf test-data/dendro.pdf test-data/dendro_1.pdf |
diffstat | 11 files changed, 257 insertions(+), 104 deletions(-) [+] |
line wrap: on
line diff
--- a/construct_eset.xml Sun Sep 12 19:47:27 2021 +0000 +++ b/construct_eset.xml Fri Nov 26 15:53:38 2021 +0000 @@ -14,9 +14,12 @@ suppressWarnings(suppressPackageStartupMessages(library(xbioc))) suppressWarnings(suppressPackageStartupMessages(library(MuSiC))) -null_str_vec = function(gstr){ +null_str_vec = function(gstr, is.str=FALSE){ tokens = unlist(as.vector(strsplit(gstr, split=","))) if (length(tokens) == 0){ + if (is.str){ + return(character(0)) + } return(NULL) } if (length(tokens) == 1){ @@ -40,6 +43,9 @@ #end if ## Annotation and Feature Data, or just a string for type of chip used annotation = null_str_vec('$annotation') +if (is.null(annotation)){ + annotation = character(0) +} if (all(rownames(pdata) != colnames(exprs))) { stop("Number of Samples between phenotypes and assays are not the same") @@ -66,26 +72,27 @@ metadata\$lname = NULL if (nrow(metadata)==0) { - metadata = NULL + pheno_data = new("AnnotatedDataFrame", data = pdata) +} else { + pheno_data = new("AnnotatedDataFrame", data = pdata, varMetadata = metadata) } -pheno_data = new("AnnotatedDataFrame", data = pdata, varMetadata = metadata) ## Experiment Description -- using the MIAME object experiment_data = new( "MIAME", - name = null_str_vec('$expdata.name'), - lab = null_str_vec('$expdata.lab'), - contact = null_str_vec('$expdata.contact'), - title = null_str_vec('$expdata.title'), - abstract = null_str_vec('$expdata.abstract'), - url = null_str_vec('$expdata.url'), + name = null_str_vec('$expdata.name', is.str=T), + lab = null_str_vec('$expdata.lab', is.str=T), + contact = null_str_vec('$expdata.contact', is.str=T), + title = null_str_vec('$expdata.title', is.str=T), + abstract = null_str_vec('$expdata.abstract', is.str=T), + url = null_str_vec('$expdata.url', is.str=T), other = list( #for i, row in enumerate($expdata.other): #if i==0 - '$row.field' = null_str_vec('$row.comment') + '$row.field' = null_str_vec('$row.comment', is.str=T) #else - ,'$row.field' = null_str_vec('$row.comment') + ,'$row.field' = null_str_vec('$row.comment', is.str=T) #end if #end for )) @@ -95,7 +102,7 @@ experimentData = experiment_data, annotation = annotation) -capture.output(print(e_set), file = '$out_tab') +capture.output(print(e_set), file = '$out_txt') saveRDS(e_set, file= '$out_rds') </configfile> @@ -112,7 +119,8 @@ </param> <repeat name="metadata" title="Meta Data" min="0" max="15" > <!-- optional, so min=0 --> - <param name="row_names" label="Label" type="text" > + <param name="row_names" label="Label" type="text" + help="Metadata should correspond directly to the columns of the Phenotype Data" > <expand macro="validator_text_and_urls" /> </param> <param name="label_desc" label="Label Description" type="text" > @@ -149,7 +157,7 @@ </section> </inputs> <outputs> - <data name="out_tab" format="tabular" label="${tool.name} on ${on_string}: General Info" /> + <data name="out_txt" format="txt" label="${tool.name} on ${on_string}: General Info" /> <data name="out_rds" format="rdata.eset" label="${tool.name} on ${on_string}: RData ESet Object" /> </outputs> <tests> @@ -186,7 +194,7 @@ <param name="comment" value="Some other comment" /> </repeat> </section> - <output name="out_tab"> + <output name="out_txt"> <assert_contents> <has_text text="assayData: 3 features, 2 samples " /> </assert_contents>
--- a/macros.xml Sun Sep 12 19:47:27 2021 +0000 +++ b/macros.xml Fri Nov 26 15:53:38 2021 +0000 @@ -1,5 +1,5 @@ <macros> - <token name="@VERSION_SUFFIX@">0</token> + <token name="@VERSION_SUFFIX@">1</token> <!-- The ESet inspector/constructor and MuSiC tool can have independent Galaxy versions but should reference the same package version always. --> @@ -15,11 +15,11 @@ <validator type="regex" message="FORMAT terms separated by commas">^(([A-Za-z0-9+_ -]+)\s?,?)*$</validator> </xml> <xml name="validator_text" > - <validator type="regex" message="No commas allowed">^(([A-Za-z0-9+_ -]+)\s?)*$</validator> + <validator type="regex" message="No commas allowed">^(([A-Za-z0-9+_ -]+)\s?)+$</validator> </xml> <xml name="celltypes_macro" > <param name="celltypes" type="text" optional="true" value="" - label="Comma list of cell types to use from scRNA dataset" help="If NULL, then use all cell types." > + label="Comma list of cell types to use from scRNA dataset" help="If blank, then use all available cell types." > <expand macro="validator_index_identifiers" /> </param> </xml>
--- a/scripts/dendrogram.R Sun Sep 12 19:47:27 2021 +0000 +++ b/scripts/dendrogram.R Fri Nov 26 15:53:38 2021 +0000 @@ -17,31 +17,6 @@ args <- commandArgs(trailingOnly = TRUE) source(args[1]) -## We then perform bulk tissue cell type estimation with pre-grouping -## of cell types: C, list_of_cell_types, marker genes name, marker -## genes list. -## data.to.use = list( -## "C1" = list(cell.types = c("Neutro"), -## marker.names=NULL, -## marker.list=NULL), -## "C2" = list(cell.types = c("Podo"), -## marker.names=NULL, -## marker.list=NULL), -## "C3" = list(cell.types = c("Endo","CD-PC","LOH","CD-IC","DCT","PT"), -## marker.names = "Epithelial", -## marker.list = read_list("../test-data/epith.markers")), -## "C4" = list(cell.types = c("Macro","Fib","B lymph","NK","T lymph"), -## marker.names = "Immune", -## marker.list = read_list("../test-data/immune.markers")) -## ) -grouped_celltypes <- lapply(data.to.use, function(x) { - x$cell.types -}) -marker_groups <- lapply(data.to.use, function(x) { - x$marker.list -}) -names(marker_groups) <- names(data.to.use) - ## Perform the estimation ## Produce the first step information @@ -51,33 +26,107 @@ ## Plot the dendrogram of design matrix and cross-subject mean of ## realtive abundance -par(mfrow = c(1, 2)) -d <- dist(t(log(sub.basis$Disgn.mtx + 1e-6)), method = "euclidean") +## Hierarchical clustering using Complete Linkage +d1 <- dist(t(log(sub.basis$Disgn.mtx + 1e-6)), method = "euclidean") +hc1 <- hclust(d1, method = "complete") ## Hierarchical clustering using Complete Linkage -hc1 <- hclust(d, method = "complete") -## Plot the obtained dendrogram -plot(hc1, cex = 0.6, hang = -1, main = "Cluster log(Design Matrix)") -d <- dist(t(log(sub.basis$M.theta + 1e-8)), method = "euclidean") -## Hierarchical clustering using Complete Linkage -hc2 <- hclust(d, method = "complete") -## Plot the obtained dendrogram -pdf(file = outfile_pdf, width = 8, height = 8) -plot(hc2, cex = 0.6, hang = -1, main = "Cluster log(Mean of RA)") +d2 <- dist(t(log(sub.basis$M.theta + 1e-8)), method = "euclidean") +hc2 <- hclust(d2, method = "complete") + -cl_type <- as.character(scrna_eset[[celltypes_label]]) +if (length(data.to.use) > 0) { + ## We then perform bulk tissue cell type estimation with pre-grouping + ## of cell types: C, list_of_cell_types, marker genes name, marker + ## genes list. + ## data.to.use = list( + ## "C1" = list(cell.types = c("Neutro"), + ## marker.names=NULL, + ## marker.list=NULL), + ## "C2" = list(cell.types = c("Podo"), + ## marker.names=NULL, + ## marker.list=NULL), + ## "C3" = list(cell.types = c("Endo","CD-PC","LOH","CD-IC","DCT","PT"), + ## marker.names = "Epithelial", + ## marker.list = read_list("../test-data/epith.markers")), + ## "C4" = list(cell.types = c("Macro","Fib","B lymph","NK","T lymph"), + ## marker.names = "Immune", + ## marker.list = read_list("../test-data/immune.markers")) + ## ) + grouped_celltypes <- lapply(data.to.use, function(x) { + x$cell.types + }) + marker_groups <- lapply(data.to.use, function(x) { + x$marker.list + }) + names(marker_groups) <- names(data.to.use) + + + cl_type <- as.character(scrna_eset[[celltypes_label]]) + + for (cl in seq_len(length(grouped_celltypes))) { + cl_type[cl_type %in% + grouped_celltypes[[cl]]] <- names(grouped_celltypes)[cl] + } + pData(scrna_eset)[[clustertype_label]] <- factor( + cl_type, levels = c(names(grouped_celltypes), + "CD-Trans", "Novel1", "Novel2")) + + est_bulk <- music_prop.cluster( + bulk.eset = bulk_eset, sc.eset = scrna_eset, + group.markers = marker_groups, clusters = celltypes_label, + groups = clustertype_label, samples = samples_label, + clusters.type = grouped_celltypes + ) -for (cl in seq_len(length(grouped_celltypes))) { - cl_type[cl_type %in% grouped_celltypes[[cl]]] <- names(grouped_celltypes)[cl] -} -pData(scrna_eset)[[clustertype_label]] <- factor( - cl_type, levels = c(names(grouped_celltypes), - "CD-Trans", "Novel1", "Novel2")) + estimated_music_props <- est_bulk$Est.prop.weighted.cluster + ## NNLS is not calculated here + + ## Show different in estimation methods + ## Jitter plot of estimated cell type proportions + methods_list <- c("MuSiC") + + jitter_fig <- Jitter_Est( + list(data.matrix(estimated_music_props)), + method.name = methods_list, title = "Jitter plot of Est Proportions", + size = 2, alpha = 0.7) + + theme_minimal() + + labs(x = element_blank(), y = element_blank()) + + theme(axis.text = element_text(size = 6), + axis.text.x = element_blank(), + legend.position = "none") + + plot_box <- Boxplot_Est(list( + data.matrix(estimated_music_props)), + method.name = methods_list) + + theme_minimal() + + labs(x = element_blank(), y = element_blank()) + + theme(axis.text = element_text(size = 6), + axis.text.x = element_blank(), + legend.position = "none") -est_bulk <- music_prop.cluster( - bulk.eset = bulk_eset, sc.eset = scrna_eset, - group.markers = marker_groups, clusters = celltypes_label, - groups = clustertype_label, samples = samples_label, - clusters.type = grouped_celltypes) + plot_hmap <- Prop_heat_Est(list( + data.matrix(estimated_music_props)), + method.name = methods_list) + + labs(x = element_blank(), y = element_blank()) + + theme(axis.text.y = element_text(size = 6), + axis.text.x = element_text(angle = -90, size = 5), + plot.title = element_text(size = 9), + legend.key.width = unit(0.15, "cm"), + legend.text = element_text(size = 5), + legend.title = element_text(size = 5)) -write.table(est_bulk, file = outfile_tab, quote = F, col.names = NA, sep = "\t") -dev.off() +} + +pdf(file = outfile_pdf, width = 8, height = 8) +par(mfrow = c(1, 2)) +plot(hc1, cex = 0.6, hang = -1, main = "Cluster log(Design Matrix)") +plot(hc2, cex = 0.6, hang = -1, main = "Cluster log(Mean of RA)") +if (length(data.to.use) > 0) { + plot_grid(jitter_fig, plot_box, plot_hmap, ncol = 2, nrow = 2) +} +message(dev.off()) + +if (length(data.to.use) > 0) { + write.table(estimated_music_props, + file = outfile_tab, quote = F, col.names = NA, sep = "\t") +}
--- a/scripts/estimateprops.R Sun Sep 12 19:47:27 2021 +0000 +++ b/scripts/estimateprops.R Fri Nov 26 15:53:38 2021 +0000 @@ -14,36 +14,67 @@ clusters = celltypes_label, samples = samples_label, select.ct = celltypes, verbose = T) +estimated_music_props <- est_prop$Est.prop.weighted +estimated_nnls_props <- est_prop$Est.prop.allgene ## Show different in estimation methods ## Jitter plot of estimated cell type proportions -jitter.fig <- Jitter_Est( - list(data.matrix(est_prop$Est.prop.weighted), - data.matrix(est_prop$Est.prop.allgene)), - method.name = methods, title = "Jitter plot of Est Proportions") +jitter_fig <- Jitter_Est( + list(data.matrix(estimated_music_props), + data.matrix(estimated_nnls_props)), + method.name = methods, title = "Jitter plot of Est Proportions", + size = 2, alpha = 0.7) + theme_minimal() ## Make a Plot ## A more sophisticated jitter plot is provided as below. We separated -## the T2D subjects and normal subjects by their HbA1c levels. -m_prop <- rbind(melt(est_prop$Est.prop.weighted), - melt(est_prop$Est.prop.allgene)) +## the T2D subjects and normal subjects by their disease factor levels. +estimated_music_props_flat <- melt(estimated_music_props) +estimated_nnls_props_flat <- melt(estimated_nnls_props) + +m_prop <- rbind(estimated_music_props_flat, + estimated_nnls_props_flat) +colnames(m_prop) <- c("Sub", "CellType", "Prop") + +if (is.null(celltypes)) { + celltypes <- levels(m_prop$CellType) + message("No celltypes declared, using:") + message(celltypes) +} -colnames(m_prop) <- c("Sub", "CellType", "Prop") +if (phenotype_target_threshold == -99) { + phenotype_target_threshold <- -Inf + message("phenotype target threshold set to -Inf") +} + +if (is.null(phenotype_factors)) { + phenotype_factors <- colnames(pData(bulk_eset)) +} +## filter out unwanted factors like "sampleID" and "subjectName" +phenotype_factors <- phenotype_factors[ + !(phenotype_factors %in% phenotype_factors_always_exclude)] +message("Phenotype Factors to use:") +message(phenotype_factors) + m_prop$CellType <- factor(m_prop$CellType, levels = celltypes) # nolint -m_prop$Method <- factor(rep(methods, each = 89 * 6), levels = methods) # nolint -m_prop$HbA1c <- rep(bulk_eset$hba1c, 2 * 6) # nolint -m_prop <- m_prop[!is.na(m_prop$HbA1c), ] -m_prop$Disease <- factor(sample_groups[(m_prop$HbA1c > 6.5) + 1], # nolint +m_prop$Method <- factor(rep(methods, each = nrow(estimated_music_props_flat)), # nolint + levels = methods) +m_prop$Disease_factor <- rep(bulk_eset[[phenotype_target]], 2 * length(celltypes)) # nolint +m_prop <- m_prop[!is.na(m_prop$Disease_factor), ] +## Generate a TRUE/FALSE table of Normal == 1 and Disease == 2 +sample_groups <- c("Normal", sample_disease_group) +m_prop$Disease <- factor(sample_groups[(m_prop$Disease_factor > phenotype_target_threshold) + 1], # nolint levels = sample_groups) +## Binary to scale: e.g. TRUE / 5 = 0.2 m_prop$D <- (m_prop$Disease == # nolint sample_disease_group) / sample_disease_group_scale -m_prop <- rbind(subset(m_prop, Disease == healthy_phenotype), - subset(m_prop, Disease != healthy_phenotype)) +## NA's are not included in the comparison below +m_prop <- rbind(subset(m_prop, Disease != sample_disease_group), + subset(m_prop, Disease == sample_disease_group)) -jitter.new <- ggplot(m_prop, aes(Method, Prop)) + +jitter_new <- ggplot(m_prop, aes(Method, Prop)) + geom_point(aes(fill = Method, color = Disease, stroke = D, shape = Disease), size = 2, alpha = 0.7, position = position_jitter(width = 0.25, height = 0)) + @@ -52,20 +83,23 @@ scale_shape_manual(values = c(21, 24)) + theme_minimal() ## Plot to compare method effectiveness -## Create dataframe for beta cell proportions and HbA1c levels -m_prop_ana <- data.frame(pData(bulk_eset)[rep(1:89, 2), phenotype_factors], - ct.prop = c(est_prop$Est.prop.weighted[, 2], - est_prop$Est.prop.allgene[, 2]), - Method = factor(rep(methods, each = 89), +## Create dataframe for beta cell proportions and Disease_factor levels +m_prop_ana <- data.frame(pData(bulk_eset)[rep(1:nrow(estimated_music_props), 2), #nolint + phenotype_factors], + ct.prop = c(estimated_music_props[, 2], + estimated_nnls_props[, 2]), + Method = factor(rep(methods, + each = nrow(estimated_music_props)), levels = methods)) -colnames(m_prop_ana)[1:4] <- phenotype_factors -m_prop_ana <- subset(m_prop_ana, !is.na(m_prop_ana[phenotype_gene])) +colnames(m_prop_ana)[1:length(phenotype_factors)] <- phenotype_factors #nolint +m_prop_ana <- subset(m_prop_ana, !is.na(m_prop_ana[phenotype_target])) m_prop_ana$Disease <- factor(sample_groups[( # nolint - m_prop_ana[phenotype_gene] > 6.5) + 1], sample_groups) + m_prop_ana[phenotype_target] > phenotype_target_threshold) + 1], + sample_groups) m_prop_ana$D <- (m_prop_ana$Disease == # nolint sample_disease_group) / sample_disease_group_scale -jitt_compare <- ggplot(m_prop_ana, aes_string(phenotype_gene, "ct.prop")) + +jitt_compare <- ggplot(m_prop_ana, aes_string(phenotype_target, "ct.prop")) + geom_smooth(method = "lm", se = FALSE, col = "black", lwd = 0.25) + geom_point(aes(fill = Method, color = Disease, stroke = D, shape = Disease), size = 2, alpha = 0.7) + facet_wrap(~ Method) + @@ -73,21 +107,81 @@ scale_colour_manual(values = c("white", "gray20")) + scale_shape_manual(values = c(21, 24)) +## BoxPlot +plot_box <- Boxplot_Est(list( + data.matrix(estimated_music_props), + data.matrix(estimated_nnls_props)), + method.name = c("MuSiC", "NNLS")) + + theme(axis.text.x = element_text(angle = -90), + axis.text.y = element_text(size = 8)) + + ggtitle(element_blank()) + theme_minimal() + +## Heatmap +plot_hmap <- Prop_heat_Est(list( + data.matrix(estimated_music_props), + data.matrix(estimated_nnls_props)), + method.name = c("MuSiC", "NNLS")) + + theme(axis.text.x = element_text(angle = -90), + axis.text.y = element_text(size = 6)) pdf(file = outfile_pdf, width = 8, height = 8) -plot_grid(jitter.fig, jitter.new, labels = "auto", ncol = 1, nrow = 2) -jitt_compare -dev.off() +plot_grid(jitter_fig, plot_box, labels = "auto", ncol = 1, nrow = 2) +plot_grid(jitter_new, jitt_compare, labels = "auto", ncol = 1, nrow = 2) +plot_hmap +message(dev.off()) + +## Output Proportions + +write.table(est_prop$Est.prop.weighted, + file = paste0("report_data/prop_", + "Music Estimated Proportions of Cell Types", + ".tabular"), + quote = F, sep = "\t", col.names = NA) +write.table(est_prop$Est.prop.allgene, + file = paste0("report_data/prop_", + "NNLS Estimated Proportions of Cell Types", + ".tabular"), + quote = F, sep = "\t", col.names = NA) +write.table(est_prop$Weight.gene, + file = paste0("report_data/weightgene_", + "Music Estimated Proportions of Cell Types (by Gene)", + ".tabular"), + quote = F, sep = "\t", col.names = NA) +write.table(est_prop$r.squared.full, + file = paste0("report_data/rsquared_", + "Music R-sqr Estimated Proportions of Each Subject", + ".tabular"), + quote = F, sep = "\t", col.names = NA) +write.table(est_prop$Var.prop, + file = paste0("report_data/varprop_", + "Matrix of Variance of MuSiC Estimates", + ".tabular"), + quote = F, sep = "\t", col.names = NA) + ## Summary table for (meth in methods) { ##lm_beta_meth = lm(ct.prop ~ age + bmi + hba1c + gender, data = - ##subset(m_prop_ana, Method == meth)) + sub_data <- subset(m_prop_ana, Method == meth) + ## We can only do regression where there are more than 1 factors + ## so we must find and exclude the ones which are not + gt1_facts <- sapply(phenotype_factors, function(facname) { + return(length(unique(sort(sub_data[[facname]]))) == 1) + }) + form_factors <- phenotype_factors + exclude_facts <- names(gt1_facts)[gt1_facts] + if (length(exclude_facts) > 0) { + message("Factors with only one level will be excluded:") + message(exclude_facts) + form_factors <- phenotype_factors[ + !(phenotype_factors %in% exclude_facts)] + } lm_beta_meth <- lm(as.formula( - paste("ct.prop", paste(phenotype_factors, collapse = " + "), - sep = " ~ ")), - data = subset(m_prop_ana, Method == meth)) - print(paste0("Summary: ", meth)) + paste("ct.prop", paste(form_factors, collapse = " + "), + sep = " ~ ")), data = sub_data) + message(paste0("Summary: ", meth)) capture.output(summary(lm_beta_meth), - file = paste0("report_data/summ_", meth, ".txt")) + file = paste0("report_data/summ_Log of ", + meth, + " fitting.txt")) }
--- a/scripts/inspect.R Sun Sep 12 19:47:27 2021 +0000 +++ b/scripts/inspect.R Fri Nov 26 15:53:38 2021 +0000 @@ -6,17 +6,19 @@ source(args[1]) printout <- function(text) { - if (typeof(text) %in% c("list", "vector")) { + if (typeof(text) %in% c("list", "vector", "integer", "double", "numeric")) { write.table(text, file = outfile_tab, quote = F, sep = "\t", col.names = NA) } else { ## text + print(typeof(text)) capture.output(text, file = outfile_tab) # nolint } } -if (inspector %in% c("print", "pData", "fData", "dims", "experimentData", - "exprs", "signature", "annotation", "abstract")) { +if (inspector %in% c("print", "pData", "fData", "dims", + "experimentData", "protocolData", "exprs", + "signature", "annotation", "abstract")) { op <- get(inspector) tab <- op(rds_eset) printout(tab)