comparison correlation_with_signature.R @ 0:6f3da989b59d draft default tip

planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/gsc_correlation commit 70c7abb20f76651e61325e3aa90809f6874f8b85
author artbio
date Mon, 24 Jun 2019 07:18:35 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:6f3da989b59d
1 # Performs multi-correlation analysis between the vectors of gene expressions
2 # in single cell RNAseq libraries and the vectors of signature scores in these
3 # same single cell RNAseq libraries.
4
5 # Example of command
6 # Rscript correlations_with_signature.R --expression_file <expression_data.tsv>
7 # --signatures_file <signature_scores.tsv>
8 # --sep "\t"
9 # --colnames "T"
10 # --gene_corr <gene-gene corr file>
11 # --gene_corr_pval <gene-gene corr pvalues file>
12 # --sig_corr <genes correlation to signature file>
13
14 # load packages that are provided in the conda env
15 options( show.error.messages=F,
16 error = function () { cat( geterrmessage(), file=stderr() ); q( "no", 1, F ) } )
17 loc <- Sys.setlocale("LC_MESSAGES", "en_US.UTF-8")
18 requiredPackages = c('optparse', 'Hmisc')
19 warnings()
20
21 library(optparse)
22 library(Hmisc)
23
24 # Arguments
25 option_list = list(
26 make_option(
27 "--sep",
28 default = '\t',
29 type = 'character',
30 help = "File separator, must be the same for all input files [default : '%default' ]"
31 ),
32 make_option(
33 "--colnames",
34 default = TRUE,
35 type = 'logical',
36 help = "Consider first lines as header (must stand for all input files) [default : '%default' ]"
37 ),
38 make_option(
39 "--expression_file",
40 default = NA,
41 type = 'character',
42 help = "Input file that contains log2(CPM +1) expression values"
43 ),
44 make_option(
45 "--signatures_file",
46 default = NA,
47 type = 'character',
48 help = "Input file that contains cell signature"
49 ),
50 make_option(
51 "--sig_corr",
52 default = "sig_corr.tsv",
53 type = 'character',
54 help = "signature correlations output [default : '%default' ]"
55 ),
56 make_option(
57 "--gene_corr",
58 default = "gene_corr.tsv",
59 type = 'character',
60 help = "genes-genes correlations output [default : '%default' ]"
61 ),
62 make_option(
63 "--gene_corr_pval",
64 default = "gene_corr_pval.tsv",
65 type = 'character',
66 help = "genes-genes correlations pvalues output [default : '%default' ]"
67 )
68 )
69
70 opt = parse_args(OptionParser(option_list = option_list),
71 args = commandArgs(trailingOnly = TRUE))
72
73 if (opt$sep == "tab") {opt$sep = "\t"}
74 if (opt$sep == "comma") {opt$sep = ","}
75
76 # Open files
77 data <- read.table(
78 opt$expression_file,
79 header = opt$colnames,
80 row.names = 1,
81 sep = opt$sep,
82 check.names = F
83 )
84 signature <- read.delim(
85 opt$signatures_file,
86 header = T,
87 stringsAsFactors = F,
88 row.names = 1,
89 sep = opt$sep,
90 check.names = F
91 )
92
93
94 # keep only signatures that are in the expression dataframe
95 signature <- subset(signature, rownames(signature) %in% colnames(data))
96
97 # Add signature score to expression matrix
98 data <- rbind(t(signature), data)
99
100 # Gene correlation
101 gene_corr <- rcorr(t(data), type = "pearson") # transpose because we correlate genes, not cells
102
103 # Gene correlation with signature score
104 gene_signature_corr <- cbind.data.frame(gene = colnames(gene_corr$r),
105 Pearson_correlation = gene_corr$r[, 1],
106 p_value = gene_corr$P[, 1])
107 gene_signature_corr <- gene_signature_corr[ order(gene_signature_corr[,2], decreasing = T), ]
108
109
110 # Save files
111 write.table(
112 gene_signature_corr,
113 file = opt$sig_corr,
114 sep = "\t",
115 quote = F,
116 col.names = T,
117 row.names = F
118 )
119
120 r_genes <- data.frame(gene=rownames(gene_corr$r), gene_corr$r) # add rownames as a variable for output
121 p_genes <- data.frame(gene=rownames(gene_corr$P), gene_corr$P) # add rownames as a variable for output
122 write.table(
123 r_genes[-1,-2],
124 file = opt$gene_corr,
125 sep = "\t",
126 quote = F,
127 col.names = T,
128 row.names = F
129 )
130 write.table(
131 p_genes[-1,-2],
132 file = opt$gene_corr_pval,
133 sep = "\t",
134 quote = F,
135 col.names = T,
136 row.names = F
137 )