# HG changeset patch # User sauria # Date 1493311079 14400 # Node ID f0c8cdd78e28883c094659e9bb51595742b4f659 # Parent 9aeb70cf7a41edd67fd37cfe1512c3ad168ffc84 Uploaded diff -r 9aeb70cf7a41 -r f0c8cdd78e28 correlation_matrix.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/correlation_matrix.py Thu Apr 27 12:37:59 2017 -0400 @@ -0,0 +1,85 @@ +#!/usr/bin/env python + +import argparse + +import numpy +import scipy.stats + +def main(): + parser = generate_parser() + args = parser.parse_args() + data, names = load_data(args) + corr = find_correlations(data, args) + save_data(corr, names, args) + +def load_data(args): + infile = open(args.input) + names = [] + data = [] + if args.column: + temp = infile.readline() + temp = infile.readline() + if args.int: + dtype = int + else: + dtype = float + while temp: + temp = temp.split() + if args.row: + names.append(temp[0]) + temp = temp[1:] + data.append([]) + for i in range(len(temp)): + data[-1].append(dtype(temp[i])) + temp = infile.readline() + if len(names) == 0: + names = None + data = numpy.array(data) + if args.features: + data = data.T + return data, names + +def find_correlations(data, args): + corr = numpy.ones((data.shape[0], data.shape[0]), dtype=numpy.float32) + if args.test == 'pearson': + findcorr = scipy.stats.pearsonr + elif args.test == 'spearman': + findcorr = scipy.stats.spearmanr + else: + findcorr = scipy.stats.kendalltau + for i in range(data.shape[0] - 1): + for j in range(i + 1, data.shape[0]): + corr[i, j] = findcorr(data[i, :], data[j, :])[0] + corr[j, i] = corr[i, j] + return corr + +def save_data(data, names, args): + output = open(args.output, 'w') + if names is not None: + output.write("%s\n" % '\t'.join(['sample'] + names)) + for i in range(data.shape[0]): + if names is not None: + temp = [names[i]] + else: + temp = [] + for j in range(data.shape[1]): + temp.append("%0.6f" % data[i, j]) + output.write("%s\n" % '\t'.join(temp)) + output.close() + +def generate_parser(): + """Generate an argument parser.""" + description = "%(prog)s -- Create a raw file of paired aligned reads for a HiC experiment from bam files" + parser = argparse.ArgumentParser(description=description) + parser.add_argument('-f', dest="features", action='store_true', help="Rows represent features.") + parser.add_argument('-i', dest='int', action='store_true', help="Data is of type int.") + parser.add_argument('-t', dest='test', action='store', default='pearson', + choices=['spearman', 'pearson', 'kendall'], help="Type of correlation to perform.") + parser.add_argument('-r', dest='row', action='store_true', help="Row names present.") + parser.add_argument('-c', dest='column', action='store_true', help="Column names present.") + parser.add_argument(dest="input", type=str, action='store', help="Text files conatining table to be correlated.") + parser.add_argument(dest="output", type=str, action='store', help="Output destination.") + return parser + +if __name__ == "__main__": + main() \ No newline at end of file diff -r 9aeb70cf7a41 -r f0c8cdd78e28 correlation_matrix.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/correlation_matrix.xml Thu Apr 27 12:37:59 2017 -0400 @@ -0,0 +1,122 @@ + + correlation_matrix.py ${corr} ${feature} ${row} ${column} ${int} ${input} ${out_file1} + + numpy + scipy + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +.. class:: infomark + +**TIP:** If your data is not TAB or SPACE delimited, use *Text Manipulation->Convert* + +.. class:: warningmark + +This tool will not work with missing data ("nan"). + +----- + +**Syntax** + +This tool computes the matrix of correlation coefficients between each pairwise combination of samples across all features. + +- **Pearson's Correlation** reflects the degree of linear relationship between two variables. It ranges from +1 to -1. A correlation of +1 means that there is a perfect positive linear relationship between variables. The formula for Pearson's correlation is: + + .. image:: pearson.png + + where n is the number of items + +- **Kendall's rank correlation** is used to measure the degree of correspondence between two rankings and assessing the significance of this correspondence. The formula for Kendall's rank correlation is: + + .. image:: kendall.png + + where n is the number of items, and P is the sum. + +- **Spearman's rank correlation** assesses how well an arbitrary monotonic function could describe the relationship between two variables, without making any assumptions about the frequency distribution of the variables. The formula for Spearman's rank correlation is + + .. image:: spearman.png + + where D is the difference between the ranks of corresponding values of X and Y, and N is the number of pairs of values. + +----- + +**Example** + +- Input file:: + + Sample Feature1 Feature2 Feature3 Feature4 Feature5 Feature6 + A -0.3019 1.6868 -2.5340 -4.9926 -2.1455 7.6550 + B -0.5274 2.0816 -3.4161 -4.5812 -1.8705 6.9659 + C -3.2619 0.0963 -1.0131 1.9299 -0.2277 -0.7781 + D -1.9462 0.3685 -1.3830 1.1512 -0.1307 -0.3001 + E -2.0360 -0.7261 -1.6089 1.0215 -0.1636 -1.6826 + F -2.7152 0.8035 -1.7544 1.7193 -0.1782 -1.0745 + +- Computing the correlation coefficients between every pair of rows of the above file (using Pearson's Correlation), the output is:: + + sample A B C D E F + A 1.0000 0.9923 -0.3144 -0.1313 -0.5773 -0.2793 + B 0.9923 1.0000 -0.2575 -0.0462 -0.5011 -0.1915 + C -0.3144 -0.2575 1.0000 0.9443 0.8916 0.9470 + D -0.1313 -0.0462 0.9443 1.0000 0.8679 0.9793 + E -0.5773 -0.5011 0.8916 0.8679 1.0000 0.9046 + F -0.2793 -0.1915 0.9470 0.9793 0.9046 1.0000 + + + diff -r 9aeb70cf7a41 -r f0c8cdd78e28 kendall.png Binary file kendall.png has changed diff -r 9aeb70cf7a41 -r f0c8cdd78e28 pearson.png Binary file pearson.png has changed diff -r 9aeb70cf7a41 -r f0c8cdd78e28 spearman.png Binary file spearman.png has changed diff -r 9aeb70cf7a41 -r f0c8cdd78e28 test/corr_test_data.txt --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test/corr_test_data.txt Thu Apr 27 12:37:59 2017 -0400 @@ -0,0 +1,7 @@ + a b c d e f +A -0.301922 1.686861 -2.534047 -4.992625 -2.145566 7.655097 +B -0.527461 2.081606 -3.416167 -4.581296 -1.870538 6.965914 +C -3.261987 0.096352 -1.013146 1.929970 -0.227763 -0.778194 +D -1.946216 0.368520 -1.383010 1.151264 -0.130760 -0.300130 +E -2.036051 -0.726172 -1.608901 1.021513 -0.163652 -1.682650 +F -2.715232 0.803575 -1.754483 1.719354 -0.178256 -1.074505 diff -r 9aeb70cf7a41 -r f0c8cdd78e28 test/kendall.txt --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test/kendall.txt Thu Apr 27 12:37:59 2017 -0400 @@ -0,0 +1,7 @@ +sample A B C D E F +A 1.000000 1.000000 -0.200000 -0.200000 -0.466667 -0.200000 +B 1.000000 1.000000 -0.200000 -0.200000 -0.466667 -0.200000 +C -0.200000 -0.200000 1.000000 1.000000 0.733333 1.000000 +D -0.200000 -0.200000 1.000000 1.000000 0.733333 1.000000 +E -0.466667 -0.466667 0.733333 0.733333 1.000000 0.733333 +F -0.200000 -0.200000 1.000000 1.000000 0.733333 1.000000 diff -r 9aeb70cf7a41 -r f0c8cdd78e28 test/pearson.txt --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test/pearson.txt Thu Apr 27 12:37:59 2017 -0400 @@ -0,0 +1,7 @@ +sample A B C D E F +A 1.000000 0.992329 -0.314408 -0.131307 -0.577364 -0.279346 +B 0.992329 1.000000 -0.257557 -0.046203 -0.501154 -0.191588 +C -0.314408 -0.257557 1.000000 0.944390 0.891607 0.947065 +D -0.131307 -0.046203 0.944390 1.000000 0.867969 0.979385 +E -0.577364 -0.501154 0.891607 0.867969 1.000000 0.904631 +F -0.279346 -0.191588 0.947065 0.979385 0.904631 1.000000 diff -r 9aeb70cf7a41 -r f0c8cdd78e28 test/spearman.txt --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test/spearman.txt Thu Apr 27 12:37:59 2017 -0400 @@ -0,0 +1,7 @@ +sample A B C D E F +A 1.000000 1.000000 -0.257143 -0.257143 -0.600000 -0.257143 +B 1.000000 1.000000 -0.257143 -0.257143 -0.600000 -0.257143 +C -0.257143 -0.257143 1.000000 1.000000 0.885714 1.000000 +D -0.257143 -0.257143 1.000000 1.000000 0.885714 1.000000 +E -0.600000 -0.600000 0.885714 0.885714 1.000000 0.885714 +F -0.257143 -0.257143 1.000000 1.000000 0.885714 1.000000