# HG changeset patch # User devteam # Date 1406561405 14400 # Node ID 6c20d2297d675c3b6bea083fe613ebcac2a9f2c5 Imported from capsule None diff -r 000000000000 -r 6c20d2297d67 cor.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/cor.py Mon Jul 28 11:30:05 2014 -0400 @@ -0,0 +1,88 @@ +#!/usr/bin/env python +#Greg Von Kuster +""" +Calculate correlations between numeric columns in a tab delim file. +usage: %prog infile output.txt columns method +""" + +import sys +from rpy import * + +def stop_err(msg): + sys.stderr.write(msg) + sys.exit() + +def main(): + method = sys.argv[4] + assert method in ( "pearson", "kendall", "spearman" ) + + try: + columns = map( int, sys.argv[3].split( ',' ) ) + except: + stop_err( "Problem determining columns, perhaps your query does not contain a column of numerical data." ) + + matrix = [] + skipped_lines = 0 + first_invalid_line = 0 + invalid_value = '' + invalid_column = 0 + + for i, line in enumerate( file( sys.argv[1] ) ): + valid = True + line = line.rstrip('\n\r') + + if line and not line.startswith( '#' ): + # Extract values and convert to floats + row = [] + for column in columns: + column -= 1 + fields = line.split( "\t" ) + if len( fields ) <= column: + valid = False + else: + val = fields[column] + if val.lower() == "na": + row.append( float( "nan" ) ) + else: + try: + row.append( float( fields[column] ) ) + except: + valid = False + skipped_lines += 1 + if not first_invalid_line: + first_invalid_line = i+1 + invalid_value = fields[column] + invalid_column = column+1 + else: + valid = False + skipped_lines += 1 + if not first_invalid_line: + first_invalid_line = i+1 + + if valid: + matrix.append( row ) + + if skipped_lines < i: + try: + out = open( sys.argv[2], "w" ) + except: + stop_err( "Unable to open output file" ) + + # Run correlation + try: + value = r.cor( array( matrix ), use="pairwise.complete.obs", method=method ) + except Exception, exc: + out.close() + stop_err("%s" %str( exc )) + for row in value: + print >> out, "\t".join( map( str, row ) ) + out.close() + + if skipped_lines > 0: + msg = "..Skipped %d lines starting with line #%d. " %( skipped_lines, first_invalid_line ) + if invalid_value and invalid_column > 0: + msg += "Value '%s' in column %d is not numeric." % ( invalid_value, invalid_column ) + print msg + +if __name__ == "__main__": + main() diff -r 000000000000 -r 6c20d2297d67 cor.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/cor.xml Mon Jul 28 11:30:05 2014 -0400 @@ -0,0 +1,101 @@ + + for numeric columns + + rpy + + cor.py $input1 $out_file1 $numeric_columns $method + + + + + + + + + + + + + + + + + + + + + + + +.. class:: infomark + +**TIP:** If your data is not TAB delimited, use *Text Manipulation->Convert* + +.. class:: warningmark + +Missing data ("nan") removed from each pairwise comparison + +----- + +**Syntax** + +This tool computes the matrix of correlation coefficients between numeric columns. + +- All invalid, blank and comment lines are skipped when performing computations. The number of skipped lines is displayed in the resulting history item. + +- **Pearson's Correlation** reflects the degree of linear relationship between two variables. It ranges from +1 to -1. A correlation of +1 means that there is a perfect positive linear relationship between variables. The formula for Pearson's correlation is: + + .. image:: pearson.png + + where n is the number of items + +- **Kendall's rank correlation** is used to measure the degree of correspondence between two rankings and assessing the significance of this correspondence. The formula for Kendall's rank correlation is: + + .. image:: kendall.png + + where n is the number of items, and P is the sum. + +- **Spearman's rank correlation** assesses how well an arbitrary monotonic function could describe the relationship between two variables, without making any assumptions about the frequency distribution of the variables. The formula for Spearman's rank correlation is + + .. image:: spearman.png + + where D is the difference between the ranks of corresponding values of X and Y, and N is the number of pairs of values. + +----- + +**Example** + +- Input file:: + + #Person Height Self Esteem + 1 68 4.1 + 2 71 4.6 + 3 62 3.8 + 4 75 4.4 + 5 58 3.2 + 6 60 3.1 + 7 67 3.8 + 8 68 4.1 + 9 71 4.3 + 10 69 3.7 + 11 68 3.5 + 12 67 3.2 + 13 63 3.7 + 14 62 3.3 + 15 60 3.4 + 16 63 4.0 + 17 65 4.1 + 18 67 3.8 + 19 63 3.4 + 20 61 3.6 + +- Computing the correlation coefficients between columns 2 and 3 of the above file (using Pearson's Correlation), the output is:: + + 1.0 0.730635686279 + 0.730635686279 1.0 + + So the correlation for our twenty cases is .73, which is a fairly strong positive relationship. + + diff -r 000000000000 -r 6c20d2297d67 kendall.png Binary file kendall.png has changed diff -r 000000000000 -r 6c20d2297d67 pearson.png Binary file pearson.png has changed diff -r 000000000000 -r 6c20d2297d67 spearman.png Binary file spearman.png has changed diff -r 000000000000 -r 6c20d2297d67 test-data/cor.tabular --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/cor.tabular Mon Jul 28 11:30:05 2014 -0400 @@ -0,0 +1,21 @@ +Person Height Self Esteem +1 68 4.1 +2 71 4.6 +3 62 3.8 +4 75 4.4 +5 58 3.2 +6 60 3.1 +7 67 3.8 +8 68 4.1 +9 71 4.3 +1 69 3.7 +1 68 3.5 +1 67 3.2 +1 63 3.7 +1 62 3.3 +1 60 3.4 +1 63 4.0 +1 65 4.1 +1 67 3.8 +1 63 3.4 +2 61 3.6 \ No newline at end of file diff -r 000000000000 -r 6c20d2297d67 test-data/cor_out.txt --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/cor_out.txt Mon Jul 28 11:30:05 2014 -0400 @@ -0,0 +1,2 @@ +1.0 0.730635686279 +0.730635686279 1.0 diff -r 000000000000 -r 6c20d2297d67 tool_dependencies.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool_dependencies.xml Mon Jul 28 11:30:05 2014 -0400 @@ -0,0 +1,6 @@ + + + + + +