80
|
1 #!/usr/bin/env python
|
|
2 #Greg Von Kuster
|
|
3 """
|
|
4 Calculate correlations between numeric columns in a tab delim file.
|
|
5 usage: %prog infile output.txt columns method
|
|
6 """
|
|
7
|
|
8 import sys
|
|
9 #from rpy import *
|
|
10 import rpy2.robjects as robjects
|
|
11 r = robjects.r
|
|
12
|
|
13 def stop_err(msg):
|
|
14 sys.stderr.write(msg)
|
|
15 sys.exit()
|
|
16
|
|
17 def main():
|
|
18 method = sys.argv[4]
|
|
19 assert method in ( "pearson", "kendall", "spearman" )
|
|
20
|
|
21 try:
|
|
22 columns = map( int, sys.argv[3].split( ',' ) )
|
|
23 except:
|
|
24 stop_err( "Problem determining columns, perhaps your query does not contain a column of numerical data." )
|
|
25
|
|
26 matrix = []
|
|
27 skipped_lines = 0
|
|
28 first_invalid_line = 0
|
|
29 invalid_value = ''
|
|
30 invalid_column = 0
|
|
31
|
|
32 for i, line in enumerate( file( sys.argv[1] ) ):
|
|
33 valid = True
|
|
34 line = line.rstrip('\n\r')
|
|
35
|
|
36 if line and not line.startswith( '#' ):
|
|
37 # Extract values and convert to floats
|
|
38 row = []
|
|
39 for column in columns:
|
|
40 column -= 1
|
|
41 fields = line.split( "\t" )
|
|
42 if len( fields ) <= column:
|
|
43 valid = False
|
|
44 else:
|
|
45 val = fields[column]
|
|
46 if val.lower() == "na":
|
|
47 row.append( float( "nan" ) )
|
|
48 else:
|
|
49 try:
|
|
50 row.append( float( fields[column] ) )
|
|
51 except:
|
|
52 valid = False
|
|
53 skipped_lines += 1
|
|
54 if not first_invalid_line:
|
|
55 first_invalid_line = i+1
|
|
56 invalid_value = fields[column]
|
|
57 invalid_column = column+1
|
|
58 else:
|
|
59 valid = False
|
|
60 skipped_lines += 1
|
|
61 if not first_invalid_line:
|
|
62 first_invalid_line = i+1
|
|
63
|
|
64 if valid:
|
|
65 # matrix.append( row )
|
|
66 matrix += row
|
|
67
|
|
68 if skipped_lines < i:
|
|
69 try:
|
|
70 out = open( sys.argv[2], "w" )
|
|
71 except:
|
|
72 stop_err( "Unable to open output file" )
|
|
73
|
|
74 # Run correlation
|
|
75 # print >> sys.stderr, "matrix: %s" % matrix
|
|
76 # print >> sys.stderr, "array: %s" % array( matrix )
|
|
77 try:
|
|
78 # value = r.cor( array( matrix ), use="pairwise.complete.obs", method=method )
|
|
79 fv = robjects.FloatVector(matrix)
|
|
80 m = r['matrix'](fv, ncol=len(columns),byrow=True)
|
|
81 rslt_mat = r.cor(m, use="pairwise.complete.obs", method=method )
|
|
82 value = []
|
|
83 for ri in range(1, rslt_mat.nrow + 1):
|
|
84 row = []
|
|
85 for ci in range(1, rslt_mat.ncol + 1):
|
|
86 row.append(rslt_mat.rx(ri,ci)[0])
|
|
87 value.append(row)
|
|
88 except Exception, exc:
|
|
89 out.close()
|
|
90 stop_err("%s" %str( exc ))
|
|
91 for row in value:
|
|
92 print >> out, "\t".join( map( str, row ) )
|
|
93 out.close()
|
|
94
|
|
95 if skipped_lines > 0:
|
|
96 msg = "..Skipped %d lines starting with line #%d. " %( skipped_lines, first_invalid_line )
|
|
97 if invalid_value and invalid_column > 0:
|
|
98 msg += "Value '%s' in column %d is not numeric." % ( invalid_value, invalid_column )
|
|
99 print msg
|
|
100
|
|
101 if __name__ == "__main__":
|
|
102 main()
|