comparison cor.py @ 80:c4a3a8999945 draft

Uploaded
author bernhardlutz
date Mon, 20 Jan 2014 14:39:43 -0500
parents
children
comparison
equal deleted inserted replaced
79:dc82017052ac 80:c4a3a8999945
1 #!/usr/bin/env python
2 #Greg Von Kuster
3 """
4 Calculate correlations between numeric columns in a tab delim file.
5 usage: %prog infile output.txt columns method
6 """
7
8 import sys
9 #from rpy import *
10 import rpy2.robjects as robjects
11 r = robjects.r
12
13 def stop_err(msg):
14 sys.stderr.write(msg)
15 sys.exit()
16
17 def main():
18 method = sys.argv[4]
19 assert method in ( "pearson", "kendall", "spearman" )
20
21 try:
22 columns = map( int, sys.argv[3].split( ',' ) )
23 except:
24 stop_err( "Problem determining columns, perhaps your query does not contain a column of numerical data." )
25
26 matrix = []
27 skipped_lines = 0
28 first_invalid_line = 0
29 invalid_value = ''
30 invalid_column = 0
31
32 for i, line in enumerate( file( sys.argv[1] ) ):
33 valid = True
34 line = line.rstrip('\n\r')
35
36 if line and not line.startswith( '#' ):
37 # Extract values and convert to floats
38 row = []
39 for column in columns:
40 column -= 1
41 fields = line.split( "\t" )
42 if len( fields ) <= column:
43 valid = False
44 else:
45 val = fields[column]
46 if val.lower() == "na":
47 row.append( float( "nan" ) )
48 else:
49 try:
50 row.append( float( fields[column] ) )
51 except:
52 valid = False
53 skipped_lines += 1
54 if not first_invalid_line:
55 first_invalid_line = i+1
56 invalid_value = fields[column]
57 invalid_column = column+1
58 else:
59 valid = False
60 skipped_lines += 1
61 if not first_invalid_line:
62 first_invalid_line = i+1
63
64 if valid:
65 # matrix.append( row )
66 matrix += row
67
68 if skipped_lines < i:
69 try:
70 out = open( sys.argv[2], "w" )
71 except:
72 stop_err( "Unable to open output file" )
73
74 # Run correlation
75 # print >> sys.stderr, "matrix: %s" % matrix
76 # print >> sys.stderr, "array: %s" % array( matrix )
77 try:
78 # value = r.cor( array( matrix ), use="pairwise.complete.obs", method=method )
79 fv = robjects.FloatVector(matrix)
80 m = r['matrix'](fv, ncol=len(columns),byrow=True)
81 rslt_mat = r.cor(m, use="pairwise.complete.obs", method=method )
82 value = []
83 for ri in range(1, rslt_mat.nrow + 1):
84 row = []
85 for ci in range(1, rslt_mat.ncol + 1):
86 row.append(rslt_mat.rx(ri,ci)[0])
87 value.append(row)
88 except Exception, exc:
89 out.close()
90 stop_err("%s" %str( exc ))
91 for row in value:
92 print >> out, "\t".join( map( str, row ) )
93 out.close()
94
95 if skipped_lines > 0:
96 msg = "..Skipped %d lines starting with line #%d. " %( skipped_lines, first_invalid_line )
97 if invalid_value and invalid_column > 0:
98 msg += "Value '%s' in column %d is not numeric." % ( invalid_value, invalid_column )
99 print msg
100
101 if __name__ == "__main__":
102 main()