Mercurial > repos > bgruening > upload_testing
comparison cor.py @ 80:c4a3a8999945 draft
Uploaded
author | bernhardlutz |
---|---|
date | Mon, 20 Jan 2014 14:39:43 -0500 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
79:dc82017052ac | 80:c4a3a8999945 |
---|---|
1 #!/usr/bin/env python | |
2 #Greg Von Kuster | |
3 """ | |
4 Calculate correlations between numeric columns in a tab delim file. | |
5 usage: %prog infile output.txt columns method | |
6 """ | |
7 | |
8 import sys | |
9 #from rpy import * | |
10 import rpy2.robjects as robjects | |
11 r = robjects.r | |
12 | |
13 def stop_err(msg): | |
14 sys.stderr.write(msg) | |
15 sys.exit() | |
16 | |
17 def main(): | |
18 method = sys.argv[4] | |
19 assert method in ( "pearson", "kendall", "spearman" ) | |
20 | |
21 try: | |
22 columns = map( int, sys.argv[3].split( ',' ) ) | |
23 except: | |
24 stop_err( "Problem determining columns, perhaps your query does not contain a column of numerical data." ) | |
25 | |
26 matrix = [] | |
27 skipped_lines = 0 | |
28 first_invalid_line = 0 | |
29 invalid_value = '' | |
30 invalid_column = 0 | |
31 | |
32 for i, line in enumerate( file( sys.argv[1] ) ): | |
33 valid = True | |
34 line = line.rstrip('\n\r') | |
35 | |
36 if line and not line.startswith( '#' ): | |
37 # Extract values and convert to floats | |
38 row = [] | |
39 for column in columns: | |
40 column -= 1 | |
41 fields = line.split( "\t" ) | |
42 if len( fields ) <= column: | |
43 valid = False | |
44 else: | |
45 val = fields[column] | |
46 if val.lower() == "na": | |
47 row.append( float( "nan" ) ) | |
48 else: | |
49 try: | |
50 row.append( float( fields[column] ) ) | |
51 except: | |
52 valid = False | |
53 skipped_lines += 1 | |
54 if not first_invalid_line: | |
55 first_invalid_line = i+1 | |
56 invalid_value = fields[column] | |
57 invalid_column = column+1 | |
58 else: | |
59 valid = False | |
60 skipped_lines += 1 | |
61 if not first_invalid_line: | |
62 first_invalid_line = i+1 | |
63 | |
64 if valid: | |
65 # matrix.append( row ) | |
66 matrix += row | |
67 | |
68 if skipped_lines < i: | |
69 try: | |
70 out = open( sys.argv[2], "w" ) | |
71 except: | |
72 stop_err( "Unable to open output file" ) | |
73 | |
74 # Run correlation | |
75 # print >> sys.stderr, "matrix: %s" % matrix | |
76 # print >> sys.stderr, "array: %s" % array( matrix ) | |
77 try: | |
78 # value = r.cor( array( matrix ), use="pairwise.complete.obs", method=method ) | |
79 fv = robjects.FloatVector(matrix) | |
80 m = r['matrix'](fv, ncol=len(columns),byrow=True) | |
81 rslt_mat = r.cor(m, use="pairwise.complete.obs", method=method ) | |
82 value = [] | |
83 for ri in range(1, rslt_mat.nrow + 1): | |
84 row = [] | |
85 for ci in range(1, rslt_mat.ncol + 1): | |
86 row.append(rslt_mat.rx(ri,ci)[0]) | |
87 value.append(row) | |
88 except Exception, exc: | |
89 out.close() | |
90 stop_err("%s" %str( exc )) | |
91 for row in value: | |
92 print >> out, "\t".join( map( str, row ) ) | |
93 out.close() | |
94 | |
95 if skipped_lines > 0: | |
96 msg = "..Skipped %d lines starting with line #%d. " %( skipped_lines, first_invalid_line ) | |
97 if invalid_value and invalid_column > 0: | |
98 msg += "Value '%s' in column %d is not numeric." % ( invalid_value, invalid_column ) | |
99 print msg | |
100 | |
101 if __name__ == "__main__": | |
102 main() |