Mercurial > repos > bgruening > upload_testing
view cor.py @ 90:b061185bcb83 draft
Uploaded
author | bernhardlutz |
---|---|
date | Thu, 23 Jan 2014 14:53:46 -0500 |
parents | c4a3a8999945 |
children |
line wrap: on
line source
#!/usr/bin/env python #Greg Von Kuster """ Calculate correlations between numeric columns in a tab delim file. usage: %prog infile output.txt columns method """ import sys #from rpy import * import rpy2.robjects as robjects r = robjects.r def stop_err(msg): sys.stderr.write(msg) sys.exit() def main(): method = sys.argv[4] assert method in ( "pearson", "kendall", "spearman" ) try: columns = map( int, sys.argv[3].split( ',' ) ) except: stop_err( "Problem determining columns, perhaps your query does not contain a column of numerical data." ) matrix = [] skipped_lines = 0 first_invalid_line = 0 invalid_value = '' invalid_column = 0 for i, line in enumerate( file( sys.argv[1] ) ): valid = True line = line.rstrip('\n\r') if line and not line.startswith( '#' ): # Extract values and convert to floats row = [] for column in columns: column -= 1 fields = line.split( "\t" ) if len( fields ) <= column: valid = False else: val = fields[column] if val.lower() == "na": row.append( float( "nan" ) ) else: try: row.append( float( fields[column] ) ) except: valid = False skipped_lines += 1 if not first_invalid_line: first_invalid_line = i+1 invalid_value = fields[column] invalid_column = column+1 else: valid = False skipped_lines += 1 if not first_invalid_line: first_invalid_line = i+1 if valid: # matrix.append( row ) matrix += row if skipped_lines < i: try: out = open( sys.argv[2], "w" ) except: stop_err( "Unable to open output file" ) # Run correlation # print >> sys.stderr, "matrix: %s" % matrix # print >> sys.stderr, "array: %s" % array( matrix ) try: # value = r.cor( array( matrix ), use="pairwise.complete.obs", method=method ) fv = robjects.FloatVector(matrix) m = r['matrix'](fv, ncol=len(columns),byrow=True) rslt_mat = r.cor(m, use="pairwise.complete.obs", method=method ) value = [] for ri in range(1, rslt_mat.nrow + 1): row = [] for ci in range(1, rslt_mat.ncol + 1): row.append(rslt_mat.rx(ri,ci)[0]) value.append(row) except Exception, exc: out.close() stop_err("%s" %str( exc )) for row in value: print >> out, "\t".join( map( str, row ) ) out.close() if skipped_lines > 0: msg = "..Skipped %d lines starting with line #%d. " %( skipped_lines, first_invalid_line ) if invalid_value and invalid_column > 0: msg += "Value '%s' in column %d is not numeric." % ( invalid_value, invalid_column ) print msg if __name__ == "__main__": main()