Mercurial > repos > devteam > dna_filtering
diff histogram.py @ 0:9c0d844f4e48 draft
Imported from capsule None
author | devteam |
---|---|
date | Mon, 28 Jul 2014 11:30:12 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/histogram.py Mon Jul 28 11:30:12 2014 -0400 @@ -0,0 +1,101 @@ +#!/usr/bin/env python +#Greg Von Kuster + +import sys +from rpy import * + +assert sys.version_info[:2] >= ( 2, 4 ) + +def stop_err(msg): + sys.stderr.write(msg) + sys.exit() + +def main(): + + # Handle input params + in_fname = sys.argv[1] + out_fname = sys.argv[2] + try: + column = int( sys.argv[3] ) - 1 + except: + stop_err( "Column not specified, your query does not contain a column of numerical data." ) + title = sys.argv[4] + xlab = sys.argv[5] + breaks = int( sys.argv[6] ) + if breaks == 0: + breaks = "Sturges" + if sys.argv[7] == "true": + density = True + else: density = False + if len( sys.argv ) >= 9 and sys.argv[8] == "true": + frequency = True + else: frequency = False + + matrix = [] + skipped_lines = 0 + first_invalid_line = 0 + invalid_value = '' + i = 0 + for i, line in enumerate( file( in_fname ) ): + valid = True + line = line.rstrip('\r\n') + # Skip comments + if line and not line.startswith( '#' ): + # Extract values and convert to floats + row = [] + try: + fields = line.split( "\t" ) + val = fields[column] + if val.lower() == "na": + row.append( float( "nan" ) ) + except: + valid = False + skipped_lines += 1 + if not first_invalid_line: + first_invalid_line = i+1 + else: + try: + row.append( float( val ) ) + except ValueError: + valid = False + skipped_lines += 1 + if not first_invalid_line: + first_invalid_line = i+1 + invalid_value = fields[column] + else: + valid = False + skipped_lines += 1 + if not first_invalid_line: + first_invalid_line = i+1 + + if valid: + matrix += row + + if skipped_lines < i: + try: + a = r.array( matrix ) + r.pdf( out_fname, 8, 8 ) + histogram = r.hist( a, probability=not frequency, main=title, xlab=xlab, breaks=breaks ) + if density: + density = r.density( a ) + if frequency: + scale_factor = len( matrix ) * ( histogram['mids'][1] - histogram['mids'][0] ) #uniform bandwidth taken from first 2 midpoints + density[ 'y' ] = map( lambda x: x * scale_factor, density[ 'y' ] ) + r.lines( density ) + r.dev_off() + except Exception, exc: + stop_err( "%s" %str( exc ) ) + else: + if i == 0: + stop_err("Input dataset is empty.") + else: + stop_err( "All values in column %s are non-numeric." %sys.argv[3] ) + + print "Histogram of column %s. " %sys.argv[3] + if skipped_lines > 0: + print "Skipped %d invalid lines starting with line #%d, '%s'." % ( skipped_lines, first_invalid_line, invalid_value ) + + r.quit( save="no" ) + +if __name__ == "__main__": + main()