# HG changeset patch
# User nick
# Date 1369759671 14400
# Node ID 128db16c9399f83a8966a5c47604f6f92fb6376c

Uploaded script

diff -r 000000000000 -r 128db16c9399 hetbox.py
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/hetbox.py	Tue May 28 12:47:51 2013 -0400
@@ -0,0 +1,141 @@
+#!/usr/bin/env python
+
+import os,sys,numpy
+from rpy2.robjects import Formula
+from rpy2.robjects.packages import importr
+from rpy2 import robjects
+
+def fail(message):
+  sys.stderr.write(message+'\n')
+  sys.exit(1)
+
+args = sys.argv[1:]
+if '-r' in args:
+  make_report = True
+else:
+  make_report = False
+if len(args) >= 1:
+  infile = args[0]
+else:
+  fail('Error: No input filename provided (as argument 1).')
+if len(args) >= 2:
+  outfile = args[1]
+else:
+  fail('Error: No output filename provided (as argument 2).')
+if len(args) >= 3:
+  report = args[2]
+elif make_report:
+  fail('Error: No output report filename provided (as argument 3).')
+
+if not os.path.exists(infile):
+  fail('Error: Input file '+infile+' could not be found.')
+
+
+utils     = importr('utils')
+graphics  = importr('graphics')
+base      = importr('base')
+stats     = importr('stats')
+rprint    = robjects.globalenv.get("print")
+grdevices = importr('grDevices')
+grdevices.png(file=outfile, width=1024, height=768)
+
+# Read file into a data frame
+DATA    = utils.read_delim(infile)
+# Multiply minor allele frequencies by 100 to get percentage
+# Doing this in R, since I can't find a way to modify a column in a data frame
+# in rpy2
+robjects.r.assign('data', DATA)
+robjects.r('data$MINOR.FREQ.PERC. = data$MINOR.FREQ.PERC. * 100')
+DATA = robjects.r('data')
+#index = data.names.index('MINOR.FREQ.PERC.')
+##  .ro turns the returned object into one that can be operated on per-element
+#DATA.rx2('MINOR.FREQ.PERC.') = DATA.rx('MINOR.FREQ.PERC.').ro * 100
+# Formula() creates a Python object representing the R object returned by x ~ y
+formula = Formula('MINOR.FREQ.PERC.~SAMPLE')
+# The "environment" in .getenvironment() is the entire R workspace in which the
+# Formula object exists. The R workspace meaning all the defined variables.
+# Here, the .getenvironment() method is being used to set some variables in the
+# R workspace
+#   Note: .rx2() looks up a column by its label and returns it as a vector
+formula.getenvironment()['MINOR.FREQ.PERC.'] = DATA.rx2('MINOR.FREQ.PERC.')
+formula.getenvironment()['SAMPLE']           = DATA.rx2('SAMPLE')
+
+# create boxplot - fill kwargs1 with the options for the boxplot function
+kwargs1 = {'ylab':"Minor allele frequency (%)",'col':"gray", 'xaxt':"n", 'outpch':"*",'main':"Distribution of minor allele frequencies >= 2%", 'cex.lab':"1.5"}
+p = graphics.boxplot(formula, **kwargs1)
+
+tab    = base.table(DATA.rx2('SAMPLE'))
+graphics.text(0.5,1,'N:',font=2)
+for i in range(1,base.length(tab)[0]+1,1):
+    graphics.text(i,1,tab[i-1], font=2)
+
+graphlabels = base.names(tab)
+kwargs3 = {'pos':"0", 'las':"2", 'cex.axis':"1"}
+graphics.axis(1, at=range(1,len(graphlabels)+1,1),labels=graphlabels, **kwargs3)
+grdevices.dev_off()
+
+if not make_report:
+  sys.exit(0)
+
+####################################
+# GENERATE REPORT
+# report should be something like:
+# SAMPLE NoHET MEDIAN MAD TEST 
+# s1     7      10%   n   p/w/f
+# n <= 5      pass
+# 6 <= n <=10 warn
+# n >= 11     fail
+# MAD <= 2.0  fail
+# MAD > 2.0    pass
+###################################
+
+SAMPLES=[]
+for i in range(len(tab)):
+    SAMPLES.append(base.names(tab)[i])
+
+def boxstats(data,sample):
+    VALUES = [100*float(x.strip().split('\t')[11]) for x in list(open(data)) if x.strip().split('\t')[0]==sample]
+    NoHET  = len(VALUES)
+    MEDIAN = numpy.median(VALUES)
+    MAD    = numpy.median([abs(i - MEDIAN) for i in VALUES]) # Median absolute distance (robust spread statistic)
+    return [NoHET,MEDIAN, MAD]
+
+boxreport = open(report, "w+")
+boxreport.write("SAMPLE\tTOTAL.SITES\tMEDIAN.FREQ.\tMAD.FREQ\tEVAL\n")
+for sample in SAMPLES:
+    ENTRY = [sample] + boxstats(infile,sample)
+    if ENTRY[1] <= 5:
+        ENTRY.append('pass')
+    elif 6 <= ENTRY[1] <=10:
+        ENTRY.append('warn')
+    elif ENTRY[1] >= 11:
+        ENTRY.append('fail')
+    if ENTRY[3] <=2.0:
+        ENTRY.append('fail')
+    elif ENTRY[3] >2.0:
+        ENTRY.append('pass')
+    if len(set(ENTRY[4:6])) == 2:
+        ENTRY.append('warn')
+    else:
+        ENTRY.append(list(set(ENTRY[4:6]))[0])
+    boxreport.write ('%s\t%d\t%.1f\t%.1f\t%s\n' % tuple([ENTRY[i] for i in [0,1,2,3,6]]))
+
+boxreport.close()
+
+
+
+
+
+
+#####################################
+#STUFF
+#
+#kwargs = {'ylab':"Minor allele frequency (sites >= 2%)",'col':"blue", 'axes':"FALSE", 'outpch':"*", 'ylim':"c(-0.03,0.5)", 'main':"Minor allele frequencies run020 at 2%"}
+#boxplot(freq~id,data=run020[run020$freq>=0.02,], col="blue", axes=FALSE, outpch="*", ylab="Minor allele frequency (sites >= 2%)", ylim=c(-0.03,0.5), main="Minor allele frequencies run020 at 2%")
+#biglabels2=sort(as.vector(unique(run020$id)))
+#biglabels=apply(as.matrix(biglabels2), 1,function(x) substr(x,1,5))
+#axis(1, at=c(1:length(biglabels)),labels=biglabels, pos=-0.02, las=2, cex.axis=0.9)
+#axis(2, at=c(seq(0,50,5)/100), pos=0,cex.axis=0.9)
+#nbig=as.vector(table(run020[run020$freq>=0.02,1]))
+#for (i in 1:length(nbig)){text(i,-0.01,nbig[i],cex=0.9, font=2)}
+