# HG changeset patch # User nick # Date 1369759671 14400 # Node ID 128db16c9399f83a8966a5c47604f6f92fb6376c Uploaded script diff -r 000000000000 -r 128db16c9399 hetbox.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/hetbox.py Tue May 28 12:47:51 2013 -0400 @@ -0,0 +1,141 @@ +#!/usr/bin/env python + +import os,sys,numpy +from rpy2.robjects import Formula +from rpy2.robjects.packages import importr +from rpy2 import robjects + +def fail(message): + sys.stderr.write(message+'\n') + sys.exit(1) + +args = sys.argv[1:] +if '-r' in args: + make_report = True +else: + make_report = False +if len(args) >= 1: + infile = args[0] +else: + fail('Error: No input filename provided (as argument 1).') +if len(args) >= 2: + outfile = args[1] +else: + fail('Error: No output filename provided (as argument 2).') +if len(args) >= 3: + report = args[2] +elif make_report: + fail('Error: No output report filename provided (as argument 3).') + +if not os.path.exists(infile): + fail('Error: Input file '+infile+' could not be found.') + + +utils = importr('utils') +graphics = importr('graphics') +base = importr('base') +stats = importr('stats') +rprint = robjects.globalenv.get("print") +grdevices = importr('grDevices') +grdevices.png(file=outfile, width=1024, height=768) + +# Read file into a data frame +DATA = utils.read_delim(infile) +# Multiply minor allele frequencies by 100 to get percentage +# Doing this in R, since I can't find a way to modify a column in a data frame +# in rpy2 +robjects.r.assign('data', DATA) +robjects.r('data$MINOR.FREQ.PERC. = data$MINOR.FREQ.PERC. * 100') +DATA = robjects.r('data') +#index = data.names.index('MINOR.FREQ.PERC.') +## .ro turns the returned object into one that can be operated on per-element +#DATA.rx2('MINOR.FREQ.PERC.') = DATA.rx('MINOR.FREQ.PERC.').ro * 100 +# Formula() creates a Python object representing the R object returned by x ~ y +formula = Formula('MINOR.FREQ.PERC.~SAMPLE') +# The "environment" in .getenvironment() is the entire R workspace in which the +# Formula object exists. The R workspace meaning all the defined variables. +# Here, the .getenvironment() method is being used to set some variables in the +# R workspace +# Note: .rx2() looks up a column by its label and returns it as a vector +formula.getenvironment()['MINOR.FREQ.PERC.'] = DATA.rx2('MINOR.FREQ.PERC.') +formula.getenvironment()['SAMPLE'] = DATA.rx2('SAMPLE') + +# create boxplot - fill kwargs1 with the options for the boxplot function +kwargs1 = {'ylab':"Minor allele frequency (%)",'col':"gray", 'xaxt':"n", 'outpch':"*",'main':"Distribution of minor allele frequencies >= 2%", 'cex.lab':"1.5"} +p = graphics.boxplot(formula, **kwargs1) + +tab = base.table(DATA.rx2('SAMPLE')) +graphics.text(0.5,1,'N:',font=2) +for i in range(1,base.length(tab)[0]+1,1): + graphics.text(i,1,tab[i-1], font=2) + +graphlabels = base.names(tab) +kwargs3 = {'pos':"0", 'las':"2", 'cex.axis':"1"} +graphics.axis(1, at=range(1,len(graphlabels)+1,1),labels=graphlabels, **kwargs3) +grdevices.dev_off() + +if not make_report: + sys.exit(0) + +#################################### +# GENERATE REPORT +# report should be something like: +# SAMPLE NoHET MEDIAN MAD TEST +# s1 7 10% n p/w/f +# n <= 5 pass +# 6 <= n <=10 warn +# n >= 11 fail +# MAD <= 2.0 fail +# MAD > 2.0 pass +################################### + +SAMPLES=[] +for i in range(len(tab)): + SAMPLES.append(base.names(tab)[i]) + +def boxstats(data,sample): + VALUES = [100*float(x.strip().split('\t')[11]) for x in list(open(data)) if x.strip().split('\t')[0]==sample] + NoHET = len(VALUES) + MEDIAN = numpy.median(VALUES) + MAD = numpy.median([abs(i - MEDIAN) for i in VALUES]) # Median absolute distance (robust spread statistic) + return [NoHET,MEDIAN, MAD] + +boxreport = open(report, "w+") +boxreport.write("SAMPLE\tTOTAL.SITES\tMEDIAN.FREQ.\tMAD.FREQ\tEVAL\n") +for sample in SAMPLES: + ENTRY = [sample] + boxstats(infile,sample) + if ENTRY[1] <= 5: + ENTRY.append('pass') + elif 6 <= ENTRY[1] <=10: + ENTRY.append('warn') + elif ENTRY[1] >= 11: + ENTRY.append('fail') + if ENTRY[3] <=2.0: + ENTRY.append('fail') + elif ENTRY[3] >2.0: + ENTRY.append('pass') + if len(set(ENTRY[4:6])) == 2: + ENTRY.append('warn') + else: + ENTRY.append(list(set(ENTRY[4:6]))[0]) + boxreport.write ('%s\t%d\t%.1f\t%.1f\t%s\n' % tuple([ENTRY[i] for i in [0,1,2,3,6]])) + +boxreport.close() + + + + + + +##################################### +#STUFF +# +#kwargs = {'ylab':"Minor allele frequency (sites >= 2%)",'col':"blue", 'axes':"FALSE", 'outpch':"*", 'ylim':"c(-0.03,0.5)", 'main':"Minor allele frequencies run020 at 2%"} +#boxplot(freq~id,data=run020[run020$freq>=0.02,], col="blue", axes=FALSE, outpch="*", ylab="Minor allele frequency (sites >= 2%)", ylim=c(-0.03,0.5), main="Minor allele frequencies run020 at 2%") +#biglabels2=sort(as.vector(unique(run020$id))) +#biglabels=apply(as.matrix(biglabels2), 1,function(x) substr(x,1,5)) +#axis(1, at=c(1:length(biglabels)),labels=biglabels, pos=-0.02, las=2, cex.axis=0.9) +#axis(2, at=c(seq(0,50,5)/100), pos=0,cex.axis=0.9) +#nbig=as.vector(table(run020[run020$freq>=0.02,1])) +#for (i in 1:length(nbig)){text(i,-0.01,nbig[i],cex=0.9, font=2)} +