Mercurial > repos > nick > minor_variant_boxplot
view hetbox.py @ 0:128db16c9399 draft
Uploaded script
author | nick |
---|---|
date | Tue, 28 May 2013 12:47:51 -0400 |
parents | |
children | 3a1ce69571e5 |
line wrap: on
line source
#!/usr/bin/env python import os,sys,numpy from rpy2.robjects import Formula from rpy2.robjects.packages import importr from rpy2 import robjects def fail(message): sys.stderr.write(message+'\n') sys.exit(1) args = sys.argv[1:] if '-r' in args: make_report = True else: make_report = False if len(args) >= 1: infile = args[0] else: fail('Error: No input filename provided (as argument 1).') if len(args) >= 2: outfile = args[1] else: fail('Error: No output filename provided (as argument 2).') if len(args) >= 3: report = args[2] elif make_report: fail('Error: No output report filename provided (as argument 3).') if not os.path.exists(infile): fail('Error: Input file '+infile+' could not be found.') utils = importr('utils') graphics = importr('graphics') base = importr('base') stats = importr('stats') rprint = robjects.globalenv.get("print") grdevices = importr('grDevices') grdevices.png(file=outfile, width=1024, height=768) # Read file into a data frame DATA = utils.read_delim(infile) # Multiply minor allele frequencies by 100 to get percentage # Doing this in R, since I can't find a way to modify a column in a data frame # in rpy2 robjects.r.assign('data', DATA) robjects.r('data$MINOR.FREQ.PERC. = data$MINOR.FREQ.PERC. * 100') DATA = robjects.r('data') #index = data.names.index('MINOR.FREQ.PERC.') ## .ro turns the returned object into one that can be operated on per-element #DATA.rx2('MINOR.FREQ.PERC.') = DATA.rx('MINOR.FREQ.PERC.').ro * 100 # Formula() creates a Python object representing the R object returned by x ~ y formula = Formula('MINOR.FREQ.PERC.~SAMPLE') # The "environment" in .getenvironment() is the entire R workspace in which the # Formula object exists. The R workspace meaning all the defined variables. # Here, the .getenvironment() method is being used to set some variables in the # R workspace # Note: .rx2() looks up a column by its label and returns it as a vector formula.getenvironment()['MINOR.FREQ.PERC.'] = DATA.rx2('MINOR.FREQ.PERC.') formula.getenvironment()['SAMPLE'] = DATA.rx2('SAMPLE') # create boxplot - fill kwargs1 with the options for the boxplot function kwargs1 = {'ylab':"Minor allele frequency (%)",'col':"gray", 'xaxt':"n", 'outpch':"*",'main':"Distribution of minor allele frequencies >= 2%", 'cex.lab':"1.5"} p = graphics.boxplot(formula, **kwargs1) tab = base.table(DATA.rx2('SAMPLE')) graphics.text(0.5,1,'N:',font=2) for i in range(1,base.length(tab)[0]+1,1): graphics.text(i,1,tab[i-1], font=2) graphlabels = base.names(tab) kwargs3 = {'pos':"0", 'las':"2", 'cex.axis':"1"} graphics.axis(1, at=range(1,len(graphlabels)+1,1),labels=graphlabels, **kwargs3) grdevices.dev_off() if not make_report: sys.exit(0) #################################### # GENERATE REPORT # report should be something like: # SAMPLE NoHET MEDIAN MAD TEST # s1 7 10% n p/w/f # n <= 5 pass # 6 <= n <=10 warn # n >= 11 fail # MAD <= 2.0 fail # MAD > 2.0 pass ################################### SAMPLES=[] for i in range(len(tab)): SAMPLES.append(base.names(tab)[i]) def boxstats(data,sample): VALUES = [100*float(x.strip().split('\t')[11]) for x in list(open(data)) if x.strip().split('\t')[0]==sample] NoHET = len(VALUES) MEDIAN = numpy.median(VALUES) MAD = numpy.median([abs(i - MEDIAN) for i in VALUES]) # Median absolute distance (robust spread statistic) return [NoHET,MEDIAN, MAD] boxreport = open(report, "w+") boxreport.write("SAMPLE\tTOTAL.SITES\tMEDIAN.FREQ.\tMAD.FREQ\tEVAL\n") for sample in SAMPLES: ENTRY = [sample] + boxstats(infile,sample) if ENTRY[1] <= 5: ENTRY.append('pass') elif 6 <= ENTRY[1] <=10: ENTRY.append('warn') elif ENTRY[1] >= 11: ENTRY.append('fail') if ENTRY[3] <=2.0: ENTRY.append('fail') elif ENTRY[3] >2.0: ENTRY.append('pass') if len(set(ENTRY[4:6])) == 2: ENTRY.append('warn') else: ENTRY.append(list(set(ENTRY[4:6]))[0]) boxreport.write ('%s\t%d\t%.1f\t%.1f\t%s\n' % tuple([ENTRY[i] for i in [0,1,2,3,6]])) boxreport.close() ##################################### #STUFF # #kwargs = {'ylab':"Minor allele frequency (sites >= 2%)",'col':"blue", 'axes':"FALSE", 'outpch':"*", 'ylim':"c(-0.03,0.5)", 'main':"Minor allele frequencies run020 at 2%"} #boxplot(freq~id,data=run020[run020$freq>=0.02,], col="blue", axes=FALSE, outpch="*", ylab="Minor allele frequency (sites >= 2%)", ylim=c(-0.03,0.5), main="Minor allele frequencies run020 at 2%") #biglabels2=sort(as.vector(unique(run020$id))) #biglabels=apply(as.matrix(biglabels2), 1,function(x) substr(x,1,5)) #axis(1, at=c(1:length(biglabels)),labels=biglabels, pos=-0.02, las=2, cex.axis=0.9) #axis(2, at=c(seq(0,50,5)/100), pos=0,cex.axis=0.9) #nbig=as.vector(table(run020[run020$freq>=0.02,1])) #for (i in 1:length(nbig)){text(i,-0.01,nbig[i],cex=0.9, font=2)}