Mercurial > repos > nick > minor_variant_boxplot
comparison hetbox.py @ 0:128db16c9399 draft
Uploaded script
| author | nick |
|---|---|
| date | Tue, 28 May 2013 12:47:51 -0400 |
| parents | |
| children | 3a1ce69571e5 |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:128db16c9399 |
|---|---|
| 1 #!/usr/bin/env python | |
| 2 | |
| 3 import os,sys,numpy | |
| 4 from rpy2.robjects import Formula | |
| 5 from rpy2.robjects.packages import importr | |
| 6 from rpy2 import robjects | |
| 7 | |
| 8 def fail(message): | |
| 9 sys.stderr.write(message+'\n') | |
| 10 sys.exit(1) | |
| 11 | |
| 12 args = sys.argv[1:] | |
| 13 if '-r' in args: | |
| 14 make_report = True | |
| 15 else: | |
| 16 make_report = False | |
| 17 if len(args) >= 1: | |
| 18 infile = args[0] | |
| 19 else: | |
| 20 fail('Error: No input filename provided (as argument 1).') | |
| 21 if len(args) >= 2: | |
| 22 outfile = args[1] | |
| 23 else: | |
| 24 fail('Error: No output filename provided (as argument 2).') | |
| 25 if len(args) >= 3: | |
| 26 report = args[2] | |
| 27 elif make_report: | |
| 28 fail('Error: No output report filename provided (as argument 3).') | |
| 29 | |
| 30 if not os.path.exists(infile): | |
| 31 fail('Error: Input file '+infile+' could not be found.') | |
| 32 | |
| 33 | |
| 34 utils = importr('utils') | |
| 35 graphics = importr('graphics') | |
| 36 base = importr('base') | |
| 37 stats = importr('stats') | |
| 38 rprint = robjects.globalenv.get("print") | |
| 39 grdevices = importr('grDevices') | |
| 40 grdevices.png(file=outfile, width=1024, height=768) | |
| 41 | |
| 42 # Read file into a data frame | |
| 43 DATA = utils.read_delim(infile) | |
| 44 # Multiply minor allele frequencies by 100 to get percentage | |
| 45 # Doing this in R, since I can't find a way to modify a column in a data frame | |
| 46 # in rpy2 | |
| 47 robjects.r.assign('data', DATA) | |
| 48 robjects.r('data$MINOR.FREQ.PERC. = data$MINOR.FREQ.PERC. * 100') | |
| 49 DATA = robjects.r('data') | |
| 50 #index = data.names.index('MINOR.FREQ.PERC.') | |
| 51 ## .ro turns the returned object into one that can be operated on per-element | |
| 52 #DATA.rx2('MINOR.FREQ.PERC.') = DATA.rx('MINOR.FREQ.PERC.').ro * 100 | |
| 53 # Formula() creates a Python object representing the R object returned by x ~ y | |
| 54 formula = Formula('MINOR.FREQ.PERC.~SAMPLE') | |
| 55 # The "environment" in .getenvironment() is the entire R workspace in which the | |
| 56 # Formula object exists. The R workspace meaning all the defined variables. | |
| 57 # Here, the .getenvironment() method is being used to set some variables in the | |
| 58 # R workspace | |
| 59 # Note: .rx2() looks up a column by its label and returns it as a vector | |
| 60 formula.getenvironment()['MINOR.FREQ.PERC.'] = DATA.rx2('MINOR.FREQ.PERC.') | |
| 61 formula.getenvironment()['SAMPLE'] = DATA.rx2('SAMPLE') | |
| 62 | |
| 63 # create boxplot - fill kwargs1 with the options for the boxplot function | |
| 64 kwargs1 = {'ylab':"Minor allele frequency (%)",'col':"gray", 'xaxt':"n", 'outpch':"*",'main':"Distribution of minor allele frequencies >= 2%", 'cex.lab':"1.5"} | |
| 65 p = graphics.boxplot(formula, **kwargs1) | |
| 66 | |
| 67 tab = base.table(DATA.rx2('SAMPLE')) | |
| 68 graphics.text(0.5,1,'N:',font=2) | |
| 69 for i in range(1,base.length(tab)[0]+1,1): | |
| 70 graphics.text(i,1,tab[i-1], font=2) | |
| 71 | |
| 72 graphlabels = base.names(tab) | |
| 73 kwargs3 = {'pos':"0", 'las':"2", 'cex.axis':"1"} | |
| 74 graphics.axis(1, at=range(1,len(graphlabels)+1,1),labels=graphlabels, **kwargs3) | |
| 75 grdevices.dev_off() | |
| 76 | |
| 77 if not make_report: | |
| 78 sys.exit(0) | |
| 79 | |
| 80 #################################### | |
| 81 # GENERATE REPORT | |
| 82 # report should be something like: | |
| 83 # SAMPLE NoHET MEDIAN MAD TEST | |
| 84 # s1 7 10% n p/w/f | |
| 85 # n <= 5 pass | |
| 86 # 6 <= n <=10 warn | |
| 87 # n >= 11 fail | |
| 88 # MAD <= 2.0 fail | |
| 89 # MAD > 2.0 pass | |
| 90 ################################### | |
| 91 | |
| 92 SAMPLES=[] | |
| 93 for i in range(len(tab)): | |
| 94 SAMPLES.append(base.names(tab)[i]) | |
| 95 | |
| 96 def boxstats(data,sample): | |
| 97 VALUES = [100*float(x.strip().split('\t')[11]) for x in list(open(data)) if x.strip().split('\t')[0]==sample] | |
| 98 NoHET = len(VALUES) | |
| 99 MEDIAN = numpy.median(VALUES) | |
| 100 MAD = numpy.median([abs(i - MEDIAN) for i in VALUES]) # Median absolute distance (robust spread statistic) | |
| 101 return [NoHET,MEDIAN, MAD] | |
| 102 | |
| 103 boxreport = open(report, "w+") | |
| 104 boxreport.write("SAMPLE\tTOTAL.SITES\tMEDIAN.FREQ.\tMAD.FREQ\tEVAL\n") | |
| 105 for sample in SAMPLES: | |
| 106 ENTRY = [sample] + boxstats(infile,sample) | |
| 107 if ENTRY[1] <= 5: | |
| 108 ENTRY.append('pass') | |
| 109 elif 6 <= ENTRY[1] <=10: | |
| 110 ENTRY.append('warn') | |
| 111 elif ENTRY[1] >= 11: | |
| 112 ENTRY.append('fail') | |
| 113 if ENTRY[3] <=2.0: | |
| 114 ENTRY.append('fail') | |
| 115 elif ENTRY[3] >2.0: | |
| 116 ENTRY.append('pass') | |
| 117 if len(set(ENTRY[4:6])) == 2: | |
| 118 ENTRY.append('warn') | |
| 119 else: | |
| 120 ENTRY.append(list(set(ENTRY[4:6]))[0]) | |
| 121 boxreport.write ('%s\t%d\t%.1f\t%.1f\t%s\n' % tuple([ENTRY[i] for i in [0,1,2,3,6]])) | |
| 122 | |
| 123 boxreport.close() | |
| 124 | |
| 125 | |
| 126 | |
| 127 | |
| 128 | |
| 129 | |
| 130 ##################################### | |
| 131 #STUFF | |
| 132 # | |
| 133 #kwargs = {'ylab':"Minor allele frequency (sites >= 2%)",'col':"blue", 'axes':"FALSE", 'outpch':"*", 'ylim':"c(-0.03,0.5)", 'main':"Minor allele frequencies run020 at 2%"} | |
| 134 #boxplot(freq~id,data=run020[run020$freq>=0.02,], col="blue", axes=FALSE, outpch="*", ylab="Minor allele frequency (sites >= 2%)", ylim=c(-0.03,0.5), main="Minor allele frequencies run020 at 2%") | |
| 135 #biglabels2=sort(as.vector(unique(run020$id))) | |
| 136 #biglabels=apply(as.matrix(biglabels2), 1,function(x) substr(x,1,5)) | |
| 137 #axis(1, at=c(1:length(biglabels)),labels=biglabels, pos=-0.02, las=2, cex.axis=0.9) | |
| 138 #axis(2, at=c(seq(0,50,5)/100), pos=0,cex.axis=0.9) | |
| 139 #nbig=as.vector(table(run020[run020$freq>=0.02,1])) | |
| 140 #for (i in 1:length(nbig)){text(i,-0.01,nbig[i],cex=0.9, font=2)} | |
| 141 |
