Mercurial > repos > nick > minor_variant_boxplot
changeset 2:3a1ce69571e5 draft
Allow commented lines, de-kludge data frame handling
author | nick |
---|---|
date | Tue, 28 May 2013 16:55:39 -0400 |
parents | a85dd9968af9 |
children | e9c34ef993d6 |
files | hetbox.py |
diffstat | 1 files changed, 35 insertions(+), 19 deletions(-) [+] |
line wrap: on
line diff
--- a/hetbox.py Tue May 28 12:48:13 2013 -0400 +++ b/hetbox.py Tue May 28 16:55:39 2013 -0400 @@ -1,4 +1,7 @@ #!/usr/bin/env python +# New in this version: +# - handle commented-out header lines +# - did everything through the Rpy2 interface instead of inline R code import os,sys,numpy from rpy2.robjects import Formula @@ -27,8 +30,17 @@ elif make_report: fail('Error: No output report filename provided (as argument 3).') +# Check input file if not os.path.exists(infile): fail('Error: Input file '+infile+' could not be found.') +with open(infile, 'r') as lines: + line = lines.readline() + if not line: + fail('Error: Input file seems to be empty') + line = line.strip().lstrip('#') # rm whitespace, comment chars + labels = line.split("\t") + if 'SAMPLE' not in labels or labels[11] != 'MINOR.FREQ.PERC.': + fail('Error: Input file does not seem to have a proper header line.') utils = importr('utils') @@ -41,42 +53,46 @@ # Read file into a data frame DATA = utils.read_delim(infile) +# Remove comment from header, if +labels = robjects.r.names(DATA) +if labels[0][0:2] == 'X.': + labels[0] = labels[0][2:] +#robjects.r.assign('data', DATA) +#robjects.r('data$MINOR.FREQ.PERC. = data$MINOR.FREQ.PERC. * 100') +#DATA = robjects.r('data') +#index = data.names.index('MINOR.FREQ.PERC.') # Multiply minor allele frequencies by 100 to get percentage -# Doing this in R, since I can't find a way to modify a column in a data frame -# in rpy2 -robjects.r.assign('data', DATA) -robjects.r('data$MINOR.FREQ.PERC. = data$MINOR.FREQ.PERC. * 100') -DATA = robjects.r('data') -#index = data.names.index('MINOR.FREQ.PERC.') -## .ro turns the returned object into one that can be operated on per-element -#DATA.rx2('MINOR.FREQ.PERC.') = DATA.rx('MINOR.FREQ.PERC.').ro * 100 +# .rx2() looks up a column by its label and returns it as a vector +# .ro turns the returned object into one that can be operated on per-element +minor_freq = DATA.rx2('MINOR.FREQ.PERC.').ro * 100 +samples = DATA.rx2('SAMPLE') # Formula() creates a Python object representing the R object returned by x ~ y -formula = Formula('MINOR.FREQ.PERC.~SAMPLE') +formula = Formula('minor_freq ~ samples') # The "environment" in .getenvironment() is the entire R workspace in which the # Formula object exists. The R workspace meaning all the defined variables. # Here, the .getenvironment() method is being used to set some variables in the # R workspace -# Note: .rx2() looks up a column by its label and returns it as a vector -formula.getenvironment()['MINOR.FREQ.PERC.'] = DATA.rx2('MINOR.FREQ.PERC.') -formula.getenvironment()['SAMPLE'] = DATA.rx2('SAMPLE') +formula.getenvironment()['minor_freq'] = minor_freq +formula.getenvironment()['samples'] = samples # create boxplot - fill kwargs1 with the options for the boxplot function -kwargs1 = {'ylab':"Minor allele frequency (%)",'col':"gray", 'xaxt':"n", 'outpch':"*",'main':"Distribution of minor allele frequencies >= 2%", 'cex.lab':"1.5"} +kwargs1 = {'ylab':"Minor allele frequency (%)", 'col':"gray", 'xaxt':"n", 'outpch':"*",'main':"Distribution of minor allele frequencies >= 2%", 'cex.lab':"1.5"} p = graphics.boxplot(formula, **kwargs1) -tab = base.table(DATA.rx2('SAMPLE')) -graphics.text(0.5,1,'N:',font=2) -for i in range(1,base.length(tab)[0]+1,1): - graphics.text(i,1,tab[i-1], font=2) +table = base.table(DATA.rx2('SAMPLE')) +graphics.text(0.5, 1, 'N:', font=2) +for i in range(1, base.length(table)[0]+1, 1): + graphics.text(i, 1, table[i-1], font=2) -graphlabels = base.names(tab) +graphlabels = base.names(table) kwargs3 = {'pos':"0", 'las':"2", 'cex.axis':"1"} -graphics.axis(1, at=range(1,len(graphlabels)+1,1),labels=graphlabels, **kwargs3) +graphics.axis(1, at=range(1, len(graphlabels)+1, 1), labels=graphlabels, **kwargs3) grdevices.dev_off() if not make_report: sys.exit(0) + #################################### # GENERATE REPORT # report should be something like: