view hetbox.py @ 2:3a1ce69571e5 draft

Allow commented lines, de-kludge data frame handling
author nick
date Tue, 28 May 2013 16:55:39 -0400
parents 128db16c9399
children dfa2e75da6aa
line wrap: on
line source

#!/usr/bin/env python
# New in this version:
# - handle commented-out header lines
# - did everything through the Rpy2 interface instead of inline R code

import os,sys,numpy
from rpy2.robjects import Formula
from rpy2.robjects.packages import importr
from rpy2 import robjects

def fail(message):
  sys.stderr.write(message+'\n')
  sys.exit(1)

args = sys.argv[1:]
if '-r' in args:
  make_report = True
else:
  make_report = False
if len(args) >= 1:
  infile = args[0]
else:
  fail('Error: No input filename provided (as argument 1).')
if len(args) >= 2:
  outfile = args[1]
else:
  fail('Error: No output filename provided (as argument 2).')
if len(args) >= 3:
  report = args[2]
elif make_report:
  fail('Error: No output report filename provided (as argument 3).')

# Check input file
if not os.path.exists(infile):
  fail('Error: Input file '+infile+' could not be found.')
with open(infile, 'r') as lines:
  line = lines.readline()
  if not line:
    fail('Error: Input file seems to be empty')
  line = line.strip().lstrip('#') # rm whitespace, comment chars
  labels = line.split("\t")
  if 'SAMPLE' not in labels or labels[11] != 'MINOR.FREQ.PERC.':
    fail('Error: Input file does not seem to have a proper header line.')


utils     = importr('utils')
graphics  = importr('graphics')
base      = importr('base')
stats     = importr('stats')
rprint    = robjects.globalenv.get("print")
grdevices = importr('grDevices')
grdevices.png(file=outfile, width=1024, height=768)

# Read file into a data frame
DATA    = utils.read_delim(infile)
# Remove comment from header, if 
labels = robjects.r.names(DATA)
if labels[0][0:2] == 'X.':
  labels[0] = labels[0][2:]
#robjects.r.assign('data', DATA)
#robjects.r('data$MINOR.FREQ.PERC. = data$MINOR.FREQ.PERC. * 100')
#DATA = robjects.r('data')
#index = data.names.index('MINOR.FREQ.PERC.')
# Multiply minor allele frequencies by 100 to get percentage
#  .rx2() looks up a column by its label and returns it as a vector
#  .ro turns the returned object into one that can be operated on per-element
minor_freq = DATA.rx2('MINOR.FREQ.PERC.').ro * 100
samples    = DATA.rx2('SAMPLE')
# Formula() creates a Python object representing the R object returned by x ~ y
formula = Formula('minor_freq ~ samples')
# The "environment" in .getenvironment() is the entire R workspace in which the
# Formula object exists. The R workspace meaning all the defined variables.
# Here, the .getenvironment() method is being used to set some variables in the
# R workspace
formula.getenvironment()['minor_freq'] = minor_freq
formula.getenvironment()['samples']    = samples

# create boxplot - fill kwargs1 with the options for the boxplot function
kwargs1 = {'ylab':"Minor allele frequency (%)", 'col':"gray", 'xaxt':"n", 'outpch':"*",'main':"Distribution of minor allele frequencies >= 2%", 'cex.lab':"1.5"}
p = graphics.boxplot(formula, **kwargs1)

table  = base.table(DATA.rx2('SAMPLE'))
graphics.text(0.5, 1, 'N:', font=2)
for i in range(1, base.length(table)[0]+1, 1):
    graphics.text(i, 1, table[i-1], font=2)

graphlabels = base.names(table)
kwargs3 = {'pos':"0", 'las':"2", 'cex.axis':"1"}
graphics.axis(1, at=range(1, len(graphlabels)+1, 1), labels=graphlabels, **kwargs3)
grdevices.dev_off()

if not make_report:
  sys.exit(0)


####################################
# GENERATE REPORT
# report should be something like:
# SAMPLE NoHET MEDIAN MAD TEST 
# s1     7      10%   n   p/w/f
# n <= 5      pass
# 6 <= n <=10 warn
# n >= 11     fail
# MAD <= 2.0  fail
# MAD > 2.0    pass
###################################

SAMPLES=[]
for i in range(len(tab)):
    SAMPLES.append(base.names(tab)[i])

def boxstats(data,sample):
    VALUES = [100*float(x.strip().split('\t')[11]) for x in list(open(data)) if x.strip().split('\t')[0]==sample]
    NoHET  = len(VALUES)
    MEDIAN = numpy.median(VALUES)
    MAD    = numpy.median([abs(i - MEDIAN) for i in VALUES]) # Median absolute distance (robust spread statistic)
    return [NoHET,MEDIAN, MAD]

boxreport = open(report, "w+")
boxreport.write("SAMPLE\tTOTAL.SITES\tMEDIAN.FREQ.\tMAD.FREQ\tEVAL\n")
for sample in SAMPLES:
    ENTRY = [sample] + boxstats(infile,sample)
    if ENTRY[1] <= 5:
        ENTRY.append('pass')
    elif 6 <= ENTRY[1] <=10:
        ENTRY.append('warn')
    elif ENTRY[1] >= 11:
        ENTRY.append('fail')
    if ENTRY[3] <=2.0:
        ENTRY.append('fail')
    elif ENTRY[3] >2.0:
        ENTRY.append('pass')
    if len(set(ENTRY[4:6])) == 2:
        ENTRY.append('warn')
    else:
        ENTRY.append(list(set(ENTRY[4:6]))[0])
    boxreport.write ('%s\t%d\t%.1f\t%.1f\t%s\n' % tuple([ENTRY[i] for i in [0,1,2,3,6]]))

boxreport.close()






#####################################
#STUFF
#
#kwargs = {'ylab':"Minor allele frequency (sites >= 2%)",'col':"blue", 'axes':"FALSE", 'outpch':"*", 'ylim':"c(-0.03,0.5)", 'main':"Minor allele frequencies run020 at 2%"}
#boxplot(freq~id,data=run020[run020$freq>=0.02,], col="blue", axes=FALSE, outpch="*", ylab="Minor allele frequency (sites >= 2%)", ylim=c(-0.03,0.5), main="Minor allele frequencies run020 at 2%")
#biglabels2=sort(as.vector(unique(run020$id)))
#biglabels=apply(as.matrix(biglabels2), 1,function(x) substr(x,1,5))
#axis(1, at=c(1:length(biglabels)),labels=biglabels, pos=-0.02, las=2, cex.axis=0.9)
#axis(2, at=c(seq(0,50,5)/100), pos=0,cex.axis=0.9)
#nbig=as.vector(table(run020[run020$freq>=0.02,1]))
#for (i in 1:length(nbig)){text(i,-0.01,nbig[i],cex=0.9, font=2)}