hetbox: hetbox.py comparison

comparison hetbox.py @ 0:153d1a6e8c5e draft

Uploaded script

author	boris
date	Thu, 13 Jun 2013 15:09:48 -0400
parents
children	57c5ea9c3c5c

comparison

equal deleted inserted replaced

--1:000000000000
+:153d1a6e8c5e
+#!/usr/bin/env python
+# Code by Boris Rebolledo-Jaramillo
+# (boris-at-bx.psu.edu)
+# Edited by Nick Stoler
+# (nick-at-bx.psu.edu)
+# New in this version:
+# - Add in proper header line if not present
+import os
+import sys
+import numpy
+from rpy2.robjects import Formula
+from rpy2.robjects.packages import importr
+from rpy2 import robjects
+def fail(message):
+sys.stderr.write(message+'\n')
+sys.exit(1)
+COLUMN_LABELS = ['SAMPLE', 'CHR',  'POS', 'A', 'C', 'G', 'T', 'CVRG',
+'ALLELES', 'MAJOR', 'MINOR', 'MINOR.FREQ.PERC.'] #, 'STRAND.BIAS']
+args = sys.argv[1:]
+if len(args) >= 1:
+infile = args[0]
+else:
+fail('Error: No input filename provided (as argument 1).')
+if len(args) >= 2:
+outfile = args[1]
+else:
+fail('Error: No output filename provided (as argument 2).')
+if len(args) >= 3:
+report = args[2]
+else:
+report = ''
+# Check input file
+add_header = False
+if not os.path.exists(infile):
+fail('Error: Input file '+infile+' could not be found.')
+with open(infile, 'r') as lines:
+line = lines.readline()
+if not line:
+fail('Error: Input file seems to be empty')
+line = line.strip().lstrip('#') # rm whitespace, comment chars
+labels = line.split("\t")
+if 'SAMPLE' not in labels or labels[11] != 'MINOR.FREQ.PERC.':
+sys.stderr.write("Error: Input file does not seem to have a proper header "
++"line.\nAdding an artificial header..")
+add_header = True
+base = importr('base')
+utils = importr('utils')
+stats = importr('stats')
+rprint = robjects.globalenv.get("print")
+graphics = importr('graphics')
+grdevices = importr('grDevices')
+grdevices.png(file=outfile, width=1024, height=768)
+# Read file into a data frame
+if add_header:
+# add header line manually if not present
+DATA = utils.read_delim(infile, header=False)
+labels = robjects.r.names(DATA)
+for i in range(len(labels)):
+try:
+labels[i] = COLUMN_LABELS[i]
+except IndexError, e:
+fail("Error in input file: Too many columns (does not match hardcoded "
++"column labels).")
+else:
+DATA = utils.read_delim(infile)
+# Remove comment from header, if present
+labels = robjects.r.names(DATA)
+if labels[0][0:2] == 'X.':
+labels[0] = labels[0][2:]
+# Multiply minor allele frequencies by 100 to get percentage
+#  .rx2() looks up a column by its label and returns it as a vector
+#  .ro turns the returned object into one that can be operated on per-element
+minor_freq = DATA.rx2('MINOR.FREQ.PERC.').ro * 100
+samples    = DATA.rx2('SAMPLE')
+# Formula() creates a Python object representing the R object returned by x ~ y
+formula = Formula('minor_freq ~ samples')
+# The "environment" in .getenvironment() is the entire R workspace in which the
+# Formula object exists. The R workspace meaning all the defined variables.
+# Here, the .getenvironment() method is being used to set some variables in the
+# R workspace
+formula.getenvironment()['minor_freq'] = minor_freq
+formula.getenvironment()['samples']    = samples
+# create boxplot - fill kwargs1 with the options for the boxplot function
+kwargs1 = {'ylab':"Minor allele frequency (%)", 'col':"gray", 'xaxt':"n",
+'outpch':"*",'main':"Distribution of minor allele frequencies",
+'cex.lab':"1.5"}
+p = graphics.boxplot(formula, **kwargs1)
+table  = base.table(DATA.rx2('SAMPLE'))
+graphics.text(0.5, 1, 'N:', font=2)
+for i in range(1, base.length(table)[0]+1, 1):
+graphics.text(i, 1, table[i-1], font=2)
+graphlabels = base.names(table)
+kwargs3 = {'pos':"0", 'las':"2", 'cex.axis':"1"}
+graphics.axis(1, at=range(1, len(graphlabels)+1, 1), labels=graphlabels, **kwargs3)
+grdevices.dev_off()
+if not report:
+sys.exit(0)
+####################################
+# GENERATE REPORT
+# report should be something like:
+# SAMPLE NoHET MEDIAN MAD TEST
+# s1     7      10%   n   p/w/f
+# n <= 5      pass
+# 6 <= n <=10 warn
+# n >= 11     fail
+# MAD <= 2.0  fail
+# MAD > 2.0    pass
+###################################
+SAMPLES=[]
+for i in range(len(table)):
+SAMPLES.append(base.names(table)[i])
+def boxstats(data,sample):
+VALUES = [100*float(x.strip().split('\t')[11]) for x in list(open(data)) if x.strip().split('\t')[0]==sample]
+NoHET  = len(VALUES)
+MEDIAN = numpy.median(VALUES)
+MAD    = numpy.median([abs(i - MEDIAN) for i in VALUES]) # Median absolute distance (robust spread statistic)
+return [NoHET,MEDIAN, MAD]
+boxreport = open(report, "w+")
+boxreport.write("SAMPLE\tTOTAL.SITES\tMEDIAN.FREQ.\tMAD.FREQ\tEVAL\n")
+for sample in SAMPLES:
+ENTRY = [sample] + boxstats(infile,sample)
+if ENTRY[1] <= 5:
+ENTRY.append('pass')
+elif 6 <= ENTRY[1] <=10:
+ENTRY.append('warn')
+elif ENTRY[1] >= 11:
+ENTRY.append('fail')
+if ENTRY[3] <=2.0:
+ENTRY.append('fail')
+elif ENTRY[3] >2.0:
+ENTRY.append('pass')
+if len(set(ENTRY[4:6])) == 2:
+ENTRY.append('warn')
+else:
+ENTRY.append(list(set(ENTRY[4:6]))[0])
+boxreport.write ('%s\t%d\t%.1f\t%.1f\t%s\n' % tuple([ENTRY[i] for i in [0,1,2,3,6]]))
+boxreport.close()

Mercurial > repos > boris > hetbox

comparison hetbox.py @ 0:153d1a6e8c5e draft