Mercurial > repos > boris > hetbox
changeset 0:153d1a6e8c5e draft
Uploaded script
author | boris |
---|---|
date | Thu, 13 Jun 2013 15:09:48 -0400 |
parents | |
children | b4f9a4f2f65d |
files | hetbox.py |
diffstat | 1 files changed, 160 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/hetbox.py Thu Jun 13 15:09:48 2013 -0400 @@ -0,0 +1,160 @@ +#!/usr/bin/env python +# Code by Boris Rebolledo-Jaramillo +# (boris-at-bx.psu.edu) +# Edited by Nick Stoler +# (nick-at-bx.psu.edu) +# New in this version: +# - Add in proper header line if not present + +import os +import sys +import numpy +from rpy2.robjects import Formula +from rpy2.robjects.packages import importr +from rpy2 import robjects + +def fail(message): + sys.stderr.write(message+'\n') + sys.exit(1) + +COLUMN_LABELS = ['SAMPLE', 'CHR', 'POS', 'A', 'C', 'G', 'T', 'CVRG', + 'ALLELES', 'MAJOR', 'MINOR', 'MINOR.FREQ.PERC.'] #, 'STRAND.BIAS'] + +args = sys.argv[1:] +if len(args) >= 1: + infile = args[0] +else: + fail('Error: No input filename provided (as argument 1).') +if len(args) >= 2: + outfile = args[1] +else: + fail('Error: No output filename provided (as argument 2).') +if len(args) >= 3: + report = args[2] +else: + report = '' + +# Check input file +add_header = False +if not os.path.exists(infile): + fail('Error: Input file '+infile+' could not be found.') +with open(infile, 'r') as lines: + line = lines.readline() + if not line: + fail('Error: Input file seems to be empty') + line = line.strip().lstrip('#') # rm whitespace, comment chars + labels = line.split("\t") + if 'SAMPLE' not in labels or labels[11] != 'MINOR.FREQ.PERC.': + sys.stderr.write("Error: Input file does not seem to have a proper header " + +"line.\nAdding an artificial header..") + add_header = True + +base = importr('base') +utils = importr('utils') +stats = importr('stats') +rprint = robjects.globalenv.get("print") +graphics = importr('graphics') +grdevices = importr('grDevices') +grdevices.png(file=outfile, width=1024, height=768) + +# Read file into a data frame +if add_header: + # add header line manually if not present + DATA = utils.read_delim(infile, header=False) + labels = robjects.r.names(DATA) + for i in range(len(labels)): + try: + labels[i] = COLUMN_LABELS[i] + except IndexError, e: + fail("Error in input file: Too many columns (does not match hardcoded " + +"column labels).") +else: + DATA = utils.read_delim(infile) + # Remove comment from header, if present + labels = robjects.r.names(DATA) + if labels[0][0:2] == 'X.': + labels[0] = labels[0][2:] + +# Multiply minor allele frequencies by 100 to get percentage +# .rx2() looks up a column by its label and returns it as a vector +# .ro turns the returned object into one that can be operated on per-element +minor_freq = DATA.rx2('MINOR.FREQ.PERC.').ro * 100 +samples = DATA.rx2('SAMPLE') + +# Formula() creates a Python object representing the R object returned by x ~ y +formula = Formula('minor_freq ~ samples') +# The "environment" in .getenvironment() is the entire R workspace in which the +# Formula object exists. The R workspace meaning all the defined variables. +# Here, the .getenvironment() method is being used to set some variables in the + +# R workspace +formula.getenvironment()['minor_freq'] = minor_freq +formula.getenvironment()['samples'] = samples + +# create boxplot - fill kwargs1 with the options for the boxplot function +kwargs1 = {'ylab':"Minor allele frequency (%)", 'col':"gray", 'xaxt':"n", + 'outpch':"*",'main':"Distribution of minor allele frequencies", + 'cex.lab':"1.5"} +p = graphics.boxplot(formula, **kwargs1) +table = base.table(DATA.rx2('SAMPLE')) +graphics.text(0.5, 1, 'N:', font=2) +for i in range(1, base.length(table)[0]+1, 1): + graphics.text(i, 1, table[i-1], font=2) + +graphlabels = base.names(table) +kwargs3 = {'pos':"0", 'las':"2", 'cex.axis':"1"} +graphics.axis(1, at=range(1, len(graphlabels)+1, 1), labels=graphlabels, **kwargs3) +grdevices.dev_off() + +if not report: + sys.exit(0) + + +#################################### +# GENERATE REPORT +# report should be something like: +# SAMPLE NoHET MEDIAN MAD TEST +# s1 7 10% n p/w/f +# n <= 5 pass +# 6 <= n <=10 warn +# n >= 11 fail +# MAD <= 2.0 fail +# MAD > 2.0 pass +################################### + +SAMPLES=[] +for i in range(len(table)): + SAMPLES.append(base.names(table)[i]) + +def boxstats(data,sample): + VALUES = [100*float(x.strip().split('\t')[11]) for x in list(open(data)) if x.strip().split('\t')[0]==sample] + NoHET = len(VALUES) + MEDIAN = numpy.median(VALUES) + MAD = numpy.median([abs(i - MEDIAN) for i in VALUES]) # Median absolute distance (robust spread statistic) + return [NoHET,MEDIAN, MAD] + +boxreport = open(report, "w+") +boxreport.write("SAMPLE\tTOTAL.SITES\tMEDIAN.FREQ.\tMAD.FREQ\tEVAL\n") +for sample in SAMPLES: + ENTRY = [sample] + boxstats(infile,sample) + if ENTRY[1] <= 5: + ENTRY.append('pass') + elif 6 <= ENTRY[1] <=10: + ENTRY.append('warn') + elif ENTRY[1] >= 11: + ENTRY.append('fail') + if ENTRY[3] <=2.0: + ENTRY.append('fail') + elif ENTRY[3] >2.0: + ENTRY.append('pass') + if len(set(ENTRY[4:6])) == 2: + ENTRY.append('warn') + else: + ENTRY.append(list(set(ENTRY[4:6]))[0]) + boxreport.write ('%s\t%d\t%.1f\t%.1f\t%s\n' % tuple([ENTRY[i] for i in [0,1,2,3,6]])) + +boxreport.close() + + + +