Mercurial > repos > boris > hetbox
view hetbox.py @ 1:b4f9a4f2f65d draft
Uploaded xml
author | boris |
---|---|
date | Thu, 13 Jun 2013 15:10:09 -0400 |
parents | 153d1a6e8c5e |
children | 57c5ea9c3c5c |
line wrap: on
line source
#!/usr/bin/env python # Code by Boris Rebolledo-Jaramillo # (boris-at-bx.psu.edu) # Edited by Nick Stoler # (nick-at-bx.psu.edu) # New in this version: # - Add in proper header line if not present import os import sys import numpy from rpy2.robjects import Formula from rpy2.robjects.packages import importr from rpy2 import robjects def fail(message): sys.stderr.write(message+'\n') sys.exit(1) COLUMN_LABELS = ['SAMPLE', 'CHR', 'POS', 'A', 'C', 'G', 'T', 'CVRG', 'ALLELES', 'MAJOR', 'MINOR', 'MINOR.FREQ.PERC.'] #, 'STRAND.BIAS'] args = sys.argv[1:] if len(args) >= 1: infile = args[0] else: fail('Error: No input filename provided (as argument 1).') if len(args) >= 2: outfile = args[1] else: fail('Error: No output filename provided (as argument 2).') if len(args) >= 3: report = args[2] else: report = '' # Check input file add_header = False if not os.path.exists(infile): fail('Error: Input file '+infile+' could not be found.') with open(infile, 'r') as lines: line = lines.readline() if not line: fail('Error: Input file seems to be empty') line = line.strip().lstrip('#') # rm whitespace, comment chars labels = line.split("\t") if 'SAMPLE' not in labels or labels[11] != 'MINOR.FREQ.PERC.': sys.stderr.write("Error: Input file does not seem to have a proper header " +"line.\nAdding an artificial header..") add_header = True base = importr('base') utils = importr('utils') stats = importr('stats') rprint = robjects.globalenv.get("print") graphics = importr('graphics') grdevices = importr('grDevices') grdevices.png(file=outfile, width=1024, height=768) # Read file into a data frame if add_header: # add header line manually if not present DATA = utils.read_delim(infile, header=False) labels = robjects.r.names(DATA) for i in range(len(labels)): try: labels[i] = COLUMN_LABELS[i] except IndexError, e: fail("Error in input file: Too many columns (does not match hardcoded " +"column labels).") else: DATA = utils.read_delim(infile) # Remove comment from header, if present labels = robjects.r.names(DATA) if labels[0][0:2] == 'X.': labels[0] = labels[0][2:] # Multiply minor allele frequencies by 100 to get percentage # .rx2() looks up a column by its label and returns it as a vector # .ro turns the returned object into one that can be operated on per-element minor_freq = DATA.rx2('MINOR.FREQ.PERC.').ro * 100 samples = DATA.rx2('SAMPLE') # Formula() creates a Python object representing the R object returned by x ~ y formula = Formula('minor_freq ~ samples') # The "environment" in .getenvironment() is the entire R workspace in which the # Formula object exists. The R workspace meaning all the defined variables. # Here, the .getenvironment() method is being used to set some variables in the # R workspace formula.getenvironment()['minor_freq'] = minor_freq formula.getenvironment()['samples'] = samples # create boxplot - fill kwargs1 with the options for the boxplot function kwargs1 = {'ylab':"Minor allele frequency (%)", 'col':"gray", 'xaxt':"n", 'outpch':"*",'main':"Distribution of minor allele frequencies", 'cex.lab':"1.5"} p = graphics.boxplot(formula, **kwargs1) table = base.table(DATA.rx2('SAMPLE')) graphics.text(0.5, 1, 'N:', font=2) for i in range(1, base.length(table)[0]+1, 1): graphics.text(i, 1, table[i-1], font=2) graphlabels = base.names(table) kwargs3 = {'pos':"0", 'las':"2", 'cex.axis':"1"} graphics.axis(1, at=range(1, len(graphlabels)+1, 1), labels=graphlabels, **kwargs3) grdevices.dev_off() if not report: sys.exit(0) #################################### # GENERATE REPORT # report should be something like: # SAMPLE NoHET MEDIAN MAD TEST # s1 7 10% n p/w/f # n <= 5 pass # 6 <= n <=10 warn # n >= 11 fail # MAD <= 2.0 fail # MAD > 2.0 pass ################################### SAMPLES=[] for i in range(len(table)): SAMPLES.append(base.names(table)[i]) def boxstats(data,sample): VALUES = [100*float(x.strip().split('\t')[11]) for x in list(open(data)) if x.strip().split('\t')[0]==sample] NoHET = len(VALUES) MEDIAN = numpy.median(VALUES) MAD = numpy.median([abs(i - MEDIAN) for i in VALUES]) # Median absolute distance (robust spread statistic) return [NoHET,MEDIAN, MAD] boxreport = open(report, "w+") boxreport.write("SAMPLE\tTOTAL.SITES\tMEDIAN.FREQ.\tMAD.FREQ\tEVAL\n") for sample in SAMPLES: ENTRY = [sample] + boxstats(infile,sample) if ENTRY[1] <= 5: ENTRY.append('pass') elif 6 <= ENTRY[1] <=10: ENTRY.append('warn') elif ENTRY[1] >= 11: ENTRY.append('fail') if ENTRY[3] <=2.0: ENTRY.append('fail') elif ENTRY[3] >2.0: ENTRY.append('pass') if len(set(ENTRY[4:6])) == 2: ENTRY.append('warn') else: ENTRY.append(list(set(ENTRY[4:6]))[0]) boxreport.write ('%s\t%d\t%.1f\t%.1f\t%s\n' % tuple([ENTRY[i] for i in [0,1,2,3,6]])) boxreport.close()