Mercurial > repos > boris > hetbox
annotate hetbox.py @ 4:1c21e9f3104b draft default tip
updated script comments and remove "evaluation" column of the report
author | boris |
---|---|
date | Tue, 25 Jun 2013 00:51:35 -0400 |
parents | 57c5ea9c3c5c |
children |
rev | line source |
---|---|
0 | 1 #!/usr/bin/env python |
4
1c21e9f3104b
updated script comments and remove "evaluation" column of the report
boris
parents:
2
diff
changeset
|
2 |
0 | 3 # Code by Boris Rebolledo-Jaramillo |
4 # (boris-at-bx.psu.edu) | |
5 # Edited by Nick Stoler | |
6 # (nick-at-bx.psu.edu) | |
7 # New in this version: | |
8 # - Add in proper header line if not present | |
9 | |
4
1c21e9f3104b
updated script comments and remove "evaluation" column of the report
boris
parents:
2
diff
changeset
|
10 |
0 | 11 import os |
12 import sys | |
2 | 13 import array |
0 | 14 import numpy |
15 from rpy2.robjects import Formula | |
16 from rpy2.robjects.packages import importr | |
17 from rpy2 import robjects | |
18 | |
19 def fail(message): | |
20 sys.stderr.write(message+'\n') | |
21 sys.exit(1) | |
22 | |
23 COLUMN_LABELS = ['SAMPLE', 'CHR', 'POS', 'A', 'C', 'G', 'T', 'CVRG', | |
24 'ALLELES', 'MAJOR', 'MINOR', 'MINOR.FREQ.PERC.'] #, 'STRAND.BIAS'] | |
25 | |
26 args = sys.argv[1:] | |
27 if len(args) >= 1: | |
28 infile = args[0] | |
29 else: | |
30 fail('Error: No input filename provided (as argument 1).') | |
31 if len(args) >= 2: | |
32 outfile = args[1] | |
33 else: | |
34 fail('Error: No output filename provided (as argument 2).') | |
35 if len(args) >= 3: | |
36 report = args[2] | |
37 else: | |
38 report = '' | |
39 | |
40 # Check input file | |
41 add_header = False | |
42 if not os.path.exists(infile): | |
43 fail('Error: Input file '+infile+' could not be found.') | |
44 with open(infile, 'r') as lines: | |
45 line = lines.readline() | |
46 if not line: | |
47 fail('Error: Input file seems to be empty') | |
48 line = line.strip().lstrip('#') # rm whitespace, comment chars | |
49 labels = line.split("\t") | |
50 if 'SAMPLE' not in labels or labels[11] != 'MINOR.FREQ.PERC.': | |
51 sys.stderr.write("Error: Input file does not seem to have a proper header " | |
52 +"line.\nAdding an artificial header..") | |
53 add_header = True | |
54 | |
2 | 55 r = robjects.r |
0 | 56 base = importr('base') |
57 utils = importr('utils') | |
58 stats = importr('stats') | |
59 rprint = robjects.globalenv.get("print") | |
60 graphics = importr('graphics') | |
61 grdevices = importr('grDevices') | |
62 grdevices.png(file=outfile, width=1024, height=768) | |
63 | |
64 # Read file into a data frame | |
65 if add_header: | |
66 # add header line manually if not present | |
67 DATA = utils.read_delim(infile, header=False) | |
68 labels = robjects.r.names(DATA) | |
69 for i in range(len(labels)): | |
70 try: | |
71 labels[i] = COLUMN_LABELS[i] | |
72 except IndexError, e: | |
73 fail("Error in input file: Too many columns (does not match hardcoded " | |
74 +"column labels).") | |
75 else: | |
76 DATA = utils.read_delim(infile) | |
77 # Remove comment from header, if present | |
78 labels = robjects.r.names(DATA) | |
79 if labels[0][0:2] == 'X.': | |
80 labels[0] = labels[0][2:] | |
81 | |
82 # Multiply minor allele frequencies by 100 to get percentage | |
83 # .rx2() looks up a column by its label and returns it as a vector | |
84 # .ro turns the returned object into one that can be operated on per-element | |
85 minor_freq = DATA.rx2('MINOR.FREQ.PERC.').ro * 100 | |
86 samples = DATA.rx2('SAMPLE') | |
87 | |
88 # Formula() creates a Python object representing the R object returned by x ~ y | |
89 formula = Formula('minor_freq ~ samples') | |
90 # The "environment" in .getenvironment() is the entire R workspace in which the | |
91 # Formula object exists. The R workspace meaning all the defined variables. | |
92 # Here, the .getenvironment() method is being used to set some variables in the | |
2 | 93 # R workspace |
0 | 94 |
95 formula.getenvironment()['minor_freq'] = minor_freq | |
96 formula.getenvironment()['samples'] = samples | |
97 | |
2 | 98 |
99 r.par(oma=array.array('i', [0,0,0,0])) | |
100 r.par(mar=array.array('i', [10,4,4,2])) | |
101 ylimit = array.array('i',[-5,50]) | |
102 | |
0 | 103 # create boxplot - fill kwargs1 with the options for the boxplot function |
104 kwargs1 = {'ylab':"Minor allele frequency (%)", 'col':"gray", 'xaxt':"n", | |
105 'outpch':"*",'main':"Distribution of minor allele frequencies", | |
106 'cex.lab':"1.5"} | |
2 | 107 p = graphics.boxplot(formula, axes=0,ylim=ylimit, lty=1,**kwargs1) |
108 | |
109 table = base.table(DATA.rx2('SAMPLE')) | |
4
1c21e9f3104b
updated script comments and remove "evaluation" column of the report
boris
parents:
2
diff
changeset
|
110 graphics.text(0, -1, 'N:', font=2) |
0 | 111 for i in range(1, base.length(table)[0]+1, 1): |
2 | 112 graphics.text(i, -1, table[i-1], font=2) |
0 | 113 |
114 graphlabels = base.names(table) | |
2 | 115 kwargs3 = {'pos':"-2", 'las':"2", 'cex.axis':"1"} |
116 graphics.axis(1, at=range(1, len(graphlabels)+1, 1),labels=graphlabels, **kwargs3) | |
117 graphics.axis(2,at=(range(0,60,10)),pos=0,font=2) | |
0 | 118 grdevices.dev_off() |
119 | |
120 if not report: | |
4
1c21e9f3104b
updated script comments and remove "evaluation" column of the report
boris
parents:
2
diff
changeset
|
121 sys.exit(0) |
0 | 122 |
123 | |
124 SAMPLES=[] | |
125 for i in range(len(table)): | |
126 SAMPLES.append(base.names(table)[i]) | |
127 | |
128 def boxstats(data,sample): | |
129 VALUES = [100*float(x.strip().split('\t')[11]) for x in list(open(data)) if x.strip().split('\t')[0]==sample] | |
130 NoHET = len(VALUES) | |
131 MEDIAN = numpy.median(VALUES) | |
132 MAD = numpy.median([abs(i - MEDIAN) for i in VALUES]) # Median absolute distance (robust spread statistic) | |
133 return [NoHET,MEDIAN, MAD] | |
134 | |
135 boxreport = open(report, "w+") | |
4
1c21e9f3104b
updated script comments and remove "evaluation" column of the report
boris
parents:
2
diff
changeset
|
136 boxreport.write("#sample\tNo.sites\tmedian.freq\tMAD.freq\n") |
1c21e9f3104b
updated script comments and remove "evaluation" column of the report
boris
parents:
2
diff
changeset
|
137 |
0 | 138 for sample in SAMPLES: |
139 ENTRY = [sample] + boxstats(infile,sample) | |
4
1c21e9f3104b
updated script comments and remove "evaluation" column of the report
boris
parents:
2
diff
changeset
|
140 boxreport.write ('%s\t%d\t%.1f\t%.1f\n' % tuple([ENTRY[i] for i in [0,1,2,3]])) |
0 | 141 boxreport.close() |
142 | |
143 | |
144 | |
145 |