comparison hetbox.py @ 2:3a1ce69571e5 draft

Allow commented lines, de-kludge data frame handling
author nick
date Tue, 28 May 2013 16:55:39 -0400
parents 128db16c9399
children dfa2e75da6aa
comparison
equal deleted inserted replaced
1:a85dd9968af9 2:3a1ce69571e5
1 #!/usr/bin/env python 1 #!/usr/bin/env python
2 # New in this version:
3 # - handle commented-out header lines
4 # - did everything through the Rpy2 interface instead of inline R code
2 5
3 import os,sys,numpy 6 import os,sys,numpy
4 from rpy2.robjects import Formula 7 from rpy2.robjects import Formula
5 from rpy2.robjects.packages import importr 8 from rpy2.robjects.packages import importr
6 from rpy2 import robjects 9 from rpy2 import robjects
25 if len(args) >= 3: 28 if len(args) >= 3:
26 report = args[2] 29 report = args[2]
27 elif make_report: 30 elif make_report:
28 fail('Error: No output report filename provided (as argument 3).') 31 fail('Error: No output report filename provided (as argument 3).')
29 32
33 # Check input file
30 if not os.path.exists(infile): 34 if not os.path.exists(infile):
31 fail('Error: Input file '+infile+' could not be found.') 35 fail('Error: Input file '+infile+' could not be found.')
36 with open(infile, 'r') as lines:
37 line = lines.readline()
38 if not line:
39 fail('Error: Input file seems to be empty')
40 line = line.strip().lstrip('#') # rm whitespace, comment chars
41 labels = line.split("\t")
42 if 'SAMPLE' not in labels or labels[11] != 'MINOR.FREQ.PERC.':
43 fail('Error: Input file does not seem to have a proper header line.')
32 44
33 45
34 utils = importr('utils') 46 utils = importr('utils')
35 graphics = importr('graphics') 47 graphics = importr('graphics')
36 base = importr('base') 48 base = importr('base')
39 grdevices = importr('grDevices') 51 grdevices = importr('grDevices')
40 grdevices.png(file=outfile, width=1024, height=768) 52 grdevices.png(file=outfile, width=1024, height=768)
41 53
42 # Read file into a data frame 54 # Read file into a data frame
43 DATA = utils.read_delim(infile) 55 DATA = utils.read_delim(infile)
56 # Remove comment from header, if
57 labels = robjects.r.names(DATA)
58 if labels[0][0:2] == 'X.':
59 labels[0] = labels[0][2:]
60 #robjects.r.assign('data', DATA)
61 #robjects.r('data$MINOR.FREQ.PERC. = data$MINOR.FREQ.PERC. * 100')
62 #DATA = robjects.r('data')
63 #index = data.names.index('MINOR.FREQ.PERC.')
44 # Multiply minor allele frequencies by 100 to get percentage 64 # Multiply minor allele frequencies by 100 to get percentage
45 # Doing this in R, since I can't find a way to modify a column in a data frame 65 # .rx2() looks up a column by its label and returns it as a vector
46 # in rpy2 66 # .ro turns the returned object into one that can be operated on per-element
47 robjects.r.assign('data', DATA) 67 minor_freq = DATA.rx2('MINOR.FREQ.PERC.').ro * 100
48 robjects.r('data$MINOR.FREQ.PERC. = data$MINOR.FREQ.PERC. * 100') 68 samples = DATA.rx2('SAMPLE')
49 DATA = robjects.r('data')
50 #index = data.names.index('MINOR.FREQ.PERC.')
51 ## .ro turns the returned object into one that can be operated on per-element
52 #DATA.rx2('MINOR.FREQ.PERC.') = DATA.rx('MINOR.FREQ.PERC.').ro * 100
53 # Formula() creates a Python object representing the R object returned by x ~ y 69 # Formula() creates a Python object representing the R object returned by x ~ y
54 formula = Formula('MINOR.FREQ.PERC.~SAMPLE') 70 formula = Formula('minor_freq ~ samples')
55 # The "environment" in .getenvironment() is the entire R workspace in which the 71 # The "environment" in .getenvironment() is the entire R workspace in which the
56 # Formula object exists. The R workspace meaning all the defined variables. 72 # Formula object exists. The R workspace meaning all the defined variables.
57 # Here, the .getenvironment() method is being used to set some variables in the 73 # Here, the .getenvironment() method is being used to set some variables in the
58 # R workspace 74 # R workspace
59 # Note: .rx2() looks up a column by its label and returns it as a vector 75 formula.getenvironment()['minor_freq'] = minor_freq
60 formula.getenvironment()['MINOR.FREQ.PERC.'] = DATA.rx2('MINOR.FREQ.PERC.') 76 formula.getenvironment()['samples'] = samples
61 formula.getenvironment()['SAMPLE'] = DATA.rx2('SAMPLE')
62 77
63 # create boxplot - fill kwargs1 with the options for the boxplot function 78 # create boxplot - fill kwargs1 with the options for the boxplot function
64 kwargs1 = {'ylab':"Minor allele frequency (%)",'col':"gray", 'xaxt':"n", 'outpch':"*",'main':"Distribution of minor allele frequencies >= 2%", 'cex.lab':"1.5"} 79 kwargs1 = {'ylab':"Minor allele frequency (%)", 'col':"gray", 'xaxt':"n", 'outpch':"*",'main':"Distribution of minor allele frequencies >= 2%", 'cex.lab':"1.5"}
65 p = graphics.boxplot(formula, **kwargs1) 80 p = graphics.boxplot(formula, **kwargs1)
66 81
67 tab = base.table(DATA.rx2('SAMPLE')) 82 table = base.table(DATA.rx2('SAMPLE'))
68 graphics.text(0.5,1,'N:',font=2) 83 graphics.text(0.5, 1, 'N:', font=2)
69 for i in range(1,base.length(tab)[0]+1,1): 84 for i in range(1, base.length(table)[0]+1, 1):
70 graphics.text(i,1,tab[i-1], font=2) 85 graphics.text(i, 1, table[i-1], font=2)
71 86
72 graphlabels = base.names(tab) 87 graphlabels = base.names(table)
73 kwargs3 = {'pos':"0", 'las':"2", 'cex.axis':"1"} 88 kwargs3 = {'pos':"0", 'las':"2", 'cex.axis':"1"}
74 graphics.axis(1, at=range(1,len(graphlabels)+1,1),labels=graphlabels, **kwargs3) 89 graphics.axis(1, at=range(1, len(graphlabels)+1, 1), labels=graphlabels, **kwargs3)
75 grdevices.dev_off() 90 grdevices.dev_off()
76 91
77 if not make_report: 92 if not make_report:
78 sys.exit(0) 93 sys.exit(0)
94
79 95
80 #################################### 96 ####################################
81 # GENERATE REPORT 97 # GENERATE REPORT
82 # report should be something like: 98 # report should be something like:
83 # SAMPLE NoHET MEDIAN MAD TEST 99 # SAMPLE NoHET MEDIAN MAD TEST