Mercurial > repos > nick > minor_variant_boxplot
comparison hetbox.py @ 2:3a1ce69571e5 draft
Allow commented lines, de-kludge data frame handling
author | nick |
---|---|
date | Tue, 28 May 2013 16:55:39 -0400 |
parents | 128db16c9399 |
children | dfa2e75da6aa |
comparison
equal
deleted
inserted
replaced
1:a85dd9968af9 | 2:3a1ce69571e5 |
---|---|
1 #!/usr/bin/env python | 1 #!/usr/bin/env python |
2 # New in this version: | |
3 # - handle commented-out header lines | |
4 # - did everything through the Rpy2 interface instead of inline R code | |
2 | 5 |
3 import os,sys,numpy | 6 import os,sys,numpy |
4 from rpy2.robjects import Formula | 7 from rpy2.robjects import Formula |
5 from rpy2.robjects.packages import importr | 8 from rpy2.robjects.packages import importr |
6 from rpy2 import robjects | 9 from rpy2 import robjects |
25 if len(args) >= 3: | 28 if len(args) >= 3: |
26 report = args[2] | 29 report = args[2] |
27 elif make_report: | 30 elif make_report: |
28 fail('Error: No output report filename provided (as argument 3).') | 31 fail('Error: No output report filename provided (as argument 3).') |
29 | 32 |
33 # Check input file | |
30 if not os.path.exists(infile): | 34 if not os.path.exists(infile): |
31 fail('Error: Input file '+infile+' could not be found.') | 35 fail('Error: Input file '+infile+' could not be found.') |
36 with open(infile, 'r') as lines: | |
37 line = lines.readline() | |
38 if not line: | |
39 fail('Error: Input file seems to be empty') | |
40 line = line.strip().lstrip('#') # rm whitespace, comment chars | |
41 labels = line.split("\t") | |
42 if 'SAMPLE' not in labels or labels[11] != 'MINOR.FREQ.PERC.': | |
43 fail('Error: Input file does not seem to have a proper header line.') | |
32 | 44 |
33 | 45 |
34 utils = importr('utils') | 46 utils = importr('utils') |
35 graphics = importr('graphics') | 47 graphics = importr('graphics') |
36 base = importr('base') | 48 base = importr('base') |
39 grdevices = importr('grDevices') | 51 grdevices = importr('grDevices') |
40 grdevices.png(file=outfile, width=1024, height=768) | 52 grdevices.png(file=outfile, width=1024, height=768) |
41 | 53 |
42 # Read file into a data frame | 54 # Read file into a data frame |
43 DATA = utils.read_delim(infile) | 55 DATA = utils.read_delim(infile) |
56 # Remove comment from header, if | |
57 labels = robjects.r.names(DATA) | |
58 if labels[0][0:2] == 'X.': | |
59 labels[0] = labels[0][2:] | |
60 #robjects.r.assign('data', DATA) | |
61 #robjects.r('data$MINOR.FREQ.PERC. = data$MINOR.FREQ.PERC. * 100') | |
62 #DATA = robjects.r('data') | |
63 #index = data.names.index('MINOR.FREQ.PERC.') | |
44 # Multiply minor allele frequencies by 100 to get percentage | 64 # Multiply minor allele frequencies by 100 to get percentage |
45 # Doing this in R, since I can't find a way to modify a column in a data frame | 65 # .rx2() looks up a column by its label and returns it as a vector |
46 # in rpy2 | 66 # .ro turns the returned object into one that can be operated on per-element |
47 robjects.r.assign('data', DATA) | 67 minor_freq = DATA.rx2('MINOR.FREQ.PERC.').ro * 100 |
48 robjects.r('data$MINOR.FREQ.PERC. = data$MINOR.FREQ.PERC. * 100') | 68 samples = DATA.rx2('SAMPLE') |
49 DATA = robjects.r('data') | |
50 #index = data.names.index('MINOR.FREQ.PERC.') | |
51 ## .ro turns the returned object into one that can be operated on per-element | |
52 #DATA.rx2('MINOR.FREQ.PERC.') = DATA.rx('MINOR.FREQ.PERC.').ro * 100 | |
53 # Formula() creates a Python object representing the R object returned by x ~ y | 69 # Formula() creates a Python object representing the R object returned by x ~ y |
54 formula = Formula('MINOR.FREQ.PERC.~SAMPLE') | 70 formula = Formula('minor_freq ~ samples') |
55 # The "environment" in .getenvironment() is the entire R workspace in which the | 71 # The "environment" in .getenvironment() is the entire R workspace in which the |
56 # Formula object exists. The R workspace meaning all the defined variables. | 72 # Formula object exists. The R workspace meaning all the defined variables. |
57 # Here, the .getenvironment() method is being used to set some variables in the | 73 # Here, the .getenvironment() method is being used to set some variables in the |
58 # R workspace | 74 # R workspace |
59 # Note: .rx2() looks up a column by its label and returns it as a vector | 75 formula.getenvironment()['minor_freq'] = minor_freq |
60 formula.getenvironment()['MINOR.FREQ.PERC.'] = DATA.rx2('MINOR.FREQ.PERC.') | 76 formula.getenvironment()['samples'] = samples |
61 formula.getenvironment()['SAMPLE'] = DATA.rx2('SAMPLE') | |
62 | 77 |
63 # create boxplot - fill kwargs1 with the options for the boxplot function | 78 # create boxplot - fill kwargs1 with the options for the boxplot function |
64 kwargs1 = {'ylab':"Minor allele frequency (%)",'col':"gray", 'xaxt':"n", 'outpch':"*",'main':"Distribution of minor allele frequencies >= 2%", 'cex.lab':"1.5"} | 79 kwargs1 = {'ylab':"Minor allele frequency (%)", 'col':"gray", 'xaxt':"n", 'outpch':"*",'main':"Distribution of minor allele frequencies >= 2%", 'cex.lab':"1.5"} |
65 p = graphics.boxplot(formula, **kwargs1) | 80 p = graphics.boxplot(formula, **kwargs1) |
66 | 81 |
67 tab = base.table(DATA.rx2('SAMPLE')) | 82 table = base.table(DATA.rx2('SAMPLE')) |
68 graphics.text(0.5,1,'N:',font=2) | 83 graphics.text(0.5, 1, 'N:', font=2) |
69 for i in range(1,base.length(tab)[0]+1,1): | 84 for i in range(1, base.length(table)[0]+1, 1): |
70 graphics.text(i,1,tab[i-1], font=2) | 85 graphics.text(i, 1, table[i-1], font=2) |
71 | 86 |
72 graphlabels = base.names(tab) | 87 graphlabels = base.names(table) |
73 kwargs3 = {'pos':"0", 'las':"2", 'cex.axis':"1"} | 88 kwargs3 = {'pos':"0", 'las':"2", 'cex.axis':"1"} |
74 graphics.axis(1, at=range(1,len(graphlabels)+1,1),labels=graphlabels, **kwargs3) | 89 graphics.axis(1, at=range(1, len(graphlabels)+1, 1), labels=graphlabels, **kwargs3) |
75 grdevices.dev_off() | 90 grdevices.dev_off() |
76 | 91 |
77 if not make_report: | 92 if not make_report: |
78 sys.exit(0) | 93 sys.exit(0) |
94 | |
79 | 95 |
80 #################################### | 96 #################################### |
81 # GENERATE REPORT | 97 # GENERATE REPORT |
82 # report should be something like: | 98 # report should be something like: |
83 # SAMPLE NoHET MEDIAN MAD TEST | 99 # SAMPLE NoHET MEDIAN MAD TEST |