Mercurial > repos > nick > minor_variant_boxplot
comparison hetbox.py @ 2:3a1ce69571e5 draft
Allow commented lines, de-kludge data frame handling
| author | nick |
|---|---|
| date | Tue, 28 May 2013 16:55:39 -0400 |
| parents | 128db16c9399 |
| children | dfa2e75da6aa |
comparison
equal
deleted
inserted
replaced
| 1:a85dd9968af9 | 2:3a1ce69571e5 |
|---|---|
| 1 #!/usr/bin/env python | 1 #!/usr/bin/env python |
| 2 # New in this version: | |
| 3 # - handle commented-out header lines | |
| 4 # - did everything through the Rpy2 interface instead of inline R code | |
| 2 | 5 |
| 3 import os,sys,numpy | 6 import os,sys,numpy |
| 4 from rpy2.robjects import Formula | 7 from rpy2.robjects import Formula |
| 5 from rpy2.robjects.packages import importr | 8 from rpy2.robjects.packages import importr |
| 6 from rpy2 import robjects | 9 from rpy2 import robjects |
| 25 if len(args) >= 3: | 28 if len(args) >= 3: |
| 26 report = args[2] | 29 report = args[2] |
| 27 elif make_report: | 30 elif make_report: |
| 28 fail('Error: No output report filename provided (as argument 3).') | 31 fail('Error: No output report filename provided (as argument 3).') |
| 29 | 32 |
| 33 # Check input file | |
| 30 if not os.path.exists(infile): | 34 if not os.path.exists(infile): |
| 31 fail('Error: Input file '+infile+' could not be found.') | 35 fail('Error: Input file '+infile+' could not be found.') |
| 36 with open(infile, 'r') as lines: | |
| 37 line = lines.readline() | |
| 38 if not line: | |
| 39 fail('Error: Input file seems to be empty') | |
| 40 line = line.strip().lstrip('#') # rm whitespace, comment chars | |
| 41 labels = line.split("\t") | |
| 42 if 'SAMPLE' not in labels or labels[11] != 'MINOR.FREQ.PERC.': | |
| 43 fail('Error: Input file does not seem to have a proper header line.') | |
| 32 | 44 |
| 33 | 45 |
| 34 utils = importr('utils') | 46 utils = importr('utils') |
| 35 graphics = importr('graphics') | 47 graphics = importr('graphics') |
| 36 base = importr('base') | 48 base = importr('base') |
| 39 grdevices = importr('grDevices') | 51 grdevices = importr('grDevices') |
| 40 grdevices.png(file=outfile, width=1024, height=768) | 52 grdevices.png(file=outfile, width=1024, height=768) |
| 41 | 53 |
| 42 # Read file into a data frame | 54 # Read file into a data frame |
| 43 DATA = utils.read_delim(infile) | 55 DATA = utils.read_delim(infile) |
| 56 # Remove comment from header, if | |
| 57 labels = robjects.r.names(DATA) | |
| 58 if labels[0][0:2] == 'X.': | |
| 59 labels[0] = labels[0][2:] | |
| 60 #robjects.r.assign('data', DATA) | |
| 61 #robjects.r('data$MINOR.FREQ.PERC. = data$MINOR.FREQ.PERC. * 100') | |
| 62 #DATA = robjects.r('data') | |
| 63 #index = data.names.index('MINOR.FREQ.PERC.') | |
| 44 # Multiply minor allele frequencies by 100 to get percentage | 64 # Multiply minor allele frequencies by 100 to get percentage |
| 45 # Doing this in R, since I can't find a way to modify a column in a data frame | 65 # .rx2() looks up a column by its label and returns it as a vector |
| 46 # in rpy2 | 66 # .ro turns the returned object into one that can be operated on per-element |
| 47 robjects.r.assign('data', DATA) | 67 minor_freq = DATA.rx2('MINOR.FREQ.PERC.').ro * 100 |
| 48 robjects.r('data$MINOR.FREQ.PERC. = data$MINOR.FREQ.PERC. * 100') | 68 samples = DATA.rx2('SAMPLE') |
| 49 DATA = robjects.r('data') | |
| 50 #index = data.names.index('MINOR.FREQ.PERC.') | |
| 51 ## .ro turns the returned object into one that can be operated on per-element | |
| 52 #DATA.rx2('MINOR.FREQ.PERC.') = DATA.rx('MINOR.FREQ.PERC.').ro * 100 | |
| 53 # Formula() creates a Python object representing the R object returned by x ~ y | 69 # Formula() creates a Python object representing the R object returned by x ~ y |
| 54 formula = Formula('MINOR.FREQ.PERC.~SAMPLE') | 70 formula = Formula('minor_freq ~ samples') |
| 55 # The "environment" in .getenvironment() is the entire R workspace in which the | 71 # The "environment" in .getenvironment() is the entire R workspace in which the |
| 56 # Formula object exists. The R workspace meaning all the defined variables. | 72 # Formula object exists. The R workspace meaning all the defined variables. |
| 57 # Here, the .getenvironment() method is being used to set some variables in the | 73 # Here, the .getenvironment() method is being used to set some variables in the |
| 58 # R workspace | 74 # R workspace |
| 59 # Note: .rx2() looks up a column by its label and returns it as a vector | 75 formula.getenvironment()['minor_freq'] = minor_freq |
| 60 formula.getenvironment()['MINOR.FREQ.PERC.'] = DATA.rx2('MINOR.FREQ.PERC.') | 76 formula.getenvironment()['samples'] = samples |
| 61 formula.getenvironment()['SAMPLE'] = DATA.rx2('SAMPLE') | |
| 62 | 77 |
| 63 # create boxplot - fill kwargs1 with the options for the boxplot function | 78 # create boxplot - fill kwargs1 with the options for the boxplot function |
| 64 kwargs1 = {'ylab':"Minor allele frequency (%)",'col':"gray", 'xaxt':"n", 'outpch':"*",'main':"Distribution of minor allele frequencies >= 2%", 'cex.lab':"1.5"} | 79 kwargs1 = {'ylab':"Minor allele frequency (%)", 'col':"gray", 'xaxt':"n", 'outpch':"*",'main':"Distribution of minor allele frequencies >= 2%", 'cex.lab':"1.5"} |
| 65 p = graphics.boxplot(formula, **kwargs1) | 80 p = graphics.boxplot(formula, **kwargs1) |
| 66 | 81 |
| 67 tab = base.table(DATA.rx2('SAMPLE')) | 82 table = base.table(DATA.rx2('SAMPLE')) |
| 68 graphics.text(0.5,1,'N:',font=2) | 83 graphics.text(0.5, 1, 'N:', font=2) |
| 69 for i in range(1,base.length(tab)[0]+1,1): | 84 for i in range(1, base.length(table)[0]+1, 1): |
| 70 graphics.text(i,1,tab[i-1], font=2) | 85 graphics.text(i, 1, table[i-1], font=2) |
| 71 | 86 |
| 72 graphlabels = base.names(tab) | 87 graphlabels = base.names(table) |
| 73 kwargs3 = {'pos':"0", 'las':"2", 'cex.axis':"1"} | 88 kwargs3 = {'pos':"0", 'las':"2", 'cex.axis':"1"} |
| 74 graphics.axis(1, at=range(1,len(graphlabels)+1,1),labels=graphlabels, **kwargs3) | 89 graphics.axis(1, at=range(1, len(graphlabels)+1, 1), labels=graphlabels, **kwargs3) |
| 75 grdevices.dev_off() | 90 grdevices.dev_off() |
| 76 | 91 |
| 77 if not make_report: | 92 if not make_report: |
| 78 sys.exit(0) | 93 sys.exit(0) |
| 94 | |
| 79 | 95 |
| 80 #################################### | 96 #################################### |
| 81 # GENERATE REPORT | 97 # GENERATE REPORT |
| 82 # report should be something like: | 98 # report should be something like: |
| 83 # SAMPLE NoHET MEDIAN MAD TEST | 99 # SAMPLE NoHET MEDIAN MAD TEST |
