Mercurial > repos > nick > minor_variant_boxplot
comparison hetbox.py @ 6:99cda2ff12b8 draft
Current version - maybe ready to release
| author | nick |
|---|---|
| date | Tue, 04 Jun 2013 00:06:36 -0400 |
| parents | dfa2e75da6aa |
| children |
comparison
equal
deleted
inserted
replaced
| 5:d38113d9c4d6 | 6:99cda2ff12b8 |
|---|---|
| 1 #!/usr/bin/env python | 1 #!/usr/bin/env python |
| 2 # New in this version: | 2 # New in this version: |
| 3 # - option to generate report is triggered simply by including report filename | 3 # - Add in proper header line if not present |
| 4 # as third argument | |
| 5 | 4 |
| 6 import os,sys,numpy | 5 import os,sys,numpy |
| 7 from rpy2.robjects import Formula | 6 from rpy2.robjects import Formula |
| 8 from rpy2.robjects.packages import importr | 7 from rpy2.robjects.packages import importr |
| 9 from rpy2 import robjects | 8 from rpy2 import robjects |
| 10 | 9 |
| 11 def fail(message): | 10 def fail(message): |
| 12 sys.stderr.write(message+'\n') | 11 sys.stderr.write(message+'\n') |
| 13 sys.exit(1) | 12 sys.exit(1) |
| 13 | |
| 14 COLUMN_LABELS = ['SAMPLE', 'CHR', 'POS', 'A', 'C', 'G', 'T', 'CVRG', 'ALLELES', 'MAJOR', 'MINOR', 'MINOR.FREQ.PERC.'] #, 'STRAND.BIAS'] | |
| 14 | 15 |
| 15 args = sys.argv[1:] | 16 args = sys.argv[1:] |
| 16 if len(args) >= 1: | 17 if len(args) >= 1: |
| 17 infile = args[0] | 18 infile = args[0] |
| 18 else: | 19 else: |
| 25 report = args[2] | 26 report = args[2] |
| 26 else: | 27 else: |
| 27 report = '' | 28 report = '' |
| 28 | 29 |
| 29 # Check input file | 30 # Check input file |
| 31 add_header = False | |
| 30 if not os.path.exists(infile): | 32 if not os.path.exists(infile): |
| 31 fail('Error: Input file '+infile+' could not be found.') | 33 fail('Error: Input file '+infile+' could not be found.') |
| 32 with open(infile, 'r') as lines: | 34 with open(infile, 'r') as lines: |
| 33 line = lines.readline() | 35 line = lines.readline() |
| 34 if not line: | 36 if not line: |
| 35 fail('Error: Input file seems to be empty') | 37 fail('Error: Input file seems to be empty') |
| 36 line = line.strip().lstrip('#') # rm whitespace, comment chars | 38 line = line.strip().lstrip('#') # rm whitespace, comment chars |
| 37 labels = line.split("\t") | 39 labels = line.split("\t") |
| 38 if 'SAMPLE' not in labels or labels[11] != 'MINOR.FREQ.PERC.': | 40 if 'SAMPLE' not in labels or labels[11] != 'MINOR.FREQ.PERC.': |
| 39 fail('Error: Input file does not seem to have a proper header line.') | 41 sys.stderr.write("Error: Input file does not seem to have a proper header " |
| 42 +"line.\nAdding an artificial header..") | |
| 43 add_header = True | |
| 44 | |
| 40 | 45 |
| 41 | 46 |
| 42 utils = importr('utils') | 47 utils = importr('utils') |
| 43 graphics = importr('graphics') | 48 graphics = importr('graphics') |
| 44 base = importr('base') | 49 base = importr('base') |
| 46 rprint = robjects.globalenv.get("print") | 51 rprint = robjects.globalenv.get("print") |
| 47 grdevices = importr('grDevices') | 52 grdevices = importr('grDevices') |
| 48 grdevices.png(file=outfile, width=1024, height=768) | 53 grdevices.png(file=outfile, width=1024, height=768) |
| 49 | 54 |
| 50 # Read file into a data frame | 55 # Read file into a data frame |
| 51 DATA = utils.read_delim(infile) | 56 if add_header: |
| 52 # Remove comment from header, if | 57 # add header line manually if not present |
| 53 labels = robjects.r.names(DATA) | 58 DATA = utils.read_delim(infile, header=False) |
| 54 if labels[0][0:2] == 'X.': | 59 labels = robjects.r.names(DATA) |
| 55 labels[0] = labels[0][2:] | 60 for i in range(len(labels)): |
| 56 #robjects.r.assign('data', DATA) | 61 try: |
| 57 #robjects.r('data$MINOR.FREQ.PERC. = data$MINOR.FREQ.PERC. * 100') | 62 labels[i] = COLUMN_LABELS[i] |
| 58 #DATA = robjects.r('data') | 63 except IndexError, e: |
| 59 #index = data.names.index('MINOR.FREQ.PERC.') | 64 fail("Error in input file: Too many columns (does not match hardcoded " |
| 65 +"column labels).") | |
| 66 else: | |
| 67 DATA = utils.read_delim(infile) | |
| 68 # Remove comment from header, if present | |
| 69 labels = robjects.r.names(DATA) | |
| 70 if labels[0][0:2] == 'X.': | |
| 71 labels[0] = labels[0][2:] | |
| 60 # Multiply minor allele frequencies by 100 to get percentage | 72 # Multiply minor allele frequencies by 100 to get percentage |
| 61 # .rx2() looks up a column by its label and returns it as a vector | 73 # .rx2() looks up a column by its label and returns it as a vector |
| 62 # .ro turns the returned object into one that can be operated on per-element | 74 # .ro turns the returned object into one that can be operated on per-element |
| 63 minor_freq = DATA.rx2('MINOR.FREQ.PERC.').ro * 100 | 75 minor_freq = DATA.rx2('MINOR.FREQ.PERC.').ro * 100 |
| 64 samples = DATA.rx2('SAMPLE') | 76 samples = DATA.rx2('SAMPLE') |
| 70 # R workspace | 82 # R workspace |
| 71 formula.getenvironment()['minor_freq'] = minor_freq | 83 formula.getenvironment()['minor_freq'] = minor_freq |
| 72 formula.getenvironment()['samples'] = samples | 84 formula.getenvironment()['samples'] = samples |
| 73 | 85 |
| 74 # create boxplot - fill kwargs1 with the options for the boxplot function | 86 # create boxplot - fill kwargs1 with the options for the boxplot function |
| 75 kwargs1 = {'ylab':"Minor allele frequency (%)", 'col':"gray", 'xaxt':"n", 'outpch':"*",'main':"Distribution of minor allele frequencies >= 2%", 'cex.lab':"1.5"} | 87 kwargs1 = {'ylab':"Minor allele frequency (%)", 'col':"gray", 'xaxt':"n", 'outpch':"*",'main':"Distribution of minor allele frequencies", 'cex.lab':"1.5"} |
| 76 p = graphics.boxplot(formula, **kwargs1) | 88 p = graphics.boxplot(formula, **kwargs1) |
| 77 | 89 |
| 78 table = base.table(DATA.rx2('SAMPLE')) | 90 table = base.table(DATA.rx2('SAMPLE')) |
| 79 graphics.text(0.5, 1, 'N:', font=2) | 91 graphics.text(0.5, 1, 'N:', font=2) |
| 80 for i in range(1, base.length(table)[0]+1, 1): | 92 for i in range(1, base.length(table)[0]+1, 1): |
