comparison hetbox.py @ 6:99cda2ff12b8 draft

Current version - maybe ready to release
author nick
date Tue, 04 Jun 2013 00:06:36 -0400
parents dfa2e75da6aa
children
comparison
equal deleted inserted replaced
5:d38113d9c4d6 6:99cda2ff12b8
1 #!/usr/bin/env python 1 #!/usr/bin/env python
2 # New in this version: 2 # New in this version:
3 # - option to generate report is triggered simply by including report filename 3 # - Add in proper header line if not present
4 # as third argument
5 4
6 import os,sys,numpy 5 import os,sys,numpy
7 from rpy2.robjects import Formula 6 from rpy2.robjects import Formula
8 from rpy2.robjects.packages import importr 7 from rpy2.robjects.packages import importr
9 from rpy2 import robjects 8 from rpy2 import robjects
10 9
11 def fail(message): 10 def fail(message):
12 sys.stderr.write(message+'\n') 11 sys.stderr.write(message+'\n')
13 sys.exit(1) 12 sys.exit(1)
13
14 COLUMN_LABELS = ['SAMPLE', 'CHR', 'POS', 'A', 'C', 'G', 'T', 'CVRG', 'ALLELES', 'MAJOR', 'MINOR', 'MINOR.FREQ.PERC.'] #, 'STRAND.BIAS']
14 15
15 args = sys.argv[1:] 16 args = sys.argv[1:]
16 if len(args) >= 1: 17 if len(args) >= 1:
17 infile = args[0] 18 infile = args[0]
18 else: 19 else:
25 report = args[2] 26 report = args[2]
26 else: 27 else:
27 report = '' 28 report = ''
28 29
29 # Check input file 30 # Check input file
31 add_header = False
30 if not os.path.exists(infile): 32 if not os.path.exists(infile):
31 fail('Error: Input file '+infile+' could not be found.') 33 fail('Error: Input file '+infile+' could not be found.')
32 with open(infile, 'r') as lines: 34 with open(infile, 'r') as lines:
33 line = lines.readline() 35 line = lines.readline()
34 if not line: 36 if not line:
35 fail('Error: Input file seems to be empty') 37 fail('Error: Input file seems to be empty')
36 line = line.strip().lstrip('#') # rm whitespace, comment chars 38 line = line.strip().lstrip('#') # rm whitespace, comment chars
37 labels = line.split("\t") 39 labels = line.split("\t")
38 if 'SAMPLE' not in labels or labels[11] != 'MINOR.FREQ.PERC.': 40 if 'SAMPLE' not in labels or labels[11] != 'MINOR.FREQ.PERC.':
39 fail('Error: Input file does not seem to have a proper header line.') 41 sys.stderr.write("Error: Input file does not seem to have a proper header "
42 +"line.\nAdding an artificial header..")
43 add_header = True
44
40 45
41 46
42 utils = importr('utils') 47 utils = importr('utils')
43 graphics = importr('graphics') 48 graphics = importr('graphics')
44 base = importr('base') 49 base = importr('base')
46 rprint = robjects.globalenv.get("print") 51 rprint = robjects.globalenv.get("print")
47 grdevices = importr('grDevices') 52 grdevices = importr('grDevices')
48 grdevices.png(file=outfile, width=1024, height=768) 53 grdevices.png(file=outfile, width=1024, height=768)
49 54
50 # Read file into a data frame 55 # Read file into a data frame
51 DATA = utils.read_delim(infile) 56 if add_header:
52 # Remove comment from header, if 57 # add header line manually if not present
53 labels = robjects.r.names(DATA) 58 DATA = utils.read_delim(infile, header=False)
54 if labels[0][0:2] == 'X.': 59 labels = robjects.r.names(DATA)
55 labels[0] = labels[0][2:] 60 for i in range(len(labels)):
56 #robjects.r.assign('data', DATA) 61 try:
57 #robjects.r('data$MINOR.FREQ.PERC. = data$MINOR.FREQ.PERC. * 100') 62 labels[i] = COLUMN_LABELS[i]
58 #DATA = robjects.r('data') 63 except IndexError, e:
59 #index = data.names.index('MINOR.FREQ.PERC.') 64 fail("Error in input file: Too many columns (does not match hardcoded "
65 +"column labels).")
66 else:
67 DATA = utils.read_delim(infile)
68 # Remove comment from header, if present
69 labels = robjects.r.names(DATA)
70 if labels[0][0:2] == 'X.':
71 labels[0] = labels[0][2:]
60 # Multiply minor allele frequencies by 100 to get percentage 72 # Multiply minor allele frequencies by 100 to get percentage
61 # .rx2() looks up a column by its label and returns it as a vector 73 # .rx2() looks up a column by its label and returns it as a vector
62 # .ro turns the returned object into one that can be operated on per-element 74 # .ro turns the returned object into one that can be operated on per-element
63 minor_freq = DATA.rx2('MINOR.FREQ.PERC.').ro * 100 75 minor_freq = DATA.rx2('MINOR.FREQ.PERC.').ro * 100
64 samples = DATA.rx2('SAMPLE') 76 samples = DATA.rx2('SAMPLE')
70 # R workspace 82 # R workspace
71 formula.getenvironment()['minor_freq'] = minor_freq 83 formula.getenvironment()['minor_freq'] = minor_freq
72 formula.getenvironment()['samples'] = samples 84 formula.getenvironment()['samples'] = samples
73 85
74 # create boxplot - fill kwargs1 with the options for the boxplot function 86 # create boxplot - fill kwargs1 with the options for the boxplot function
75 kwargs1 = {'ylab':"Minor allele frequency (%)", 'col':"gray", 'xaxt':"n", 'outpch':"*",'main':"Distribution of minor allele frequencies >= 2%", 'cex.lab':"1.5"} 87 kwargs1 = {'ylab':"Minor allele frequency (%)", 'col':"gray", 'xaxt':"n", 'outpch':"*",'main':"Distribution of minor allele frequencies", 'cex.lab':"1.5"}
76 p = graphics.boxplot(formula, **kwargs1) 88 p = graphics.boxplot(formula, **kwargs1)
77 89
78 table = base.table(DATA.rx2('SAMPLE')) 90 table = base.table(DATA.rx2('SAMPLE'))
79 graphics.text(0.5, 1, 'N:', font=2) 91 graphics.text(0.5, 1, 'N:', font=2)
80 for i in range(1, base.length(table)[0]+1, 1): 92 for i in range(1, base.length(table)[0]+1, 1):