Mercurial > repos > nick > minor_variant_boxplot

--- a/hetbox.py	Thu May 30 15:41:55 2013 -0400
+++ b/hetbox.py	Tue Jun 04 00:06:36 2013 -0400
@@ -1,7 +1,6 @@
 #!/usr/bin/env python
 # New in this version:
-# - option to generate report is triggered simply by including report filename
-#   as third argument
+# - Add in proper header line if not present

 import os,sys,numpy
 from rpy2.robjects import Formula
@@ -12,6 +11,8 @@
   sys.stderr.write(message+'\n')
   sys.exit(1)

+COLUMN_LABELS = ['SAMPLE', 'CHR',  'POS', 'A', 'C', 'G', 'T', 'CVRG', 'ALLELES', 'MAJOR', 'MINOR', 'MINOR.FREQ.PERC.'] #, 'STRAND.BIAS']
+
 args = sys.argv[1:]
 if len(args) >= 1:
   infile = args[0]
@@ -27,6 +28,7 @@
   report = ''

 # Check input file
+add_header = False
 if not os.path.exists(infile):
   fail('Error: Input file '+infile+' could not be found.')
 with open(infile, 'r') as lines:
@@ -36,7 +38,10 @@
   line = line.strip().lstrip('#') # rm whitespace, comment chars
   labels = line.split("\t")
   if 'SAMPLE' not in labels or labels[11] != 'MINOR.FREQ.PERC.':
-    fail('Error: Input file does not seem to have a proper header line.')
+    sys.stderr.write("Error: Input file does not seem to have a proper header "
+      +"line.\nAdding an artificial header..")
+    add_header = True
+


 utils     = importr('utils')
@@ -48,15 +53,22 @@
 grdevices.png(file=outfile, width=1024, height=768)

 # Read file into a data frame
-DATA    = utils.read_delim(infile)
-# Remove comment from header, if
-labels = robjects.r.names(DATA)
-if labels[0][0:2] == 'X.':
-  labels[0] = labels[0][2:]
-#robjects.r.assign('data', DATA)
-#robjects.r('data$MINOR.FREQ.PERC. = data$MINOR.FREQ.PERC. * 100')
-#DATA = robjects.r('data')
-#index = data.names.index('MINOR.FREQ.PERC.')
+if add_header:
+  # add header line manually if not present
+  DATA = utils.read_delim(infile, header=False)
+  labels = robjects.r.names(DATA)
+  for i in range(len(labels)):
+    try:
+      labels[i] = COLUMN_LABELS[i]
+    except IndexError, e:
+      fail("Error in input file: Too many columns (does not match hardcoded "
+        +"column labels).")
+else:
+  DATA = utils.read_delim(infile)
+  # Remove comment from header, if present
+  labels = robjects.r.names(DATA)
+  if labels[0][0:2] == 'X.':
+    labels[0] = labels[0][2:]
 # Multiply minor allele frequencies by 100 to get percentage
 #  .rx2() looks up a column by its label and returns it as a vector
 #  .ro turns the returned object into one that can be operated on per-element
@@ -72,7 +84,7 @@
 formula.getenvironment()['samples']    = samples

 # create boxplot - fill kwargs1 with the options for the boxplot function
-kwargs1 = {'ylab':"Minor allele frequency (%)", 'col':"gray", 'xaxt':"n", 'outpch':"*",'main':"Distribution of minor allele frequencies >= 2%", 'cex.lab':"1.5"}
+kwargs1 = {'ylab':"Minor allele frequency (%)", 'col':"gray", 'xaxt':"n", 'outpch':"*",'main':"Distribution of minor allele frequencies", 'cex.lab':"1.5"}
 p = graphics.boxplot(formula, **kwargs1)

 table  = base.table(DATA.rx2('SAMPLE'))