changeset 10:db6f217dc45a

Uploaded tarball. Version 1.1. - Added stranded output. - Added, tested no-filter mode. - now prints all sites. - Changed handling of minor allele ties. - Added test datasets. - Revised help text.
author nick
date Fri, 06 Sep 2013 20:37:07 -0400
parents eca1ea054d0d
children 566e42c97532
files allele-counts.py allele-counts.xml tests/artificial-nofilt.csv.out tests/artificial-nofilt.vcf.in tests/artificial-samples.csv.out tests/artificial-samples.vcf.in tests/artificial.csv.out tests/artificial.vcf.in tests/compute-site.py tests/errors.vcf.in tests/real-mit-s.csv.out tests/real-mit-s.vcf.in tests/real-mit.csv.out tests/real-mit.vcf.in tests/real-nofilt.csv.out tests/real-nofilt.vcf.in tests/real.csv.out tests/real.vcf.in tests/run-tests.py
diffstat 19 files changed, 837 insertions(+), 136 deletions(-) [+]
line wrap: on
line diff
--- a/allele-counts.py	Tue Jun 25 11:26:49 2013 -0400
+++ b/allele-counts.py	Fri Sep 06 20:37:07 2013 -0400
@@ -1,25 +1,25 @@
 #!/usr/bin/python
 # This parses the output of Dan's "Naive Variant Detector" (previously,
 # "BAM Coverage"). It was forked from the code of "bam-coverage.py".
+# Or, to put it briefly,
+# cat variants.vcf | grep -v '^#' | cut -f 10 | cut -d ':' -f 4 | tr ',=' '\t:'
 #
 # New in this version:
 #   Made header line customizable
 #     - separate from internal column labels, which are used as dict keys
-#
-# TODO:
-# - test handling of -c 0 (and -f 0?)
-# - should it technically handle data lines that start with a '#'?
 import os
 import sys
+import random
 from optparse import OptionParser
 
 COLUMNS = ['sample', 'chr', 'pos', 'A', 'C', 'G', 'T', 'coverage', 'alleles', 'major', 'minor', 'freq'] #, 'bias']
 COLUMN_LABELS = ['SAMPLE', 'CHR',  'POS', 'A', 'C', 'G', 'T', 'CVRG', 'ALLELES', 'MAJOR', 'MINOR', 'MINOR.FREQ.PERC.'] #, 'STRAND.BIAS']
 CANONICAL_VARIANTS = ['A', 'C', 'G', 'T']
-USAGE = """Usage: cat variants.vcf | %prog [options] > alleles.csv
-       %prog [options] -i variants.vcf -o alleles.csv"""
+USAGE = """Usage: %prog [options] -i variants.vcf -o alleles.csv
+       cat variants.vcf | %prog [options] > alleles.csv"""
 OPT_DEFAULTS = {'infile':'-', 'outfile':'-', 'freq_thres':1.0, 'covg_thres':100,
-  'print_header':False, 'stdin':False}
+  'print_header':False, 'stdin':False, 'stranded':False, 'no_filter':False,
+  'debug_loc':'', 'seed':''}
 DESCRIPTION = """This will parse the VCF output of Dan's "Naive Variant Caller" (aka "BAM Coverage") Galaxy tool. For each position reported, it counts the number of reads of each base, determines the major allele, minor allele (second most frequent variant), and number of alleles above a threshold. So currently it only considers SNVs (ACGT), including in the coverage figure. By default it reads from stdin and prints to stdout."""
 EPILOG = """Requirements:
 The input VCF must report the variants for each strand.
@@ -40,27 +40,35 @@
     help='Print output data to this file instead of stdout.')
   parser.add_option('-f', '--freq-thres', dest='freq_thres', type='float',
     default=defaults.get('freq_thres'),
-    help='Frequency threshold for counting alleles, given in percentage: -f 1 = 1% frequency. Default is %default%.')
+    help=('Frequency threshold for counting alleles, given in percentage: -f 1 '
+      +'= 1% frequency. Default is %default%.'))
   parser.add_option('-c', '--covg-thres', dest='covg_thres', type='int',
     default=defaults.get('covg_thres'),
-    help='Coverage threshold. Each site must be supported by at least this many reads on each strand. Otherwise the site will not be printed in the output. The default is %default reads per strand.')
+    help=('Coverage threshold. Each site must be supported by at least this '
+      +'many reads on each strand. Otherwise the site will not be printed in '
+      +'the output. The default is %default reads per strand.'))
+  parser.add_option('-n', '--no-filter', dest='no_filter', action='store_const',
+    const=not(defaults.get('no_filter')), default=defaults.get('no_filter'),
+    help=('Operate without a frequency threshold or coverage threshold. '
+      +'Equivalent to "-c 0 -f 0".'))
   parser.add_option('-H', '--header', dest='print_header', action='store_const',
     const=not(defaults.get('print_header')), default=defaults.get('print_header'),
-    help='Print header line. This is a #-commented line with the column labels. Off by default.')
-  parser.add_option('-d', '--debug', dest='debug', action='store_true',
-    default=False,
-    help='Turn on debug mode. You must also specify a single site to process in a final argument using UCSC coordinate format.')
+    help=('Print header line. This is a #-commented line with the column '
+      +'labels. Off by default.'))
+  parser.add_option('-s', '--stranded', dest='stranded', action='store_const',
+    const=not(defaults.get('stranded')), default=defaults.get('stranded'),
+    help='Report variant counts by strand, in separate columns. Off by default.')
+  parser.add_option('-r', '--rand-seed', dest='seed',
+    default=defaults.get('seed'), help=('Seed for random number generator.'))
+  parser.add_option('-d', '--debug', dest='debug_loc',
+    default=defaults.get('debug_loc'),
+    help=('Turn on debug mode and specify a single site to process using UCSC '
+      +'coordinate format. You can also append a sample ID after another ":" '
+      +'to restrict it further.'))
 
   (options, args) = parser.parse_args()
 
-  # read in positional arguments
-  arguments = {}
-  if options.debug:
-    if len(args) >= 1:
-      arguments['print_loc'] = args[0]
-      args.remove(args[0])
-
-  return (options, arguments)
+  return (options, args)
 
 
 def main():
@@ -72,19 +80,26 @@
   print_header = options.print_header
   freq_thres = options.freq_thres / 100.0
   covg_thres = options.covg_thres
-  debug = options.debug
+  stranded = options.stranded
+  debug_loc = options.debug_loc
+  seed = options.seed
+  
+  if options.no_filter:
+    freq_thres = 0
+    covg_thres = 0
 
-  if debug:
-    print_loc = args.get('print_loc')
-    if print_loc:
-      if ':' in print_loc:
-        (print_chr, print_pos) = print_loc.split(':')
-      else:
-        print_pos = print_loc
-    else:
-      sys.stderr.write("Warning: No site coordinate found in arguments. "
-        +"Turning off debug mode.\n")
-      debug = False
+  if seed:
+    random.seed(seed)
+
+  debug = False
+  print_sample = ''
+  if debug_loc:
+    debug = True
+    coords = debug_loc.split(':')
+    print_chr = coords[0]
+    print_pos = ''
+    if len(coords) > 1: print_pos = coords[1]
+    if len(coords) > 2: print_sample = coords[2]
 
   # set infile_handle to either stdin or the input file
   if infile == OPT_DEFAULTS.get('infile'):
@@ -105,12 +120,19 @@
     except IOError, e:
       fail('Error: The given output filename '+outfile+' could not be opened.')
 
+  # Take care of column names, print header
   if len(COLUMNS) != len(COLUMN_LABELS):
-    fail('Error: Internal column names do not match column labels.')
+    fail('Error: Internal column names list do not match column labels list.')
+  if stranded:
+    COLUMNS[3:7]       = ['+A', '+C', '+G', '+T', '-A', '-C', '-G', '-T']
+    COLUMN_LABELS[3:7] = ['+A', '+C', '+G', '+T', '-A', '-C', '-G', '-T']
   if print_header:
     outfile_handle.write('#'+'\t'.join(COLUMN_LABELS)+"\n")
 
-  # main loop: process and print one line at a time
+  # main loop
+  # each iteration processes one VCF line and prints one or more output lines
+  # one VCF line    = one site, one or more samples
+  # one output line = one site, one sample
   sample_names = []
   for line in infile_handle:
     line = line.rstrip('\r\n')
@@ -128,19 +150,24 @@
     site_data = read_site(line, sample_names, CANONICAL_VARIANTS)
 
     if debug:
-      if site_data['pos'] != print_pos:
+      if print_pos != site_data['pos']:
+        continue
+      if print_chr != site_data['chr'] and print_chr != '':
         continue
-      try:
-        if site_data['chr'] != print_chr:
-          continue
-      except NameError, e:
-        pass  # No chr specified. Just go ahead and print the line.
+      if print_sample != '':
+        for sample in site_data['samples'].keys():
+          if sample.lower() != print_sample.lower():
+            site_data['samples'].pop(sample, None)
+        if len(site_data['samples']) == 0:
+          sys.stderr.write("Error: Sample '"+print_sample+"' not found.\n")
+          sys.exit(1)
+
 
     site_summary = summarize_site(site_data, sample_names, CANONICAL_VARIANTS,
-      freq_thres, covg_thres, debug=debug)
+      freq_thres, covg_thres, stranded, debug=debug)
 
     if debug and site_summary[0]['print']:
-      print line.split('\t')[9].split(':')[-1]
+        print line.split('\t')[9].split(':')[-1]
 
     print_site(outfile_handle, site_summary, COLUMNS)
 
@@ -158,8 +185,27 @@
 def read_site(line, sample_names, canonical):
   """Read in a line, parse the variants into a data structure, and return it.
   The line should be actual site data, not a header line, so check beforehand.
-  Notes:
-  - The line is assumed to have been chomped."""
+  Only the variants in 'canonical' will be read; all others are ignored.
+  Note: the line is assumed to have been chomped.
+  The returned data is stored in a dict, with the following structure:
+  {
+    'chr': 'chr1',
+    'pos': '2617',
+    'samples': {
+      'THYROID': {
+        '+A': 32,
+        '-A': 45,
+        '-G': 1,
+      },
+      'BLOOD': {
+        '+A': 2,
+        '-C': 1,
+        '+G': 37,
+        '-G': 42,
+      },
+    },
+  }
+  """
   
   site = {}
   fields = line.split('\t')
@@ -212,7 +258,7 @@
 
 
 def summarize_site(site, sample_names, canonical, freq_thres, covg_thres,
-  debug=False):
+  stranded=False, debug=False):
   """Take the raw data from the VCF line and transform it into the summary data
   to be printed in the output format."""
 
@@ -221,9 +267,6 @@
 
     sample = {'print':False}
     variants = site['samples'].get(sample_name)
-    if not variants:
-      site_summary.append(sample)
-      continue
 
     sample['sample'] = sample_name
     sample['chr']    = site['chr']
@@ -240,31 +283,49 @@
       elif variant[0] == '-':
         covg_minus += variants[variant]
     # stranded coverage threshold
-    if coverage <= 0 or covg_plus < covg_thres or covg_minus < covg_thres:
+    if covg_plus < covg_thres or covg_minus < covg_thres:
       site_summary.append(sample)
       continue
     else:
       sample['print'] = True
 
-    # get an ordered list of read counts for all variants (either strand)
-    ranked_bases = get_read_counts(variants, 0, strands='+-', debug=debug)
+    # get an ordered list of read counts for all variants (both strands)
+    bases = get_read_counts(variants, '+-')
+    ranked_bases = process_read_counts(bases, sort=True, debug=debug)
+
+    # prepare stranded or unstranded lists of base counts
+    base_count_lists = []
+    if stranded:
+      strands = ['+', '-']
+      base_count_lists.append(get_read_counts(variants, '+'))
+      base_count_lists.append(get_read_counts(variants, '-'))
+    else:
+      strands = ['']
+      base_count_lists.append(ranked_bases)
 
-    # record read counts into dict for this sample
-    for base in ranked_bases:
-      sample[base[0]] = base[1]
-    # fill in any zeros
-    for variant in canonical:
-      if not sample.has_key(variant):
-        sample[variant] = 0
+    # record read counts into output dict
+    # If stranded, this will loop twice, once for each strand, and prepend '+'
+    # or '-' to the base name. If not stranded, it will loop once, and prepend
+    # nothing ('').
+    for (strand, base_count_list) in zip(strands, base_count_lists):
+      for base_count in base_count_list:
+        sample[strand+base_count[0]] = base_count[1]
+      # fill in any zeros
+      for base in canonical:
+        if not sample.has_key(strand+base):
+          sample[strand+base] = 0
 
-    sample['alleles']  = count_alleles(variants, freq_thres, debug=debug)
+    sample['alleles'] = count_alleles(variants, freq_thres, debug=debug)
 
-    # set minor allele to N if there's a tie for 2nd
+    # If there's a tie for 2nd, randomly choose one to be 2nd
     if len(ranked_bases) >= 3 and ranked_bases[1][1] == ranked_bases[2][1]:
-      ranked_bases[1] = ('N', 0)
-      sample['alleles'] = 1 if sample['alleles'] else 0
+      swap = random.choice([True,False])
+      if swap:
+        tmp_base = ranked_bases[1]
+        ranked_bases[1] = ranked_bases[2]
+        ranked_bases[2] = tmp_base
 
-    if debug: print ranked_bases
+    if debug: print "ranked +-: "+str(ranked_bases)
 
     sample['coverage'] = coverage
     try:
@@ -283,67 +344,63 @@
   return site_summary
 
 
-def print_site(filehandle, site, columns):
-  """Print the output lines for one site (one per sample).
-  filehandle must be open."""
-  for sample in site:
-    if sample['print']:
-      fields = [str(sample.get(column)) for column in columns]
-      filehandle.write('\t'.join(fields)+"\n")
+def get_read_counts(stranded_counts, strands='+-'):
+  """Do a simple sum of the read counts per variant, on the specified strands.
+      Arguments:
+  stranded_counts: Dict of the stranded variants (keys) and their read counts
+    (values).
+  strands: Which strand(s) to count. Can be '+', '-', or '+-' for both (default).
+      Return value:
+  summed_counts: A list of the alleles and their read counts. The elements are
+    tuples (variant, read count)."""
+
+  variants = stranded_counts.keys()
+
+  summed_counts = {}
+  for variant in variants:
+    strand = variant[0]
+    base = variant[1:]
+    if strand in strands:
+      summed_counts[base] = stranded_counts[variant] + summed_counts.get(base, 0)
+
+  return summed_counts.items()
 
 
-def get_read_counts(variant_counts, freq_thres, strands='+-', debug=False):
-  """Count the number of reads for each base, and create a ranked list of
-  alleles passing the frequency threshold.
+def process_read_counts(variant_counts, freq_thres=0, sort=False, debug=False):
+  """Process a list of read counts by frequency filtering and/or sorting.
       Arguments:
-  variant_counts: Dict of the stranded variants (keys) and their read counts (values).
+  variant_counts: List of the non-stranded variants and their read counts. The
+    elements are tuples (variant, read count).
   freq_thres: The frequency threshold each allele needs to pass to be included.
-  strands: Which strand(s) to count. Can be '+', '-', or '+-' for both (default).
-  variants: A list of the variants of interest. Other types of variants will not
-    be included in the returned list. If no list is given, all variants found in
-    the variant_counts will be used.
+  sort: Whether to sort the list in descending order of read counts.
       Return value:
-  ranked_bases: A list of the alleles and their read counts. The elements are
-    tuples (base, read count). The alleles are listed in descending order of
-    frequency, and only those passing the threshold are included."""
-
-  # Get list of all variants from variant_counts list
-  variants = [variant[1:] for variant in variant_counts]
-  # deduplicate via a dict
-  variant_dict = dict((variant, 1) for variant in variants)
-  variants = variant_dict.keys()
-
-  ranked_bases = []
-  for variant in variants:
-    reads = 0
-    for strand in strands:
-      reads += variant_counts.get(strand+variant, 0)
-    ranked_bases.append((variant, reads))
+  variant_counts: A list of the processed alleles and their read counts. The
+    elements are tuples (variant, read count)."""
 
   # get coverage for the specified strands
   coverage = 0
   for variant in variant_counts:
-    if variant[0] in strands:
-      coverage += variant_counts.get(variant, 0)
-  # if debug: print "strands: "+strands+', covg: '+str(coverage)
+    coverage += variant[1]
 
-  if coverage < 1:
+  if coverage <= 0:
     return []
 
   # sort the list of alleles by read count
-  ranked_bases.sort(reverse=True, key=lambda base: base[1])
+  if sort:
+    variant_counts.sort(reverse=True, key=lambda variant: variant[1])
 
   if debug:
-    print strands+' coverage: '+str(coverage)+', freq_thres: '+str(freq_thres)
-    for base in ranked_bases:
-      print (base[0]+': '+str(base[1])+'/'+str(float(coverage))+' = '+
-        str(base[1]/float(coverage)))
+    print 'coverage: '+str(coverage)+', freq_thres: '+str(freq_thres)
+    for variant in variant_counts:
+      print (variant[0]+': '+str(variant[1])+'/'+str(float(coverage))+' = '+
+        str(variant[1]/float(coverage)))
 
   # remove bases below the frequency threshold
-  ranked_bases = [base for base in ranked_bases
-    if base[1]/float(coverage) >= freq_thres]
+  if freq_thres > 0:
+    variant_counts = [variant for variant in variant_counts
+      if variant[1]/float(coverage) >= freq_thres]
 
-  return ranked_bases
+  return variant_counts
 
 
 def count_alleles(variant_counts, freq_thres, debug=False):
@@ -354,16 +411,19 @@
   is zero."""
   allele_count = 0
 
-  alleles_plus  = get_read_counts(variant_counts, freq_thres, debug=debug,
-    strands='+')
-  alleles_minus = get_read_counts(variant_counts, freq_thres, debug=debug,
-    strands='-')
+  alleles_plus  = get_read_counts(variant_counts, '+')
+  alleles_plus  = process_read_counts(alleles_plus, freq_thres=freq_thres,
+    sort=False, debug=debug)
+  alleles_minus = get_read_counts(variant_counts, '-')
+  alleles_minus = process_read_counts(alleles_minus, freq_thres=freq_thres,
+    sort=False, debug=debug)
 
   if debug:
     print '+ '+str(alleles_plus)
     print '- '+str(alleles_minus)
 
-  # check if each strand reports the same set of alleles
+  # Check if each strand reports the same set of alleles.
+  # Sorting by base is to compare lists without regard to order (as sets).
   alleles_plus_sorted  = sorted([base[0] for base in alleles_plus if base[1]])
   alleles_minus_sorted = sorted([base[0] for base in alleles_minus if base[1]])
   if alleles_plus_sorted == alleles_minus_sorted:
@@ -372,9 +432,19 @@
   return allele_count
 
 
+def print_site(filehandle, site, columns):
+  """Print the output lines for one site (one per sample).
+  filehandle must be open."""
+  for sample in site:
+    if sample['print']:
+      fields = [str(sample.get(column)) for column in columns]
+      filehandle.write('\t'.join(fields)+"\n")
+
+
 def fail(message):
   sys.stderr.write(message+'\n')
   sys.exit(1)
 
+
 if __name__ == "__main__":
   main()
\ No newline at end of file
--- a/allele-counts.xml	Tue Jun 25 11:26:49 2013 -0400
+++ b/allele-counts.xml	Fri Sep 06 20:37:07 2013 -0400
@@ -1,10 +1,12 @@
-<tool id="allele_counts_1" version="1.0" name="Variant Annotator">
-  <description></description>
-  <command interpreter="python">allele-counts.py -i $input -o $output -f $freq -c $covg $header</command>
+<tool id="allele_counts_1" version="1.1" name="Variant Annotator">
+  <description> process variant counts</description>
+  <command interpreter="python">allele-counts.py -i $input -o $output -f $freq -c $covg $header $stranded $nofilt</command>
   <inputs>
     <param name="input" type="data" format="vcf" label="Input variants from Naive Variants Detector"/>
     <param name="freq" type="float" value="1.0" min="0" max="100" label="Minor allele frequency threshold (in percent)"/>
-    <param name="covg" type="integer" value="10" min="0" label="Coverage threshold (per strand)"/>
+    <param name="covg" type="integer" value="10" min="0" label="Coverage threshold (in reads per strand)"/>
+    <param name="nofilt" type="boolean" truevalue="-n" falsevalue="" checked="False" label="Do not filter sites or alleles" />
+    <param name="stranded" type="boolean" truevalue="-s" falsevalue="" checked="False" label="Output stranded base counts" />
     <param name="header" type="boolean" truevalue="-H" falsevalue="" checked="True" label="Write header line" />
   </inputs>
   <outputs>
@@ -21,40 +23,51 @@
 
 **What it does**
 
-This tool parses variant counts from a special VCF file (normally the output of the **Naive Variant Detector** tool). It counts simple (ACGT) variants, calculates numbers of alleles, and calculates minor allele frequency. It applies filters based on coverage, strand bias, and minor allele frequency cutoffs.
+This tool parses variant counts from a special VCF file. It counts simple variants, calculates numbers of alleles, and calculates minor allele frequency. It can apply filters based on coverage, strand bias, and minor allele frequency cutoffs.
 
 -----
 
+.. class:: infomark
+
+**Input Format**
+
 .. class:: warningmark
 
-**Note**
+**Note:** variants that are not A/C/G/T SNVs will be ignored!
 
-The VCF must have a certain genotype field in the sample columns, giving the read count of each type of variant. Also, the variant data **must be stranded**. The **Naive Variant Detector** tool produces this type of VCF.
+The input VCF should be like the output of the **Naive Variant Detector** tool (using the stranded option). The sample column(s) must give the read count for each variant **on each strand**. Below is an example of a valid sample column entry (the important part is after the last colon)::
+
+    0/0:1:0.02:+T=27,+G=1,-T=22,
 
 -----
 
 .. class:: infomark
 
-**Output columns**
+**Output**
 
-Each row represents one site in one sample. 12 fields give information about that site::
+Each row represents one site in one sample. For unstranded output, 12 fields give information about that site::
 
-    1.  SAMPLE  - Sample names (from VCF sample column labels)
+    1.  SAMPLE  - Sample name (from VCF sample column labels)
     2.  CHR     - Chromosome of the site
     3.  POS     - Chromosomal coordinate of the site
     4.  A       - Number of reads supporting an 'A'
-    5.  C       - ditto, for 'C'
-    6.  G       - ditto, for 'G'
-    7.  T       - ditto, for 'T'
+    5.  C       - 'C' reads
+    6.  G       - 'G' reads
+    7.  T       - 'T' reads
     8.  CVRG    - Total (number of reads supporting one of the four bases above)
     9.  ALLELES - Number of qualifying alleles
-    10. MAJOR   - Major allele base
-    11. MINOR   - Minor allele base (2nd most prevalent variant)
+    10. MAJOR   - Major allele
+    11. MINOR   - Minor allele (2nd most prevalent variant)
     12. MINOR.FREQ.PERC. - Frequency of minor allele
 
+For stranded output, instead of using 4 columns to report read counts per base, 8 are used to report the stranded counts per base::
+
+    1       2   3   4  5  6  7  8  9 10 11  12    13     14    15         16
+    SAMPLE CHR POS +A +C +G +T -A -C -G -T CVRG ALLELES MAJOR MINOR MINOR.FREQ.PERC.
+
 **Example**
 
-This is the header line, followed by some example data lines. Note that some samples and/or sites will not be included in the output, if they fall below the coverage threshold::
+Below is a header line, followed by some example data lines. Since the input contained three samples, the data for each site is reported on three consecutive lines. However, if a sample fell below the coverage threshold at that site, the line will be omitted::
 
     #SAMPLE  CHR    POS  A   C    G    T  CVRG  ALLELES  MAJOR  MINOR  MINOR.FREQ.PERC.
     BLOOD_1  chr20  99   0   101  1    2  104   1        C      T      0.01923
@@ -69,19 +82,17 @@
 
 **Site printing and allele tallying requirements**
 
-Each line is printed only when the site is covered by the threshold number of reads **on each strand**. If coverage of either strand is below the threshold, the line (sample + site combination) is omitted.
+Coverage threshold:
 
-**N.B.**: This means the total coverage for each printed site will be at least twice the number you give in the "coverage threshold" option.
+If a coverage threshold is used, the number of reads **on each strand** must be at or above the threshold. If either strand is below the threshold, the line will be omitted. **N.B.** this means the total coverage for each printed site will be at least twice the number you give in the "coverage threshold" option. Also, since only simple variants are counted, a site with 100 reads, all supporting a deletion variant, would not be printed.
 
-Also, reads supporting a variant outside the canonical 4 nucleotides will not count towards the coverage requirement. For instance, a site/sample line with 100x coverage, all of which support a deletion variant, will not be printed.
+Frequency threshold:
 
-Alleles are only counted (in column 9) if they meet or exceed the minor allele frequency threshold. So a site/sample line with types of variants, 96% A, 3.3% C, and 0.7% G, will count as 2 alleles (at 1% threshold).
-
-Strand bias: the alleles passing the threshold on each strand have to match (though not in order). Otherwise, the allele count will be 0. So a site/sample line whose + strand shows 70% A, 27% C, and 3% G, and - strand shows 70% A and 30% C will have an allele count of 0. The minor allele and minor allele frequency, though, will always be reported\*.
+If a frequency threshold is used, alleles are only counted (in the ALLELES column) if they meet or exceed this minor allele frequency threshold.
 
-But in this version, there is no requirement that the strands show similar allele frequencies, as long as they both pass the threshold.
+Strand bias:
 
-\*One specific case will actually affect the reported minor allele identity and frequency. If there is a tie for the minor allele (between the 2nd and 3rd most common alleles), the minor allele will be reporated as 'N', and the frequency as 0.0.
+The alleles passing the threshold on each strand must match (though not in order), or the allele count will be 0. So a site with A, C, G on the plus strand and A, G on the minus strand will get an allele count of zero, though the (strand-independent) major allele, minor allele, and minor allele frequency will still be reported. If there is a tie for the minor allele, one will be randomly chosen.
 
   </help>
 
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tests/artificial-nofilt.csv.out	Fri Sep 06 20:37:07 2013 -0400
@@ -0,0 +1,27 @@
+#SAMPLE	CHR	POS	A	C	G	T	CVRG	ALLELES	MAJOR	MINOR	MINOR.FREQ.PERC.
+THYROID	chr1	0	30	0	0	0	30	1	A	.	0.0
+THYROID	chr1	10	30	0	2	0	32	2	A	G	0.0625
+THYROID	chr1	20	31	0	1	0	32	0	A	G	0.03125
+THYROID	chr1	30	21	0	4	0	25	2	A	G	0.16
+THYROID	chr1	40	22	0	3	0	25	0	A	G	0.12
+THYROID	chr1	50	3	0	0	0	3	1	A	.	0.0
+THYROID	chr1	60	2	0	2	0	4	2	A	G	0.5
+THYROID	chr1	70	1	0	3	0	4	0	G	A	0.25
+THYROID	chr1	80	104	0	3	0	107	0	A	G	0.02804
+THYROID	chr1	90	100	2	11	0	113	3	A	G	0.09735
+THYROID	chr1	100	100	1	11	0	112	0	A	G	0.09821
+THYROID	chr1	120	0	0	0	0	0	0	.	.	0.0
+THYROID	chr1	130	0	0	2	0	2	1	G	.	0.0
+THYROID	chr1	140	0	0	1	0	1	0	G	.	0.0
+THYROID	chr1	150	0	0	4	0	4	1	G	.	0.0
+THYROID	chr1	160	0	0	3	0	3	0	G	.	0.0
+THYROID	chr1	260	106	0	14	0	120	2	A	G	0.11667
+THYROID	chr1	300	2	0	2	76	80	3	T	G	0.025
+THYROID	chr1	310	12	0	12	76	100	3	T	G	0.12
+THYROID	chr1	320	12	0	12	56	80	3	T	A	0.15
+THYROID	chr1	330	7	0	7	66	80	3	T	G	0.0875
+THYROID	chr1	340	1	0	1	98	100	0	T	G	0.01
+THYROID	chr1	350	11	0	11	78	100	0	T	A	0.11
+THYROID	chr1	400	32	0	8	0	40	2	A	G	0.2
+THYROID	chr1	410	1	0	2	97	100	0	T	G	0.02
+THYROID	chr1	420	104	0	0	0	104	1	A	.	0.0
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tests/artificial-nofilt.vcf.in	Fri Sep 06 20:37:07 2013 -0400
@@ -0,0 +1,50 @@
+##fileformat=VCFv4.1
+##comment="ARGS=-H -r 1 -f 0 -c 0"
+##comment="This is a test set of made-up sites, each created in order to test certain functionality. It's meant to be run with -f 10 -c 10"
+##fileDate=19700101
+##source=Dan
+##reference=file:///scratch/dan/galaxy/galaxy-central/database/files/002/dataset_0000.dat
+##INFO=<ID=AC,Number=A,Type=Integer,Description="Allele count in genotypes, for each ALT allele, in the same order as listed">
+##INFO=<ID=AF,Number=A,Type=Float,Description="Allele Frequency, for each ALT allele, in the same order as listed">
+##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
+##FORMAT=<ID=AC,Number=.,Type=Integer,Description="Allele count in genotypes, for each ALT allele, in the same order as listed">
+##FORMAT=<ID=AF,Number=.,Type=Float,Description="Allele Frequency, for each ALT allele, in the same order as listed">
+##FORMAT=<ID=NC,Number=.,Type=String,Description="Nucleotide and indel counts">
+#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	THYROID
+# General note: the only data made consistent is the CHROM, POS, REF, ALT, and the variant data (after the ':'). The other stuff isn't supposed to be consistent.
+# Simplest case, but POS 0 and no minor allele
+chr1	0	.	A	.	.	.	AC=;AF=	GT:AC:AF:NC	0/0:1:1:+A=15,-A=15,
+# Simple, normal cases of A/G variants: above/below threshold x strand bias/no strand bias (2 x 2 = 4 cases)
+chr1	10	.	A	G	.	.	AC=;AF=	GT:AC:AF:NC	0/0:1:1:+A=15,+G=1,-A=15,-G=1,
+chr1	20	.	A	G	.	.	AC=;AF=	GT:AC:AF:NC	0/0:1:1:+A=15,+G=1,-A=16,
+chr1	30	.	A	G	.	.	AC=;AF=	GT:AC:AF:NC	0/0:1:1:+A=10,+G=2,-A=11,-G=2,
+chr1	40	.	A	G	.	.	AC=;AF=	GT:AC:AF:NC	0/0:1:1:+A=11,+G=3,-A=11,
+# Really low coverage
+chr1	50	.	A	G	.	.	AC=;AF=	GT:AC:AF:NC	0/0:1:1:+A=2,-A=1,
+chr1	60	.	A	G	.	.	AC=;AF=	GT:AC:AF:NC	0/0:1:1:+A=1,+G=1,-A=1,-G=1
+chr1	70	.	A	G	.	.	AC=;AF=	GT:AC:AF:NC	0/0:1:1:+A=1,+G=1,-G=2,
+chr1	80	.	A	G	.	.	AC=;AF=	GT:AC:AF:NC	0/0:1:1:+A=102,+G=3,-A=2,
+# Very low frequency tertiary allele
+chr1	90	.	A	G	.	.	AC=;AF=	GT:AC:AF:NC	0/0:1:1:+A=50,+G=5,+C=1,-A=50,-G=6,-C=1,
+chr1	100	.	A	G	.	.	AC=;AF=	GT:AC:AF:NC	0/0:1:1:+A=50,+G=5,-A=50,-G=6,-C=1,
+# Only non-canonical alleles
+chr1	120	.	d1	.	.	.	AC=;AF=	GT:AC:AF:NC	0/0:1:1:+d1=15,-d1=15,
+# Same 4 cases, but with non-canonical major allele (d1)
+chr1	130	.	d1	G	.	.	AC=;AF=	GT:AC:AF:NC	0/0:1:1:+d1=15,+G=1,-d1=15,-G=1,
+chr1	140	.	d1	G	.	.	AC=;AF=	GT:AC:AF:NC	0/0:1:1:+d1=15,+G=1,-d1=16,
+chr1	150	.	d1	G	.	.	AC=;AF=	GT:AC:AF:NC	0/0:1:1:+d1=10,+G=2,-d1=11,-G=2,
+chr1	160	.	d1	G	.	.	AC=;AF=	GT:AC:AF:NC	0/0:1:1:+d1=11,+G=3,-d1=11,
+# Test case where minor allele is above threshold on only one strand because of different coverage. Also, a long decimal minor allele frequency.
+chr1	260	.	A	G	.	.	AC=;AF=	GT:AC:AF:NC	0/0:1:1:+A=13,+G=7,-A=93,-G=7,
+# Test case where minor alleles have equal frequency: Above/below threshold, +/- strand bias
+chr1	300	.	T	G,A	.	.	AC=;AF=	GT:AC:AF:NC	0/0:1:1:+A=1,+G=1,+T=38,-A=1,-G=1,-T=38,
+chr1	310	.	T	G,A	.	.	AC=;AF=	GT:AC:AF:NC	0/0:1:1:+A=6,+G=6,+T=38,-A=6,-G=6,-T=38,
+chr1	320	.	T	G,A	.	.	AC=;AF=	GT:AC:AF:NC	0/0:1:1:+A=6,+G=6,+T=38,-A=6,-G=6,-T=18,
+chr1	330	.	T	G,A	.	.	AC=;AF=	GT:AC:AF:NC	0/0:1:1:+A=6,+G=6,+T=38,-A=1,-G=1,-T=28,
+chr1	340	.	T	G,A	.	.	AC=;AF=	GT:AC:AF:NC	0/0:1:1:+A=1,+T=80,-G=1,-T=18,
+chr1	350	.	T	G,A	.	.	AC=;AF=	GT:AC:AF:NC	0/0:1:1:+A=11,+T=60,-G=11,-T=18,
+# Case where + and - variants are interleaved with each other. Also, a long decimal result for the minor allele frequency.
+chr1	400	.	A	N	.	.	AC=;AF=	GT:AC:AF:NC	0/0:1:1:+A=16,-A=16,+G=4,-G=4,
+# Test complex data in the ALT, INFO, and sample (before the ':') columns
+chr1	410	.	T	G,A	.	.	AC=1,1;AF=0.0111111111111,0.0111111111111	GT:AC:AF:NC	0/0:1,1:0.0111111111111,0.0111111111111:+A=1,+T=81,-T=16,-G=2,
+chr1	420	.	A	.	.	.	AC=;AF=	GT:AC:AF:NC	0/0:::+A=82,-A=22,
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tests/artificial-samples.csv.out	Fri Sep 06 20:37:07 2013 -0400
@@ -0,0 +1,13 @@
+BRAIN	chr1	0	30	0	0	0	30	1	A	.	0.0
+ARTERY	chr1	0	0	0	30	0	30	1	G	.	0.0
+THYROID	chr1	0	0	30	0	0	30	1	C	.	0.0
+BRAIN	chr1	10	30	0	0	0	30	1	A	.	0.0
+ARTERY	chr1	10	30	0	2	0	32	1	A	G	0.0625
+THYROID	chr1	10	31	0	1	0	32	1	A	G	0.03125
+BRAIN	chr1	20	30	0	2	0	32	1	A	G	0.0625
+ARTERY	chr1	20	34	0	6	0	40	2	A	G	0.15
+THYROID	chr1	20	30	0	2	0	32	0	A	G	0.0625
+BRAIN	chr1	30	30	0	0	0	30	1	A	.	0.0
+BRAIN	chr1	40	0	0	0	30	30	1	T	.	0.0
+ARTERY	chr1	40	1	0	2	97	100	0	T	G	0.02
+THYROID	chr1	40	0	69	0	31	100	0	C	T	0.31
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tests/artificial-samples.vcf.in	Fri Sep 06 20:37:07 2013 -0400
@@ -0,0 +1,23 @@
+##fileformat=VCFv4.1
+##comment="ARGS=-f 10 -c 10"
+##comment="This is a test set of made-up sites, in this case to test the handling of multiple read groups. It's meant to be run with -f 10 -c 10"
+##fileDate=19700101
+##source=Dan
+##reference=file:///scratch/dan/galaxy/galaxy-central/database/files/002/dataset_0000.dat
+##INFO=<ID=AC,Number=A,Type=Integer,Description="Allele count in genotypes, for each ALT allele, in the same order as listed">
+##INFO=<ID=AF,Number=A,Type=Float,Description="Allele Frequency, for each ALT allele, in the same order as listed">
+##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
+##FORMAT=<ID=AC,Number=.,Type=Integer,Description="Allele count in genotypes, for each ALT allele, in the same order as listed">
+##FORMAT=<ID=AF,Number=.,Type=Float,Description="Allele Frequency, for each ALT allele, in the same order as listed">
+##FORMAT=<ID=NC,Number=.,Type=String,Description="Nucleotide and indel counts">
+#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	BRAIN	ARTERY	THYROID
+# naive case
+chr1	0	.	A	G,C	.	.	AC=;AF=	GT:AC:AF:NC	0/0:1:1:+A=15,-A=15,	0/0:1:1:+G=15,-G=15,	0/0:1:1:+C=15,-C=15,
+# cases all with one allele, but different minor allele numbers below threshold
+chr1	10	.	A	G	.	.	AC=;AF=	GT:AC:AF:NC	0/0:1:1:+A=15,-A=15,	0/0:1:1:+A=15,+G=1,-A=15,-G=1,	0/0:1:1:+A=15,+G=1,-A=16,
+# cases with different numbers of alleles
+chr1	20	.	A	G	.	.	AC=;AF=	GT:AC:AF:NC	0/0:1:1:+A=15,+G=1,-A=15,-G=1,	0/0:1:1:+A=17,+G=3,-A=17,-G=3,	0/0:1:1:+A=15,+G=2,-A=15,
+# cases where some shouldn't be printed
+chr1	30	.	A	G	.	.	AC=;AF=	GT:AC:AF:NC	0/0:1:1:+A=15,-A=15,	0/0:1:1:+A=9,-A=8,	0/0:1:1:+A=15,-A=9,
+# cases with some complex pre-':' data
+chr1	40	.	T	A,G,C	.	.	AC=1,1;AF=0.0111111111111,0.0111111111111	GT:AC:AF:NC	0/0:1:1:+T=15,-T=15,	0/0:1,1:0.0111111111111,0.0111111111111:+A=1,+T=81,-T=16,-G=2,	0/0:3,1:0.0394736842105,0.0131578947368:+C=52,+T=30,-C=17,-T=1,
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tests/artificial.csv.out	Fri Sep 06 20:37:07 2013 -0400
@@ -0,0 +1,35 @@
+THYROID	chr1	0	30	0	0	0	30	1	A	.	0.0
+THYROID	chr1	10	30	0	2	0	32	1	A	G	0.0625
+THYROID	chr1	20	31	0	1	0	32	1	A	G	0.03125
+THYROID	chr1	30	21	0	4	0	25	2	A	G	0.16
+THYROID	chr1	40	22	0	3	0	25	0	A	G	0.12
+THYROID	chr1	50	30	0	0	0	30	1	A	.	0.0
+THYROID	chr1	60	31	0	0	0	31	1	A	.	0.0
+THYROID	chr1	70	21	0	0	0	21	1	A	.	0.0
+THYROID	chr1	80	22	0	0	0	22	1	A	.	0.0
+THYROID	chr1	82	30	0	2	0	32	1	A	G	0.0625
+THYROID	chr1	84	31	0	1	0	32	1	A	G	0.03125
+THYROID	chr1	86	21	0	4	0	25	2	A	G	0.16
+THYROID	chr1	88	22	0	3	0	25	0	A	G	0.12
+THYROID	chr1	90	30	0	0	0	30	1	A	.	0.0
+THYROID	chr1	100	31	0	0	0	31	1	A	.	0.0
+THYROID	chr1	110	21	0	0	0	21	1	A	.	0.0
+THYROID	chr1	120	22	0	0	0	22	1	A	.	0.0
+THYROID	chr1	210	20	0	0	0	20	1	A	.	0.0
+THYROID	chr1	220	22	0	0	0	22	1	A	.	0.0
+THYROID	chr1	230	182	0	18	0	200	1	A	G	0.09
+THYROID	chr1	240	180	0	20	0	200	2	A	G	0.1
+THYROID	chr1	250	178	0	22	0	200	2	A	G	0.11
+THYROID	chr1	260	106	0	14	0	120	0	A	G	0.11667
+THYROID	chr1	300	2	0	2	76	80	1	T	G	0.025
+THYROID	chr1	310	12	0	12	76	100	3	T	G	0.12
+THYROID	chr1	320	12	0	12	56	80	3	T	A	0.15
+THYROID	chr1	330	7	0	7	66	80	0	T	G	0.0875
+THYROID	chr1	340	1	0	1	98	100	1	T	G	0.01
+THYROID	chr1	350	11	0	11	78	100	0	T	A	0.11
+THYROID	chr1	400	32	0	8	0	40	2	A	G	0.2
+THYROID	chr1	410	1	0	2	97	100	0	T	G	0.02
+THYROID	chr1	420	104	0	0	0	104	1	A	.	0.0
+THYROID	chr1	430	30	0	0	0	30	1	A	.	0.0
+THYROID	chr1	440	30	0	0	0	30	1	A	.	0.0
+THYROID	27	1234567890	29	0	0	0	29	1	A	.	0.0
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tests/artificial.vcf.in	Fri Sep 06 20:37:07 2013 -0400
@@ -0,0 +1,72 @@
+##fileformat=VCFv4.1
+##comment="ARGS=-r 1 -f 10 -c 10"
+##comment="This is a test set of made-up sites, each created in order to test certain functionality. It's meant to be run with -f 10 -c 10"
+##fileDate=19700101
+##source=Dan
+##reference=file:///scratch/dan/galaxy/galaxy-central/database/files/002/dataset_0000.dat
+##INFO=<ID=AC,Number=A,Type=Integer,Description="Allele count in genotypes, for each ALT allele, in the same order as listed">
+##INFO=<ID=AF,Number=A,Type=Float,Description="Allele Frequency, for each ALT allele, in the same order as listed">
+##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
+##FORMAT=<ID=AC,Number=.,Type=Integer,Description="Allele count in genotypes, for each ALT allele, in the same order as listed">
+##FORMAT=<ID=AF,Number=.,Type=Float,Description="Allele Frequency, for each ALT allele, in the same order as listed">
+##FORMAT=<ID=NC,Number=.,Type=String,Description="Nucleotide and indel counts">
+#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	THYROID
+# General note: the only data made consistent is the CHROM, POS, REF, ALT, and the variant data (after the ':'). The other stuff isn't supposed to be consistent.
+# Simplest case, but POS 0 and no minor allele
+chr1	0	.	A	.	.	.	AC=;AF=	GT:AC:AF:NC	0/0:1:1:+A=15,-A=15,
+# Simple, normal cases of A/G variants: above/below threshold x strand bias/no strand bias (2 x 2 = 4 cases)
+chr1	10	.	A	G	.	.	AC=;AF=	GT:AC:AF:NC	0/0:1:1:+A=15,+G=1,-A=15,-G=1,
+chr1	20	.	A	G	.	.	AC=;AF=	GT:AC:AF:NC	0/0:1:1:+A=15,+G=1,-A=16,
+chr1	30	.	A	G	.	.	AC=;AF=	GT:AC:AF:NC	0/0:1:1:+A=10,+G=2,-A=11,-G=2,
+chr1	40	.	A	G	.	.	AC=;AF=	GT:AC:AF:NC	0/0:1:1:+A=11,+G=3,-A=11,
+# Same 4 cases, but with minor allele = N
+chr1	50	.	A	N	.	.	AC=;AF=	GT:AC:AF:NC	0/0:1:1:+A=15,+N=1,-A=15,-N=1,
+chr1	60	.	A	N	.	.	AC=;AF=	GT:AC:AF:NC	0/0:1:1:+A=15,+N=1,-A=16,
+chr1	70	.	A	N	.	.	AC=;AF=	GT:AC:AF:NC	0/0:1:1:+A=10,+N=2,-A=11,-N=2,
+chr1	80	.	A	N	.	.	AC=;AF=	GT:AC:AF:NC	0/0:1:1:+A=11,+N=3,-A=11,
+# Same 4 cases, but with an additional noncanonical minor allele d1
+chr1	82	.	A	d1	.	.	AC=;AF=	GT:AC:AF:NC	0/0:1:1:+A=15,+G=1,+d1=1,-A=15,-G=1,-d1=1,
+chr1	84	.	A	d1	.	.	AC=;AF=	GT:AC:AF:NC	0/0:1:1:+A=15,+G=1,+d1=1,-A=16,
+chr1	86	.	A	d1	.	.	AC=;AF=	GT:AC:AF:NC	0/0:1:1:+A=10,+G=2,+d1=1,-A=11,-G=2,-d1=1,
+chr1	88	.	A	d1	.	.	AC=;AF=	GT:AC:AF:NC	0/0:1:1:+A=11,+G=3,+d1=1,-A=11,
+# Same 4 cases, but with minor allele = d1 (non-canonical)
+chr1	90	.	A	d1	.	.	AC=;AF=	GT:AC:AF:NC	0/0:1:1:+A=15,+d1=1,-A=15,-d1=1,
+chr1	100	.	A	d1	.	.	AC=;AF=	GT:AC:AF:NC	0/0:1:1:+A=15,+d1=1,-A=16,
+chr1	110	.	A	d1	.	.	AC=;AF=	GT:AC:AF:NC	0/0:1:1:+A=10,+d1=2,-A=11,-d1=2,
+chr1	120	.	A	d1	.	.	AC=;AF=	GT:AC:AF:NC	0/0:1:1:+A=11,+d1=3,-A=11,
+# Same 4 cases, but with MAJOR allele = d1
+chr1	130	.	d1	G	.	.	AC=;AF=	GT:AC:AF:NC	0/0:1:1:+d1=15,+G=1,-d1=15,-G=1,
+chr1	140	.	d1	G	.	.	AC=;AF=	GT:AC:AF:NC	0/0:1:1:+d1=15,+G=1,-d1=16,
+chr1	150	.	d1	G	.	.	AC=;AF=	GT:AC:AF:NC	0/0:1:1:+d1=10,+G=2,-d1=11,-G=2,
+chr1	160	.	d1	G	.	.	AC=;AF=	GT:AC:AF:NC	0/0:1:1:+d1=11,+G=3,-d1=11,
+# Test edge cases where freq == freq_thres and/or covg == covg_thres
+chr1	200	.	A	.	.	.	AC=;AF=	GT:AC:AF:NC	0/0:1:1:+A=9,-A=9,
+chr1	210	.	A	.	.	.	AC=;AF=	GT:AC:AF:NC	0/0:1:1:+A=10,-A=10,
+chr1	220	.	A	.	.	.	AC=;AF=	GT:AC:AF:NC	0/0:1:1:+A=11,-A=11,
+chr1	230	.	A	G	.	.	AC=;AF=	GT:AC:AF:NC	0/0:1:1:+A=91,+G=9,-A=91,-G=9,
+chr1	240	.	A	G	.	.	AC=;AF=	GT:AC:AF:NC	0/0:1:1:+A=90,+G=10,-A=90,-G=10,
+chr1	250	.	A	G	.	.	AC=;AF=	GT:AC:AF:NC	0/0:1:1:+A=89,+G=11,-A=89,-G=11,
+# Test case where minor allele is above threshold on only one strand because of different coverage. Also, a long decimal minor allele frequency.
+chr1	260	.	A	G	.	.	AC=;AF=	GT:AC:AF:NC	0/0:1:1:+A=13,+G=7,-A=93,-G=7,
+# Test case where minor alleles have equal frequency: Above/below threshold, +/- strand bias
+chr1	300	.	T	G,A	.	.	AC=;AF=	GT:AC:AF:NC	0/0:1:1:+A=1,+G=1,+T=38,-A=1,-G=1,-T=38,
+chr1	310	.	T	G,A	.	.	AC=;AF=	GT:AC:AF:NC	0/0:1:1:+A=6,+G=6,+T=38,-A=6,-G=6,-T=38,
+chr1	320	.	T	G,A	.	.	AC=;AF=	GT:AC:AF:NC	0/0:1:1:+A=6,+G=6,+T=38,-A=6,-G=6,-T=18,
+chr1	330	.	T	G,A	.	.	AC=;AF=	GT:AC:AF:NC	0/0:1:1:+A=6,+G=6,+T=38,-A=1,-G=1,-T=28,
+chr1	340	.	T	G,A	.	.	AC=;AF=	GT:AC:AF:NC	0/0:1:1:+A=1,+T=80,-G=1,-T=18,
+chr1	350	.	T	G,A	.	.	AC=;AF=	GT:AC:AF:NC	0/0:1:1:+A=11,+T=60,-G=11,-T=18,
+# Case where + and - variants are interleaved with each other. Also, a long decimal result for the minor allele frequency.
+chr1	400	.	A	N	.	.	AC=;AF=	GT:AC:AF:NC	0/0:1:1:+A=16,-A=16,+G=4,-G=4,
+# Test complex data in the ALT, INFO, and sample (before the ':') columns
+chr1	410	.	T	G,A	.	.	AC=1,1;AF=0.0111111111111,0.0111111111111	GT:AC:AF:NC	0/0:1,1:0.0111111111111,0.0111111111111:+A=1,+T=81,-T=16,-G=2,
+chr1	420	.	A	.	.	.	AC=;AF=	GT:AC:AF:NC	0/0:::+A=82,-A=22,
+# Test some other types of noncanonical variants (tie for 2nd and not)
+chr1	430	.	A	N,GAA,d2	.	.	AC=;AF=	GT:AC:AF:NC	0/0:1:1:+A=15,+N=1,+d2=1,-A=15,-GAA=2
+chr1	440	.	A	N,GAA,d2	.	.	AC=;AF=	GT:AC:AF:NC	0/0:1:1:+A=15,+N=1,+d2=1,-A=15,-GAA=1
+# No canonical variants present
+chr1	450	.	A	d1	.	.	AC=;AF=	GT:AC:AF:NC	0/0:1:1:+d1=15,-d1=20,
+chr1	460	.	A	d1,N	.	.	AC=;AF=	GT:AC:AF:NC	0/0:1:1:+d1=15,+N=2,-d1=20,-N=2,
+# Catch some divide by zero errors
+chr1	470	.	A	.	.	.	AC=;AF=	GT:AC:AF:NC	0/0:1:1:+A=0,
+# Test an unusual CHROM value and a long POS value
+27	1234567890	.	A	N	.	.	AC=;AF=	GT:AC:AF:NC	0/0:1:1:+A=15,+N=1,-A=14,
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tests/compute-site.py	Fri Sep 06 20:37:07 2013 -0400
@@ -0,0 +1,161 @@
+#!/usr/bin/python
+import os
+import sys
+from optparse import OptionParser
+
+OPT_DEFAULTS = {'freq_thres':0, 'covg_thres':0}
+BASES = ['A', 'C', 'G', 'T']
+USAGE = ("Usage: %prog (options) 'variant_str' ('variant_str2' (etc))\n"
+    +"       cat variants.txt > %prog (options)")
+
+def main():
+
+  parser = OptionParser(usage=USAGE)
+  parser.add_option('-f', '--freq-thres', dest='freq_thres', type='float',
+    default=OPT_DEFAULTS.get('freq_thres'),
+    help=('Frequency threshold for counting alleles, given in percentage: -f 1 '
+      +'= 1% frequency. Default is %default%.'))
+  parser.add_option('-c', '--covg-thres', dest='covg_thres', type='int',
+    default=OPT_DEFAULTS.get('covg_thres'),
+    help=('Coverage threshold. Each site must be supported by at least this '
+      +'many reads on each strand. Otherwise the site will not be printed in '
+      +'the output. The default is %default reads per strand.'))
+  (options, args) = parser.parse_args()
+  freq_thres = options.freq_thres
+  covg_thres = options.covg_thres
+
+  if len(sys.argv) > 1 and '-h' in sys.argv[1][0:3]:
+    script_name = os.path.basename(sys.argv[0])
+    print """USAGE:
+  $ """+script_name+""" [sample column text]
+  $ """+script_name+""" '+A=10,+G=2,-A=20,-G=41,'
+  $ """+script_name+""" '0/1:10,1:0.25,0.025:-A=29,-AG=1,-G=10,'
+  $ """+script_name+""" '+A=10,+G=2,-A=20,-G=41,' '0/1:10,1:0.25,0.025:-A=29,-AG=1,-G=10,'
+Or invoke with no arguments to use interactively. It will read from stdin, so
+just paste one sample per line."""
+    sys.exit(0)
+  
+  if len(args) > 0:
+    stdin = False
+    samples = args
+  else:
+    stdin = True
+    samples = sys.stdin
+    print "Reading from standard input.."
+
+  for sample in samples:
+    print ''
+    sample = sample.split(':')[-1]
+    if not sample:
+      continue
+    print sample
+    counts_dict = parse_counts(sample)
+    compute_stats(counts_dict, freq_thres, covg_thres)
+  
+
+def parse_counts(sample_str):
+  counts_dict = {}
+  counts = sample_str.split(',')
+  for count in counts:
+    if '=' not in count:
+      continue
+    (var, reads) = count.split('=')
+    if var[1:] in BASES:
+      counts_dict[var] = int(reads)
+  return counts_dict
+
+
+def compute_stats(counts_dict, freq_thres, covg_thres):
+  # totals for A, C, G, T
+  counts_unstranded = {}
+  for base in BASES:
+    counts_unstranded[base] = 0
+    for strand in '+-':
+      counts_unstranded[base] += counts_dict.get(strand+base, 0)
+  print '+- '+str(counts_unstranded)
+
+  # get counts for each strand
+  plus_counts  = get_stranded_counts(counts_dict, '+')
+  minus_counts = get_stranded_counts(counts_dict, '-')
+  counts_lists = {'+':plus_counts, '-':minus_counts}
+  print ' + '+str(plus_counts)
+  print ' - '+str(minus_counts)
+
+  # stranded coverage threshold
+  coverages = {}
+  failing_strands = {}
+  for strand in '+-':
+    coverages[strand] = 0
+    for count in counts_lists[strand].values():
+      coverages[strand] += count
+    if coverages[strand] < covg_thres:
+      failing_strands[strand] = coverages[strand]
+    sys.stdout.write(strand+'coverage: '+str(coverages[strand])+"\t")
+  coverages['+-'] = coverages['+'] + coverages['-']
+  sys.stdout.write("+-coverage: "+str(coverages['+-'])+"\n")
+
+  if failing_strands:
+    for strand in failing_strands:
+      print ('coverage on '+strand+' strand too low ('
+        +str(failing_strands[strand])+' < '+str(covg_thres)+")")
+    return
+
+  # apply frequency threshold
+  for strand in counts_lists:
+    strand_counts = counts_lists[strand]
+    for variant in strand_counts.keys():
+      # print (variant+" freq: "+str(strand_counts[variant])+"/"
+      #   +str(coverages[strand])+" = "
+      #   +str(strand_counts[variant]/float(coverages[strand])))
+      if strand_counts[variant]/float(coverages[strand]) < freq_thres:
+        strand_counts.pop(variant)
+  plus_variants  = sorted(plus_counts.keys())
+  minus_variants = sorted(minus_counts.keys())
+  if plus_variants == minus_variants:
+    strand_bias = False
+    print "no strand bias: +"+str(plus_variants)+" == -"+str(minus_variants)
+    sys.stdout.write("alleles: "+str(len(plus_variants))+"\t")
+  else:
+    strand_bias = True
+    print "   strand bias: +"+str(plus_variants)+" != -"+str(minus_variants)
+    sys.stdout.write("alleles: 0\t")
+
+  variants_sorted = sort_variants(counts_unstranded)
+  if len(variants_sorted) >= 1:
+    sys.stdout.write("major: "+variants_sorted[0]+"\t")
+  minor = '.'
+  if len(variants_sorted) == 2:
+    minor = variants_sorted[1]
+  elif len(variants_sorted) > 2:
+    if (counts_unstranded.get(variants_sorted[1]) ==
+        counts_unstranded.get(variants_sorted[2])):
+      minor = 'N'
+    else:
+      minor = variants_sorted[1]
+
+  sys.stdout.write("minor: "+minor+"\tfreq: ")
+  print round(float(counts_unstranded.get(minor, 0))/coverages['+-'], 5)
+
+
+def get_stranded_counts(unstranded_counts, strand):
+  stranded_counts = {}
+  for variant in unstranded_counts:
+    if variant[0] == strand:
+      stranded_counts[variant[1:]] = unstranded_counts[variant]
+  return stranded_counts
+
+def sort_variants(variant_counts):
+  """Sort the list of variants based on their counts. Returns a list of just
+  the variants, no counts."""
+  variants = variant_counts.keys()
+  var_del = []
+  for variant in variants:
+    if variant_counts.get(variant) == 0:
+      var_del.append(variant)
+  for variant in var_del:
+    variants.remove(variant)
+  variants.sort(reverse=True, key=lambda variant: variant_counts.get(variant,0))
+  return variants
+
+if __name__ == "__main__":
+  main()
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tests/errors.vcf.in	Fri Sep 06 20:37:07 2013 -0400
@@ -0,0 +1,25 @@
+##fileformat=VCFv4.1
+##comment="ARGS=-f 10 -c 10"
+##comment="This is a test set of made-up sites, each created in order to test a certain type of error handling. It's meant to be run with -f 10 -c 10"
+##fileDate=19700101
+##source=Dan
+##reference=file:///scratch/dan/galaxy/galaxy-central/database/files/002/dataset_0000.dat
+##INFO=<ID=AC,Number=A,Type=Integer,Description="Allele count in genotypes, for each ALT allele, in the same order as listed">
+##INFO=<ID=AF,Number=A,Type=Float,Description="Allele Frequency, for each ALT allele, in the same order as listed">
+##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
+##FORMAT=<ID=AC,Number=.,Type=Integer,Description="Allele count in genotypes, for each ALT allele, in the same order as listed">
+##FORMAT=<ID=AF,Number=.,Type=Float,Description="Allele Frequency, for each ALT allele, in the same order as listed">
+##FORMAT=<ID=NC,Number=.,Type=String,Description="Nucleotide and indel counts">
+#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	THYROID
+# A correct control
+chr1	0	.	A	.	.	.	AC=;AF=	GT:AC:AF:NC	0/0:1:1:+A=15,-A=15,
+# List some bases more than once
+chr1	0	.	A	.	.	.	AC=;AF=	GT:AC:AF:NC	0/0:1:1:+A=15,-A=15,+A=12,
+# Comma omissions
+chr1	0	.	A	.	.	.	AC=;AF=	GT:AC:AF:NC	0/0:1:1:+A=15,-A=15
+chr1	0	.	A	.	.	.	AC=;AF=	GT:AC:AF:NC	0/0:1:1:+A=15-A=15,
+chr1	0	.	A	.	.	.	AC=;AF=	GT:AC:AF:NC	0/0:1:1:+A=15,+G=1,+C=1,-A=15,-G=1,
+chr1	0	.	A	.	.	.	AC=;AF=	GT:AC:AF:NC	0/0:1:1:+A=15,+G=1+C=1,-A=15,-G=1,
+# Wrong number of sample columns
+chr1	0	.	A	.	.	.	AC=;AF=	GT:AC:AF:NC	0/0:1:1:+A=15,-A=15,	0/0:1:1:+A=15,-A=15,
+chr1	0	.	A	.	.	.	AC=;AF=	GT:AC:AF:NC
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tests/real-mit-s.csv.out	Fri Sep 06 20:37:07 2013 -0400
@@ -0,0 +1,12 @@
+#SAMPLE	CHR	POS	+A	+C	+G	+T	-A	-C	-G	-T	CVRG	ALLELES	MAJOR	MINOR	MINOR.FREQ.PERC.
+S1	chrM	2000	1	9095	1	0	7	5808	0	1	14913	1	C	A	0.00054
+S3	chrM	2000	0	7933	0	4	10	5242	1	2	13192	1	C	A	0.00076
+S1	chrM	3000	17399	7	22	8	10567	35	22	4	28064	0	A	G	0.00157
+S2	chrM	3000	12535	3	24	2	7937	13	12	2	20528	1	A	G	0.00175
+S3	chrM	3000	18981	7	29	6	11286	33	17	4	30363	0	A	G	0.00152
+S4	chrM	3000	9254	1	15	2	6240	16	14	1	15543	0	A	G	0.00187
+S1	chrM	4000	6134	2	1	3	6124	1	1	1	12267	1	A	T	0.00033
+S1	chrM	7000	0	17	1	6216	4	9	2	7529	13778	0	T	C	0.00189
+S2	chrM	7000	0	7	2	5104	0	9	4	6288	11414	1	T	C	0.0014
+S3	chrM	7000	0	9	0	6446	4	4	10	7506	13979	1	T	C	0.00093
+S3	chrM	8000	3	1	5023	1	1	0	5043	2	10074	1	G	A	0.0004
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tests/real-mit-s.vcf.in	Fri Sep 06 20:37:07 2013 -0400
@@ -0,0 +1,28 @@
+##fileformat=VCFv4.1
+##comment="ARGS=-f 0.2 -c 5000 -H -s"
+##fileDate=20130521
+##source=Dan
+##reference=file:///scratch/dan/galaxy/galaxy-central/database/files/002/dataset_2536.dat
+##INFO=<ID=AC,Number=A,Type=Integer,Description="Allele count in genotypes, for each ALT allele, in the same order as listed">
+##INFO=<ID=AF,Number=A,Type=Float,Description="Allele Frequency, for each ALT allele, in the same order as listed">
+##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
+##FORMAT=<ID=AC,Number=.,Type=Integer,Description="Allele count in genotypes, for each ALT allele, in the same order as listed">
+##FORMAT=<ID=AF,Number=.,Type=Float,Description="Allele Frequency, for each ALT allele, in the same order as listed">
+##FORMAT=<ID=NC,Number=.,Type=String,Description="Nucleotide and indel counts">
+#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	S1	S2	S3	S4
+chrM	1000	.	ACTC	TCTC,CCTC,GCTC,ATC,A	.	.	AC=188032,682,8,8,1;AF=0.995310134556,0.00361003186568,4.23464148467e-05,4.23464148467e-05,5.29330185583e-06	GT:AC:AF:NC	1/1:12546,51,0,0,0:0.994688020297,0.00404344723698,0,0,0:+A=2,+C=20,+T=8020,-A=14,-C=31,-T=4526,	1/1:9970,38,1,1,0:0.995208624476,0.00379317228988,9.98203234178e-05,9.98203234178e-05,0:+A=1,+d1=1,+C=21,+T=6546,+G=1,-A=7,-C=17,-T=3424,	1/1:11372,35,0,2,0:0.995448179272,0.0030637254902,0,0.000175070028011,0:+d1=2,+C=10,+T=7199,-A=15,-C=25,-T=4173,	1/1:9115,39,0,0,0:0.995304651671,0.00425857174055,0,0,0:+C=13,+T=5817,-A=4,-C=26,-T=3298,
+chrM	2000	.	TAC	CAC,AAC,GAC,T,TC,TGAC	.	.	AC=209371,114,8,2,2,1;AF=0.999083807733,0.000543989158392,3.81746777819e-05,9.54366944547e-06,9.54366944547e-06,4.77183472273e-06	GT:AC:AF:NC	1/1:14903,8,1,0,0,0:0.999329444109,0.000536444712667,6.70555890834e-05,0,0,0:+A=1,+C=9095,+G=1,-A=7,-C=5808,-T=1,	1/1:11838,3,0,0,0,0:0.999324666554,0.000253250042208,0,0,0,0:+C=7242,+T=4,-A=3,-C=4596,-T=1,	1/1:13175,10,1,0,0,0:0.998711340206,0.000758035172832,7.58035172832e-05,0,0,0:+C=7933,+T=4,-A=10,-C=5242,-T=2,-G=1,	1/1:10921,3,2,0,0,1:0.999176578225,0.000274473924977,0.000182982616651,0,0,9.14913083257e-05:+A=1,+TG=1,+C=6732,+T=1,-A=2,-C=4189,-T=2,-G=2,
+chrM	3000	.	TC	AC,GC,CC,T,TGC	.	.	AC=386813,649,441,35,33;AF=0.996740346013,0.00167234421946,0.00113636949273,9.01880549786e-05,8.5034451837e-05	GT:AC:AF:NC	1/1:27966,44,42,5,6:0.996117542297,0.00156723063224,0.00149599287622,0.000178094390027,0.000213713268032:+A=17399,+d1=1,+C=7,+T=8,+G=22,-A=10567,-d1=4,-C=35,-G=22,-T=4,-TG=6,	1/1:20472,36,16,0,0:0.997272018706,0.00175370226033,0.000779423226812,0,0:+A=12535,+C=3,+T=2,+G=24,-A=7937,-C=13,-T=2,-G=12,	1/1:30267,46,40,5,3:0.996575680748,0.00151460274604,0.00131704586612,0.000164630733265,9.87784399592e-05:+A=18981,+d1=2,+C=7,+T=6,+G=29,-A=11286,-d1=3,-C=33,-G=17,-T=4,-TG=3,	1/1:15494,29,17,2,3:0.996526884487,0.00186519166452,0.00109338821713,0.000128633907898,0.000192950861847:+A=9254,+d1=2,+C=1,+T=2,+G=15,-A=6240,-TG=3,-C=16,-T=1,-G=14,
+chrM	4000	.	TA	AA,GA,CA,T,TCA,TTA	.	.	AC=138825,77,37,9,8,3;AF=0.998511134127,0.000553829334254,0.000266125783992,6.47332988089e-05,5.75407100524e-05,2.15777662696e-05	GT:AC:AF:NC	1/1:12258,2,3,1,1,2:0.998940591639,0.00016298590172,0.000244478852579,8.14929508598e-05,8.14929508598e-05,0.00016298590172:+A=6134,+d1=1,+C=2,+G=1,+TT=1,+T=3,+TC=1,-A=6124,-C=1,-TT=1,-G=1,-T=1,	1/1:8934,6,3,0,0,0:0.998546998994,0.000670615848888,0.000335307924444,0,0,0:+A=4382,+T=3,+G=5,-A=4552,-C=3,-T=1,-G=1,	1/1:10135,4,1,2,1,0:0.999014292755,0.000394282897979,9.85707244948e-05,0.00019714144899,9.85707244948e-05,0:+A=4966,+d1=1,+TC=1,+G=3,-A=5169,-d1=1,-C=1,-T=2,-G=1,	1/1:7342,4,1,0,0,0:0.999183451279,0.000544365813827,0.000136091453457,0,0,0:+A=3711,+T=1,-A=3631,-C=1,-G=4,
+chrM	5000	.	A	G,T,C	.	.	AC=28,27,10;AF=0.000251195421066,0.000242224156028,8.97126503808e-05	GT:AC:AF:NC	0/0:2,1,0:0.000190912562047,9.54562810233e-05,0:+A=3664,+T=1,-A=6809,-G=2,	0/0:7,1,2:0.000987724001693,0.000141103428813,0.000282206857627:+A=2337,+C=1,+T=1,+G=2,-A=4740,-C=1,-G=5,	0/0:2,3,0:0.00022411474675,0.000336172120126,0:+A=3194,+T=1,-A=5725,-T=2,-G=2,	0/0:1,1,0:0.000169865806013,0.000169865806013,0:+A=2022,+T=1,-A=3863,-G=1,
+chrM	6000	.	T	C,A,TT,TCTTA,G	.	.	AC=118279,31,7,1,1;AF=0.999459199108,0.000261950432218,5.91500975977e-05,8.45001394252e-06,8.45001394252e-06	GT:AC:AF:NC	1/1:9612,2,0,0,1:0.999168399168,0.0002079002079,0,0,0.00010395010395:+C=4845,+T=3,-A=2,-C=4767,-T=2,-G=1,	1/1:7771,0,1,0,0:0.99987133299,0,0.000128667009779,0,0:+C=3682,-C=4089,-TT=1,	1/1:9152,1,0,0,0:0.999563128003,0.000109217999126,0,0,0:+C=4640,+T=1,-A=1,-C=4512,-T=2,	1/1:5500,0,1,0,0:0.99981821487,0,0.000181785129976,0,0:+C=2707,-C=2793,-TT=1,
+chrM	7000	.	GTA	TTA,CTA,ATA,GA,GACTACTA,G	.	.	AC=177915,270,72,20,3,1;AF=0.997370840434,0.00151358866266,0.000403623643376,0.000112117678716,1.68176518073e-05,5.60588393578e-06	GT:AC:AF:NC	1/1:13745,26,4,1,0,0:0.997532476958,0.0018869293853,0.000290296828507,7.25742071268e-05,0,0:+d1=1,+C=17,+T=6216,+G=1,-A=4,-C=9,-T=7529,-G=2,	1/1:11392,16,0,1,0,0:0.997985107315,0.00140166447657,0,8.76040297854e-05,0,0:+d1=1,+C=7,+T=5104,+G=2,-C=9,-T=6288,-G=4,	1/1:13952,13,4,0,0,0:0.998068531368,0.000929966378139,0.000286143500966,0,0,0:+C=9,+T=6446,-A=4,-C=4,-T=7506,-G=10,	1/1:7740,12,1,0,0,0:0.998194480268,0.0015475883415,0.000128965695125,0,0,0:+C=8,+T=3471,-A=1,-C=4,-T=4269,-G=1,
+chrM	8000	.	TGA	GGA,AGA,CGA,T	.	.	AC=132300,33,7,2;AF=0.999350384482,0.000249271070959,5.28756817186e-05,1.51073376339e-05	GT:AC:AF:NC	1/1:10337,2,0,2:0.999226679555,0.000193330111165,0,0.000193330111165:+A=1,+d2=1,+T=1,+G=4832,-A=1,-d2=1,-T=3,-G=5505,	1/1:8347,3,0,0:0.999640718563,0.000359281437126,0,0:+A=2,+G=4258,-A=1,-G=4089,	1/1:10066,4,1,0:0.999205876514,0.000397061743101,9.92654357753e-05,0:+A=3,+C=1,+T=1,+G=5023,-A=1,-T=2,-G=5043,	1/1:4891,2,3,0:0.99897875817,0.000408496732026,0.000612745098039,0:+A=1,+C=1,+G=1746,-A=1,-C=2,-G=3145,
+chrM	9000	.	TACGC	AACGC,GACGC,CACGC,T,TCGACGC	.	.	AC=125089,77,46,8,2;AF=0.998842167463,0.000614849002667,0.000367312391204,6.38804158615e-05,1.59701039654e-05	GT:AC:AF:NC	1/1:10775,9,6,1,0:0.998517282921,0.000834028356964,0.000556018904643,9.26698174405e-05,0:+A=6306,+G=1,-A=4469,-C=6,-d4=1,-G=8,	1/1:8081,7,2,0,0:0.998764058831,0.00086515881844,0.00024718823384,0,0:+A=4777,+T=1,+G=3,-A=3304,-C=2,-G=4,	1/1:9475,4,3,2,0:0.998735111205,0.000421629598398,0.000316222198798,0.000210814799199,0:+A=5435,+G=1,-A=4040,-C=3,-d4=2,-T=3,-G=3,	1/1:5709,3,1,0,0:0.999299842465,0.000525118151584,0.000175039383861,0,0:+A=3234,+G=1,-A=2475,-C=1,-G=2,
+#chrM	10000	.	AGT	GGT,TGT,AAGT,ACGT,A,CGT,AAGAGT,AACAGT,AT	.	.	AC=88942,145,51,12,7,6,5,2,1;AF=0.997051734768,0.00162546942436,0.000571716832016,0.000134521607533,7.84709377277e-05,6.72608037666e-05,5.60506698055e-05,2.24202679222e-05,1.12101339611e-05	GT:AC:AF:NC	1/1:7632,14,4,3,1,1,0,0,0:0.996735013713,0.00182839232075,0.000522397805929,0.000391798354447,0.000130599451482,0.000130599451482,0,0,0:+A=1,+C=1,+AC=3,+G=2535,+T=11,-A=1,-AA=4,-d2=1,-T=3,-G=5097,	1/1:6053,5,2,1,0,0,1,0,0:0.998186015831,0.000824538258575,0.00032981530343,0.000164907651715,0,0,0.000164907651715,0,0:+AC=1,+T=5,+G=1674,-A=2,-AA=2,-G=4379,-AAGA=1,	1/1:7012,14,6,0,2,0,0,0,0:0.996730632552,0.00199004975124,0.000852878464819,0,0.000284292821606,0,0,0,0:+d2=1,+T=11,+G=2241,-A=1,-AA=6,-d2=1,-T=3,-G=4771,	1/1:4646,1,2,0,0,0,0,0,0:0.998710232158,0.000214961306965,0.000429922613929,0,0,0,0,0,0:+A=1,+T=1,+G=1337,-A=2,-AA=2,-G=3309,
+#chrM	11000	.	CCA	ACA,GCA,TCA,CA,C	.	.	AC=54,44,36,3,2;AF=0.000362343152385,0.00029524256861,0.00024156210159,2.01301751325e-05,1.3420116755e-05	GT:AC:AF:NC	0/0:7,3,1,0,0:0.000521920668058,0.000223680286311,7.45600954369e-05,0,0:+C=10229,+T=1,-A=7,-C=3172,-G=3,	0/0:3,4,2,0,0:0.000368233705659,0.000490978274211,0.000245489137106,0,0:+A=2,+C=6575,+T=2,+G=1,-A=1,-C=1563,-G=3,	0/0:9,3,2,0,0:0.000811615114077,0.000270538371359,0.000180358914239,0,0:+A=1,+C=7906,+T=2,-A=8,-C=3169,-G=3,	0/0:0,2,3,1,0:0,0.000220507166483,0.000330760749724,0.000110253583241,0:+d1=1,+C=7066,+T=2,-C=1998,-T=1,-G=2,
+#chrM	12000	.	AC	CC,TC,GC,A	.	.	AC=182586,84,23,1;AF=0.999146337459,0.000459664444955,0.000125860502785,5.47219577328e-06	GT:AC:AF:NC	1/1:13592,8,0,0:0.999264813998,0.000588148801647,0,0:+A=2,+C=7113,+T=5,-C=6479,-T=3,	1/1:9868,1,0,0:0.999797365755,0.000101317122594,0,0:+C=5182,-A=1,-C=4686,-T=1,	1/1:11289,6,3,0:0.998938147067,0.000530926466684,0.000265463233342,0:+A=2,+C=6123,+T=3,+G=1,-A=1,-C=5166,-T=3,-G=2,	1/1:9708,4,0,0:0.999382334775,0.000411776816965,0,0:+A=1,+C=4892,+T=2,-A=1,-C=4816,-T=2,
+#chrM	13000	.	A	G,C,T	.	.	AC=56125,9,6;AF=0.999358985773,0.000160253556739,0.000106835704492	GT:AC:AF:NC	1/1:3816,0,0:0.999214454046,0,0:+A=1,+G=1319,-A=2,-G=2497,	1/1:2814,1,0:0.999289772727,0.000355113636364,0:+G=986,-A=1,-C=1,-G=1828,	1/1:3479,0,0:0.999712643678,0,0:+A=1,+G=1285,-G=2194,	1/1:2679,0,0:1.0,0,0:+G=979,-G=1700,
+#chrM	14000	.	CTAA	TTAA,GTAA,CAA,ATAA,C	.	.	AC=147853,32,22,21,1;AF=0.997766290558,0.000215947740647,0.000148464071695,0.000141715704799,6.74836689521e-06	GT:AC:AF:NC	1/1:10992,3,2,4,0:0.996825972613,0.000272059490342,0.000181372993561,0.000362745987123,0:+A=1,+d1=1,+C=19,+T=7038,-A=3,-d1=1,-C=7,-T=3954,-G=3,	1/1:6853,2,2,0,0:0.997525473071,0.000291120815138,0.000291120815138,0,0:+d1=2,+C=9,+T=4609,-C=4,-T=2244,-G=2,	1/1:9277,2,0,5,0:0.998063474987,0.000215169445939,0,0.000537923614847,0:+A=2,+C=9,+T=5786,-A=3,-C=2,-T=3491,-G=2,	1/1:8125,1,3,2,0:0.997422047631,0.000122759636631,0.000368278909894,0.000245519273263,0:+d1=2,+C=9,+T=5241,-A=2,-d1=1,-C=6,-T=2884,-G=1,
+#chrM	15000	.	AA	GA,TA,CA,A	.	.	AC=132,29,29,4;AF=0.000611470633196,0.000134338245172,0.000134338245172,1.85294131272e-05	GT:AC:AF:NC	0/0:15,2,0,0:0.000956510649152,0.00012753475322,0,0:+A=8715,+T=1,+G=5,-A=6950,-T=1,-G=10,	0/0:13,1,0,0:0.00121985549404,9.38350380032e-05,0,0:+A=6102,+T=1,+G=6,-A=4541,-G=7,	0/0:1,2,5,1:7.86101721563e-05,0.000157220344313,0.000393050860781,7.86101721563e-05:+A=7378,+C=5,+T=2,-A=5334,-d1=1,-G=1,	0/0:6,0,0,0:0.000532765050613,0,0,0:+A=6215,+G=4,-A=5041,-G=2,
+#chrM	16000	.	AG	GG,TG,CG,A	.	.	AC=224750,90,3,1;AF=0.999244175707,0.000400142272808,1.33380757603e-05,4.44602525342e-06	GT:AC:AF:NC	1/1:15911,2,0,0:0.99949745587,0.000125636032414,0,0:+A=3,+T=1,+G=7156,-A=3,-T=1,-G=8755,	1/1:11311,2,0,0:0.999381516169,0.000176709666019,0,0:+A=3,+G=4973,-A=2,-T=2,-G=6338,	1/1:14157,2,0,0:0.999012066897,0.000141133300402,0,0:+A=5,+T=1,+G=6766,-A=7,-T=1,-G=7391,	1/1:11039,3,0,0:0.999185372918,0.000271542360608,0,0:+A=4,+T=1,+G=4959,-A=2,-T=2,-G=6080,
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tests/real-mit.csv.out	Fri Sep 06 20:37:07 2013 -0400
@@ -0,0 +1,12 @@
+#SAMPLE	CHR	POS	A	C	G	T	CVRG	ALLELES	MAJOR	MINOR	MINOR.FREQ.PERC.
+S1	chrM	2000	8	14903	1	1	14913	1	C	A	0.00054
+S3	chrM	2000	10	13175	1	6	13192	1	C	A	0.00076
+S1	chrM	3000	27966	42	44	12	28064	0	A	G	0.00157
+S2	chrM	3000	20472	16	36	4	20528	1	A	G	0.00175
+S3	chrM	3000	30267	40	46	10	30363	0	A	G	0.00152
+S4	chrM	3000	15494	17	29	3	15543	0	A	G	0.00187
+S1	chrM	4000	12258	3	2	4	12267	1	A	T	0.00033
+S1	chrM	7000	4	26	3	13745	13778	0	T	C	0.00189
+S2	chrM	7000	0	16	6	11392	11414	1	T	C	0.0014
+S3	chrM	7000	4	13	10	13952	13979	1	T	C	0.00093
+S3	chrM	8000	4	1	10066	3	10074	1	G	A	0.0004
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tests/real-mit.vcf.in	Fri Sep 06 20:37:07 2013 -0400
@@ -0,0 +1,28 @@
+##fileformat=VCFv4.1
+##comment="ARGS=-f 0.2 -c 5000 -H"
+##fileDate=20130521
+##source=Dan
+##reference=file:///scratch/dan/galaxy/galaxy-central/database/files/002/dataset_2536.dat
+##INFO=<ID=AC,Number=A,Type=Integer,Description="Allele count in genotypes, for each ALT allele, in the same order as listed">
+##INFO=<ID=AF,Number=A,Type=Float,Description="Allele Frequency, for each ALT allele, in the same order as listed">
+##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
+##FORMAT=<ID=AC,Number=.,Type=Integer,Description="Allele count in genotypes, for each ALT allele, in the same order as listed">
+##FORMAT=<ID=AF,Number=.,Type=Float,Description="Allele Frequency, for each ALT allele, in the same order as listed">
+##FORMAT=<ID=NC,Number=.,Type=String,Description="Nucleotide and indel counts">
+#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	S1	S2	S3	S4
+chrM	1000	.	ACTC	TCTC,CCTC,GCTC,ATC,A	.	.	AC=188032,682,8,8,1;AF=0.995310134556,0.00361003186568,4.23464148467e-05,4.23464148467e-05,5.29330185583e-06	GT:AC:AF:NC	1/1:12546,51,0,0,0:0.994688020297,0.00404344723698,0,0,0:+A=2,+C=20,+T=8020,-A=14,-C=31,-T=4526,	1/1:9970,38,1,1,0:0.995208624476,0.00379317228988,9.98203234178e-05,9.98203234178e-05,0:+A=1,+d1=1,+C=21,+T=6546,+G=1,-A=7,-C=17,-T=3424,	1/1:11372,35,0,2,0:0.995448179272,0.0030637254902,0,0.000175070028011,0:+d1=2,+C=10,+T=7199,-A=15,-C=25,-T=4173,	1/1:9115,39,0,0,0:0.995304651671,0.00425857174055,0,0,0:+C=13,+T=5817,-A=4,-C=26,-T=3298,
+chrM	2000	.	TAC	CAC,AAC,GAC,T,TC,TGAC	.	.	AC=209371,114,8,2,2,1;AF=0.999083807733,0.000543989158392,3.81746777819e-05,9.54366944547e-06,9.54366944547e-06,4.77183472273e-06	GT:AC:AF:NC	1/1:14903,8,1,0,0,0:0.999329444109,0.000536444712667,6.70555890834e-05,0,0,0:+A=1,+C=9095,+G=1,-A=7,-C=5808,-T=1,	1/1:11838,3,0,0,0,0:0.999324666554,0.000253250042208,0,0,0,0:+C=7242,+T=4,-A=3,-C=4596,-T=1,	1/1:13175,10,1,0,0,0:0.998711340206,0.000758035172832,7.58035172832e-05,0,0,0:+C=7933,+T=4,-A=10,-C=5242,-T=2,-G=1,	1/1:10921,3,2,0,0,1:0.999176578225,0.000274473924977,0.000182982616651,0,0,9.14913083257e-05:+A=1,+TG=1,+C=6732,+T=1,-A=2,-C=4189,-T=2,-G=2,
+chrM	3000	.	TC	AC,GC,CC,T,TGC	.	.	AC=386813,649,441,35,33;AF=0.996740346013,0.00167234421946,0.00113636949273,9.01880549786e-05,8.5034451837e-05	GT:AC:AF:NC	1/1:27966,44,42,5,6:0.996117542297,0.00156723063224,0.00149599287622,0.000178094390027,0.000213713268032:+A=17399,+d1=1,+C=7,+T=8,+G=22,-A=10567,-d1=4,-C=35,-G=22,-T=4,-TG=6,	1/1:20472,36,16,0,0:0.997272018706,0.00175370226033,0.000779423226812,0,0:+A=12535,+C=3,+T=2,+G=24,-A=7937,-C=13,-T=2,-G=12,	1/1:30267,46,40,5,3:0.996575680748,0.00151460274604,0.00131704586612,0.000164630733265,9.87784399592e-05:+A=18981,+d1=2,+C=7,+T=6,+G=29,-A=11286,-d1=3,-C=33,-G=17,-T=4,-TG=3,	1/1:15494,29,17,2,3:0.996526884487,0.00186519166452,0.00109338821713,0.000128633907898,0.000192950861847:+A=9254,+d1=2,+C=1,+T=2,+G=15,-A=6240,-TG=3,-C=16,-T=1,-G=14,
+chrM	4000	.	TA	AA,GA,CA,T,TCA,TTA	.	.	AC=138825,77,37,9,8,3;AF=0.998511134127,0.000553829334254,0.000266125783992,6.47332988089e-05,5.75407100524e-05,2.15777662696e-05	GT:AC:AF:NC	1/1:12258,2,3,1,1,2:0.998940591639,0.00016298590172,0.000244478852579,8.14929508598e-05,8.14929508598e-05,0.00016298590172:+A=6134,+d1=1,+C=2,+G=1,+TT=1,+T=3,+TC=1,-A=6124,-C=1,-TT=1,-G=1,-T=1,	1/1:8934,6,3,0,0,0:0.998546998994,0.000670615848888,0.000335307924444,0,0,0:+A=4382,+T=3,+G=5,-A=4552,-C=3,-T=1,-G=1,	1/1:10135,4,1,2,1,0:0.999014292755,0.000394282897979,9.85707244948e-05,0.00019714144899,9.85707244948e-05,0:+A=4966,+d1=1,+TC=1,+G=3,-A=5169,-d1=1,-C=1,-T=2,-G=1,	1/1:7342,4,1,0,0,0:0.999183451279,0.000544365813827,0.000136091453457,0,0,0:+A=3711,+T=1,-A=3631,-C=1,-G=4,
+chrM	5000	.	A	G,T,C	.	.	AC=28,27,10;AF=0.000251195421066,0.000242224156028,8.97126503808e-05	GT:AC:AF:NC	0/0:2,1,0:0.000190912562047,9.54562810233e-05,0:+A=3664,+T=1,-A=6809,-G=2,	0/0:7,1,2:0.000987724001693,0.000141103428813,0.000282206857627:+A=2337,+C=1,+T=1,+G=2,-A=4740,-C=1,-G=5,	0/0:2,3,0:0.00022411474675,0.000336172120126,0:+A=3194,+T=1,-A=5725,-T=2,-G=2,	0/0:1,1,0:0.000169865806013,0.000169865806013,0:+A=2022,+T=1,-A=3863,-G=1,
+chrM	6000	.	T	C,A,TT,TCTTA,G	.	.	AC=118279,31,7,1,1;AF=0.999459199108,0.000261950432218,5.91500975977e-05,8.45001394252e-06,8.45001394252e-06	GT:AC:AF:NC	1/1:9612,2,0,0,1:0.999168399168,0.0002079002079,0,0,0.00010395010395:+C=4845,+T=3,-A=2,-C=4767,-T=2,-G=1,	1/1:7771,0,1,0,0:0.99987133299,0,0.000128667009779,0,0:+C=3682,-C=4089,-TT=1,	1/1:9152,1,0,0,0:0.999563128003,0.000109217999126,0,0,0:+C=4640,+T=1,-A=1,-C=4512,-T=2,	1/1:5500,0,1,0,0:0.99981821487,0,0.000181785129976,0,0:+C=2707,-C=2793,-TT=1,
+chrM	7000	.	GTA	TTA,CTA,ATA,GA,GACTACTA,G	.	.	AC=177915,270,72,20,3,1;AF=0.997370840434,0.00151358866266,0.000403623643376,0.000112117678716,1.68176518073e-05,5.60588393578e-06	GT:AC:AF:NC	1/1:13745,26,4,1,0,0:0.997532476958,0.0018869293853,0.000290296828507,7.25742071268e-05,0,0:+d1=1,+C=17,+T=6216,+G=1,-A=4,-C=9,-T=7529,-G=2,	1/1:11392,16,0,1,0,0:0.997985107315,0.00140166447657,0,8.76040297854e-05,0,0:+d1=1,+C=7,+T=5104,+G=2,-C=9,-T=6288,-G=4,	1/1:13952,13,4,0,0,0:0.998068531368,0.000929966378139,0.000286143500966,0,0,0:+C=9,+T=6446,-A=4,-C=4,-T=7506,-G=10,	1/1:7740,12,1,0,0,0:0.998194480268,0.0015475883415,0.000128965695125,0,0,0:+C=8,+T=3471,-A=1,-C=4,-T=4269,-G=1,
+chrM	8000	.	TGA	GGA,AGA,CGA,T	.	.	AC=132300,33,7,2;AF=0.999350384482,0.000249271070959,5.28756817186e-05,1.51073376339e-05	GT:AC:AF:NC	1/1:10337,2,0,2:0.999226679555,0.000193330111165,0,0.000193330111165:+A=1,+d2=1,+T=1,+G=4832,-A=1,-d2=1,-T=3,-G=5505,	1/1:8347,3,0,0:0.999640718563,0.000359281437126,0,0:+A=2,+G=4258,-A=1,-G=4089,	1/1:10066,4,1,0:0.999205876514,0.000397061743101,9.92654357753e-05,0:+A=3,+C=1,+T=1,+G=5023,-A=1,-T=2,-G=5043,	1/1:4891,2,3,0:0.99897875817,0.000408496732026,0.000612745098039,0:+A=1,+C=1,+G=1746,-A=1,-C=2,-G=3145,
+chrM	9000	.	TACGC	AACGC,GACGC,CACGC,T,TCGACGC	.	.	AC=125089,77,46,8,2;AF=0.998842167463,0.000614849002667,0.000367312391204,6.38804158615e-05,1.59701039654e-05	GT:AC:AF:NC	1/1:10775,9,6,1,0:0.998517282921,0.000834028356964,0.000556018904643,9.26698174405e-05,0:+A=6306,+G=1,-A=4469,-C=6,-d4=1,-G=8,	1/1:8081,7,2,0,0:0.998764058831,0.00086515881844,0.00024718823384,0,0:+A=4777,+T=1,+G=3,-A=3304,-C=2,-G=4,	1/1:9475,4,3,2,0:0.998735111205,0.000421629598398,0.000316222198798,0.000210814799199,0:+A=5435,+G=1,-A=4040,-C=3,-d4=2,-T=3,-G=3,	1/1:5709,3,1,0,0:0.999299842465,0.000525118151584,0.000175039383861,0,0:+A=3234,+G=1,-A=2475,-C=1,-G=2,
+#chrM	10000	.	AGT	GGT,TGT,AAGT,ACGT,A,CGT,AAGAGT,AACAGT,AT	.	.	AC=88942,145,51,12,7,6,5,2,1;AF=0.997051734768,0.00162546942436,0.000571716832016,0.000134521607533,7.84709377277e-05,6.72608037666e-05,5.60506698055e-05,2.24202679222e-05,1.12101339611e-05	GT:AC:AF:NC	1/1:7632,14,4,3,1,1,0,0,0:0.996735013713,0.00182839232075,0.000522397805929,0.000391798354447,0.000130599451482,0.000130599451482,0,0,0:+A=1,+C=1,+AC=3,+G=2535,+T=11,-A=1,-AA=4,-d2=1,-T=3,-G=5097,	1/1:6053,5,2,1,0,0,1,0,0:0.998186015831,0.000824538258575,0.00032981530343,0.000164907651715,0,0,0.000164907651715,0,0:+AC=1,+T=5,+G=1674,-A=2,-AA=2,-G=4379,-AAGA=1,	1/1:7012,14,6,0,2,0,0,0,0:0.996730632552,0.00199004975124,0.000852878464819,0,0.000284292821606,0,0,0,0:+d2=1,+T=11,+G=2241,-A=1,-AA=6,-d2=1,-T=3,-G=4771,	1/1:4646,1,2,0,0,0,0,0,0:0.998710232158,0.000214961306965,0.000429922613929,0,0,0,0,0,0:+A=1,+T=1,+G=1337,-A=2,-AA=2,-G=3309,
+#chrM	11000	.	CCA	ACA,GCA,TCA,CA,C	.	.	AC=54,44,36,3,2;AF=0.000362343152385,0.00029524256861,0.00024156210159,2.01301751325e-05,1.3420116755e-05	GT:AC:AF:NC	0/0:7,3,1,0,0:0.000521920668058,0.000223680286311,7.45600954369e-05,0,0:+C=10229,+T=1,-A=7,-C=3172,-G=3,	0/0:3,4,2,0,0:0.000368233705659,0.000490978274211,0.000245489137106,0,0:+A=2,+C=6575,+T=2,+G=1,-A=1,-C=1563,-G=3,	0/0:9,3,2,0,0:0.000811615114077,0.000270538371359,0.000180358914239,0,0:+A=1,+C=7906,+T=2,-A=8,-C=3169,-G=3,	0/0:0,2,3,1,0:0,0.000220507166483,0.000330760749724,0.000110253583241,0:+d1=1,+C=7066,+T=2,-C=1998,-T=1,-G=2,
+#chrM	12000	.	AC	CC,TC,GC,A	.	.	AC=182586,84,23,1;AF=0.999146337459,0.000459664444955,0.000125860502785,5.47219577328e-06	GT:AC:AF:NC	1/1:13592,8,0,0:0.999264813998,0.000588148801647,0,0:+A=2,+C=7113,+T=5,-C=6479,-T=3,	1/1:9868,1,0,0:0.999797365755,0.000101317122594,0,0:+C=5182,-A=1,-C=4686,-T=1,	1/1:11289,6,3,0:0.998938147067,0.000530926466684,0.000265463233342,0:+A=2,+C=6123,+T=3,+G=1,-A=1,-C=5166,-T=3,-G=2,	1/1:9708,4,0,0:0.999382334775,0.000411776816965,0,0:+A=1,+C=4892,+T=2,-A=1,-C=4816,-T=2,
+#chrM	13000	.	A	G,C,T	.	.	AC=56125,9,6;AF=0.999358985773,0.000160253556739,0.000106835704492	GT:AC:AF:NC	1/1:3816,0,0:0.999214454046,0,0:+A=1,+G=1319,-A=2,-G=2497,	1/1:2814,1,0:0.999289772727,0.000355113636364,0:+G=986,-A=1,-C=1,-G=1828,	1/1:3479,0,0:0.999712643678,0,0:+A=1,+G=1285,-G=2194,	1/1:2679,0,0:1.0,0,0:+G=979,-G=1700,
+#chrM	14000	.	CTAA	TTAA,GTAA,CAA,ATAA,C	.	.	AC=147853,32,22,21,1;AF=0.997766290558,0.000215947740647,0.000148464071695,0.000141715704799,6.74836689521e-06	GT:AC:AF:NC	1/1:10992,3,2,4,0:0.996825972613,0.000272059490342,0.000181372993561,0.000362745987123,0:+A=1,+d1=1,+C=19,+T=7038,-A=3,-d1=1,-C=7,-T=3954,-G=3,	1/1:6853,2,2,0,0:0.997525473071,0.000291120815138,0.000291120815138,0,0:+d1=2,+C=9,+T=4609,-C=4,-T=2244,-G=2,	1/1:9277,2,0,5,0:0.998063474987,0.000215169445939,0,0.000537923614847,0:+A=2,+C=9,+T=5786,-A=3,-C=2,-T=3491,-G=2,	1/1:8125,1,3,2,0:0.997422047631,0.000122759636631,0.000368278909894,0.000245519273263,0:+d1=2,+C=9,+T=5241,-A=2,-d1=1,-C=6,-T=2884,-G=1,
+#chrM	15000	.	AA	GA,TA,CA,A	.	.	AC=132,29,29,4;AF=0.000611470633196,0.000134338245172,0.000134338245172,1.85294131272e-05	GT:AC:AF:NC	0/0:15,2,0,0:0.000956510649152,0.00012753475322,0,0:+A=8715,+T=1,+G=5,-A=6950,-T=1,-G=10,	0/0:13,1,0,0:0.00121985549404,9.38350380032e-05,0,0:+A=6102,+T=1,+G=6,-A=4541,-G=7,	0/0:1,2,5,1:7.86101721563e-05,0.000157220344313,0.000393050860781,7.86101721563e-05:+A=7378,+C=5,+T=2,-A=5334,-d1=1,-G=1,	0/0:6,0,0,0:0.000532765050613,0,0,0:+A=6215,+G=4,-A=5041,-G=2,
+#chrM	16000	.	AG	GG,TG,CG,A	.	.	AC=224750,90,3,1;AF=0.999244175707,0.000400142272808,1.33380757603e-05,4.44602525342e-06	GT:AC:AF:NC	1/1:15911,2,0,0:0.99949745587,0.000125636032414,0,0:+A=3,+T=1,+G=7156,-A=3,-T=1,-G=8755,	1/1:11311,2,0,0:0.999381516169,0.000176709666019,0,0:+A=3,+G=4973,-A=2,-T=2,-G=6338,	1/1:14157,2,0,0:0.999012066897,0.000141133300402,0,0:+A=5,+T=1,+G=6766,-A=7,-T=1,-G=7391,	1/1:11039,3,0,0:0.999185372918,0.000271542360608,0,0:+A=4,+T=1,+G=4959,-A=2,-T=2,-G=6080,
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tests/real-nofilt.csv.out	Fri Sep 06 20:37:07 2013 -0400
@@ -0,0 +1,15 @@
+#SAMPLE	CHR	POS	A	C	G	T	CVRG	ALLELES	MAJOR	MINOR	MINOR.FREQ.PERC.
+THYROID	chr1	246704250	29	0	0	0	29	1	A	.	0.0
+THYROID	chr1	246704257	0	0	0	71	71	1	T	.	0.0
+THYROID	chr1	246704268	104	0	0	0	104	1	A	.	0.0
+THYROID	chr1	246704269	0	0	0	105	105	1	T	.	0.0
+THYROID	chr1	246704363	0	72	3	0	75	0	C	G	0.04
+THYROID	chr1	246704437	5	130	0	0	135	0	C	A	0.03704
+THYROID	chr1	246707878	0	0	131	0	131	1	G	.	0.0
+THYROID	chr1	246714587	30	0	43	0	73	2	G	A	0.41096
+THYROID	chr1	246729215	1	0	1	88	90	0	T	G	0.01111
+THYROID	chr1	246729216	1	0	1	90	92	0	T	G	0.01087
+THYROID	chr1	246729378	16	7	0	0	23	0	A	C	0.30435
+THYROID	chr1	246729392	29	0	10	0	39	0	A	G	0.25641
+THYROID	chr7	91502881	0	0	0	26	26	1	T	.	0.0
+THYROID	chr7	91502897	7	36	0	0	43	0	C	A	0.16279
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tests/real-nofilt.vcf.in	Fri Sep 06 20:37:07 2013 -0400
@@ -0,0 +1,27 @@
+##fileformat=VCFv4.1
+##comment="ARGS=-H -r 1 -f 0 -c 0"
+##comment="This is a test set of sites copied from real data."
+##fileDate=19700101
+##source=Dan
+##reference=file:///scratch/dan/galaxy/galaxy-central/database/files/002/dataset_0000.dat
+##INFO=<ID=AC,Number=A,Type=Integer,Description="Allele count in genotypes, for each ALT allele, in the same order as listed">
+##INFO=<ID=AF,Number=A,Type=Float,Description="Allele Frequency, for each ALT allele, in the same order as listed">
+##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
+##FORMAT=<ID=AC,Number=.,Type=Integer,Description="Allele count in genotypes, for each ALT allele, in the same order as listed">
+##FORMAT=<ID=AF,Number=.,Type=Float,Description="Allele Frequency, for each ALT allele, in the same order as listed">
+##FORMAT=<ID=NC,Number=.,Type=String,Description="Nucleotide and indel counts">
+#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	THYROID
+chr1	246704250	.	A	N	.	.	AC=1;AF=0.0333333333333	GT:AC:AF:NC	0/0:1:0.0333333333333:+A=15,+N=1,-A=14,
+chr1	246704257	.	T	N	.	.	AC=2;AF=0.027397260274	GT:AC:AF:NC	0/0:2:0.027397260274:+T=52,+N=2,-T=19,
+chr1	246704268	.	A	.	.	.	AC=;AF=	GT:AC:AF:NC	0/0:::+A=82,-A=22,
+chr1	246704269	.	T	.	.	.	AC=;AF=	GT:AC:AF:NC	0/0:::+T=83,-T=22,
+chr1	246704363	.	C	G,N	.	.	AC=3,1;AF=0.0394736842105,0.0131578947368	GT:AC:AF:NC	0/0:3,1:0.0394736842105,0.0131578947368:+C=52,+G=3,+N=1,-C=20,
+chr1	246704437	.	C	A,N	.	.	AC=5,1;AF=0.0367647058824,0.00735294117647	GT:AC:AF:NC	0/0:5,1:0.0367647058824,0.00735294117647:+C=72,-A=5,-C=58,-N=1,
+chr1	246707878	.	G	GA	.	.	AC=6;AF=0.043795620438	GT:AC:AF:NC	0/0:6:0.043795620438:+G=90,-G=41,-GA=6,
+chr1	246714587	.	G	A	.	.	AC=30;AF=0.41095890411	GT:AC:AF:NC	0/1:30:0.41095890411:+A=10,+G=2,-A=20,-G=41,
+chr1	246729215	.	T	G,A	.	.	AC=1,1;AF=0.0111111111111,0.0111111111111	GT:AC:AF:NC	0/0:1,1:0.0111111111111,0.0111111111111:+A=1,+T=81,-T=7,-G=1,
+chr1	246729216	.	T	G,A	.	.	AC=1,1;AF=0.0111111111111,0.0111111111111	GT:AC:AF:NC	0/0:1,1:0.0111111111111,0.0111111111111:+A=1,+T=81,-T=9,-G=1,
+chr1	246729378	.	A	C	.	.	AC=7;AF=0.304347826087	GT:AC:AF:NC	0/1:7:0.304347826087:-A=16,-C=7,
+chr1	246729392	.	A	G,AG	.	.	AC=10,1;AF=0.25,0.025	GT:AC:AF:NC	0/1:10,1:0.25,0.025:-A=29,-AG=1,-G=10,
+chr7	91502881	.	TC	T	.	.	AC=3;AF=0.103448275862	GT:AC:AF:NC	0/0:3:0.103448275862:+T=14,-d1=3,-T=12,
+chr7	91502897	.	C	A	.	.	AC=7;AF=0.162790697674	GT:AC:AF:NC	0/0:7:0.162790697674:+A=7,+C=17,-C=19,
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tests/real.csv.out	Fri Sep 06 20:37:07 2013 -0400
@@ -0,0 +1,11 @@
+THYROID	chr1	246704250	29	0	0	0	29	1	A	.	0.0
+THYROID	chr1	246704257	0	0	0	71	71	1	T	.	0.0
+THYROID	chr1	246704268	104	0	0	0	104	1	A	.	0.0
+THYROID	chr1	246704269	0	0	0	105	105	1	T	.	0.0
+THYROID	chr1	246704363	0	72	3	0	75	0	C	G	0.04
+THYROID	chr1	246704437	5	130	0	0	135	0	C	A	0.03704
+THYROID	chr1	246707878	0	0	131	0	131	1	G	.	0.0
+THYROID	chr1	246714587	30	0	43	0	73	2	G	A	0.41096
+THYROID	chr1	246729216	1	0	1	90	92	0	T	G	0.01087
+THYROID	chr7	91502881	0	0	0	26	26	1	T	.	0.0
+THYROID	chr7	91502897	7	36	0	0	43	0	C	A	0.16279
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tests/real.vcf.in	Fri Sep 06 20:37:07 2013 -0400
@@ -0,0 +1,27 @@
+##fileformat=VCFv4.1
+##comment="ARGS=-r 1 -f 1 -c 10"
+##comment="This is a test set of sites copied from real data. It's meant to be run with -f 1 -c 10"
+##fileDate=19700101
+##source=Dan
+##reference=file:///scratch/dan/galaxy/galaxy-central/database/files/002/dataset_0000.dat
+##INFO=<ID=AC,Number=A,Type=Integer,Description="Allele count in genotypes, for each ALT allele, in the same order as listed">
+##INFO=<ID=AF,Number=A,Type=Float,Description="Allele Frequency, for each ALT allele, in the same order as listed">
+##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
+##FORMAT=<ID=AC,Number=.,Type=Integer,Description="Allele count in genotypes, for each ALT allele, in the same order as listed">
+##FORMAT=<ID=AF,Number=.,Type=Float,Description="Allele Frequency, for each ALT allele, in the same order as listed">
+##FORMAT=<ID=NC,Number=.,Type=String,Description="Nucleotide and indel counts">
+#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	THYROID
+chr1	246704250	.	A	N	.	.	AC=1;AF=0.0333333333333	GT:AC:AF:NC	0/0:1:0.0333333333333:+A=15,+N=1,-A=14,
+chr1	246704257	.	T	N	.	.	AC=2;AF=0.027397260274	GT:AC:AF:NC	0/0:2:0.027397260274:+T=52,+N=2,-T=19,
+chr1	246704268	.	A	.	.	.	AC=;AF=	GT:AC:AF:NC	0/0:::+A=82,-A=22,
+chr1	246704269	.	T	.	.	.	AC=;AF=	GT:AC:AF:NC	0/0:::+T=83,-T=22,
+chr1	246704363	.	C	G,N	.	.	AC=3,1;AF=0.0394736842105,0.0131578947368	GT:AC:AF:NC	0/0:3,1:0.0394736842105,0.0131578947368:+C=52,+G=3,+N=1,-C=20,
+chr1	246704437	.	C	A,N	.	.	AC=5,1;AF=0.0367647058824,0.00735294117647	GT:AC:AF:NC	0/0:5,1:0.0367647058824,0.00735294117647:+C=72,-A=5,-C=58,-N=1,
+chr1	246707878	.	G	GA	.	.	AC=6;AF=0.043795620438	GT:AC:AF:NC	0/0:6:0.043795620438:+G=90,-G=41,-GA=6,
+chr1	246714587	.	G	A	.	.	AC=30;AF=0.41095890411	GT:AC:AF:NC	0/1:30:0.41095890411:+A=10,+G=2,-A=20,-G=41,
+chr1	246729215	.	T	G,A	.	.	AC=1,1;AF=0.0111111111111,0.0111111111111	GT:AC:AF:NC	0/0:1,1:0.0111111111111,0.0111111111111:+A=1,+T=81,-T=7,-G=1,
+chr1	246729216	.	T	G,A	.	.	AC=1,1;AF=0.0111111111111,0.0111111111111	GT:AC:AF:NC	0/0:1,1:0.0111111111111,0.0111111111111:+A=1,+T=81,-T=9,-G=1,
+chr1	246729378	.	A	C	.	.	AC=7;AF=0.304347826087	GT:AC:AF:NC	0/1:7:0.304347826087:-A=16,-C=7,
+chr1	246729392	.	A	G,AG	.	.	AC=10,1;AF=0.25,0.025	GT:AC:AF:NC	0/1:10,1:0.25,0.025:-A=29,-AG=1,-G=10,
+chr7	91502881	.	TC	T	.	.	AC=3;AF=0.103448275862	GT:AC:AF:NC	0/0:3:0.103448275862:+T=14,-d1=3,-T=12,
+chr7	91502897	.	C	A	.	.	AC=7;AF=0.162790697674	GT:AC:AF:NC	0/0:7:0.162790697674:+A=7,+C=17,-C=19,
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tests/run-tests.py	Fri Sep 06 20:37:07 2013 -0400
@@ -0,0 +1,54 @@
+#!/usr/bin/env python
+import os
+import sys
+import subprocess
+
+DATASETS = [
+  'artificial',
+  'artificial-samples',
+  'artificial-nofilt',
+  'real',
+  'real-mit',
+  'real-mit-s',
+  'real-nofilt',
+]
+IN_EXT  = '.vcf.in'
+OUT_EXT = '.csv.out'
+ARGS_KEY = '##comment="ARGS='
+
+def main():
+
+  test_dir = os.path.dirname(os.path.relpath(sys.argv[0]))
+  if test_dir:
+    test_dir += os.sep
+
+  for dataset in DATASETS:
+    infile  = test_dir+dataset+IN_EXT
+    outfile = test_dir+dataset+OUT_EXT
+
+    if not os.path.exists(infile):
+      sys.stderr.write("Error: file not found: "+infile+"\n")
+      continue
+    if not os.path.exists(outfile):
+      sys.stderr.write("Error: file not found: "+outfile+"\n")
+      continue
+
+    options = read_options(infile)
+    script_cmd = 'allele-counts.py '+options+' -i '+infile
+    bash_cmd = 'diff '+outfile+' <('+script_cmd+')'
+    # print infile+":"
+    print script_cmd
+    subprocess.call(['bash', '-c', bash_cmd])
+
+
+def read_options(infile):
+  with open(infile, 'r') as infilehandle:
+    for line in infilehandle:
+      line.strip()
+      if ARGS_KEY == line[:len(ARGS_KEY)]:
+        return line[len(ARGS_KEY):-2]
+  return ''
+
+
+if __name__ == '__main__':
+  main()
\ No newline at end of file