changeset 17:44c3abd1b767 draft

"planemo upload for repository https://github.com/galaxyproject/dunovo commit 5a2e08bc1213b0437d0adcb45f7f431bd3c735f4"
author nick
date Tue, 31 Mar 2020 09:00:51 +0000
parents 25e8f4cf2a81
children a1e7b592c9a8
files 0todo.txt README.md allele-counts.py allele-counts.xml tests/artificial-nofilt.csv.out tests/artificial-samples.csv.out tests/artificial.csv.out tests/real-mit-s.csv.out tests/real-mit.csv.out tests/real-nofilt.csv.out tests/real.csv.out tests/run-tests.py
diffstat 12 files changed, 286 insertions(+), 179 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/0todo.txt	Tue Mar 31 09:00:51 2020 +0000
@@ -0,0 +1,2 @@
+test handling of -c 0 (and -f 0?)
+should it technically handle data lines that start with a '#'?
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/README.md	Tue Mar 31 09:00:51 2020 +0000
@@ -0,0 +1,4 @@
+variant-annotator
+=================
+
+A Galaxy tool for parsing variant counts from a VCF and computing statistics
--- a/allele-counts.py	Wed Dec 09 11:31:13 2015 -0500
+++ b/allele-counts.py	Tue Mar 31 09:00:51 2020 +0000
@@ -1,4 +1,4 @@
-#!/usr/bin/python
+#!/usr/bin/python3
 """
 Run with -h option or see DESCRIPTION for description.
 This script's functionality is being obsoleted by the new, and much more sanely
@@ -11,7 +11,6 @@
 Naive Variant Caller variant count parsing one-liner:
 $ cat variants.vcf | grep -v '^#' | cut -f 10 | cut -d ':' -f 4 | tr ',=' '\t:'
 """
-from __future__ import division
 import os
 import sys
 import errno
@@ -49,6 +48,7 @@
 threshold (but not necessarily in the same order). If the site fails this test,
 the number of alleles is reported as 0."""
 
+
 def get_options(defaults, usage, description='', epilog=''):
   """Get options, print usage text."""
 
@@ -124,7 +124,6 @@
     if len(coords) > 2: print_sample = coords[2]
 
   # set infile_handle to either stdin or the input file
-  global infile_handle
   if infile == OPT_DEFAULTS.get('infile'):
     infile_handle = sys.stdin
     sys.stderr.write("Reading from standard input..\n")
@@ -135,7 +134,6 @@
       fail('Error: Input VCF file '+infile+' not found.')
 
   # set outfile_handle to either stdout or the output file
-  global outfile_handle
   if outfile == OPT_DEFAULTS.get('outfile'):
     outfile_handle = sys.stdout
   else:
@@ -186,23 +184,18 @@
           sys.stderr.write("Error: Sample '"+print_sample+"' not found.\n")
           sys.exit(1)
 
-
     site_summary = summarize_site(site_data, sample_names, CANONICAL_VARIANTS,
       freq_thres, covg_thres, stranded, debug=debug)
 
     if debug and site_summary[0]['print']:
-        print line.split('\t')[9].split(':')[-1]
+      print(line.split('\t')[9].split(':')[-1])
 
     try:
       print_site(outfile_handle, site_summary, COLUMNS)
     except IOError as ioe:
       if ioe.errno == errno.EPIPE:
-        cleanup()
         sys.exit(0)
 
-  # close any open filehandles
-  cleanup()
-
   # keeps Galaxy from giving an error if there were messages on stderr
   sys.exit(0)
 
@@ -238,7 +231,7 @@
 
   if len(fields) < 9:
     fail("Error in input VCF: wrong number of fields in data line. "
-          +"Failed on line:\n"+line)
+         "Failed on line:\n"+line)
 
   site['chr'] = fields[0]
   site['pos'] = fields[1]
@@ -246,35 +239,38 @@
 
   if len(samples) < len(sample_names):
     fail("Error in input VCF: missing sample fields in data line. "
-          +"Failed on line:\n"+line)
+         "Failed on line:\n"+line)
   elif len(samples) > len(sample_names):
     fail("Error in input VCF: more sample fields in data line than in header. "
-          +"Failed on line:\n"+line)
+         "Failed on line:\n"+line)
 
   sample_counts = {}
   for i in range(len(samples)):
-    
+
     variant_counts = {}
     counts = samples[i].split(':')[-1]
     counts = counts.split(',')
 
     for count in counts:
-      if not count:
+      if not count or count == '.':
         continue
       fields = count.split('=')
       if len(fields) != 2:
         fail("Error in input VCF: Incorrect variant data format (must contain "
-          +"a single '='). Failed on line:\n"+line)
+             "a single '='). Failed on data \"{}\" in line:\n{}"
+             .format(count, line))
       (variant, reads) = fields
       if variant[1:] not in canonical:
         continue
-      if variant[0] != '-' and variant[0] != '+':
-        fail("Error in input VCF: variant data not strand-specific. "
-          +"Failed on line:\n"+line)
+      if not variant.startswith('-') and not variant.startswith('+'):
+        fail("Error in input VCF: variant data not strand-specific. Failed on "
+             "data \"{}\" on line:\n{}".format(variant, line))
       try:
         variant_counts[variant] = int(float(reads))
       except ValueError:
-        fail("Error in input VCF: Variant count not a valid number. Failed on variant count string '"+reads+"'\nIn the following line:\n"+line)
+        fail("Error in input VCF: Variant count not a valid number. Failed on "
+             "variant count string \"{}\"\nIn the following line:\n{}"
+             .format(reads, line))
 
     sample_counts[sample_names[i]] = variant_counts
 
@@ -338,7 +334,7 @@
         sample[strand+base_count[0]] = base_count[1]
       # fill in any zeros
       for base in canonical:
-        if not sample.has_key(strand+base):
+        if strand+base not in sample:
           sample[strand+base] = 0
 
     sample['alleles'] = count_alleles(variants, freq_thres, debug=debug)
@@ -351,7 +347,7 @@
         ranked_bases[1] = ranked_bases[2]
         ranked_bases[2] = tmp_base
 
-    if debug: print "ranked +-: "+str(ranked_bases)
+    if debug: print("ranked +-: "+str(ranked_bases))
 
     sample['coverage'] = coverage
     try:
@@ -396,7 +392,7 @@
     if strand in strands:
       summed_counts[base] = stranded_counts[variant] + summed_counts.get(base, 0)
 
-  return summed_counts.items()
+  return list(summed_counts.items())
 
 
 def process_read_counts(variant_counts, freq_thres=0, sort=False, debug=False):
@@ -423,10 +419,10 @@
     variant_counts.sort(reverse=True, key=lambda variant: variant[1])
 
   if debug:
-    print 'coverage: '+str(coverage)+', freq_thres: '+str(freq_thres)
+    print('coverage: '+str(coverage)+', freq_thres: '+str(freq_thres))
     for variant in variant_counts:
-      print (variant[0]+': '+str(variant[1])+'/'+str(float(coverage))+' = '+
-        str(variant[1]/coverage))
+      print((variant[0]+': '+str(variant[1])+'/'+str(float(coverage))+' = '+
+        str(variant[1]/coverage)))
 
   # remove bases below the frequency threshold
   if freq_thres > 0:
@@ -452,8 +448,8 @@
     sort=False, debug=debug)
 
   if debug:
-    print '+ '+str(alleles_plus)
-    print '- '+str(alleles_minus)
+    print('+ '+str(alleles_plus))
+    print('- '+str(alleles_minus))
 
   # Check if each strand reports the same set of alleles.
   # Sorting by base is to compare lists without regard to order (as sets).
@@ -492,17 +488,9 @@
 
 
 def fail(message):
-  cleanup()
   sys.stderr.write(message+'\n')
   sys.exit(1)
 
 
-def cleanup():
-  if isinstance(infile_handle, file):
-    infile_handle.close()
-  if isinstance(outfile_handle, file):
-    outfile_handle.close()
-
-
 if __name__ == "__main__":
   main()
\ No newline at end of file
--- a/allele-counts.xml	Wed Dec 09 11:31:13 2015 -0500
+++ b/allele-counts.xml	Tue Mar 31 09:00:51 2020 +0000
@@ -1,6 +1,10 @@
-<tool id="allele_counts_1" version="1.2" name="Variant Annotator">
+<tool id="allele_counts_1" version="1.3" name="Variant Annotator">
   <description> process variant counts</description>
-  <command interpreter="python">allele-counts.py -i $input -o $output -f $freq -c $covg $header $stranded $nofilt
+  <stdio>
+    <exit_code range="1:" level="fatal" />
+    <exit_code range=":-1" level="fatal" />
+  </stdio>
+  <command>allele-counts.py -i $input -o $output -f $freq -c $covg $header $stranded $nofilt
   #if $seed:
     -r $seed
   #end if
@@ -15,12 +19,8 @@
     <param name="seed" type="text" value="" label="PRNG seed" />
   </inputs>
   <outputs>
-    <data name="output" format="tabular"/>
+    <data name="output" format="tabular" />
   </outputs>
-  <stdio>
-    <exit_code range="1:" err_level="fatal"/>
-    <exit_code range=":-1" err_level="fatal"/>
-  </stdio>
 
   <tests>
     <test>
@@ -114,4 +114,40 @@
 
   </help>
 
+  <citations>
+    <citation type="bibtex">
+      @article{Blankenberg2014,
+        author = {Blankenberg, Daniel and {Von Kuster}, Gregory and Bouvier, Emil and Baker, Dannon and Afgan, Enis and Stoler, Nicholas and Taylor, James and Nekrutenko, Anton},
+        doi = {10.1186/gb4161},
+        issn = {1465-6906},
+        journal = {Genome Biology},
+        keywords = {galaxy},
+        number = {2},
+        pages = {403},
+        title = {{Dissemination of scientific software with Galaxy ToolShed}},
+        url = {http://genomebiology.biomedcentral.com/articles/10.1186/gb4161},
+        volume = {15},
+        year = {2014}
+      }
+    </citation>
+    <citation type="bibtex">
+      @article{Dickins2014,
+        archivePrefix = {arXiv},
+        arxivId = {15334406},
+        author = {Dickins, Benjamin and Rebolledo-Jaramillo, Boris and Su, Marcia Shu Wei and Paul, Ian M and Blankenberg, Daniel and Stoler, Nicholas and Makova, Kateryna D and Nekrutenko, Anton},
+        doi = {10.2144/000114146},
+        eprint = {15334406},
+        isbn = {5049880467},
+        issn = {19409818},
+        journal = {BioTechniques},
+        number = {3},
+        pages = {134--141},
+        pmid = {24641477},
+        title = {{Controlling for contamination in re-sequencing studies with a reproducible web-based phylogenetic approach}},
+        volume = {56},
+        year = {2014}
+      }
+    </citation>
+  </citations>
+
 </tool>
--- a/tests/artificial-nofilt.csv.out	Wed Dec 09 11:31:13 2015 -0500
+++ b/tests/artificial-nofilt.csv.out	Tue Mar 31 09:00:51 2020 +0000
@@ -1,27 +1,27 @@
-#SAMPLE	CHR	POS	A	C	G	T	CVRG	ALLELES	MAJOR	MINOR	MINOR.FREQ.PERC.
-THYROID	chr1	0	30	0	0	0	30	1	A	.	0.0
-THYROID	chr1	10	30	0	2	0	32	2	A	G	0.0625
-THYROID	chr1	20	31	0	1	0	32	0	A	G	0.03125
-THYROID	chr1	30	21	0	4	0	25	2	A	G	0.16
-THYROID	chr1	40	22	0	3	0	25	0	A	G	0.12
-THYROID	chr1	50	3	0	0	0	3	1	A	.	0.0
-THYROID	chr1	60	2	0	2	0	4	2	A	G	0.5
-THYROID	chr1	70	1	0	3	0	4	0	G	A	0.25
-THYROID	chr1	80	104	0	3	0	107	0	A	G	0.02804
-THYROID	chr1	90	100	2	11	0	113	3	A	G	0.09735
-THYROID	chr1	100	100	1	11	0	112	0	A	G	0.09821
-THYROID	chr1	120	0	0	0	0	0	0	.	.	0.0
-THYROID	chr1	130	0	0	2	0	2	1	G	.	0.0
-THYROID	chr1	140	0	0	1	0	1	0	G	.	0.0
-THYROID	chr1	150	0	0	4	0	4	1	G	.	0.0
-THYROID	chr1	160	0	0	3	0	3	0	G	.	0.0
-THYROID	chr1	260	106	0	14	0	120	2	A	G	0.11667
-THYROID	chr1	300	2	0	2	76	80	3	T	G	0.025
-THYROID	chr1	310	12	0	12	76	100	3	T	G	0.12
-THYROID	chr1	320	12	0	12	56	80	3	T	A	0.15
-THYROID	chr1	330	7	0	7	66	80	3	T	G	0.0875
-THYROID	chr1	340	1	0	1	98	100	0	T	G	0.01
-THYROID	chr1	350	11	0	11	78	100	0	T	A	0.11
-THYROID	chr1	400	32	0	8	0	40	2	A	G	0.2
-THYROID	chr1	410	1	0	2	97	100	0	T	G	0.02
-THYROID	chr1	420	104	0	0	0	104	1	A	.	0.0
+#SAMPLE	CHR	POS	A	C	G	T	CVRG	ALLELES	MAJOR	MINOR	MAF	BIAS
+THYROID	chr1	0	30	0	0	0	30	1	A	.	0.0	.
+THYROID	chr1	10	30	0	2	0	32	2	A	G	0.0625	0.0
+THYROID	chr1	20	31	0	1	0	32	0	A	G	0.03125	2.0
+THYROID	chr1	30	21	0	4	0	25	2	A	G	0.16	0.08013
+THYROID	chr1	40	22	0	3	0	25	0	A	G	0.12	1.78571
+THYROID	chr1	50	3	0	0	0	3	1	A	.	0.0	.
+THYROID	chr1	60	2	0	2	0	4	2	A	G	0.5	0.0
+THYROID	chr1	70	1	0	3	0	4	0	G	A	0.25	2.0
+THYROID	chr1	80	104	0	3	0	107	0	A	G	0.02804	1.01905
+THYROID	chr1	90	100	2	11	0	113	3	A	G	0.09735	0.16381
+THYROID	chr1	100	100	1	11	0	112	0	A	G	0.09821	0.16381
+THYROID	chr1	120	0	0	0	0	0	0	.	.	0.0	.
+THYROID	chr1	130	0	0	2	0	2	1	G	.	0.0	.
+THYROID	chr1	140	0	0	1	0	1	0	G	.	0.0	.
+THYROID	chr1	150	0	0	4	0	4	1	G	.	0.0	.
+THYROID	chr1	160	0	0	3	0	3	0	G	.	0.0	.
+THYROID	chr1	260	106	0	14	0	120	2	A	G	0.11667	2.4
+THYROID	chr1	300	2	0	2	76	80	3	T	A	0.025	0.0
+THYROID	chr1	310	12	0	12	76	100	3	T	A	0.12	0.0
+THYROID	chr1	320	12	0	12	56	80	3	T	G	0.15	0.64394
+THYROID	chr1	330	7	0	7	66	80	3	T	G	0.0875	1.06247
+THYROID	chr1	340	1	0	1	98	100	0	T	A	0.01	1.22222
+THYROID	chr1	350	11	0	11	78	100	0	T	A	0.11	1.25352
+THYROID	chr1	400	32	0	8	0	40	2	A	G	0.2	0.0
+THYROID	chr1	410	1	0	2	97	100	0	T	G	0.02	5.5
+THYROID	chr1	420	104	0	0	0	104	1	A	.	0.0	.
--- a/tests/artificial-samples.csv.out	Wed Dec 09 11:31:13 2015 -0500
+++ b/tests/artificial-samples.csv.out	Tue Mar 31 09:00:51 2020 +0000
@@ -1,13 +1,13 @@
-BRAIN	chr1	0	30	0	0	0	30	1	A	.	0.0
-ARTERY	chr1	0	0	0	30	0	30	1	G	.	0.0
-THYROID	chr1	0	0	30	0	0	30	1	C	.	0.0
-BRAIN	chr1	10	30	0	0	0	30	1	A	.	0.0
-ARTERY	chr1	10	30	0	2	0	32	1	A	G	0.0625
-THYROID	chr1	10	31	0	1	0	32	1	A	G	0.03125
-BRAIN	chr1	20	30	0	2	0	32	1	A	G	0.0625
-ARTERY	chr1	20	34	0	6	0	40	2	A	G	0.15
-THYROID	chr1	20	30	0	2	0	32	0	A	G	0.0625
-BRAIN	chr1	30	30	0	0	0	30	1	A	.	0.0
-BRAIN	chr1	40	0	0	0	30	30	1	T	.	0.0
-ARTERY	chr1	40	1	0	2	97	100	0	T	G	0.02
-THYROID	chr1	40	0	69	0	31	100	0	C	T	0.31
+BRAIN	chr1	0	30	0	0	0	30	1	A	.	0.0	.
+ARTERY	chr1	0	0	0	30	0	30	1	G	.	0.0	.
+THYROID	chr1	0	0	30	0	0	30	1	C	.	0.0	.
+BRAIN	chr1	10	30	0	0	0	30	1	A	.	0.0	.
+ARTERY	chr1	10	30	0	2	0	32	1	A	G	0.0625	0.0
+THYROID	chr1	10	31	0	1	0	32	1	A	G	0.03125	2.0
+BRAIN	chr1	20	30	0	2	0	32	1	A	G	0.0625	0.0
+ARTERY	chr1	20	34	0	6	0	40	2	A	G	0.15	0.0
+THYROID	chr1	20	30	0	2	0	32	0	A	G	0.0625	1.88235
+BRAIN	chr1	30	30	0	0	0	30	1	A	.	0.0	.
+BRAIN	chr1	40	0	0	0	30	30	1	T	.	0.0	.
+ARTERY	chr1	40	1	0	2	97	100	0	T	G	0.02	5.5
+THYROID	chr1	40	0	69	0	31	100	0	C	T	0.31	1.00096
--- a/tests/artificial.csv.out	Wed Dec 09 11:31:13 2015 -0500
+++ b/tests/artificial.csv.out	Tue Mar 31 09:00:51 2020 +0000
@@ -1,35 +1,35 @@
-THYROID	chr1	0	30	0	0	0	30	1	A	.	0.0
-THYROID	chr1	10	30	0	2	0	32	1	A	G	0.0625
-THYROID	chr1	20	31	0	1	0	32	1	A	G	0.03125
-THYROID	chr1	30	21	0	4	0	25	2	A	G	0.16
-THYROID	chr1	40	22	0	3	0	25	0	A	G	0.12
-THYROID	chr1	50	30	0	0	0	30	1	A	.	0.0
-THYROID	chr1	60	31	0	0	0	31	1	A	.	0.0
-THYROID	chr1	70	21	0	0	0	21	1	A	.	0.0
-THYROID	chr1	80	22	0	0	0	22	1	A	.	0.0
-THYROID	chr1	82	30	0	2	0	32	1	A	G	0.0625
-THYROID	chr1	84	31	0	1	0	32	1	A	G	0.03125
-THYROID	chr1	86	21	0	4	0	25	2	A	G	0.16
-THYROID	chr1	88	22	0	3	0	25	0	A	G	0.12
-THYROID	chr1	90	30	0	0	0	30	1	A	.	0.0
-THYROID	chr1	100	31	0	0	0	31	1	A	.	0.0
-THYROID	chr1	110	21	0	0	0	21	1	A	.	0.0
-THYROID	chr1	120	22	0	0	0	22	1	A	.	0.0
-THYROID	chr1	210	20	0	0	0	20	1	A	.	0.0
-THYROID	chr1	220	22	0	0	0	22	1	A	.	0.0
-THYROID	chr1	230	182	0	18	0	200	1	A	G	0.09
-THYROID	chr1	240	180	0	20	0	200	2	A	G	0.1
-THYROID	chr1	250	178	0	22	0	200	2	A	G	0.11
-THYROID	chr1	260	106	0	14	0	120	0	A	G	0.11667
-THYROID	chr1	300	2	0	2	76	80	1	T	G	0.025
-THYROID	chr1	310	12	0	12	76	100	3	T	G	0.12
-THYROID	chr1	320	12	0	12	56	80	3	T	A	0.15
-THYROID	chr1	330	7	0	7	66	80	0	T	G	0.0875
-THYROID	chr1	340	1	0	1	98	100	1	T	G	0.01
-THYROID	chr1	350	11	0	11	78	100	0	T	A	0.11
-THYROID	chr1	400	32	0	8	0	40	2	A	G	0.2
-THYROID	chr1	410	1	0	2	97	100	0	T	G	0.02
-THYROID	chr1	420	104	0	0	0	104	1	A	.	0.0
-THYROID	chr1	430	30	0	0	0	30	1	A	.	0.0
-THYROID	chr1	440	30	0	0	0	30	1	A	.	0.0
-THYROID	27	1234567890	29	0	0	0	29	1	A	.	0.0
+THYROID	chr1	0	30	0	0	0	30	1	A	.	0.0	.
+THYROID	chr1	10	30	0	2	0	32	1	A	G	0.0625	0.0
+THYROID	chr1	20	31	0	1	0	32	1	A	G	0.03125	2.0
+THYROID	chr1	30	21	0	4	0	25	2	A	G	0.16	0.08013
+THYROID	chr1	40	22	0	3	0	25	0	A	G	0.12	1.78571
+THYROID	chr1	50	30	0	0	0	30	1	A	.	0.0	.
+THYROID	chr1	60	31	0	0	0	31	1	A	.	0.0	.
+THYROID	chr1	70	21	0	0	0	21	1	A	.	0.0	.
+THYROID	chr1	80	22	0	0	0	22	1	A	.	0.0	.
+THYROID	chr1	82	30	0	2	0	32	1	A	G	0.0625	0.0
+THYROID	chr1	84	31	0	1	0	32	1	A	G	0.03125	2.0
+THYROID	chr1	86	21	0	4	0	25	2	A	G	0.16	0.08013
+THYROID	chr1	88	22	0	3	0	25	0	A	G	0.12	1.78571
+THYROID	chr1	90	30	0	0	0	30	1	A	.	0.0	.
+THYROID	chr1	100	31	0	0	0	31	1	A	.	0.0	.
+THYROID	chr1	110	21	0	0	0	21	1	A	.	0.0	.
+THYROID	chr1	120	22	0	0	0	22	1	A	.	0.0	.
+THYROID	chr1	210	20	0	0	0	20	1	A	.	0.0	.
+THYROID	chr1	220	22	0	0	0	22	1	A	.	0.0	.
+THYROID	chr1	230	182	0	18	0	200	1	A	G	0.09	0.0
+THYROID	chr1	240	180	0	20	0	200	2	A	G	0.1	0.0
+THYROID	chr1	250	178	0	22	0	200	2	A	G	0.11	0.0
+THYROID	chr1	260	106	0	14	0	120	0	A	G	0.11667	2.4
+THYROID	chr1	300	2	0	2	76	80	1	T	A	0.025	0.0
+THYROID	chr1	310	12	0	12	76	100	3	T	A	0.12	0.0
+THYROID	chr1	320	12	0	12	56	80	3	T	G	0.15	0.64394
+THYROID	chr1	330	7	0	7	66	80	0	T	G	0.0875	1.06247
+THYROID	chr1	340	1	0	1	98	100	1	T	A	0.01	1.22222
+THYROID	chr1	350	11	0	11	78	100	0	T	A	0.11	1.25352
+THYROID	chr1	400	32	0	8	0	40	2	A	G	0.2	0.0
+THYROID	chr1	410	1	0	2	97	100	0	T	G	0.02	5.5
+THYROID	chr1	420	104	0	0	0	104	1	A	.	0.0	.
+THYROID	chr1	430	30	0	0	0	30	1	A	.	0.0	.
+THYROID	chr1	440	30	0	0	0	30	1	A	.	0.0	.
+THYROID	27	1234567890	29	0	0	0	29	1	A	.	0.0	.
--- a/tests/real-mit-s.csv.out	Wed Dec 09 11:31:13 2015 -0500
+++ b/tests/real-mit-s.csv.out	Tue Mar 31 09:00:51 2020 +0000
@@ -1,12 +1,12 @@
-#SAMPLE	CHR	POS	+A	+C	+G	+T	-A	-C	-G	-T	CVRG	ALLELES	MAJOR	MINOR	MINOR.FREQ.PERC.
-S1	chrM	2000	1	9095	1	0	7	5808	0	1	14913	1	C	A	0.00054
-S3	chrM	2000	0	7933	0	4	10	5242	1	2	13192	1	C	A	0.00076
-S1	chrM	3000	17399	7	22	8	10567	35	22	4	28064	0	A	G	0.00157
-S2	chrM	3000	12535	3	24	2	7937	13	12	2	20528	1	A	G	0.00175
-S3	chrM	3000	18981	7	29	6	11286	33	17	4	30363	0	A	G	0.00152
-S4	chrM	3000	9254	1	15	2	6240	16	14	1	15543	0	A	G	0.00187
-S1	chrM	4000	6134	2	1	3	6124	1	1	1	12267	1	A	T	0.00033
-S1	chrM	7000	0	17	1	6216	4	9	2	7529	13778	0	T	C	0.00189
-S2	chrM	7000	0	7	2	5104	0	9	4	6288	11414	1	T	C	0.0014
-S3	chrM	7000	0	9	0	6446	4	4	10	7506	13979	1	T	C	0.00093
-S3	chrM	8000	3	1	5023	1	1	0	5043	2	10074	1	G	A	0.0004
+#SAMPLE	CHR	POS	+A	+C	+G	+T	-A	-C	-G	-T	CVRG	ALLELES	MAJOR	MINOR	MAF	BIAS
+S1	chrM	2000	1	9095	1	0	7	5808	0	1	14913	1	C	A	0.00054	2.03879
+S3	chrM	2000	0	7933	0	4	10	5242	1	2	13192	1	C	A	0.00076	2.51047
+S1	chrM	3000	17399	7	22	8	10567	35	22	4	28064	0	A	G	0.00157	0.51868
+S2	chrM	3000	12535	3	24	2	7937	13	12	2	20528	1	A	G	0.00175	0.22864
+S3	chrM	3000	18981	7	29	6	11286	33	17	4	30363	0	A	G	0.00152	0.01416
+S4	chrM	3000	9254	1	15	2	6240	16	14	1	15543	0	A	G	0.00187	0.33202
+S1	chrM	4000	6134	2	1	3	6124	1	1	1	12267	1	A	T	0.00033	0.99804
+S1	chrM	7000	0	17	1	6216	4	9	2	7529	13778	0	T	C	0.00189	0.81221
+S2	chrM	7000	0	7	2	5104	0	9	4	6288	11414	1	T	C	0.0014	0.04254
+S3	chrM	7000	0	9	0	6446	4	4	10	7506	13979	1	T	C	0.00093	0.92561
+S3	chrM	8000	3	1	5023	1	1	0	5043	2	10074	1	G	A	0.0004	1.00358
--- a/tests/real-mit.csv.out	Wed Dec 09 11:31:13 2015 -0500
+++ b/tests/real-mit.csv.out	Tue Mar 31 09:00:51 2020 +0000
@@ -1,12 +1,12 @@
-#SAMPLE	CHR	POS	A	C	G	T	CVRG	ALLELES	MAJOR	MINOR	MINOR.FREQ.PERC.
-S1	chrM	2000	8	14903	1	1	14913	1	C	A	0.00054
-S3	chrM	2000	10	13175	1	6	13192	1	C	A	0.00076
-S1	chrM	3000	27966	42	44	12	28064	0	A	G	0.00157
-S2	chrM	3000	20472	16	36	4	20528	1	A	G	0.00175
-S3	chrM	3000	30267	40	46	10	30363	0	A	G	0.00152
-S4	chrM	3000	15494	17	29	3	15543	0	A	G	0.00187
-S1	chrM	4000	12258	3	2	4	12267	1	A	T	0.00033
-S1	chrM	7000	4	26	3	13745	13778	0	T	C	0.00189
-S2	chrM	7000	0	16	6	11392	11414	1	T	C	0.0014
-S3	chrM	7000	4	13	10	13952	13979	1	T	C	0.00093
-S3	chrM	8000	4	1	10066	3	10074	1	G	A	0.0004
+#SAMPLE	CHR	POS	A	C	G	T	CVRG	ALLELES	MAJOR	MINOR	MAF	BIAS
+S1	chrM	2000	8	14903	1	1	14913	1	C	A	0.00054	2.03879
+S3	chrM	2000	10	13175	1	6	13192	1	C	A	0.00076	2.51047
+S1	chrM	3000	27966	42	44	12	28064	0	A	G	0.00157	0.51868
+S2	chrM	3000	20472	16	36	4	20528	1	A	G	0.00175	0.22864
+S3	chrM	3000	30267	40	46	10	30363	0	A	G	0.00152	0.01416
+S4	chrM	3000	15494	17	29	3	15543	0	A	G	0.00187	0.33202
+S1	chrM	4000	12258	3	2	4	12267	1	A	T	0.00033	0.99804
+S1	chrM	7000	4	26	3	13745	13778	0	T	C	0.00189	0.81221
+S2	chrM	7000	0	16	6	11392	11414	1	T	C	0.0014	0.04254
+S3	chrM	7000	4	13	10	13952	13979	1	T	C	0.00093	0.92561
+S3	chrM	8000	4	1	10066	3	10074	1	G	A	0.0004	1.00358
--- a/tests/real-nofilt.csv.out	Wed Dec 09 11:31:13 2015 -0500
+++ b/tests/real-nofilt.csv.out	Tue Mar 31 09:00:51 2020 +0000
@@ -1,15 +1,15 @@
-#SAMPLE	CHR	POS	A	C	G	T	CVRG	ALLELES	MAJOR	MINOR	MINOR.FREQ.PERC.
-THYROID	chr1	246704250	29	0	0	0	29	1	A	.	0.0
-THYROID	chr1	246704257	0	0	0	71	71	1	T	.	0.0
-THYROID	chr1	246704268	104	0	0	0	104	1	A	.	0.0
-THYROID	chr1	246704269	0	0	0	105	105	1	T	.	0.0
-THYROID	chr1	246704363	0	72	3	0	75	0	C	G	0.04
-THYROID	chr1	246704437	5	130	0	0	135	0	C	A	0.03704
-THYROID	chr1	246707878	0	0	131	0	131	1	G	.	0.0
-THYROID	chr1	246714587	30	0	43	0	73	2	G	A	0.41096
-THYROID	chr1	246729215	1	0	1	88	90	0	T	G	0.01111
-THYROID	chr1	246729216	1	0	1	90	92	0	T	G	0.01087
-THYROID	chr1	246729378	16	7	0	0	23	0	A	C	0.30435
-THYROID	chr1	246729392	29	0	10	0	39	0	A	G	0.25641
-THYROID	chr7	91502881	0	0	0	26	26	1	T	.	0.0
-THYROID	chr7	91502897	7	36	0	0	43	0	C	A	0.16279
+#SAMPLE	CHR	POS	A	C	G	T	CVRG	ALLELES	MAJOR	MINOR	MAF	BIAS
+THYROID	chr1	246704250	29	0	0	0	29	1	A	.	0.0	.
+THYROID	chr1	246704257	0	0	0	71	71	1	T	.	0.0	.
+THYROID	chr1	246704268	104	0	0	0	104	1	A	.	0.0	.
+THYROID	chr1	246704269	0	0	0	105	105	1	T	.	0.0	.
+THYROID	chr1	246704363	0	72	3	0	75	0	C	G	0.04	1.36364
+THYROID	chr1	246704437	5	130	0	0	135	0	C	A	0.03704	2.14286
+THYROID	chr1	246707878	0	0	131	0	131	1	G	.	0.0	.
+THYROID	chr1	246714587	30	0	43	0	73	2	G	A	0.41096	1.22996
+THYROID	chr1	246729215	1	0	1	88	90	0	T	A	0.01111	1.08537
+THYROID	chr1	246729216	1	0	1	90	92	0	T	A	0.01087	1.10976
+THYROID	chr1	246729378	16	7	0	0	23	0	A	C	0.30435	.
+THYROID	chr1	246729392	29	0	10	0	39	0	A	G	0.25641	.
+THYROID	chr7	91502881	0	0	0	26	26	1	T	.	0.0	.
+THYROID	chr7	91502897	7	36	0	0	43	0	C	A	0.16279	1.79167
--- a/tests/real.csv.out	Wed Dec 09 11:31:13 2015 -0500
+++ b/tests/real.csv.out	Tue Mar 31 09:00:51 2020 +0000
@@ -1,11 +1,11 @@
-THYROID	chr1	246704250	29	0	0	0	29	1	A	.	0.0
-THYROID	chr1	246704257	0	0	0	71	71	1	T	.	0.0
-THYROID	chr1	246704268	104	0	0	0	104	1	A	.	0.0
-THYROID	chr1	246704269	0	0	0	105	105	1	T	.	0.0
-THYROID	chr1	246704363	0	72	3	0	75	0	C	G	0.04
-THYROID	chr1	246704437	5	130	0	0	135	0	C	A	0.03704
-THYROID	chr1	246707878	0	0	131	0	131	1	G	.	0.0
-THYROID	chr1	246714587	30	0	43	0	73	2	G	A	0.41096
-THYROID	chr1	246729216	1	0	1	90	92	0	T	G	0.01087
-THYROID	chr7	91502881	0	0	0	26	26	1	T	.	0.0
-THYROID	chr7	91502897	7	36	0	0	43	0	C	A	0.16279
+THYROID	chr1	246704250	29	0	0	0	29	1	A	.	0.0	.
+THYROID	chr1	246704257	0	0	0	71	71	1	T	.	0.0	.
+THYROID	chr1	246704268	104	0	0	0	104	1	A	.	0.0	.
+THYROID	chr1	246704269	0	0	0	105	105	1	T	.	0.0	.
+THYROID	chr1	246704363	0	72	3	0	75	0	C	G	0.04	1.36364
+THYROID	chr1	246704437	5	130	0	0	135	0	C	A	0.03704	2.14286
+THYROID	chr1	246707878	0	0	131	0	131	1	G	.	0.0	.
+THYROID	chr1	246714587	30	0	43	0	73	2	G	A	0.41096	1.22996
+THYROID	chr1	246729216	1	0	1	90	92	0	T	A	0.01087	1.10976
+THYROID	chr7	91502881	0	0	0	26	26	1	T	.	0.0	.
+THYROID	chr7	91502897	7	36	0	0	43	0	C	A	0.16279	1.79167
--- a/tests/run-tests.py	Wed Dec 09 11:31:13 2015 -0500
+++ b/tests/run-tests.py	Tue Mar 31 09:00:51 2020 +0000
@@ -1,8 +1,9 @@
-#!/usr/bin/env python
+#!/usr/bin/env python3
 import os
 import sys
 import subprocess
 
+SCRIPT_NAME = 'allele-counts.py'
 DATASETS = [
   'artificial',
   'artificial-samples',
@@ -16,15 +17,52 @@
 OUT_EXT = '.csv.out'
 ARGS_KEY = '##comment="ARGS='
 
+XML = {
+  'tests_start':'  <tests>',
+  'test_start': '    <test>',
+  'input':      '      <param name="input" value="tests/%s" />',
+  'param':      '      <param name="%s" value="%s" />',
+  'output':     '      <output name="output" file="tests/%s" />',
+  'test_end':   '    </test>',
+  'tests_end':  '  </tests>',
+}
+PARAMS = {
+  '-f':'freq',
+  '-c':'covg',
+  '-H':'header',
+  '-s':'stranded',
+  '-n':'nofilt',
+  '-r':'seed',
+}
+PARAM_ARG = {
+  '-f':True,
+  '-c':True,
+  '-H':False,
+  '-s':False,
+  '-n':False,
+  '-r':True,
+}
+
 def main():
 
-  test_dir = os.path.dirname(os.path.relpath(sys.argv[0]))
-  if test_dir:
-    test_dir += os.sep
+  do_print_xml = False
+  if len(sys.argv) > 1:
+    if sys.argv[1] == '-x':
+      do_print_xml = True
+    else:
+      sys.stderr.write("Error: unrecognized option '"+sys.argv[1]+"'\n")
+      sys.exit(1)
+
+  test_dir = os.path.dirname(os.path.realpath(__file__))
+  script_dir = os.path.relpath(os.path.dirname(test_dir))
+  test_dir = os.path.relpath(test_dir)
+
+  if do_print_xml:
+    print(XML.get('tests_start'))
 
   for dataset in DATASETS:
-    infile  = test_dir+dataset+IN_EXT
-    outfile = test_dir+dataset+OUT_EXT
+    infile  = os.path.join(test_dir, dataset+IN_EXT)
+    outfile = os.path.join(test_dir, dataset+OUT_EXT)
 
     if not os.path.exists(infile):
       sys.stderr.write("Error: file not found: "+infile+"\n")
@@ -34,11 +72,50 @@
       continue
 
     options = read_options(infile)
-    script_cmd = 'allele-counts.py '+options+' -i '+infile
-    bash_cmd = 'diff '+outfile+' <('+script_cmd+')'
-    # print infile+":"
-    print script_cmd
-    subprocess.call(['bash', '-c', bash_cmd])
+    if do_print_xml:
+      print_xml(infile, outfile, options, XML, PARAMS, PARAM_ARG)
+    else:
+      run_tests(infile, outfile, options, script_dir)
+
+  if do_print_xml:
+    print(XML.get('tests_end'))
+
+
+def run_tests(infile, outfile, options, script_dir):
+  script_cmd = os.path.join(script_dir, SCRIPT_NAME)+' '+options+' -i '+infile
+  bash_cmd = 'diff '+outfile+' <('+script_cmd+')'
+  print(script_cmd)
+  subprocess.call(['bash', '-c', bash_cmd])
+
+
+def print_xml(infile, outfile, options_str, xml, params, param_arg):
+  infile = os.path.basename(infile)
+  outfile = os.path.basename(outfile)
+
+  options = options_str.split()  # on whitespace
+
+  print(xml.get('test_start'))
+  print(xml.get('input') % infile)
+
+  # read in options one at a time, print <param> line
+  i = 0
+  while i < len(options):
+    opt = options[i]
+    if opt not in params or opt not in param_arg:
+      sys.stderr.write("Error: unknown option '"+opt+"' in ARGS list in file "+infile+"\n")
+      sys.exit(1)
+    # takes argument
+    if param_arg[opt]:
+      i+=1
+      arg = options[i]
+      print(xml.get('param') % (params[opt], arg))
+    # no argument (boolean)
+    else:
+      print(xml.get('param') % (params[opt], 'true'))
+    i+=1
+
+  print(xml.get('output') % outfile)
+  print(xml.get('test_end'))
 
 
 def read_options(infile):