changeset 0:ae709fd50581 draft

Imported from capsule None
author devteam
date Mon, 19 May 2014 11:00:01 -0400
parents
children 5cabbe4cfaf4
files fasta_to_tabular.py fasta_to_tabular.xml test-data/1.tabular test-data/2.tabular test-data/4.fasta test-data/454.fasta test-data/a.tab test-data/fasta_to_tabular_out1.tabular test-data/fasta_to_tabular_out2.tabular test-data/fasta_to_tabular_out3.tabular test-data/fasta_to_tabular_out4.tabular test-data/fasta_to_tabular_out5.tabular test-data/fasta_to_tabular_out6.tabular
diffstat 13 files changed, 367 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/fasta_to_tabular.py	Mon May 19 11:00:01 2014 -0400
@@ -0,0 +1,57 @@
+#!/usr/bin/env python
+# This code exists in 2 places: ~/datatypes/converters and ~/tools/fasta_tools
+"""
+Input: fasta (input file), tabular (output file), int (truncation of id), int (columns from description)
+Output: tabular
+format convert: fasta to tabular
+"""
+
+import sys, os
+
+def stop_err( msg ):
+    sys.stderr.write( msg )
+    sys.exit()
+
+def __main__():
+    if len(sys.argv) != 5:
+        stop_err("Wrong number of argument. Expect four (fasta, tabular, truncation, columns)")
+    infile = sys.argv[1]
+    outfile = sys.argv[2]
+    keep_first = int( sys.argv[3] )
+    descr_split = int( sys.argv[4] )
+    fasta_title = fasta_seq = ''
+    if keep_first == 0:
+        keep_first = None
+    elif descr_split == 1:
+        #Added one for the ">" character
+        #(which is removed if using descr_split > 1)
+        keep_first += 1
+    if descr_split < 1:
+        stop_err("Bad description split value (should be 1 or more)")
+    out = open( outfile, 'w' )
+    for i, line in enumerate( open( infile ) ):
+        line = line.rstrip( '\r\n' )
+        if not line or line.startswith( '#' ):
+            continue
+        if line.startswith( '>' ):
+            #Don't want any existing tabs to trigger extra columns:
+            line = line.replace('\t', ' ')
+            if i > 0:
+                out.write('\n')
+            if descr_split == 1:
+                out.write(line[1:keep_first])
+            else:
+                words = line[1:].split(None, descr_split-1)
+                #apply any truncation to first word (the id)
+                words[0] = words[0][0:keep_first]
+                #pad with empty columns if required
+                words += [""]*(descr_split-len(words))
+                out.write("\t".join(words))
+            out.write('\t')
+        else:
+            out.write(line)
+    if i > 0:
+        out.write('\n')
+    out.close()
+
+if __name__ == "__main__" : __main__()
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/fasta_to_tabular.xml	Mon May 19 11:00:01 2014 -0400
@@ -0,0 +1,128 @@
+<tool id="fasta2tab" name="FASTA-to-Tabular" version="1.1.0">
+	<description>converter</description>
+	<command interpreter="python">fasta_to_tabular.py $input $output $keep_first $descr_columns</command>
+	<inputs>
+		<param name="input" type="data" format="fasta" label="Convert these sequences"/>
+		<param name="descr_columns" type="integer" size="2" value="1" label="How many columns to divide title string into?" help="Typically 2 to take the ID (first word) and decription (rest) as two columns, or 1 to give a single column">
+			<validator type="in_range" min="1" />
+		</param>
+		<param name="keep_first" type="integer" size="5" value="0" label="How many title characters to keep?" help="Applies only to the first column taken from the title string ('0' = keep the whole thing), useful when your sequence identifiers are all the same length.">
+			<validator type="in_range" min="0" />
+		</param>
+	</inputs>
+	<outputs>
+		<data name="output" format="tabular"/>
+	</outputs>
+	<tests>
+		<test>
+			<param name="input" value="454.fasta" />
+			<param name="descr_columns" value="1"/>
+			<param name="keep_first" value="0"/>
+			<output name="output" file="fasta_to_tabular_out1.tabular" />
+		</test>
+		
+		<test>
+			<param name="input" value="4.fasta" />
+			<param name="descr_columns" value="1"/>
+			<param name="keep_first" value="0"/>
+			<output name="output" file="fasta_to_tabular_out2.tabular" />
+		</test>
+		
+		<test>
+			<param name="input" value="454.fasta" />
+			<param name="descr_columns" value="1"/>
+			<param name="keep_first" value="14"/>
+			<output name="output" file="fasta_to_tabular_out3.tabular" />
+		</test>
+
+		<test>
+			<param name="input" value="454.fasta" />
+			<param name="descr_columns" value="2"/>
+			<param name="keep_first" value="0"/>
+			<output name="output" file="fasta_to_tabular_out4.tabular" />
+		</test>
+
+		<test>
+			<param name="input" value="454.fasta" />
+			<param name="descr_columns" value="5"/>
+			<param name="keep_first" value="0"/>
+			<output name="output" file="fasta_to_tabular_out5.tabular" />
+		</test>
+
+		<test>
+			<param name="input" value="454.fasta" />
+			<param name="descr_columns" value="5"/>
+			<param name="keep_first" value="10"/>
+			<output name="output" file="fasta_to_tabular_out6.tabular" />
+		</test>
+
+	</tests>
+	<help>
+	
+**What it does**
+
+This tool converts FASTA formatted sequences to TAB-delimited format.
+
+Many tools consider the first word of the FASTA "&gt;" title line to be an identifier, and any remaining text to be a free form description.
+It is therefore useful to split this text into two columns in Galaxy (identifier and any description) by setting **How many columns to divide title string into?** to **2**.
+In some cases the description can be usefully broken up into more columns -- see the examples .
+
+The option *How many characters to keep?* allows to select a specified number of letters from the beginning of each FASTA entry.
+With the introduction of the **How many columns to divide title string into?** option this setting is of limited use, but does still allow you to truncate the identifier.
+
+-----	
+
+**Example**
+
+Suppose you have the following FASTA formatted sequences from a Roche (454) FLX sequencing run::
+
+    &gt;EYKX4VC02EQLO5 length=108 xy=1826_0455 region=2 run=R_2007_11_07_16_15_57_
+    TCCGCGCCGAGCATGCCCATCTTGGATTCCGGCGCGATGACCATCGCCCGCTCCACCACG
+    TTCGGCCGGCCCTTCTCGTCGAGGAATGACACCAGCGCTTCGCCCACG
+    &gt;EYKX4VC02D4GS2 length=60 xy=1573_3972 region=2 run=R_2007_11_07_16_15_57_
+    AATAAAACTAAATCAGCAAAGACTGGCAAATACTCACAGGCTTATACAATACAAATGTAA
+
+Running this tool with the default settings will produce this (2 column output):
+
+========================================================================== =======================================
+EYKX4VC02EQLO5 length=108 xy=1826_0455 region=2 run=R_2007_11_07_16_15_57_ TCCGCGCCGAGCATGCCCATCTTGGATTCCGGC...ACG
+EYKX4VC02D4GS2 length=60 xy=1573_3972 region=2 run=R_2007_11_07_16_15_57_  AATAAAACTAAATCAGCAAAGACTGGCAAATAC...TAA
+========================================================================== =======================================
+
+Having the full title line (the FASTA "&gt;" line text) as a column is not always ideal.
+
+The **How many characters to keep?** option is useful if your identifiers are all the same length.
+In this example the identifier is 14 characters, so setting **How many characters to keep?** to **14** (and leaving **How many columns to divide title string into?** as the default, **1**) will produce this (2 column output):
+
+============== =======================================
+EYKX4VC02EQLO5 TCCGCGCCGAGCATGCCCATCTTGGATTCCGGC...ACG
+EYKX4VC02D4GS2 AATAAAACTAAATCAGCAAAGACTGGCAAATAC...TAA
+============== =======================================
+
+If however your FASTA file has identifiers of variable length, it is better to split the text into at least two columns.
+Running this tool with **How many columns to divide title string into?** to **2** will produce this (3 column output):
+
+============== =========================================================== =======================================
+EYKX4VC02EQLO5 length=108 xy=1826_0455 region=2 run=R_2007_11_07_16_15_57_ TCCGCGCCGAGCATGCCCATCTTGGATTCCGGC...ACG
+EYKX4VC02D4GS2 length=60 xy=1573_3972 region=2 run=R_2007_11_07_16_15_57_  AATAAAACTAAATCAGCAAAGACTGGCAAATAC...TAA
+============== =========================================================== =======================================
+
+Running this tool with **How many columns to divide title string into?** to **5** will produce this (5 column output):
+
+============== ========== ============ ======== ========================== =======================================
+EYKX4VC02EQLO5 length=108 xy=1826_0455 region=2 run=R_2007_11_07_16_15_57_ TCCGCGCCGAGCATGCCCATCTTGGATTCCGGC...ACG
+EYKX4VC02D4GS2 length=60  xy=1573_3972 region=2 run=R_2007_11_07_16_15_57_ AATAAAACTAAATCAGCAAAGACTGGCAAATAC...TAA
+============== ========== ============ ======== ========================== =======================================
+
+Running this tool with **How many columns to divide title string into?** to **5** and **How many characters to keep?** to **10** will produce this (5 column output).
+Notice that only the first column is truncated to 10 characters -- and be careful not to trim your sequence names too much (generally they should be unique):
+
+========== ========== ============ ======== ========================== =======================================
+EYKX4VC02E length=108 xy=1826_0455 region=2 run=R_2007_11_07_16_15_57_ TCCGCGCCGAGCATGCCCATCTTGGATTCCGGC...ACG
+EYKX4VC02D length=60  xy=1573_3972 region=2 run=R_2007_11_07_16_15_57_ AATAAAACTAAATCAGCAAAGACTGGCAAATAC...TAA
+========== ========== ============ ======== ========================== =======================================
+
+Note the sequences have been truncated for display purposes in the above tables.
+
+	</help>
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/1.tabular	Mon May 19 11:00:01 2014 -0400
@@ -0,0 +1,6 @@
+chr22	1000	NM_17
+chr22	2000	NM_18
+chr10	2200	NM_10
+chr10	hap	test
+chr10	1200	NM_11
+chr22	1600	NM_19
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/2.tabular	Mon May 19 11:00:01 2014 -0400
@@ -0,0 +1,10 @@
+1	68	4.1
+2	71	4.6
+3	62	3.8
+4	75	4.4
+5	58	3.2
+6	60	3.1
+7	67	3.8
+8	68	4.1
+9	71	4.3
+10	69	3.7
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/4.fasta	Mon May 19 11:00:01 2014 -0400
@@ -0,0 +1,7 @@
+>EYKX4VC01B65GS length=54 xy=0784_1754 region=1 run=R_2007_11_07_16_15_57_
+CCGGTATCCGGGTGCCGTGATGAGCGCCACCGGAACGAATTCGACTATGCCGAA
+>EYKX4VC01BNCSP length=187 xy=0558_3831 region=1 run=R_2007_11_07_16_15_57_
+CTTACCGGTCACCACCGTGCCTTCAGGATTGATCGCCAGATCGGTCGGTGCGTCAGGCGG
+GGTGACATCGCCCACCACGGTACTCACTGGCTGGCTCTGGTTCCCGGCGGCATCGGAGGC
+CACCACGTTGAGGGTATTCCCCTCGGTTTGTGGCTCGGTGAGAACCACGTTGTAGTCGCC
+ATTGGTC
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/454.fasta	Mon May 19 11:00:01 2014 -0400
@@ -0,0 +1,52 @@
+>EYKX4VC01B65GS length=54 xy=0784_1754 region=1 run=R_2007_11_07_16_15_57_
+CCGGTATCCGGGTGCCGTGATGAGCGCCACCGGAACGAATTCGACTATGCCGAA
+>EYKX4VC01BNCSP length=187 xy=0558_3831 region=1 run=R_2007_11_07_16_15_57_
+CTTACCGGTCACCACCGTGCCTTCAGGATTGATCGCCAGATCGGTCGGTGCGTCAGGCGG
+GGTGACATCGCCCACCACGGTACTCACTGGCTGGCTCTGGTTCCCGGCGGCATCGGAGGC
+CACCACGTTGAGGGTATTCCCCTCGGTTTGTGGCTCGGTGAGAACCACGTTGTAGTCGCC
+ATTGGTC
+>EYKX4VC01CD9FT length=115 xy=0865_1719 region=1 run=R_2007_11_07_16_15_57_
+GGGGGCTTTGGCCTGTCGTCCGGCACCTCGCAAGAGCTACAGCAGGCGCGGCTGGCGATC
+ATCGGCGGCACGCCGGCCTATATGTCGCCGGAACACACCACCCGCACCCAACGCG
+>EYKX4VC01B8FW0 length=95 xy=0799_0514 region=1 run=R_2007_11_07_16_15_57_
+TAAATTTCAAGGAATGCAAATCAGGGTCGTGTGTTTAGACTTCGGCTTTAGAGACCTGAA
+TACGTCAAAAACATAACTTCATGATATCTTGCAGT
+>EYKX4VC01BCGYW length=115 xy=0434_3926 region=1 run=R_2007_11_07_16_15_57_
+GGCCAGCCGGGACAGCGTTGTTGGGCTGCATGGCGACGAGCTAAAAGTCGCCATCACCGC
+CCCGCCGGTTGATGGGCAGGCTAATGCCCATCTGGTAAAAACTTTCTCGCCAAAC
+>EYKX4VC01AZXC6 length=116 xy=0292_0280 region=1 run=R_2007_11_07_16_15_57_
+GGGGGCGTTTGGCCTGTCGTCCGGCACCTCGCAAGAGCTACAGCAGGCGCGGCTGGCGAT
+CATCGGCGGCACGCCGGCCTATATGTCGCCGGAACACACCACCCGCACCCAACGCG
+>EYKX4VC01CATH5 length=82 xy=0826_0843 region=1 run=R_2007_11_07_16_15_57_
+CGAAATTGCACATTCTCGGCCATATCTCTGGACCTACATGACCGATTTGATCATCTTCGA
+ACTTAGCCTTCCTTTNTTAACG
+>EYKX4VC01BCEIV length=47 xy=0434_0757 region=1 run=R_2007_11_07_16_15_57_
+TGACGTCGTGCCGAGCTACGACAATGCCGACATGGTGATCGTTAACA
+>EYKX4VC01BWERM length=83 xy=0662_0304 region=1 run=R_2007_11_07_16_15_57_
+CGGTCGGCCTCACCATGGAGAAGATCCCGCCCCGGCCGAGGTCATGGTGGATCTCGGCCA
+GGGCGTGCTGATGAAGTTCAAAT
+>EYKX4VC01BT2O7 length=69 xy=0635_1945 region=1 run=R_2007_11_07_16_15_57_
+AGCGTTTCTCCAGCCGGTCGGCTACGCCGTTTGCCCCTGAAAGACGCTGTTCAGACCGAA
+CGCGGTAAA
+>EYKX4VC01BO0UO length=222 xy=0577_3838 region=1 run=R_2007_11_07_16_15_57_
+AGACCTGGGACAGCGGCGGGCTGCTGAAGCCGCAGGCGATAGAGGACAAACTGCAGTACC
+GCTTCTGGCTGCACTATGCCGAAGGCTCGCTGATGCCGCTGCTGTTAATGAAGCTGGTGT
+TCGCCAGCCTGGGTAAACCCCCTGTGCCCTTTGGCGTCCGCTCGCTGGGCGCCCTGCTGG
+GCAAGGGCATTCAGAAAGCGTGGCTGGATCCCCAGCTGGCCA
+>EYKX4VC01CBCPK length=83 xy=0832_1158 region=1 run=R_2007_11_07_16_15_57_
+CGGTCGGCCTCACCATGGAGAAGATCCCGCCCCGGCCGAGGTCATGGTGGATCTCGGCCA
+GGGCGTGCTGATGAAGTTCAAAT
+>EYKX4VC01B474S length=54 xy=0762_2010 region=1 run=R_2007_11_07_16_15_57_
+AGCAGTTTTCCAGCGCTTTCGAAGAGCGCTGGCGCGCGCGGGCTTCCAGCATAT
+>EYKX4VC01BB4QL length=57 xy=0431_0363 region=1 run=R_2007_11_07_16_15_57_
+GGGGAGGAGCTAATAATATGCTCTTGGGGAGGAGCTAATTATATGCTCTTGGGGAGG
+>EYKX4VC01BJ37M length=64 xy=0522_0192 region=1 run=R_2007_11_07_16_15_57_
+TCGAGTATGTATCAAGGACTACATACAAATTTGCCAAAAGAGATTATGCACTATCCCGAC
+TTCC
+>EYKX4VC01BV9R8 length=54 xy=0660_2038 region=1 run=R_2007_11_07_16_15_57_
+AAAACTCGGAGAAACTATTCAGCAGCACTGCGTTTCGCTGAATTTTAGACCGTT
+>EYKX4VC01CEPP8 length=60 xy=0870_2350 region=1 run=R_2007_11_07_16_15_57_
+CTGGGTGGGTGCACTACAGGAACGTCATTTGTTCAATCCTCACGTTGTTGTTAGTGTCAG
+>EYKX4VC01BTLME length=78 xy=0630_0292 region=1 run=R_2007_11_07_16_15_57_
+TTATCCACACGCTGTCCGGATCCAGCGCCAGGCGCCGACGCTGGACTTCCGCCGCCTGCG
+CCCAGTTGCCCTGACTTC
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/a.tab	Mon May 19 11:00:01 2014 -0400
@@ -0,0 +1,15 @@
+CHR	SNP	BP	A1	TEST	NMISS	BETA	STAT	P
+1	rs1181876	3671541	T	DOMDEV	958	-1.415	-3.326	0.0009161
+1	rs10492923	5092886	C	ADD	1007	5.105	4.368	1.382e-05
+1	rs10492923	5092886	C	DOMDEV	1007	-5.612	-4.249	2.35e-05
+1	rs10492923	5092886	C	GENO_2DF	1007	NA	19.9	4.775e-05
+1	rs1801133	11778965	T	ADD	1022	1.23	3.97	7.682e-05
+1	rs1801133	11778965	T	GENO_2DF	1022	NA	16.07	0.0003233
+1	rs1361912	12663121	A	ADD	1021	12.69	4.093	4.596e-05
+1	rs1361912	12663121	A	DOMDEV	1021	-12.37	-3.945	8.533e-05
+1	rs1361912	12663121	A	GENO_2DF	1021	NA	17.05	0.0001982
+1	rs1009806	19373138	G	ADD	1021	-1.334	-3.756	0.0001826
+1	rs1009806	19373138	G	GENO_2DF	1021	NA	19.36	6.244e-05
+1	rs873654	29550948	A	DOMDEV	1012	1.526	3.6	0.0003339
+1	rs10489527	36800027	C	ADD	1016	12.67	4.114	4.211e-05
+1	rs10489527	36800027	C	DOMDEV	1016	-13.05	-4.02	6.249e-05
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/fasta_to_tabular_out1.tabular	Mon May 19 11:00:01 2014 -0400
@@ -0,0 +1,18 @@
+EYKX4VC01B65GS length=54 xy=0784_1754 region=1 run=R_2007_11_07_16_15_57_	CCGGTATCCGGGTGCCGTGATGAGCGCCACCGGAACGAATTCGACTATGCCGAA
+EYKX4VC01BNCSP length=187 xy=0558_3831 region=1 run=R_2007_11_07_16_15_57_	CTTACCGGTCACCACCGTGCCTTCAGGATTGATCGCCAGATCGGTCGGTGCGTCAGGCGGGGTGACATCGCCCACCACGGTACTCACTGGCTGGCTCTGGTTCCCGGCGGCATCGGAGGCCACCACGTTGAGGGTATTCCCCTCGGTTTGTGGCTCGGTGAGAACCACGTTGTAGTCGCCATTGGTC
+EYKX4VC01CD9FT length=115 xy=0865_1719 region=1 run=R_2007_11_07_16_15_57_	GGGGGCTTTGGCCTGTCGTCCGGCACCTCGCAAGAGCTACAGCAGGCGCGGCTGGCGATCATCGGCGGCACGCCGGCCTATATGTCGCCGGAACACACCACCCGCACCCAACGCG
+EYKX4VC01B8FW0 length=95 xy=0799_0514 region=1 run=R_2007_11_07_16_15_57_	TAAATTTCAAGGAATGCAAATCAGGGTCGTGTGTTTAGACTTCGGCTTTAGAGACCTGAATACGTCAAAAACATAACTTCATGATATCTTGCAGT
+EYKX4VC01BCGYW length=115 xy=0434_3926 region=1 run=R_2007_11_07_16_15_57_	GGCCAGCCGGGACAGCGTTGTTGGGCTGCATGGCGACGAGCTAAAAGTCGCCATCACCGCCCCGCCGGTTGATGGGCAGGCTAATGCCCATCTGGTAAAAACTTTCTCGCCAAAC
+EYKX4VC01AZXC6 length=116 xy=0292_0280 region=1 run=R_2007_11_07_16_15_57_	GGGGGCGTTTGGCCTGTCGTCCGGCACCTCGCAAGAGCTACAGCAGGCGCGGCTGGCGATCATCGGCGGCACGCCGGCCTATATGTCGCCGGAACACACCACCCGCACCCAACGCG
+EYKX4VC01CATH5 length=82 xy=0826_0843 region=1 run=R_2007_11_07_16_15_57_	CGAAATTGCACATTCTCGGCCATATCTCTGGACCTACATGACCGATTTGATCATCTTCGAACTTAGCCTTCCTTTNTTAACG
+EYKX4VC01BCEIV length=47 xy=0434_0757 region=1 run=R_2007_11_07_16_15_57_	TGACGTCGTGCCGAGCTACGACAATGCCGACATGGTGATCGTTAACA
+EYKX4VC01BWERM length=83 xy=0662_0304 region=1 run=R_2007_11_07_16_15_57_	CGGTCGGCCTCACCATGGAGAAGATCCCGCCCCGGCCGAGGTCATGGTGGATCTCGGCCAGGGCGTGCTGATGAAGTTCAAAT
+EYKX4VC01BT2O7 length=69 xy=0635_1945 region=1 run=R_2007_11_07_16_15_57_	AGCGTTTCTCCAGCCGGTCGGCTACGCCGTTTGCCCCTGAAAGACGCTGTTCAGACCGAACGCGGTAAA
+EYKX4VC01BO0UO length=222 xy=0577_3838 region=1 run=R_2007_11_07_16_15_57_	AGACCTGGGACAGCGGCGGGCTGCTGAAGCCGCAGGCGATAGAGGACAAACTGCAGTACCGCTTCTGGCTGCACTATGCCGAAGGCTCGCTGATGCCGCTGCTGTTAATGAAGCTGGTGTTCGCCAGCCTGGGTAAACCCCCTGTGCCCTTTGGCGTCCGCTCGCTGGGCGCCCTGCTGGGCAAGGGCATTCAGAAAGCGTGGCTGGATCCCCAGCTGGCCA
+EYKX4VC01CBCPK length=83 xy=0832_1158 region=1 run=R_2007_11_07_16_15_57_	CGGTCGGCCTCACCATGGAGAAGATCCCGCCCCGGCCGAGGTCATGGTGGATCTCGGCCAGGGCGTGCTGATGAAGTTCAAAT
+EYKX4VC01B474S length=54 xy=0762_2010 region=1 run=R_2007_11_07_16_15_57_	AGCAGTTTTCCAGCGCTTTCGAAGAGCGCTGGCGCGCGCGGGCTTCCAGCATAT
+EYKX4VC01BB4QL length=57 xy=0431_0363 region=1 run=R_2007_11_07_16_15_57_	GGGGAGGAGCTAATAATATGCTCTTGGGGAGGAGCTAATTATATGCTCTTGGGGAGG
+EYKX4VC01BJ37M length=64 xy=0522_0192 region=1 run=R_2007_11_07_16_15_57_	TCGAGTATGTATCAAGGACTACATACAAATTTGCCAAAAGAGATTATGCACTATCCCGACTTCC
+EYKX4VC01BV9R8 length=54 xy=0660_2038 region=1 run=R_2007_11_07_16_15_57_	AAAACTCGGAGAAACTATTCAGCAGCACTGCGTTTCGCTGAATTTTAGACCGTT
+EYKX4VC01CEPP8 length=60 xy=0870_2350 region=1 run=R_2007_11_07_16_15_57_	CTGGGTGGGTGCACTACAGGAACGTCATTTGTTCAATCCTCACGTTGTTGTTAGTGTCAG
+EYKX4VC01BTLME length=78 xy=0630_0292 region=1 run=R_2007_11_07_16_15_57_	TTATCCACACGCTGTCCGGATCCAGCGCCAGGCGCCGACGCTGGACTTCCGCCGCCTGCGCCCAGTTGCCCTGACTTC
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/fasta_to_tabular_out2.tabular	Mon May 19 11:00:01 2014 -0400
@@ -0,0 +1,2 @@
+EYKX4VC01B65GS length=54 xy=0784_1754 region=1 run=R_2007_11_07_16_15_57_	CCGGTATCCGGGTGCCGTGATGAGCGCCACCGGAACGAATTCGACTATGCCGAA
+EYKX4VC01BNCSP length=187 xy=0558_3831 region=1 run=R_2007_11_07_16_15_57_	CTTACCGGTCACCACCGTGCCTTCAGGATTGATCGCCAGATCGGTCGGTGCGTCAGGCGGGGTGACATCGCCCACCACGGTACTCACTGGCTGGCTCTGGTTCCCGGCGGCATCGGAGGCCACCACGTTGAGGGTATTCCCCTCGGTTTGTGGCTCGGTGAGAACCACGTTGTAGTCGCCATTGGTC
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/fasta_to_tabular_out3.tabular	Mon May 19 11:00:01 2014 -0400
@@ -0,0 +1,18 @@
+EYKX4VC01B65GS	CCGGTATCCGGGTGCCGTGATGAGCGCCACCGGAACGAATTCGACTATGCCGAA
+EYKX4VC01BNCSP	CTTACCGGTCACCACCGTGCCTTCAGGATTGATCGCCAGATCGGTCGGTGCGTCAGGCGGGGTGACATCGCCCACCACGGTACTCACTGGCTGGCTCTGGTTCCCGGCGGCATCGGAGGCCACCACGTTGAGGGTATTCCCCTCGGTTTGTGGCTCGGTGAGAACCACGTTGTAGTCGCCATTGGTC
+EYKX4VC01CD9FT	GGGGGCTTTGGCCTGTCGTCCGGCACCTCGCAAGAGCTACAGCAGGCGCGGCTGGCGATCATCGGCGGCACGCCGGCCTATATGTCGCCGGAACACACCACCCGCACCCAACGCG
+EYKX4VC01B8FW0	TAAATTTCAAGGAATGCAAATCAGGGTCGTGTGTTTAGACTTCGGCTTTAGAGACCTGAATACGTCAAAAACATAACTTCATGATATCTTGCAGT
+EYKX4VC01BCGYW	GGCCAGCCGGGACAGCGTTGTTGGGCTGCATGGCGACGAGCTAAAAGTCGCCATCACCGCCCCGCCGGTTGATGGGCAGGCTAATGCCCATCTGGTAAAAACTTTCTCGCCAAAC
+EYKX4VC01AZXC6	GGGGGCGTTTGGCCTGTCGTCCGGCACCTCGCAAGAGCTACAGCAGGCGCGGCTGGCGATCATCGGCGGCACGCCGGCCTATATGTCGCCGGAACACACCACCCGCACCCAACGCG
+EYKX4VC01CATH5	CGAAATTGCACATTCTCGGCCATATCTCTGGACCTACATGACCGATTTGATCATCTTCGAACTTAGCCTTCCTTTNTTAACG
+EYKX4VC01BCEIV	TGACGTCGTGCCGAGCTACGACAATGCCGACATGGTGATCGTTAACA
+EYKX4VC01BWERM	CGGTCGGCCTCACCATGGAGAAGATCCCGCCCCGGCCGAGGTCATGGTGGATCTCGGCCAGGGCGTGCTGATGAAGTTCAAAT
+EYKX4VC01BT2O7	AGCGTTTCTCCAGCCGGTCGGCTACGCCGTTTGCCCCTGAAAGACGCTGTTCAGACCGAACGCGGTAAA
+EYKX4VC01BO0UO	AGACCTGGGACAGCGGCGGGCTGCTGAAGCCGCAGGCGATAGAGGACAAACTGCAGTACCGCTTCTGGCTGCACTATGCCGAAGGCTCGCTGATGCCGCTGCTGTTAATGAAGCTGGTGTTCGCCAGCCTGGGTAAACCCCCTGTGCCCTTTGGCGTCCGCTCGCTGGGCGCCCTGCTGGGCAAGGGCATTCAGAAAGCGTGGCTGGATCCCCAGCTGGCCA
+EYKX4VC01CBCPK	CGGTCGGCCTCACCATGGAGAAGATCCCGCCCCGGCCGAGGTCATGGTGGATCTCGGCCAGGGCGTGCTGATGAAGTTCAAAT
+EYKX4VC01B474S	AGCAGTTTTCCAGCGCTTTCGAAGAGCGCTGGCGCGCGCGGGCTTCCAGCATAT
+EYKX4VC01BB4QL	GGGGAGGAGCTAATAATATGCTCTTGGGGAGGAGCTAATTATATGCTCTTGGGGAGG
+EYKX4VC01BJ37M	TCGAGTATGTATCAAGGACTACATACAAATTTGCCAAAAGAGATTATGCACTATCCCGACTTCC
+EYKX4VC01BV9R8	AAAACTCGGAGAAACTATTCAGCAGCACTGCGTTTCGCTGAATTTTAGACCGTT
+EYKX4VC01CEPP8	CTGGGTGGGTGCACTACAGGAACGTCATTTGTTCAATCCTCACGTTGTTGTTAGTGTCAG
+EYKX4VC01BTLME	TTATCCACACGCTGTCCGGATCCAGCGCCAGGCGCCGACGCTGGACTTCCGCCGCCTGCGCCCAGTTGCCCTGACTTC
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/fasta_to_tabular_out4.tabular	Mon May 19 11:00:01 2014 -0400
@@ -0,0 +1,18 @@
+EYKX4VC01B65GS	length=54 xy=0784_1754 region=1 run=R_2007_11_07_16_15_57_	CCGGTATCCGGGTGCCGTGATGAGCGCCACCGGAACGAATTCGACTATGCCGAA
+EYKX4VC01BNCSP	length=187 xy=0558_3831 region=1 run=R_2007_11_07_16_15_57_	CTTACCGGTCACCACCGTGCCTTCAGGATTGATCGCCAGATCGGTCGGTGCGTCAGGCGGGGTGACATCGCCCACCACGGTACTCACTGGCTGGCTCTGGTTCCCGGCGGCATCGGAGGCCACCACGTTGAGGGTATTCCCCTCGGTTTGTGGCTCGGTGAGAACCACGTTGTAGTCGCCATTGGTC
+EYKX4VC01CD9FT	length=115 xy=0865_1719 region=1 run=R_2007_11_07_16_15_57_	GGGGGCTTTGGCCTGTCGTCCGGCACCTCGCAAGAGCTACAGCAGGCGCGGCTGGCGATCATCGGCGGCACGCCGGCCTATATGTCGCCGGAACACACCACCCGCACCCAACGCG
+EYKX4VC01B8FW0	length=95 xy=0799_0514 region=1 run=R_2007_11_07_16_15_57_	TAAATTTCAAGGAATGCAAATCAGGGTCGTGTGTTTAGACTTCGGCTTTAGAGACCTGAATACGTCAAAAACATAACTTCATGATATCTTGCAGT
+EYKX4VC01BCGYW	length=115 xy=0434_3926 region=1 run=R_2007_11_07_16_15_57_	GGCCAGCCGGGACAGCGTTGTTGGGCTGCATGGCGACGAGCTAAAAGTCGCCATCACCGCCCCGCCGGTTGATGGGCAGGCTAATGCCCATCTGGTAAAAACTTTCTCGCCAAAC
+EYKX4VC01AZXC6	length=116 xy=0292_0280 region=1 run=R_2007_11_07_16_15_57_	GGGGGCGTTTGGCCTGTCGTCCGGCACCTCGCAAGAGCTACAGCAGGCGCGGCTGGCGATCATCGGCGGCACGCCGGCCTATATGTCGCCGGAACACACCACCCGCACCCAACGCG
+EYKX4VC01CATH5	length=82 xy=0826_0843 region=1 run=R_2007_11_07_16_15_57_	CGAAATTGCACATTCTCGGCCATATCTCTGGACCTACATGACCGATTTGATCATCTTCGAACTTAGCCTTCCTTTNTTAACG
+EYKX4VC01BCEIV	length=47 xy=0434_0757 region=1 run=R_2007_11_07_16_15_57_	TGACGTCGTGCCGAGCTACGACAATGCCGACATGGTGATCGTTAACA
+EYKX4VC01BWERM	length=83 xy=0662_0304 region=1 run=R_2007_11_07_16_15_57_	CGGTCGGCCTCACCATGGAGAAGATCCCGCCCCGGCCGAGGTCATGGTGGATCTCGGCCAGGGCGTGCTGATGAAGTTCAAAT
+EYKX4VC01BT2O7	length=69 xy=0635_1945 region=1 run=R_2007_11_07_16_15_57_	AGCGTTTCTCCAGCCGGTCGGCTACGCCGTTTGCCCCTGAAAGACGCTGTTCAGACCGAACGCGGTAAA
+EYKX4VC01BO0UO	length=222 xy=0577_3838 region=1 run=R_2007_11_07_16_15_57_	AGACCTGGGACAGCGGCGGGCTGCTGAAGCCGCAGGCGATAGAGGACAAACTGCAGTACCGCTTCTGGCTGCACTATGCCGAAGGCTCGCTGATGCCGCTGCTGTTAATGAAGCTGGTGTTCGCCAGCCTGGGTAAACCCCCTGTGCCCTTTGGCGTCCGCTCGCTGGGCGCCCTGCTGGGCAAGGGCATTCAGAAAGCGTGGCTGGATCCCCAGCTGGCCA
+EYKX4VC01CBCPK	length=83 xy=0832_1158 region=1 run=R_2007_11_07_16_15_57_	CGGTCGGCCTCACCATGGAGAAGATCCCGCCCCGGCCGAGGTCATGGTGGATCTCGGCCAGGGCGTGCTGATGAAGTTCAAAT
+EYKX4VC01B474S	length=54 xy=0762_2010 region=1 run=R_2007_11_07_16_15_57_	AGCAGTTTTCCAGCGCTTTCGAAGAGCGCTGGCGCGCGCGGGCTTCCAGCATAT
+EYKX4VC01BB4QL	length=57 xy=0431_0363 region=1 run=R_2007_11_07_16_15_57_	GGGGAGGAGCTAATAATATGCTCTTGGGGAGGAGCTAATTATATGCTCTTGGGGAGG
+EYKX4VC01BJ37M	length=64 xy=0522_0192 region=1 run=R_2007_11_07_16_15_57_	TCGAGTATGTATCAAGGACTACATACAAATTTGCCAAAAGAGATTATGCACTATCCCGACTTCC
+EYKX4VC01BV9R8	length=54 xy=0660_2038 region=1 run=R_2007_11_07_16_15_57_	AAAACTCGGAGAAACTATTCAGCAGCACTGCGTTTCGCTGAATTTTAGACCGTT
+EYKX4VC01CEPP8	length=60 xy=0870_2350 region=1 run=R_2007_11_07_16_15_57_	CTGGGTGGGTGCACTACAGGAACGTCATTTGTTCAATCCTCACGTTGTTGTTAGTGTCAG
+EYKX4VC01BTLME	length=78 xy=0630_0292 region=1 run=R_2007_11_07_16_15_57_	TTATCCACACGCTGTCCGGATCCAGCGCCAGGCGCCGACGCTGGACTTCCGCCGCCTGCGCCCAGTTGCCCTGACTTC
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/fasta_to_tabular_out5.tabular	Mon May 19 11:00:01 2014 -0400
@@ -0,0 +1,18 @@
+EYKX4VC01B65GS	length=54	xy=0784_1754	region=1	run=R_2007_11_07_16_15_57_	CCGGTATCCGGGTGCCGTGATGAGCGCCACCGGAACGAATTCGACTATGCCGAA
+EYKX4VC01BNCSP	length=187	xy=0558_3831	region=1	run=R_2007_11_07_16_15_57_	CTTACCGGTCACCACCGTGCCTTCAGGATTGATCGCCAGATCGGTCGGTGCGTCAGGCGGGGTGACATCGCCCACCACGGTACTCACTGGCTGGCTCTGGTTCCCGGCGGCATCGGAGGCCACCACGTTGAGGGTATTCCCCTCGGTTTGTGGCTCGGTGAGAACCACGTTGTAGTCGCCATTGGTC
+EYKX4VC01CD9FT	length=115	xy=0865_1719	region=1	run=R_2007_11_07_16_15_57_	GGGGGCTTTGGCCTGTCGTCCGGCACCTCGCAAGAGCTACAGCAGGCGCGGCTGGCGATCATCGGCGGCACGCCGGCCTATATGTCGCCGGAACACACCACCCGCACCCAACGCG
+EYKX4VC01B8FW0	length=95	xy=0799_0514	region=1	run=R_2007_11_07_16_15_57_	TAAATTTCAAGGAATGCAAATCAGGGTCGTGTGTTTAGACTTCGGCTTTAGAGACCTGAATACGTCAAAAACATAACTTCATGATATCTTGCAGT
+EYKX4VC01BCGYW	length=115	xy=0434_3926	region=1	run=R_2007_11_07_16_15_57_	GGCCAGCCGGGACAGCGTTGTTGGGCTGCATGGCGACGAGCTAAAAGTCGCCATCACCGCCCCGCCGGTTGATGGGCAGGCTAATGCCCATCTGGTAAAAACTTTCTCGCCAAAC
+EYKX4VC01AZXC6	length=116	xy=0292_0280	region=1	run=R_2007_11_07_16_15_57_	GGGGGCGTTTGGCCTGTCGTCCGGCACCTCGCAAGAGCTACAGCAGGCGCGGCTGGCGATCATCGGCGGCACGCCGGCCTATATGTCGCCGGAACACACCACCCGCACCCAACGCG
+EYKX4VC01CATH5	length=82	xy=0826_0843	region=1	run=R_2007_11_07_16_15_57_	CGAAATTGCACATTCTCGGCCATATCTCTGGACCTACATGACCGATTTGATCATCTTCGAACTTAGCCTTCCTTTNTTAACG
+EYKX4VC01BCEIV	length=47	xy=0434_0757	region=1	run=R_2007_11_07_16_15_57_	TGACGTCGTGCCGAGCTACGACAATGCCGACATGGTGATCGTTAACA
+EYKX4VC01BWERM	length=83	xy=0662_0304	region=1	run=R_2007_11_07_16_15_57_	CGGTCGGCCTCACCATGGAGAAGATCCCGCCCCGGCCGAGGTCATGGTGGATCTCGGCCAGGGCGTGCTGATGAAGTTCAAAT
+EYKX4VC01BT2O7	length=69	xy=0635_1945	region=1	run=R_2007_11_07_16_15_57_	AGCGTTTCTCCAGCCGGTCGGCTACGCCGTTTGCCCCTGAAAGACGCTGTTCAGACCGAACGCGGTAAA
+EYKX4VC01BO0UO	length=222	xy=0577_3838	region=1	run=R_2007_11_07_16_15_57_	AGACCTGGGACAGCGGCGGGCTGCTGAAGCCGCAGGCGATAGAGGACAAACTGCAGTACCGCTTCTGGCTGCACTATGCCGAAGGCTCGCTGATGCCGCTGCTGTTAATGAAGCTGGTGTTCGCCAGCCTGGGTAAACCCCCTGTGCCCTTTGGCGTCCGCTCGCTGGGCGCCCTGCTGGGCAAGGGCATTCAGAAAGCGTGGCTGGATCCCCAGCTGGCCA
+EYKX4VC01CBCPK	length=83	xy=0832_1158	region=1	run=R_2007_11_07_16_15_57_	CGGTCGGCCTCACCATGGAGAAGATCCCGCCCCGGCCGAGGTCATGGTGGATCTCGGCCAGGGCGTGCTGATGAAGTTCAAAT
+EYKX4VC01B474S	length=54	xy=0762_2010	region=1	run=R_2007_11_07_16_15_57_	AGCAGTTTTCCAGCGCTTTCGAAGAGCGCTGGCGCGCGCGGGCTTCCAGCATAT
+EYKX4VC01BB4QL	length=57	xy=0431_0363	region=1	run=R_2007_11_07_16_15_57_	GGGGAGGAGCTAATAATATGCTCTTGGGGAGGAGCTAATTATATGCTCTTGGGGAGG
+EYKX4VC01BJ37M	length=64	xy=0522_0192	region=1	run=R_2007_11_07_16_15_57_	TCGAGTATGTATCAAGGACTACATACAAATTTGCCAAAAGAGATTATGCACTATCCCGACTTCC
+EYKX4VC01BV9R8	length=54	xy=0660_2038	region=1	run=R_2007_11_07_16_15_57_	AAAACTCGGAGAAACTATTCAGCAGCACTGCGTTTCGCTGAATTTTAGACCGTT
+EYKX4VC01CEPP8	length=60	xy=0870_2350	region=1	run=R_2007_11_07_16_15_57_	CTGGGTGGGTGCACTACAGGAACGTCATTTGTTCAATCCTCACGTTGTTGTTAGTGTCAG
+EYKX4VC01BTLME	length=78	xy=0630_0292	region=1	run=R_2007_11_07_16_15_57_	TTATCCACACGCTGTCCGGATCCAGCGCCAGGCGCCGACGCTGGACTTCCGCCGCCTGCGCCCAGTTGCCCTGACTTC
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/fasta_to_tabular_out6.tabular	Mon May 19 11:00:01 2014 -0400
@@ -0,0 +1,18 @@
+EYKX4VC01B	length=54	xy=0784_1754	region=1	run=R_2007_11_07_16_15_57_	CCGGTATCCGGGTGCCGTGATGAGCGCCACCGGAACGAATTCGACTATGCCGAA
+EYKX4VC01B	length=187	xy=0558_3831	region=1	run=R_2007_11_07_16_15_57_	CTTACCGGTCACCACCGTGCCTTCAGGATTGATCGCCAGATCGGTCGGTGCGTCAGGCGGGGTGACATCGCCCACCACGGTACTCACTGGCTGGCTCTGGTTCCCGGCGGCATCGGAGGCCACCACGTTGAGGGTATTCCCCTCGGTTTGTGGCTCGGTGAGAACCACGTTGTAGTCGCCATTGGTC
+EYKX4VC01C	length=115	xy=0865_1719	region=1	run=R_2007_11_07_16_15_57_	GGGGGCTTTGGCCTGTCGTCCGGCACCTCGCAAGAGCTACAGCAGGCGCGGCTGGCGATCATCGGCGGCACGCCGGCCTATATGTCGCCGGAACACACCACCCGCACCCAACGCG
+EYKX4VC01B	length=95	xy=0799_0514	region=1	run=R_2007_11_07_16_15_57_	TAAATTTCAAGGAATGCAAATCAGGGTCGTGTGTTTAGACTTCGGCTTTAGAGACCTGAATACGTCAAAAACATAACTTCATGATATCTTGCAGT
+EYKX4VC01B	length=115	xy=0434_3926	region=1	run=R_2007_11_07_16_15_57_	GGCCAGCCGGGACAGCGTTGTTGGGCTGCATGGCGACGAGCTAAAAGTCGCCATCACCGCCCCGCCGGTTGATGGGCAGGCTAATGCCCATCTGGTAAAAACTTTCTCGCCAAAC
+EYKX4VC01A	length=116	xy=0292_0280	region=1	run=R_2007_11_07_16_15_57_	GGGGGCGTTTGGCCTGTCGTCCGGCACCTCGCAAGAGCTACAGCAGGCGCGGCTGGCGATCATCGGCGGCACGCCGGCCTATATGTCGCCGGAACACACCACCCGCACCCAACGCG
+EYKX4VC01C	length=82	xy=0826_0843	region=1	run=R_2007_11_07_16_15_57_	CGAAATTGCACATTCTCGGCCATATCTCTGGACCTACATGACCGATTTGATCATCTTCGAACTTAGCCTTCCTTTNTTAACG
+EYKX4VC01B	length=47	xy=0434_0757	region=1	run=R_2007_11_07_16_15_57_	TGACGTCGTGCCGAGCTACGACAATGCCGACATGGTGATCGTTAACA
+EYKX4VC01B	length=83	xy=0662_0304	region=1	run=R_2007_11_07_16_15_57_	CGGTCGGCCTCACCATGGAGAAGATCCCGCCCCGGCCGAGGTCATGGTGGATCTCGGCCAGGGCGTGCTGATGAAGTTCAAAT
+EYKX4VC01B	length=69	xy=0635_1945	region=1	run=R_2007_11_07_16_15_57_	AGCGTTTCTCCAGCCGGTCGGCTACGCCGTTTGCCCCTGAAAGACGCTGTTCAGACCGAACGCGGTAAA
+EYKX4VC01B	length=222	xy=0577_3838	region=1	run=R_2007_11_07_16_15_57_	AGACCTGGGACAGCGGCGGGCTGCTGAAGCCGCAGGCGATAGAGGACAAACTGCAGTACCGCTTCTGGCTGCACTATGCCGAAGGCTCGCTGATGCCGCTGCTGTTAATGAAGCTGGTGTTCGCCAGCCTGGGTAAACCCCCTGTGCCCTTTGGCGTCCGCTCGCTGGGCGCCCTGCTGGGCAAGGGCATTCAGAAAGCGTGGCTGGATCCCCAGCTGGCCA
+EYKX4VC01C	length=83	xy=0832_1158	region=1	run=R_2007_11_07_16_15_57_	CGGTCGGCCTCACCATGGAGAAGATCCCGCCCCGGCCGAGGTCATGGTGGATCTCGGCCAGGGCGTGCTGATGAAGTTCAAAT
+EYKX4VC01B	length=54	xy=0762_2010	region=1	run=R_2007_11_07_16_15_57_	AGCAGTTTTCCAGCGCTTTCGAAGAGCGCTGGCGCGCGCGGGCTTCCAGCATAT
+EYKX4VC01B	length=57	xy=0431_0363	region=1	run=R_2007_11_07_16_15_57_	GGGGAGGAGCTAATAATATGCTCTTGGGGAGGAGCTAATTATATGCTCTTGGGGAGG
+EYKX4VC01B	length=64	xy=0522_0192	region=1	run=R_2007_11_07_16_15_57_	TCGAGTATGTATCAAGGACTACATACAAATTTGCCAAAAGAGATTATGCACTATCCCGACTTCC
+EYKX4VC01B	length=54	xy=0660_2038	region=1	run=R_2007_11_07_16_15_57_	AAAACTCGGAGAAACTATTCAGCAGCACTGCGTTTCGCTGAATTTTAGACCGTT
+EYKX4VC01C	length=60	xy=0870_2350	region=1	run=R_2007_11_07_16_15_57_	CTGGGTGGGTGCACTACAGGAACGTCATTTGTTCAATCCTCACGTTGTTGTTAGTGTCAG
+EYKX4VC01B	length=78	xy=0630_0292	region=1	run=R_2007_11_07_16_15_57_	TTATCCACACGCTGTCCGGATCCAGCGCCAGGCGCCGACGCTGGACTTCCGCCGCCTGCGCCCAGTTGCCCTGACTTC