Mercurial > repos > devteam > fasta_to_tabular
changeset 0:ae709fd50581 draft
Imported from capsule None
author | devteam |
---|---|
date | Mon, 19 May 2014 11:00:01 -0400 |
parents | |
children | 5cabbe4cfaf4 |
files | fasta_to_tabular.py fasta_to_tabular.xml test-data/1.tabular test-data/2.tabular test-data/4.fasta test-data/454.fasta test-data/a.tab test-data/fasta_to_tabular_out1.tabular test-data/fasta_to_tabular_out2.tabular test-data/fasta_to_tabular_out3.tabular test-data/fasta_to_tabular_out4.tabular test-data/fasta_to_tabular_out5.tabular test-data/fasta_to_tabular_out6.tabular |
diffstat | 13 files changed, 367 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/fasta_to_tabular.py Mon May 19 11:00:01 2014 -0400 @@ -0,0 +1,57 @@ +#!/usr/bin/env python +# This code exists in 2 places: ~/datatypes/converters and ~/tools/fasta_tools +""" +Input: fasta (input file), tabular (output file), int (truncation of id), int (columns from description) +Output: tabular +format convert: fasta to tabular +""" + +import sys, os + +def stop_err( msg ): + sys.stderr.write( msg ) + sys.exit() + +def __main__(): + if len(sys.argv) != 5: + stop_err("Wrong number of argument. Expect four (fasta, tabular, truncation, columns)") + infile = sys.argv[1] + outfile = sys.argv[2] + keep_first = int( sys.argv[3] ) + descr_split = int( sys.argv[4] ) + fasta_title = fasta_seq = '' + if keep_first == 0: + keep_first = None + elif descr_split == 1: + #Added one for the ">" character + #(which is removed if using descr_split > 1) + keep_first += 1 + if descr_split < 1: + stop_err("Bad description split value (should be 1 or more)") + out = open( outfile, 'w' ) + for i, line in enumerate( open( infile ) ): + line = line.rstrip( '\r\n' ) + if not line or line.startswith( '#' ): + continue + if line.startswith( '>' ): + #Don't want any existing tabs to trigger extra columns: + line = line.replace('\t', ' ') + if i > 0: + out.write('\n') + if descr_split == 1: + out.write(line[1:keep_first]) + else: + words = line[1:].split(None, descr_split-1) + #apply any truncation to first word (the id) + words[0] = words[0][0:keep_first] + #pad with empty columns if required + words += [""]*(descr_split-len(words)) + out.write("\t".join(words)) + out.write('\t') + else: + out.write(line) + if i > 0: + out.write('\n') + out.close() + +if __name__ == "__main__" : __main__()
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/fasta_to_tabular.xml Mon May 19 11:00:01 2014 -0400 @@ -0,0 +1,128 @@ +<tool id="fasta2tab" name="FASTA-to-Tabular" version="1.1.0"> + <description>converter</description> + <command interpreter="python">fasta_to_tabular.py $input $output $keep_first $descr_columns</command> + <inputs> + <param name="input" type="data" format="fasta" label="Convert these sequences"/> + <param name="descr_columns" type="integer" size="2" value="1" label="How many columns to divide title string into?" help="Typically 2 to take the ID (first word) and decription (rest) as two columns, or 1 to give a single column"> + <validator type="in_range" min="1" /> + </param> + <param name="keep_first" type="integer" size="5" value="0" label="How many title characters to keep?" help="Applies only to the first column taken from the title string ('0' = keep the whole thing), useful when your sequence identifiers are all the same length."> + <validator type="in_range" min="0" /> + </param> + </inputs> + <outputs> + <data name="output" format="tabular"/> + </outputs> + <tests> + <test> + <param name="input" value="454.fasta" /> + <param name="descr_columns" value="1"/> + <param name="keep_first" value="0"/> + <output name="output" file="fasta_to_tabular_out1.tabular" /> + </test> + + <test> + <param name="input" value="4.fasta" /> + <param name="descr_columns" value="1"/> + <param name="keep_first" value="0"/> + <output name="output" file="fasta_to_tabular_out2.tabular" /> + </test> + + <test> + <param name="input" value="454.fasta" /> + <param name="descr_columns" value="1"/> + <param name="keep_first" value="14"/> + <output name="output" file="fasta_to_tabular_out3.tabular" /> + </test> + + <test> + <param name="input" value="454.fasta" /> + <param name="descr_columns" value="2"/> + <param name="keep_first" value="0"/> + <output name="output" file="fasta_to_tabular_out4.tabular" /> + </test> + + <test> + <param name="input" value="454.fasta" /> + <param name="descr_columns" value="5"/> + <param name="keep_first" value="0"/> + <output name="output" file="fasta_to_tabular_out5.tabular" /> + </test> + + <test> + <param name="input" value="454.fasta" /> + <param name="descr_columns" value="5"/> + <param name="keep_first" value="10"/> + <output name="output" file="fasta_to_tabular_out6.tabular" /> + </test> + + </tests> + <help> + +**What it does** + +This tool converts FASTA formatted sequences to TAB-delimited format. + +Many tools consider the first word of the FASTA ">" title line to be an identifier, and any remaining text to be a free form description. +It is therefore useful to split this text into two columns in Galaxy (identifier and any description) by setting **How many columns to divide title string into?** to **2**. +In some cases the description can be usefully broken up into more columns -- see the examples . + +The option *How many characters to keep?* allows to select a specified number of letters from the beginning of each FASTA entry. +With the introduction of the **How many columns to divide title string into?** option this setting is of limited use, but does still allow you to truncate the identifier. + +----- + +**Example** + +Suppose you have the following FASTA formatted sequences from a Roche (454) FLX sequencing run:: + + >EYKX4VC02EQLO5 length=108 xy=1826_0455 region=2 run=R_2007_11_07_16_15_57_ + TCCGCGCCGAGCATGCCCATCTTGGATTCCGGCGCGATGACCATCGCCCGCTCCACCACG + TTCGGCCGGCCCTTCTCGTCGAGGAATGACACCAGCGCTTCGCCCACG + >EYKX4VC02D4GS2 length=60 xy=1573_3972 region=2 run=R_2007_11_07_16_15_57_ + AATAAAACTAAATCAGCAAAGACTGGCAAATACTCACAGGCTTATACAATACAAATGTAA + +Running this tool with the default settings will produce this (2 column output): + +========================================================================== ======================================= +EYKX4VC02EQLO5 length=108 xy=1826_0455 region=2 run=R_2007_11_07_16_15_57_ TCCGCGCCGAGCATGCCCATCTTGGATTCCGGC...ACG +EYKX4VC02D4GS2 length=60 xy=1573_3972 region=2 run=R_2007_11_07_16_15_57_ AATAAAACTAAATCAGCAAAGACTGGCAAATAC...TAA +========================================================================== ======================================= + +Having the full title line (the FASTA ">" line text) as a column is not always ideal. + +The **How many characters to keep?** option is useful if your identifiers are all the same length. +In this example the identifier is 14 characters, so setting **How many characters to keep?** to **14** (and leaving **How many columns to divide title string into?** as the default, **1**) will produce this (2 column output): + +============== ======================================= +EYKX4VC02EQLO5 TCCGCGCCGAGCATGCCCATCTTGGATTCCGGC...ACG +EYKX4VC02D4GS2 AATAAAACTAAATCAGCAAAGACTGGCAAATAC...TAA +============== ======================================= + +If however your FASTA file has identifiers of variable length, it is better to split the text into at least two columns. +Running this tool with **How many columns to divide title string into?** to **2** will produce this (3 column output): + +============== =========================================================== ======================================= +EYKX4VC02EQLO5 length=108 xy=1826_0455 region=2 run=R_2007_11_07_16_15_57_ TCCGCGCCGAGCATGCCCATCTTGGATTCCGGC...ACG +EYKX4VC02D4GS2 length=60 xy=1573_3972 region=2 run=R_2007_11_07_16_15_57_ AATAAAACTAAATCAGCAAAGACTGGCAAATAC...TAA +============== =========================================================== ======================================= + +Running this tool with **How many columns to divide title string into?** to **5** will produce this (5 column output): + +============== ========== ============ ======== ========================== ======================================= +EYKX4VC02EQLO5 length=108 xy=1826_0455 region=2 run=R_2007_11_07_16_15_57_ TCCGCGCCGAGCATGCCCATCTTGGATTCCGGC...ACG +EYKX4VC02D4GS2 length=60 xy=1573_3972 region=2 run=R_2007_11_07_16_15_57_ AATAAAACTAAATCAGCAAAGACTGGCAAATAC...TAA +============== ========== ============ ======== ========================== ======================================= + +Running this tool with **How many columns to divide title string into?** to **5** and **How many characters to keep?** to **10** will produce this (5 column output). +Notice that only the first column is truncated to 10 characters -- and be careful not to trim your sequence names too much (generally they should be unique): + +========== ========== ============ ======== ========================== ======================================= +EYKX4VC02E length=108 xy=1826_0455 region=2 run=R_2007_11_07_16_15_57_ TCCGCGCCGAGCATGCCCATCTTGGATTCCGGC...ACG +EYKX4VC02D length=60 xy=1573_3972 region=2 run=R_2007_11_07_16_15_57_ AATAAAACTAAATCAGCAAAGACTGGCAAATAC...TAA +========== ========== ============ ======== ========================== ======================================= + +Note the sequences have been truncated for display purposes in the above tables. + + </help> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/1.tabular Mon May 19 11:00:01 2014 -0400 @@ -0,0 +1,6 @@ +chr22 1000 NM_17 +chr22 2000 NM_18 +chr10 2200 NM_10 +chr10 hap test +chr10 1200 NM_11 +chr22 1600 NM_19 \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/2.tabular Mon May 19 11:00:01 2014 -0400 @@ -0,0 +1,10 @@ +1 68 4.1 +2 71 4.6 +3 62 3.8 +4 75 4.4 +5 58 3.2 +6 60 3.1 +7 67 3.8 +8 68 4.1 +9 71 4.3 +10 69 3.7
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/4.fasta Mon May 19 11:00:01 2014 -0400 @@ -0,0 +1,7 @@ +>EYKX4VC01B65GS length=54 xy=0784_1754 region=1 run=R_2007_11_07_16_15_57_ +CCGGTATCCGGGTGCCGTGATGAGCGCCACCGGAACGAATTCGACTATGCCGAA +>EYKX4VC01BNCSP length=187 xy=0558_3831 region=1 run=R_2007_11_07_16_15_57_ +CTTACCGGTCACCACCGTGCCTTCAGGATTGATCGCCAGATCGGTCGGTGCGTCAGGCGG +GGTGACATCGCCCACCACGGTACTCACTGGCTGGCTCTGGTTCCCGGCGGCATCGGAGGC +CACCACGTTGAGGGTATTCCCCTCGGTTTGTGGCTCGGTGAGAACCACGTTGTAGTCGCC +ATTGGTC
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/454.fasta Mon May 19 11:00:01 2014 -0400 @@ -0,0 +1,52 @@ +>EYKX4VC01B65GS length=54 xy=0784_1754 region=1 run=R_2007_11_07_16_15_57_ +CCGGTATCCGGGTGCCGTGATGAGCGCCACCGGAACGAATTCGACTATGCCGAA +>EYKX4VC01BNCSP length=187 xy=0558_3831 region=1 run=R_2007_11_07_16_15_57_ +CTTACCGGTCACCACCGTGCCTTCAGGATTGATCGCCAGATCGGTCGGTGCGTCAGGCGG +GGTGACATCGCCCACCACGGTACTCACTGGCTGGCTCTGGTTCCCGGCGGCATCGGAGGC +CACCACGTTGAGGGTATTCCCCTCGGTTTGTGGCTCGGTGAGAACCACGTTGTAGTCGCC +ATTGGTC +>EYKX4VC01CD9FT length=115 xy=0865_1719 region=1 run=R_2007_11_07_16_15_57_ +GGGGGCTTTGGCCTGTCGTCCGGCACCTCGCAAGAGCTACAGCAGGCGCGGCTGGCGATC +ATCGGCGGCACGCCGGCCTATATGTCGCCGGAACACACCACCCGCACCCAACGCG +>EYKX4VC01B8FW0 length=95 xy=0799_0514 region=1 run=R_2007_11_07_16_15_57_ +TAAATTTCAAGGAATGCAAATCAGGGTCGTGTGTTTAGACTTCGGCTTTAGAGACCTGAA +TACGTCAAAAACATAACTTCATGATATCTTGCAGT +>EYKX4VC01BCGYW length=115 xy=0434_3926 region=1 run=R_2007_11_07_16_15_57_ +GGCCAGCCGGGACAGCGTTGTTGGGCTGCATGGCGACGAGCTAAAAGTCGCCATCACCGC +CCCGCCGGTTGATGGGCAGGCTAATGCCCATCTGGTAAAAACTTTCTCGCCAAAC +>EYKX4VC01AZXC6 length=116 xy=0292_0280 region=1 run=R_2007_11_07_16_15_57_ +GGGGGCGTTTGGCCTGTCGTCCGGCACCTCGCAAGAGCTACAGCAGGCGCGGCTGGCGAT +CATCGGCGGCACGCCGGCCTATATGTCGCCGGAACACACCACCCGCACCCAACGCG +>EYKX4VC01CATH5 length=82 xy=0826_0843 region=1 run=R_2007_11_07_16_15_57_ +CGAAATTGCACATTCTCGGCCATATCTCTGGACCTACATGACCGATTTGATCATCTTCGA +ACTTAGCCTTCCTTTNTTAACG +>EYKX4VC01BCEIV length=47 xy=0434_0757 region=1 run=R_2007_11_07_16_15_57_ +TGACGTCGTGCCGAGCTACGACAATGCCGACATGGTGATCGTTAACA +>EYKX4VC01BWERM length=83 xy=0662_0304 region=1 run=R_2007_11_07_16_15_57_ +CGGTCGGCCTCACCATGGAGAAGATCCCGCCCCGGCCGAGGTCATGGTGGATCTCGGCCA +GGGCGTGCTGATGAAGTTCAAAT +>EYKX4VC01BT2O7 length=69 xy=0635_1945 region=1 run=R_2007_11_07_16_15_57_ +AGCGTTTCTCCAGCCGGTCGGCTACGCCGTTTGCCCCTGAAAGACGCTGTTCAGACCGAA +CGCGGTAAA +>EYKX4VC01BO0UO length=222 xy=0577_3838 region=1 run=R_2007_11_07_16_15_57_ +AGACCTGGGACAGCGGCGGGCTGCTGAAGCCGCAGGCGATAGAGGACAAACTGCAGTACC +GCTTCTGGCTGCACTATGCCGAAGGCTCGCTGATGCCGCTGCTGTTAATGAAGCTGGTGT +TCGCCAGCCTGGGTAAACCCCCTGTGCCCTTTGGCGTCCGCTCGCTGGGCGCCCTGCTGG +GCAAGGGCATTCAGAAAGCGTGGCTGGATCCCCAGCTGGCCA +>EYKX4VC01CBCPK length=83 xy=0832_1158 region=1 run=R_2007_11_07_16_15_57_ +CGGTCGGCCTCACCATGGAGAAGATCCCGCCCCGGCCGAGGTCATGGTGGATCTCGGCCA +GGGCGTGCTGATGAAGTTCAAAT +>EYKX4VC01B474S length=54 xy=0762_2010 region=1 run=R_2007_11_07_16_15_57_ +AGCAGTTTTCCAGCGCTTTCGAAGAGCGCTGGCGCGCGCGGGCTTCCAGCATAT +>EYKX4VC01BB4QL length=57 xy=0431_0363 region=1 run=R_2007_11_07_16_15_57_ +GGGGAGGAGCTAATAATATGCTCTTGGGGAGGAGCTAATTATATGCTCTTGGGGAGG +>EYKX4VC01BJ37M length=64 xy=0522_0192 region=1 run=R_2007_11_07_16_15_57_ +TCGAGTATGTATCAAGGACTACATACAAATTTGCCAAAAGAGATTATGCACTATCCCGAC +TTCC +>EYKX4VC01BV9R8 length=54 xy=0660_2038 region=1 run=R_2007_11_07_16_15_57_ +AAAACTCGGAGAAACTATTCAGCAGCACTGCGTTTCGCTGAATTTTAGACCGTT +>EYKX4VC01CEPP8 length=60 xy=0870_2350 region=1 run=R_2007_11_07_16_15_57_ +CTGGGTGGGTGCACTACAGGAACGTCATTTGTTCAATCCTCACGTTGTTGTTAGTGTCAG +>EYKX4VC01BTLME length=78 xy=0630_0292 region=1 run=R_2007_11_07_16_15_57_ +TTATCCACACGCTGTCCGGATCCAGCGCCAGGCGCCGACGCTGGACTTCCGCCGCCTGCG +CCCAGTTGCCCTGACTTC
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/a.tab Mon May 19 11:00:01 2014 -0400 @@ -0,0 +1,15 @@ +CHR SNP BP A1 TEST NMISS BETA STAT P +1 rs1181876 3671541 T DOMDEV 958 -1.415 -3.326 0.0009161 +1 rs10492923 5092886 C ADD 1007 5.105 4.368 1.382e-05 +1 rs10492923 5092886 C DOMDEV 1007 -5.612 -4.249 2.35e-05 +1 rs10492923 5092886 C GENO_2DF 1007 NA 19.9 4.775e-05 +1 rs1801133 11778965 T ADD 1022 1.23 3.97 7.682e-05 +1 rs1801133 11778965 T GENO_2DF 1022 NA 16.07 0.0003233 +1 rs1361912 12663121 A ADD 1021 12.69 4.093 4.596e-05 +1 rs1361912 12663121 A DOMDEV 1021 -12.37 -3.945 8.533e-05 +1 rs1361912 12663121 A GENO_2DF 1021 NA 17.05 0.0001982 +1 rs1009806 19373138 G ADD 1021 -1.334 -3.756 0.0001826 +1 rs1009806 19373138 G GENO_2DF 1021 NA 19.36 6.244e-05 +1 rs873654 29550948 A DOMDEV 1012 1.526 3.6 0.0003339 +1 rs10489527 36800027 C ADD 1016 12.67 4.114 4.211e-05 +1 rs10489527 36800027 C DOMDEV 1016 -13.05 -4.02 6.249e-05
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/fasta_to_tabular_out1.tabular Mon May 19 11:00:01 2014 -0400 @@ -0,0 +1,18 @@ +EYKX4VC01B65GS length=54 xy=0784_1754 region=1 run=R_2007_11_07_16_15_57_ CCGGTATCCGGGTGCCGTGATGAGCGCCACCGGAACGAATTCGACTATGCCGAA +EYKX4VC01BNCSP length=187 xy=0558_3831 region=1 run=R_2007_11_07_16_15_57_ CTTACCGGTCACCACCGTGCCTTCAGGATTGATCGCCAGATCGGTCGGTGCGTCAGGCGGGGTGACATCGCCCACCACGGTACTCACTGGCTGGCTCTGGTTCCCGGCGGCATCGGAGGCCACCACGTTGAGGGTATTCCCCTCGGTTTGTGGCTCGGTGAGAACCACGTTGTAGTCGCCATTGGTC +EYKX4VC01CD9FT length=115 xy=0865_1719 region=1 run=R_2007_11_07_16_15_57_ GGGGGCTTTGGCCTGTCGTCCGGCACCTCGCAAGAGCTACAGCAGGCGCGGCTGGCGATCATCGGCGGCACGCCGGCCTATATGTCGCCGGAACACACCACCCGCACCCAACGCG +EYKX4VC01B8FW0 length=95 xy=0799_0514 region=1 run=R_2007_11_07_16_15_57_ TAAATTTCAAGGAATGCAAATCAGGGTCGTGTGTTTAGACTTCGGCTTTAGAGACCTGAATACGTCAAAAACATAACTTCATGATATCTTGCAGT +EYKX4VC01BCGYW length=115 xy=0434_3926 region=1 run=R_2007_11_07_16_15_57_ GGCCAGCCGGGACAGCGTTGTTGGGCTGCATGGCGACGAGCTAAAAGTCGCCATCACCGCCCCGCCGGTTGATGGGCAGGCTAATGCCCATCTGGTAAAAACTTTCTCGCCAAAC +EYKX4VC01AZXC6 length=116 xy=0292_0280 region=1 run=R_2007_11_07_16_15_57_ GGGGGCGTTTGGCCTGTCGTCCGGCACCTCGCAAGAGCTACAGCAGGCGCGGCTGGCGATCATCGGCGGCACGCCGGCCTATATGTCGCCGGAACACACCACCCGCACCCAACGCG +EYKX4VC01CATH5 length=82 xy=0826_0843 region=1 run=R_2007_11_07_16_15_57_ CGAAATTGCACATTCTCGGCCATATCTCTGGACCTACATGACCGATTTGATCATCTTCGAACTTAGCCTTCCTTTNTTAACG +EYKX4VC01BCEIV length=47 xy=0434_0757 region=1 run=R_2007_11_07_16_15_57_ TGACGTCGTGCCGAGCTACGACAATGCCGACATGGTGATCGTTAACA +EYKX4VC01BWERM length=83 xy=0662_0304 region=1 run=R_2007_11_07_16_15_57_ CGGTCGGCCTCACCATGGAGAAGATCCCGCCCCGGCCGAGGTCATGGTGGATCTCGGCCAGGGCGTGCTGATGAAGTTCAAAT +EYKX4VC01BT2O7 length=69 xy=0635_1945 region=1 run=R_2007_11_07_16_15_57_ AGCGTTTCTCCAGCCGGTCGGCTACGCCGTTTGCCCCTGAAAGACGCTGTTCAGACCGAACGCGGTAAA +EYKX4VC01BO0UO length=222 xy=0577_3838 region=1 run=R_2007_11_07_16_15_57_ AGACCTGGGACAGCGGCGGGCTGCTGAAGCCGCAGGCGATAGAGGACAAACTGCAGTACCGCTTCTGGCTGCACTATGCCGAAGGCTCGCTGATGCCGCTGCTGTTAATGAAGCTGGTGTTCGCCAGCCTGGGTAAACCCCCTGTGCCCTTTGGCGTCCGCTCGCTGGGCGCCCTGCTGGGCAAGGGCATTCAGAAAGCGTGGCTGGATCCCCAGCTGGCCA +EYKX4VC01CBCPK length=83 xy=0832_1158 region=1 run=R_2007_11_07_16_15_57_ CGGTCGGCCTCACCATGGAGAAGATCCCGCCCCGGCCGAGGTCATGGTGGATCTCGGCCAGGGCGTGCTGATGAAGTTCAAAT +EYKX4VC01B474S length=54 xy=0762_2010 region=1 run=R_2007_11_07_16_15_57_ AGCAGTTTTCCAGCGCTTTCGAAGAGCGCTGGCGCGCGCGGGCTTCCAGCATAT +EYKX4VC01BB4QL length=57 xy=0431_0363 region=1 run=R_2007_11_07_16_15_57_ GGGGAGGAGCTAATAATATGCTCTTGGGGAGGAGCTAATTATATGCTCTTGGGGAGG +EYKX4VC01BJ37M length=64 xy=0522_0192 region=1 run=R_2007_11_07_16_15_57_ TCGAGTATGTATCAAGGACTACATACAAATTTGCCAAAAGAGATTATGCACTATCCCGACTTCC +EYKX4VC01BV9R8 length=54 xy=0660_2038 region=1 run=R_2007_11_07_16_15_57_ AAAACTCGGAGAAACTATTCAGCAGCACTGCGTTTCGCTGAATTTTAGACCGTT +EYKX4VC01CEPP8 length=60 xy=0870_2350 region=1 run=R_2007_11_07_16_15_57_ CTGGGTGGGTGCACTACAGGAACGTCATTTGTTCAATCCTCACGTTGTTGTTAGTGTCAG +EYKX4VC01BTLME length=78 xy=0630_0292 region=1 run=R_2007_11_07_16_15_57_ TTATCCACACGCTGTCCGGATCCAGCGCCAGGCGCCGACGCTGGACTTCCGCCGCCTGCGCCCAGTTGCCCTGACTTC
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/fasta_to_tabular_out2.tabular Mon May 19 11:00:01 2014 -0400 @@ -0,0 +1,2 @@ +EYKX4VC01B65GS length=54 xy=0784_1754 region=1 run=R_2007_11_07_16_15_57_ CCGGTATCCGGGTGCCGTGATGAGCGCCACCGGAACGAATTCGACTATGCCGAA +EYKX4VC01BNCSP length=187 xy=0558_3831 region=1 run=R_2007_11_07_16_15_57_ CTTACCGGTCACCACCGTGCCTTCAGGATTGATCGCCAGATCGGTCGGTGCGTCAGGCGGGGTGACATCGCCCACCACGGTACTCACTGGCTGGCTCTGGTTCCCGGCGGCATCGGAGGCCACCACGTTGAGGGTATTCCCCTCGGTTTGTGGCTCGGTGAGAACCACGTTGTAGTCGCCATTGGTC
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/fasta_to_tabular_out3.tabular Mon May 19 11:00:01 2014 -0400 @@ -0,0 +1,18 @@ +EYKX4VC01B65GS CCGGTATCCGGGTGCCGTGATGAGCGCCACCGGAACGAATTCGACTATGCCGAA +EYKX4VC01BNCSP CTTACCGGTCACCACCGTGCCTTCAGGATTGATCGCCAGATCGGTCGGTGCGTCAGGCGGGGTGACATCGCCCACCACGGTACTCACTGGCTGGCTCTGGTTCCCGGCGGCATCGGAGGCCACCACGTTGAGGGTATTCCCCTCGGTTTGTGGCTCGGTGAGAACCACGTTGTAGTCGCCATTGGTC +EYKX4VC01CD9FT GGGGGCTTTGGCCTGTCGTCCGGCACCTCGCAAGAGCTACAGCAGGCGCGGCTGGCGATCATCGGCGGCACGCCGGCCTATATGTCGCCGGAACACACCACCCGCACCCAACGCG +EYKX4VC01B8FW0 TAAATTTCAAGGAATGCAAATCAGGGTCGTGTGTTTAGACTTCGGCTTTAGAGACCTGAATACGTCAAAAACATAACTTCATGATATCTTGCAGT +EYKX4VC01BCGYW GGCCAGCCGGGACAGCGTTGTTGGGCTGCATGGCGACGAGCTAAAAGTCGCCATCACCGCCCCGCCGGTTGATGGGCAGGCTAATGCCCATCTGGTAAAAACTTTCTCGCCAAAC +EYKX4VC01AZXC6 GGGGGCGTTTGGCCTGTCGTCCGGCACCTCGCAAGAGCTACAGCAGGCGCGGCTGGCGATCATCGGCGGCACGCCGGCCTATATGTCGCCGGAACACACCACCCGCACCCAACGCG +EYKX4VC01CATH5 CGAAATTGCACATTCTCGGCCATATCTCTGGACCTACATGACCGATTTGATCATCTTCGAACTTAGCCTTCCTTTNTTAACG +EYKX4VC01BCEIV TGACGTCGTGCCGAGCTACGACAATGCCGACATGGTGATCGTTAACA +EYKX4VC01BWERM CGGTCGGCCTCACCATGGAGAAGATCCCGCCCCGGCCGAGGTCATGGTGGATCTCGGCCAGGGCGTGCTGATGAAGTTCAAAT +EYKX4VC01BT2O7 AGCGTTTCTCCAGCCGGTCGGCTACGCCGTTTGCCCCTGAAAGACGCTGTTCAGACCGAACGCGGTAAA +EYKX4VC01BO0UO AGACCTGGGACAGCGGCGGGCTGCTGAAGCCGCAGGCGATAGAGGACAAACTGCAGTACCGCTTCTGGCTGCACTATGCCGAAGGCTCGCTGATGCCGCTGCTGTTAATGAAGCTGGTGTTCGCCAGCCTGGGTAAACCCCCTGTGCCCTTTGGCGTCCGCTCGCTGGGCGCCCTGCTGGGCAAGGGCATTCAGAAAGCGTGGCTGGATCCCCAGCTGGCCA +EYKX4VC01CBCPK CGGTCGGCCTCACCATGGAGAAGATCCCGCCCCGGCCGAGGTCATGGTGGATCTCGGCCAGGGCGTGCTGATGAAGTTCAAAT +EYKX4VC01B474S AGCAGTTTTCCAGCGCTTTCGAAGAGCGCTGGCGCGCGCGGGCTTCCAGCATAT +EYKX4VC01BB4QL GGGGAGGAGCTAATAATATGCTCTTGGGGAGGAGCTAATTATATGCTCTTGGGGAGG +EYKX4VC01BJ37M TCGAGTATGTATCAAGGACTACATACAAATTTGCCAAAAGAGATTATGCACTATCCCGACTTCC +EYKX4VC01BV9R8 AAAACTCGGAGAAACTATTCAGCAGCACTGCGTTTCGCTGAATTTTAGACCGTT +EYKX4VC01CEPP8 CTGGGTGGGTGCACTACAGGAACGTCATTTGTTCAATCCTCACGTTGTTGTTAGTGTCAG +EYKX4VC01BTLME TTATCCACACGCTGTCCGGATCCAGCGCCAGGCGCCGACGCTGGACTTCCGCCGCCTGCGCCCAGTTGCCCTGACTTC
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/fasta_to_tabular_out4.tabular Mon May 19 11:00:01 2014 -0400 @@ -0,0 +1,18 @@ +EYKX4VC01B65GS length=54 xy=0784_1754 region=1 run=R_2007_11_07_16_15_57_ CCGGTATCCGGGTGCCGTGATGAGCGCCACCGGAACGAATTCGACTATGCCGAA +EYKX4VC01BNCSP length=187 xy=0558_3831 region=1 run=R_2007_11_07_16_15_57_ CTTACCGGTCACCACCGTGCCTTCAGGATTGATCGCCAGATCGGTCGGTGCGTCAGGCGGGGTGACATCGCCCACCACGGTACTCACTGGCTGGCTCTGGTTCCCGGCGGCATCGGAGGCCACCACGTTGAGGGTATTCCCCTCGGTTTGTGGCTCGGTGAGAACCACGTTGTAGTCGCCATTGGTC +EYKX4VC01CD9FT length=115 xy=0865_1719 region=1 run=R_2007_11_07_16_15_57_ GGGGGCTTTGGCCTGTCGTCCGGCACCTCGCAAGAGCTACAGCAGGCGCGGCTGGCGATCATCGGCGGCACGCCGGCCTATATGTCGCCGGAACACACCACCCGCACCCAACGCG +EYKX4VC01B8FW0 length=95 xy=0799_0514 region=1 run=R_2007_11_07_16_15_57_ TAAATTTCAAGGAATGCAAATCAGGGTCGTGTGTTTAGACTTCGGCTTTAGAGACCTGAATACGTCAAAAACATAACTTCATGATATCTTGCAGT +EYKX4VC01BCGYW length=115 xy=0434_3926 region=1 run=R_2007_11_07_16_15_57_ GGCCAGCCGGGACAGCGTTGTTGGGCTGCATGGCGACGAGCTAAAAGTCGCCATCACCGCCCCGCCGGTTGATGGGCAGGCTAATGCCCATCTGGTAAAAACTTTCTCGCCAAAC +EYKX4VC01AZXC6 length=116 xy=0292_0280 region=1 run=R_2007_11_07_16_15_57_ GGGGGCGTTTGGCCTGTCGTCCGGCACCTCGCAAGAGCTACAGCAGGCGCGGCTGGCGATCATCGGCGGCACGCCGGCCTATATGTCGCCGGAACACACCACCCGCACCCAACGCG +EYKX4VC01CATH5 length=82 xy=0826_0843 region=1 run=R_2007_11_07_16_15_57_ CGAAATTGCACATTCTCGGCCATATCTCTGGACCTACATGACCGATTTGATCATCTTCGAACTTAGCCTTCCTTTNTTAACG +EYKX4VC01BCEIV length=47 xy=0434_0757 region=1 run=R_2007_11_07_16_15_57_ TGACGTCGTGCCGAGCTACGACAATGCCGACATGGTGATCGTTAACA +EYKX4VC01BWERM length=83 xy=0662_0304 region=1 run=R_2007_11_07_16_15_57_ CGGTCGGCCTCACCATGGAGAAGATCCCGCCCCGGCCGAGGTCATGGTGGATCTCGGCCAGGGCGTGCTGATGAAGTTCAAAT +EYKX4VC01BT2O7 length=69 xy=0635_1945 region=1 run=R_2007_11_07_16_15_57_ AGCGTTTCTCCAGCCGGTCGGCTACGCCGTTTGCCCCTGAAAGACGCTGTTCAGACCGAACGCGGTAAA +EYKX4VC01BO0UO length=222 xy=0577_3838 region=1 run=R_2007_11_07_16_15_57_ AGACCTGGGACAGCGGCGGGCTGCTGAAGCCGCAGGCGATAGAGGACAAACTGCAGTACCGCTTCTGGCTGCACTATGCCGAAGGCTCGCTGATGCCGCTGCTGTTAATGAAGCTGGTGTTCGCCAGCCTGGGTAAACCCCCTGTGCCCTTTGGCGTCCGCTCGCTGGGCGCCCTGCTGGGCAAGGGCATTCAGAAAGCGTGGCTGGATCCCCAGCTGGCCA +EYKX4VC01CBCPK length=83 xy=0832_1158 region=1 run=R_2007_11_07_16_15_57_ CGGTCGGCCTCACCATGGAGAAGATCCCGCCCCGGCCGAGGTCATGGTGGATCTCGGCCAGGGCGTGCTGATGAAGTTCAAAT +EYKX4VC01B474S length=54 xy=0762_2010 region=1 run=R_2007_11_07_16_15_57_ AGCAGTTTTCCAGCGCTTTCGAAGAGCGCTGGCGCGCGCGGGCTTCCAGCATAT +EYKX4VC01BB4QL length=57 xy=0431_0363 region=1 run=R_2007_11_07_16_15_57_ GGGGAGGAGCTAATAATATGCTCTTGGGGAGGAGCTAATTATATGCTCTTGGGGAGG +EYKX4VC01BJ37M length=64 xy=0522_0192 region=1 run=R_2007_11_07_16_15_57_ TCGAGTATGTATCAAGGACTACATACAAATTTGCCAAAAGAGATTATGCACTATCCCGACTTCC +EYKX4VC01BV9R8 length=54 xy=0660_2038 region=1 run=R_2007_11_07_16_15_57_ AAAACTCGGAGAAACTATTCAGCAGCACTGCGTTTCGCTGAATTTTAGACCGTT +EYKX4VC01CEPP8 length=60 xy=0870_2350 region=1 run=R_2007_11_07_16_15_57_ CTGGGTGGGTGCACTACAGGAACGTCATTTGTTCAATCCTCACGTTGTTGTTAGTGTCAG +EYKX4VC01BTLME length=78 xy=0630_0292 region=1 run=R_2007_11_07_16_15_57_ TTATCCACACGCTGTCCGGATCCAGCGCCAGGCGCCGACGCTGGACTTCCGCCGCCTGCGCCCAGTTGCCCTGACTTC
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/fasta_to_tabular_out5.tabular Mon May 19 11:00:01 2014 -0400 @@ -0,0 +1,18 @@ +EYKX4VC01B65GS length=54 xy=0784_1754 region=1 run=R_2007_11_07_16_15_57_ CCGGTATCCGGGTGCCGTGATGAGCGCCACCGGAACGAATTCGACTATGCCGAA +EYKX4VC01BNCSP length=187 xy=0558_3831 region=1 run=R_2007_11_07_16_15_57_ CTTACCGGTCACCACCGTGCCTTCAGGATTGATCGCCAGATCGGTCGGTGCGTCAGGCGGGGTGACATCGCCCACCACGGTACTCACTGGCTGGCTCTGGTTCCCGGCGGCATCGGAGGCCACCACGTTGAGGGTATTCCCCTCGGTTTGTGGCTCGGTGAGAACCACGTTGTAGTCGCCATTGGTC +EYKX4VC01CD9FT length=115 xy=0865_1719 region=1 run=R_2007_11_07_16_15_57_ GGGGGCTTTGGCCTGTCGTCCGGCACCTCGCAAGAGCTACAGCAGGCGCGGCTGGCGATCATCGGCGGCACGCCGGCCTATATGTCGCCGGAACACACCACCCGCACCCAACGCG +EYKX4VC01B8FW0 length=95 xy=0799_0514 region=1 run=R_2007_11_07_16_15_57_ TAAATTTCAAGGAATGCAAATCAGGGTCGTGTGTTTAGACTTCGGCTTTAGAGACCTGAATACGTCAAAAACATAACTTCATGATATCTTGCAGT +EYKX4VC01BCGYW length=115 xy=0434_3926 region=1 run=R_2007_11_07_16_15_57_ GGCCAGCCGGGACAGCGTTGTTGGGCTGCATGGCGACGAGCTAAAAGTCGCCATCACCGCCCCGCCGGTTGATGGGCAGGCTAATGCCCATCTGGTAAAAACTTTCTCGCCAAAC +EYKX4VC01AZXC6 length=116 xy=0292_0280 region=1 run=R_2007_11_07_16_15_57_ GGGGGCGTTTGGCCTGTCGTCCGGCACCTCGCAAGAGCTACAGCAGGCGCGGCTGGCGATCATCGGCGGCACGCCGGCCTATATGTCGCCGGAACACACCACCCGCACCCAACGCG +EYKX4VC01CATH5 length=82 xy=0826_0843 region=1 run=R_2007_11_07_16_15_57_ CGAAATTGCACATTCTCGGCCATATCTCTGGACCTACATGACCGATTTGATCATCTTCGAACTTAGCCTTCCTTTNTTAACG +EYKX4VC01BCEIV length=47 xy=0434_0757 region=1 run=R_2007_11_07_16_15_57_ TGACGTCGTGCCGAGCTACGACAATGCCGACATGGTGATCGTTAACA +EYKX4VC01BWERM length=83 xy=0662_0304 region=1 run=R_2007_11_07_16_15_57_ CGGTCGGCCTCACCATGGAGAAGATCCCGCCCCGGCCGAGGTCATGGTGGATCTCGGCCAGGGCGTGCTGATGAAGTTCAAAT +EYKX4VC01BT2O7 length=69 xy=0635_1945 region=1 run=R_2007_11_07_16_15_57_ AGCGTTTCTCCAGCCGGTCGGCTACGCCGTTTGCCCCTGAAAGACGCTGTTCAGACCGAACGCGGTAAA +EYKX4VC01BO0UO length=222 xy=0577_3838 region=1 run=R_2007_11_07_16_15_57_ AGACCTGGGACAGCGGCGGGCTGCTGAAGCCGCAGGCGATAGAGGACAAACTGCAGTACCGCTTCTGGCTGCACTATGCCGAAGGCTCGCTGATGCCGCTGCTGTTAATGAAGCTGGTGTTCGCCAGCCTGGGTAAACCCCCTGTGCCCTTTGGCGTCCGCTCGCTGGGCGCCCTGCTGGGCAAGGGCATTCAGAAAGCGTGGCTGGATCCCCAGCTGGCCA +EYKX4VC01CBCPK length=83 xy=0832_1158 region=1 run=R_2007_11_07_16_15_57_ CGGTCGGCCTCACCATGGAGAAGATCCCGCCCCGGCCGAGGTCATGGTGGATCTCGGCCAGGGCGTGCTGATGAAGTTCAAAT +EYKX4VC01B474S length=54 xy=0762_2010 region=1 run=R_2007_11_07_16_15_57_ AGCAGTTTTCCAGCGCTTTCGAAGAGCGCTGGCGCGCGCGGGCTTCCAGCATAT +EYKX4VC01BB4QL length=57 xy=0431_0363 region=1 run=R_2007_11_07_16_15_57_ GGGGAGGAGCTAATAATATGCTCTTGGGGAGGAGCTAATTATATGCTCTTGGGGAGG +EYKX4VC01BJ37M length=64 xy=0522_0192 region=1 run=R_2007_11_07_16_15_57_ TCGAGTATGTATCAAGGACTACATACAAATTTGCCAAAAGAGATTATGCACTATCCCGACTTCC +EYKX4VC01BV9R8 length=54 xy=0660_2038 region=1 run=R_2007_11_07_16_15_57_ AAAACTCGGAGAAACTATTCAGCAGCACTGCGTTTCGCTGAATTTTAGACCGTT +EYKX4VC01CEPP8 length=60 xy=0870_2350 region=1 run=R_2007_11_07_16_15_57_ CTGGGTGGGTGCACTACAGGAACGTCATTTGTTCAATCCTCACGTTGTTGTTAGTGTCAG +EYKX4VC01BTLME length=78 xy=0630_0292 region=1 run=R_2007_11_07_16_15_57_ TTATCCACACGCTGTCCGGATCCAGCGCCAGGCGCCGACGCTGGACTTCCGCCGCCTGCGCCCAGTTGCCCTGACTTC
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/fasta_to_tabular_out6.tabular Mon May 19 11:00:01 2014 -0400 @@ -0,0 +1,18 @@ +EYKX4VC01B length=54 xy=0784_1754 region=1 run=R_2007_11_07_16_15_57_ CCGGTATCCGGGTGCCGTGATGAGCGCCACCGGAACGAATTCGACTATGCCGAA +EYKX4VC01B length=187 xy=0558_3831 region=1 run=R_2007_11_07_16_15_57_ CTTACCGGTCACCACCGTGCCTTCAGGATTGATCGCCAGATCGGTCGGTGCGTCAGGCGGGGTGACATCGCCCACCACGGTACTCACTGGCTGGCTCTGGTTCCCGGCGGCATCGGAGGCCACCACGTTGAGGGTATTCCCCTCGGTTTGTGGCTCGGTGAGAACCACGTTGTAGTCGCCATTGGTC +EYKX4VC01C length=115 xy=0865_1719 region=1 run=R_2007_11_07_16_15_57_ GGGGGCTTTGGCCTGTCGTCCGGCACCTCGCAAGAGCTACAGCAGGCGCGGCTGGCGATCATCGGCGGCACGCCGGCCTATATGTCGCCGGAACACACCACCCGCACCCAACGCG +EYKX4VC01B length=95 xy=0799_0514 region=1 run=R_2007_11_07_16_15_57_ TAAATTTCAAGGAATGCAAATCAGGGTCGTGTGTTTAGACTTCGGCTTTAGAGACCTGAATACGTCAAAAACATAACTTCATGATATCTTGCAGT +EYKX4VC01B length=115 xy=0434_3926 region=1 run=R_2007_11_07_16_15_57_ GGCCAGCCGGGACAGCGTTGTTGGGCTGCATGGCGACGAGCTAAAAGTCGCCATCACCGCCCCGCCGGTTGATGGGCAGGCTAATGCCCATCTGGTAAAAACTTTCTCGCCAAAC +EYKX4VC01A length=116 xy=0292_0280 region=1 run=R_2007_11_07_16_15_57_ GGGGGCGTTTGGCCTGTCGTCCGGCACCTCGCAAGAGCTACAGCAGGCGCGGCTGGCGATCATCGGCGGCACGCCGGCCTATATGTCGCCGGAACACACCACCCGCACCCAACGCG +EYKX4VC01C length=82 xy=0826_0843 region=1 run=R_2007_11_07_16_15_57_ CGAAATTGCACATTCTCGGCCATATCTCTGGACCTACATGACCGATTTGATCATCTTCGAACTTAGCCTTCCTTTNTTAACG +EYKX4VC01B length=47 xy=0434_0757 region=1 run=R_2007_11_07_16_15_57_ TGACGTCGTGCCGAGCTACGACAATGCCGACATGGTGATCGTTAACA +EYKX4VC01B length=83 xy=0662_0304 region=1 run=R_2007_11_07_16_15_57_ CGGTCGGCCTCACCATGGAGAAGATCCCGCCCCGGCCGAGGTCATGGTGGATCTCGGCCAGGGCGTGCTGATGAAGTTCAAAT +EYKX4VC01B length=69 xy=0635_1945 region=1 run=R_2007_11_07_16_15_57_ AGCGTTTCTCCAGCCGGTCGGCTACGCCGTTTGCCCCTGAAAGACGCTGTTCAGACCGAACGCGGTAAA +EYKX4VC01B length=222 xy=0577_3838 region=1 run=R_2007_11_07_16_15_57_ AGACCTGGGACAGCGGCGGGCTGCTGAAGCCGCAGGCGATAGAGGACAAACTGCAGTACCGCTTCTGGCTGCACTATGCCGAAGGCTCGCTGATGCCGCTGCTGTTAATGAAGCTGGTGTTCGCCAGCCTGGGTAAACCCCCTGTGCCCTTTGGCGTCCGCTCGCTGGGCGCCCTGCTGGGCAAGGGCATTCAGAAAGCGTGGCTGGATCCCCAGCTGGCCA +EYKX4VC01C length=83 xy=0832_1158 region=1 run=R_2007_11_07_16_15_57_ CGGTCGGCCTCACCATGGAGAAGATCCCGCCCCGGCCGAGGTCATGGTGGATCTCGGCCAGGGCGTGCTGATGAAGTTCAAAT +EYKX4VC01B length=54 xy=0762_2010 region=1 run=R_2007_11_07_16_15_57_ AGCAGTTTTCCAGCGCTTTCGAAGAGCGCTGGCGCGCGCGGGCTTCCAGCATAT +EYKX4VC01B length=57 xy=0431_0363 region=1 run=R_2007_11_07_16_15_57_ GGGGAGGAGCTAATAATATGCTCTTGGGGAGGAGCTAATTATATGCTCTTGGGGAGG +EYKX4VC01B length=64 xy=0522_0192 region=1 run=R_2007_11_07_16_15_57_ TCGAGTATGTATCAAGGACTACATACAAATTTGCCAAAAGAGATTATGCACTATCCCGACTTCC +EYKX4VC01B length=54 xy=0660_2038 region=1 run=R_2007_11_07_16_15_57_ AAAACTCGGAGAAACTATTCAGCAGCACTGCGTTTCGCTGAATTTTAGACCGTT +EYKX4VC01C length=60 xy=0870_2350 region=1 run=R_2007_11_07_16_15_57_ CTGGGTGGGTGCACTACAGGAACGTCATTTGTTCAATCCTCACGTTGTTGTTAGTGTCAG +EYKX4VC01B length=78 xy=0630_0292 region=1 run=R_2007_11_07_16_15_57_ TTATCCACACGCTGTCCGGATCCAGCGCCAGGCGCCGACGCTGGACTTCCGCCGCCTGCGCCCAGTTGCCCTGACTTC