Mercurial > repos > imgteam > iscc_sum_verify
changeset 0:9fee6d81910d draft default tip
planemo upload for repository https://github.com/BMCV/galaxy-image-analysis/tree/master/tools/iscc-sum commit 6db86b8b65a0e05b7f3541d505fbe900633fc72a
| author | imgteam |
|---|---|
| date | Fri, 19 Dec 2025 15:02:49 +0000 |
| parents | |
| children | |
| files | creators.xml iscc_similarity_parse_output.py iscc_verify.xml macros.xml test-data/sequence1.txt test-data/sequence2.txt test-data/test1.png test-data/test1_copy.png test-data/test1_iscc.txt test-data/test2.tiff test-data/test2_similar.tiff test-data/test3.fasta |
| diffstat | 12 files changed, 461 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/creators.xml Fri Dec 19 15:02:49 2025 +0000 @@ -0,0 +1,21 @@ +<macros> + + <xml name="creators/iscc"> + <organization name="ISCC Foundation" url="https://iscc.foundation"/> + <yield /> + </xml> + <xml name="creators/lco"> + <organization name="Leiden Cell Observatory, Leiden University" url="https://www.universiteitleiden.nl/en/research/research-facilities/science/cell-observatory"/> + <yield /> + </xml> + <xml name="creators/maartenpaul"> + <person givenName="Maarten" familyName="Paul"/> + <yield/> + </xml> + <xml name="creators/etzm"> + <person givenName="Martin" familyName="Etzrodt"/> + <yield/> + </xml> + + +</macros> \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/iscc_similarity_parse_output.py Fri Dec 19 15:02:49 2025 +0000 @@ -0,0 +1,158 @@ +#!/usr/bin/env python +""" +Parse ISCC similarity output into tabular format with unique identifiers. + +Input format (from iscc-sum --similar): + ISCC:K4AOMG... *file1.txt + ~08 ISCC:K4AOMG... *file2.txt + ~10 ISCC:K4AOMG... *file3.txt + ISCC:K4AGSPO... *file4.txt + +Output format (tabular with 7 columns, bidirectional): + file_id filename iscc_code match_id match_filename match_iscc_hash distance + 23 file1.txt K4AOMG... 24 file2.txt K4AOMG... 8 + 24 file2.txt K4AOMG... 23 file1.txt K4AOMG... 8 + 25 file4.txt K4AGSPO... -1 +""" +import argparse + + +def clean_filename(filename): + """Remove directory prefix from filename.""" + # Remove 'input_files/' prefix if present + if filename.startswith('input_files/'): + filename = filename[len('input_files/'):] + + return filename + + +def load_id_mapping(mapping_file): + """Load filename to element_identifier mapping. + + Returns: dict mapping cleaned filename -> element_identifier + """ + mapping = {} + with open(mapping_file, 'r') as f: + for line in f: + parts = line.strip().split('\t') + if len(parts) == 2: + filename, element_id = parts + # Clean the filename the same way as in parse + cleaned = clean_filename(filename) + mapping[cleaned] = element_id + return mapping + + +def parse_iscc_line(line): + """Parse ISCC line and extract code and filename. + + Format: "ISCC:CODE *filename" or " ~NN ISCC:CODE *filename" + Returns: (code, filename) or (None, None) if parse fails + """ + # Find the * separator + if ' *' not in line: + return None, None + + # Split on ' *' to get code part and filename + parts = line.split(' *', 1) + code_part = parts[0].strip() + filename = clean_filename(parts[1].strip()) + + # Extract CODE (after 'ISCC:') + if 'ISCC:' in code_part: + code = code_part.split('ISCC:', 1)[1].strip() + else: + code = '' + + return code, filename + + +def main(): + parser = argparse.ArgumentParser( + description='Parse ISCC similarity output into tabular format' + ) + parser.add_argument( + 'similarity_raw', + help='Raw similarity output from iscc-sum --similar' + ) + parser.add_argument( + 'id_mapping', + help='TSV file mapping filenames to element identifiers' + ) + parser.add_argument( + 'output_file', + help='Tabular output file' + ) + args = parser.parse_args() + + # Load ID mapping + id_map = load_id_mapping(args.id_mapping) + + # Parse similarity output + file_codes = {} # filename -> code mapping + matches = [] # List of (file1, code1, file2, code2, distance) + current_ref = None + current_code = None + + with open(args.similarity_raw, 'r') as f: + for line in f: + line = line.rstrip() + if not line: + continue + + if line.startswith('ISCC:'): + # Reference file: "ISCC:CODE *filename" + code, filename = parse_iscc_line(line) + if code and filename: + current_ref = filename + current_code = code + file_codes[filename] = code + + elif line.startswith(' ') and current_ref: + # Similar file: " ~NN ISCC:CODE *filename" + parts = line.strip().split(None, 1) # Split on first whitespace + if len(parts) == 2: + dist_str = parts[0].replace('~', '') + distance = int(dist_str) + + # Parse the rest of the line for ISCC and filename + code, filename = parse_iscc_line(parts[1]) + + if code and filename: + matches.append((current_ref, current_code, filename, code, distance)) + file_codes[filename] = code + # Write output with identifiers + with open(args.output_file, 'w') as out: + # Write header (7 columns) + out.write("file_id\tfilename\tiscc_code\tmatch_id\tmatch_filename\tmatch_iscc_code\tdistance\n") + + # Track which files have matches + files_with_matches = set() + + # Write similarity matches in both directions + for file1, code1, file2, code2, distance in matches: + # Get element identifiers + file1_name = id_map[file1] + file2_name = id_map[file2] + file1_id = str.split(file1, '_', 1)[0] # Extract ID from filename + file2_id = str.split(file2, '_', 1)[0] # Extract ID from filename + + # Write A -> B (file_id is the numeric ID, filename is the element_identifier) + out.write(f"{file1_id}\t{file1_name}\t{code1}\t{file2_id}\t{file2_name}\t{code2}\t{distance}\n") + # Write B -> A (bidirectional) + out.write(f"{file2_id}\t{file2_name}\t{code2}\t{file1_id}\t{file1_name}\t{code1}\t{distance}\n") + + files_with_matches.add(file1) + files_with_matches.add(file2) + + # Write files with no matches (distance = -1, empty match columns) + for filename in sorted(file_codes.keys()): + if filename not in files_with_matches: + file_id = str.split(filename, '_', 1)[0] # Extract ID from filename + element_name = id_map[filename] + code_val = file_codes[filename] + out.write(f"{file_id}\t{element_name}\t{code_val}\t\t\t\t-1\n") + + +if __name__ == '__main__': + main()
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/iscc_verify.xml Fri Dec 19 15:02:49 2025 +0000 @@ -0,0 +1,197 @@ +<tool id="iscc_sum_verify" name="Verify ISCC-CODE" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="24.1"> + <description>with ISCC-SUM</description> + <macros> + <import>macros.xml</import> + <import>creators.xml</import> + </macros> + <expand macro="requirements" /> + <expand macro="version_command" /> + <creator> + <expand macro="creators/iscc" /> + <expand macro="creators/lco" /> + <expand macro="creators/maartenpaul" /> + <expand macro="creators/etzm" /> + </creator> + + <command detect_errors="exit_code"><![CDATA[ + ## Generate ISCC-CODE for input dataset + GENERATED=\$(iscc-sum '${input_file}' | cut -d':' -f2 | cut -d' ' -f1) && + + ## Get expected ISCC-CODE + EXPECTED='${expected_code}' && + + ## Validate expected ISCC-CODE length + if [ \${#EXPECTED} -ne 55 ]; then + echo "ERROR: Expected ISCC-CODE must be exactly 55 characters" >&2; + echo "Found: \${#EXPECTED} characters" >&2; + exit 1; + fi && + + ## Output verification report + if [ "\$GENERATED" = "\$EXPECTED" ]; then + echo "OK - ISCC-CODEs match" > '${output_file}'; + else + echo "FAILED - ISCC-CODEs do not match" > '${output_file}'; + fi && + echo "Expected: \$EXPECTED" >> '${output_file}' && + echo "Generated: \$GENERATED" >> '${output_file}' && + echo "" >> '${output_file}' + + ]]></command> + + <inputs> + <param name="input_file" type="data" format="data" label="Dataset to verify" + help="Verify this dataset's ISCC-CODE. When a collection is provided, each dataset is verified separately against the same expected ISCC-CODE."/> + <param name="expected_code" type="text" label="Expected ISCC-CODE" + help="The 55-character ISCC-CODE to verify against"> + <validator type="length" min="55" max="55" message="ISCC-CODE must be exactly 55 characters"/> + </param> + </inputs> + + <outputs> + <data name="output_file" format="txt" label="${tool.name} on ${on_string}"/> + </outputs> + + <tests> + <!-- Test 1: Successful verification --> + <test expect_num_outputs="1"> + <param name="input_file" value="test1.png"/> + <param name="expected_code" value="K4AOMGOGQJA4Y46PAC4YPPA63GKD5RVFPR7FU3I4OOEW44TYXNYOTMY"/> + <output name="output_file"> + <assert_contents> + <has_text text="OK - ISCC-CODEs match"/> + <has_text text="Expected: K4AOMGOGQJA4Y46PAC4YPPA63GKD5RVFPR7FU3I4OOEW44TYXNYOTMY"/> + <has_text text="Generated: K4AOMGOGQJA4Y46PAC4YPPA63GKD5RVFPR7FU3I4OOEW44TYXNYOTMY"/> + <has_n_lines n="4"/> + </assert_contents> + </output> + </test> + + <!-- Test 2: Failed verification --> + <test expect_num_outputs="1"> + <param name="input_file" value="test1.png"/> + <param name="expected_code" value="K4AGSPOSB5SS2X427WZ27QASTSBVTS55DXLMFDF7WOJKEOSTDEI3OXQ"/> + <output name="output_file"> + <assert_contents> + <has_text text="FAILED - ISCC-CODEs do not match"/> + <has_text text="Expected: K4AGSPOSB5SS2X427WZ27QASTSBVTS55DXLMFDF7WOJKEOSTDEI3OXQ"/> + <has_text text="Generated: K4AOMGOGQJA4Y46PAC4YPPA63GKD5RVFPR7FU3I4OOEW44TYXNYOTMY"/> + <has_n_lines n="4"/> + </assert_contents> + </output> + </test> + + <!-- Test 3: FASTA file verification --> + <test expect_num_outputs="1"> + <param name="input_file" value="test3.fasta"/> + <param name="expected_code" value="K4AKF7PTZ7JTAAYZ7YZHZPR5RETKYXXE7RTBTJA4JX5GQQMSLZRC6QQ"/> + <output name="output_file"> + <assert_contents> + <has_text text="OK - ISCC-CODEs match"/> + <has_text text="Expected: K4AKF7PTZ7JTAAYZ7YZHZPR5RETKYXXE7RTBTJA4JX5GQQMSLZRC6QQ"/> + <has_text text="Generated: K4AKF7PTZ7JTAAYZ7YZHZPR5RETKYXXE7RTBTJA4JX5GQQMSLZRC6QQ"/> + <has_n_lines n="4"/> + </assert_contents> + </output> + </test> + </tests> + + <help><![CDATA[ +What it does +============ + +Verifies that a file (dataset) matches an expected ISCC-CODE (International Standard Content Code) for exact content verification. This tool uses ISCC-SUM, which generates an ISCC-CODE containing Data-Code and Instance-Code units for bit-level file comparison. + +Exit Codes +========== + +The tool uses exit codes for workflow logic: + +- **0**: Verification successful (OK - ISCC-CODEs match) +- **1**: Verification failed (FAILED - ISCC-CODEs do not match) + +Dataset Mapping +=============== + +When you provide a collection, Galaxy automatically runs verification once per dataset. All datasets are verified against the same expected ISCC-CODE. + +Output +====== + +A verification report containing: + +- Filename (or element identifier) +- Expected ISCC-CODE +- Generated ISCC-CODE +- Status: OK or FAILED + +Example output:: + Status: OK - ISCC-CODEs match + + Expected: K4AOMGOGQJA4Y46PAC4YPPA63GKD5RVFPR7FU3I4OOEW44TYXNYOTMY + Generated: K4AOMGOGQJA4Y46PAC4YPPA63GKD5RVFPR7FU3I4OOEW44TYXNYOTMY + +Use Cases +========= + +- Verify file integrity after transfer or storage +- Confirm downloaded datasets match reference ISCC-CODEs +- Validate that backups are identical to originals +- Quality control for data archiving + +Workflow Examples +================= + +Verify a single dataset +----------------------- + +:: + + Input: document.pdf + Expected ISCC-CODE: K4AOMG... + ↓ + [Verify ISCC-CODE] + ↓ + Output: "OK" or "FAILED" + +Verify against reference table +------------------------------- + +Generate reference ISCC-CODEs first:: + + Original files → [Generate ISCC-CODE] → Reference ISCC-CODEs + +Later verify:: + + New datasets → [Generate ISCC-CODE] → New ISCC-CODEs + ↓ + [Join two Datasets] on filename + ↓ + Compare ISCC-CODE columns + ↓ + Result: Which files match/differ + +Working with ISCC-CODE Files in Workflows +========================================== + +If you have the expected ISCC-CODE in a dataset (e.g., from Generate ISCC tool): + +1. In the workflow editor, connect the ISCC-CODE file to this tool +2. The ISCC-CODE file content will be used automatically +3. Or manually copy the ISCC-CODE from the file into the text field + +Important Notes +=============== + +- **Exact match only**: Any change to the file will cause verification to fail +- **Bit-level comparison**: Even metadata changes are detected +- **For similarity detection**: Use "Find similar ISCC-CODEs" tool instead + +More Information +================ + +For details about ISCC: https://sum.iscc.codes/ and https://iscc.codes/ +For ISCC structure and subtypes: https://ieps.iscc.codes/iep-0001/ + ]]></help> + <expand macro="citations" /> +</tool> \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/macros.xml Fri Dec 19 15:02:49 2025 +0000 @@ -0,0 +1,24 @@ +<macros> + <token name="@TOOL_VERSION@">0.1.0</token> + <token name="@VERSION_SUFFIX@">1</token> + <xml name="citations"> + <citations> + <citation type="bibtex"> + @misc{iscc_sum, + title={ISCC - Similarity Hash for Digital Media}, + url={https://iscc.codes/}, + note={Accessed: 2025} + } + </citation> + </citations> + </xml> + <xml name="requirements"> + <requirements> + <requirement type="package" version="@TOOL_VERSION@">iscc-sum</requirement> + </requirements> + </xml> + <xml name="version_command"> + <version_command>iscc-sum --version</version_command> + </xml> +</macros> +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/sequence1.txt Fri Dec 19 15:02:49 2025 +0000 @@ -0,0 +1,1 @@ +CTCACTTGTGGATACCCCGACCTATTTTGACGGGACCACTCGCGGTAGTCGTTGGGCTTATGCACCGTAAAGTCCTCCGCCGGCCTCCCCCCTACAAAAGATGATAAGCTCCGGCAAGCAATATTGAACAACGCAAGGATCGGCGATATAAACAGAGAAACGGCTGATTACTCTTGTTGGTGTGGTATCGCTAAACTGCGTCGCGGAGCCTTATGGCATAGTCGTCCGCGGAGCACTCTGGTAACGCTTATGGTCCATAGCACATTCATCGCATCCGGGCATGCGCTCTATTTGACGATCCCTTGGCGCAGAGATGCTGGCCACGAGCTAAATTAAAGCGACTGCACTACTGTAAGGTCCGTCACGCAGACGACGGCCCGGGGAGAGCACTAACCCATCAACCTGTACGGGAACTTTCTATATCGTTCTCGGACGGAGAGATAACTACAGTGCCGCTTACAGCCCCTCTGTCGTCGCCGACGTCTGTAATATAGCCTTGTTGTGATTCCACCCTATTGAGGCATTGACTGATGCGGAAGGAGATCTGGAATGAACTGGTCTATGTGACAGAAACTGTGCAAGTACCTAATCTCGTTAGTGTAGGTTCTGACCGATACGTGCTTCGTTGAGAACTCACAATTTTACAACTGGGGACATAAGCCCTACGCCCATCATCTACTGACGTCCCTGAGGCTGCAGTTCATGTAATGGGACAGTATCCGCCGCAAGTTCTAGTGCAATGGCGGTATAGTACGCTCGTACTGTAGTAGAGGCGACACGGGTGGGATCATCACTAATAAGGATACTGGGAAGACTCACAGGCCTCCGCCTATAGGCGGTGCTTACTCTTACATAAAGCGGCTGTTAGTATTACCCCGCGAGGATTCGAAAAGGTGAACCAACCCGGTCGATCCGGAGGGACGGGCCTCAAAGCCGCGTGACGACGGCTGTCGGCCCGTAACAGAATCCCCGCAATAAGCTCCCGTGAGCCTCGATTGAACAGCCCTGGTGGGCCCCATCAGTAGCCCGAATATGTCGCTTTTCGGGTCCTGGGCCGAGGAGCGATACCTTCCAGTAATCGAGGCCGTTCGTTAATTCTTGTTGCGTTCCTAGCGCCTATATTTGTCTCTTTGCCGGCTTATGTGGACAAGCATAGCATAGCCATTTATCGGAGCGCCTCGGTACACGGTATGACCAGACGCCTCGTGAGACCATTACGTATACCAGGTGTCCTGTGAGCAGCGAAGGCCCATACGCGAGATACACTGCCAGAAATCCGCGTGATTACGAGTCGTGGTAAATTTAATCTGGCTGTGGTCTAGACATTCCAGGCGGTGCGTCTGCTGTCGGGTGCCTCTGGTGACTGGCTAGATGGACTTGCCGCTGGTAAACACACCATGACCCCGCCTCTCCATTGATGCCACGGCGAATGTCGGGGAGACAGCAGCGACTGCAGACATCAGATCAGAGTAATACTAACATGCGATAAGTCCCTAACTGACTATGGCCTTCTGTAGAGTCTACTTCACCAGATATGCTGTCTCTGGCACGTGGATGGTTTAGAGGAATCACATTCAAGTCTGGTTAACCATGAAACAAGTCTTGAGTGTAAAATTGTCGTCTCCTGTGTACGAGATGGAGGTACTAGATGACTGCAGGGACTCCGACGTTATGTACGTTGCTCCGTCAAAGGCGCCATTCAGGATCACGTTACCGCCAAAAAATGGGAGCAGGAGCTCTTCTCCCCTGCGGTCACGTCTATAGAAATTACACCTTTAACCCTCCTGAGAACCGGGAGGCGGGAATCCGTCACGTATGAGAAGGTATTTGCCCGACAATCAATACCGGACGCTCCTAAGTTTTTCCACTCGCTTGAGCCGGCTAGGCCTCTCTGCCCGAAGTTTCGACGGACTGCTGCCAACGCCCAGGCATAGTTTTAGGAGGATTATTCGGGGGCACTGGCAACCAACTTCTCGGGTCCTGCCCGACTGGTCTTCGGGCTAATATAGCGAATTGCCGAGAACCCGGCCCCACGCAATGGAACGTCCTTAGCTCCGGCAGGCAATTAAGGGGAACGTAAGTATAGCGCAAAAAAACAGAGAAATAGGCGAATGAATCTATTCTTACTGTATCGAAGAATGGCCTCGCGGAGGCATGTGTCATGCTAGCGTGCGGGGTACTCTAGTTATCCATATGGTCCACAGGACACTCGTTGCTTTCGGATTTGCCCTTTATGCGCCGGTTTTCAGCCACGCTTATGCTCAGCATCGTTATAACCAGACCGATACTAGATCTATAAAGTCCGCCATGCAGACGAGACCAGACGGAGATTACCGAGCAATCTATCAGATCGGCGACCATTAGTGAGCTACTGGAGCCGAGGGGTAACTACGATGCCGCTAAGAACCTCTCGGTCGTCGCTAGCGATTACACTCCAGTCTCATTATAATCGTTCGCTATTCAGGGATTGACCAACACCGGAAAACATTTCACTTGAAGTATTGTATACGACAGAGTCCGTGCACCTACCAAACCTGTTTAATCTAAGTTCAGACTAGTTGGAAGTATGTCTAGATCTCAGATTTCGTCACTAGAGGGCCCACGCTCTATTTTTATGATCCATTGATCTCCCTGACGCTGCAAGATTTGCAACCAGGCAGACTTGGCGGTAGGTCCTAGTGCAGCGGGGCTTTTTTTCTATAGTCCTTGAGAGGAGGAGACGTCAGTCCAGATATCTTTGATGTCCTGATTGGAAGGACCGTTGGCCCCCCACCCTTAGGCAGTGTATACTCTTCCATAAACGAGCTATTAGTTATGAGGTCCGTAGATTGAAAAGGGTGACGGAATTCGGCCGAACGGGAAAGACGGACATCTAGGTATCCTGAGCACGGTTGCGCGTCCGTATCAAGCTCCTCTTTATAGGCCCCGGTTACTGTTGGTCGTAGAGCCCAGAACGGGTTGGGCAGATGTACGACAATATCGCTTAGTCACCCTTGGGCCACGGTCCGCTACCTTACAGGAATTGAGACCGTCCATTAATTTCCCTTGCATATATATTGCGTTTCTTCGACCTTTTAACCGCTCTCTTAGAAGAGAGACAGATAGCTTCTTACCCGTGCCCCACCGTTGGCAGTACGATCGCACGCCCCACGTGAACGATTGGTAAACCCTGTGGCCTGTGAGCGACAAAAGCTTTAATGGGAAATACGCGCCCATAACTTGGTGCGAATACGGGTCGTAGCAATGTTCGTCTGAGTATGATCTATATAATACGGGCGGTACGTCTGCTTTGGTCAGCCTCTAATGGCTCGTATGATAGTGCAGCCGCTGGTGATCACTCAATGATCTCGGCTCCCCGTTGCAACTACGGGGATTCTTGGAGAGCCAGCTGCGTTCGGTATTGTGAGGACAGTGTAGTATTAGCAAACGATAAGTCCCGAACTAGTTGTGACCTAACGAAAAGAGAATTTCATAATACGTGCTGTCCCACGCGCATGGTACATTTGGACAATATTGAATGGAGTCTGATCAACCTTCACACCGATCTAGAATCGAATGCAAAGATCACCCAGGTGCAAATCAAAAATTCTAGGTAACTAGAAGATTTGCGACGTTCTAAGTGTTGGACGATATGAATCGCGACCCAGGATGACGTCGCCCTGAAAAAAAGATTTCTGCAACTCTCCTCGTCAGCAGTCTGGTGTATCGAAAGTACAGGACTAGCCTTCCTAGCAACCGCGGGCTGGGAATCTGAGACATGAGTCAAGATATTTGCTCGGTAACGTATGCTCTAGGCATCTAACTATTCCCTGTGTCTTATAGGGGCCTGCGTTATCTGCCTGTCGAACCATAGGATTCGTGTCAGCGCGCAGGCTTGGATCGAGATGAAATCTCCGGGGCCTAAGACTACGAGCATCTGGCGTCTTGGCTAACCCCCCTACATGTTGTTATAAACAATCAGTGGAAACCCAGTGCTAGAGGATGGAATGACCTTAAATCAGGGACGATATTAAACGGAACGTATATTCAACGCAATGAAGCCGGAGGATTGGCGTGGGAATCGTGCTTCTGTCTAAGCAAGTAAGGGTATGAGGTCGCAACCGTCCCCCAAGCGTACAGGGTGCACTTTGTAACGATTTGGGAGTCCAGAGACTCGCTGTTTTCGAAATTTGCCCTCAAGCGCGAGTATTGAACCAGGCTTACGCCCAAGAACGTAGCAAGCTGACTCAAACAAAATACATTTTGCCCGCGTTACATATGAATCAAGTTGGAAGTTATGGAGCATAGTAACATGTGGACGGCCAGTGGTGGGTTGCTACACCCCTGCGGCAACGTTGAAGCTCCTGGATTACACTGGCTGGATCTAAGCCGTGACACCCGTCATACTCCATAACCGTCTGTAACTCACGGCTTGTTCTGGACTGGATTGCCATTCTCTCAGAGTATTATGCAGGCCGGCGTACGGGTCCCATATAAACCTGTCATAGCTTACCTGACTCTACTTGGAAATGTGGCTAGGTCTTTGCCCACGCACCTAATCGGTCCTCGTTTGCTTTTTAGGACCCGATGAACTACAGAACACTGCAAGAATCTCTACCTGCTTTACAAAGTGCTGGATCCTATTCCAGCGGGATCTTTTATCTAAACACGATGAGAGGAGTATTCGTCAGGCCACATAGCTTTCTTGTTCTGATCGGAACGATCGTTGGCGCCCGACCCCCCGATTCCATAGTGAGTTCTTCGTCCGAGCCATTGTATGCGAGATCGATAGACTGATAGGGGATGCAGTATATCCCTGGATACAATAGACGCACAGGTTGGAATCCTAAGTGAAGTCGCGCGTCCGAACCCAGCTCTATTTTAGAGGTCATGGGTTCTGGTGCCCGCGAGCCGCGGAACCGATTAGGGGCATGTACAACAATATTTATTAGTCATCTTTCAGACACAATCTCCCAGCTCACTGGTATATAGTTCCTGCTATAATTAGCCTCCCTCATAAGTTGCACTACTTCAGCGTCCCAAATGCACCCTTACCACGAAGACAGGATTGTCCGATCCCATATTACGACCTTGGCAGGGGGTTCGCAAGTCCCACCCCAAACGATGCTGAAGGCTCAGGTTTCACAGGGACAAAAGCTTTAAACGCGAGTTCCCGCTCATAACCTGGACCGAATGCAGAATCATGCATCGTTCCACTGTGTTCGTGTCATCTAGGACGGGCGCAAAGGATATATAATTCAATTTTGAATACCTTATATTATTGTACACCTACCGGTCACCAGCCAACAATGTGCGGATGGCGTTACAACTTTCTGGGCCTAATCTGACCGTTCTAGATACCGCACTCTGGGCAATACGAGGTAAAGCCAGTCACCCAGTGTCGATCAACACCTAACCTAACGGTAAGAGGCTCACATAATGGCACTGTCGGCGTCCCCAGGGTATTTTACGTTAGCATCAGGTGGACTAACATGAATCTTTACTCCCAAGCGAAAACGGGTGCGTGGACTAGCGAGGAGCAAACGAAAATTCTTGGCCTGCTTGGTGTCTCGTATTCCTCTTAGAGATCGACGAAATGTTTCACGACCAAGGGAAAGGTCGCCCTACAAAATAGATTTGCGTTACTCTCTCCATAAGGAGTCCGGTGTAGCGAAGGATCAAGGCGACCCTAGGTAGCAACCGCCGGCTTCGGCGGTAAGGTATCACTCAAGAAGCAGACACAGTAAGACACGGTCTAGCTGACTGTCTATCGGCTAGGTCAAATAGAGAGCTTTGATATCTGCATGTCTAGCTTTAGAATTCAGTTTAGCGCGCTGATCTGAGTCGAGATAAAATCACCAGTACCCAAGACCAGGGGGGCTCGCCACGTTGGCTAATCCTGGTACATCTTGTAATCAATATTCAGTAGAAAATTTGTGTTAGAAGGACGAGTCACCATGTACCAATAGCGATAACGATCGGTCGGACTATTCATTGTGGTGATGACGCTGGGTTTACGTGGGAAAGGTGCTTGTGTCCCGACAGGCTAGGATATAATGCTGAGGCCCTTCCCCAAGCGTTCAGCGTGGGATTTGCTACAACTTCCGAGTCCTACATGTGCGTGTTCATGTTATGTATGCACAAGGCCGAGAATAGGACGTAGCCTTCGAGTTAGTACGTAGCGTGGTCGCACAAGCACAGTAGATCCTCCCCGCGCATCCTATTTATTAAGTTAATTCTAAAGCAATACGATCACATGTGGATGGGCAGTGGCCGGTTGTTACACGCCTACCGCGGTGCTGAATGACCGGGACTAAAGAGGCGAAGATTATGGCGTGTGACCCGTTATGCTCGAGTTCGGTCAGTGCGTCATTGCAAGTAGTCGATTGCTTTCTCAATCTCCGAGCGATTTAGCGTGACAGCCCCAGGGAACCCATAAAATGTGATCGCAGTCCATCCGATCGTACATAGAAAGGAAGGTCCCCATACGCCCACGCACCTGTTTACTCGTCGTATGCATAAAAGAGCCGCACGAACCACAGAGCATAAAGAGGACCTCTAGTTCCTTTACAAAGTACAGGTTCGCTTTTCGGCGAGATGCCTTACCTAGATGCAATGACGGACGTATTCCTCTGGCCACATCGGTTCCTGCTTTCGCTGGGATCCAAGATTGGCAGCTGAAGCCGCCTTTCCATAGTGAGTCCTTCGTCTGTGACTAACTGTGCCAAATCGTCTAGCAAACTGCTGATCCAGTTTAACTCACCAAATTATAGCCGTACAGACCGAAATCTTAAGTCATATCACGCGACTAGCCTCTGCTTAATTTTTGTGCTCAAGGGTTTTGGTCCGCCCGAGCGGTGCAGCCGATTAGGACCATGTAATACATTTGTTACAAGACTTCTTTTAAACACTTTCTTCCTGCCCAGTAGCGGATGATAATCGTTGTTGCCAGCCGGCGTGGAAGGTAACAGCACCGGTGCGAGCCTAATGTGCCGTCTCCACGAACACAAGGCTGTCCGATCGTATAATAGGATTCCGCAATGGGGTTAGCAAGTGGCAGCCTAAACGATATCGGGGACTTGCGATGTACATGCTTTGGTACAATACATACGTGATCCAGTTGTTATCCTGCATCGGAACATCAATTGTGCATCGGACCAGCATATTCATGTCATCTAGGAGGCGCGCGTAGGATAAATAATTCAATTAAGATGTCGTTTTGCTAGTATACGTCTAGGCGTCACCCGCCATCTGTGTGCAGGTGGGCCGACGAGACACTGTCCCTGATTTCTCCGCTTCTAATAGCACACACGGGGCAATACCAGCACAAGCCAGTCTCGCAGCAACGCTCGTCAGCAAACGAAAGAGCTTAAGGCTCGCCAATTCGCACTGTCAGGGTCGCTTGGGTGTTTTGCACTAGCGTCAGGTACGCTAGTATGCGTTCTTCCTTCCAGGGGTATGTGGCTGCGTGGTCAAATGTGCGGCATACGTATTTGCTCGACGTGTTTGCTCTCACGAACTTGACCTGGAGATCAAGGAGATGTTTCTTGTCGAACTGGACAGCGCTTCAACGGAACGGATCTACGTTACAGCCTGCATAATGAAAACGGAGTTGCCGACGACGAAAGCGACTTTGGGTTCTGTCTGTTGTCATTGGCGGAAAACTTCCGTTCAGGAGGCGGACACTGATTGACACGGTTTAGCAGAAGGTTTGAGGAATAGGTTAAATTGAGTGGTTTAATAACGGTATGTCTGGGATTAAAGTGTAGTATAGTGTGATTATCGGAGACGGTTTTAAGACACGAGTTCCCAAAATCAAGCGGGGTCATTACAACGGTTATTCCTGGTAGTTTAGGTGTACAATGTCCTGAAGAATATTTAAGAAAAAAGCACCCCTCATCGCCTAGAATTACCTACTACGGTCGACCATACCTTCGATTATCGCGGCCACTCTCGCATTAGTCGGCAGAGGTGGTTGTGTTGCGATAGCCCAGTATAATATTCTAAGGCGTTACCCTGATGAATATCCAACGGAATTGCTATAGGCCTTGAACGCTACACGGACGATACGAAATTATGTATGGACCGGGTCATCAAAAGGTTATACCCTTGTAGTTAACATGTAGCCCGGCCCTATTAGTACAGTAGTGCCTTGAATGGCATTCTCTTTATTAAGTTTTCTCTACAGCTAAACGATCAAGTGCACTTCCACAGAGCGCGGTGGAGATTCATTCACTCGGCAGCTCTGTAATAGGGACTAAAAAAGTGATGATAATCATGAGTGCCGCGTTATGGTGGTGTCGGAACAGAGCGGTCTTACGGCCAGTCGTATGCCTTCTCGAGTTCCGTCCAGTTAAGCGTGACAGTCCCAGTGTACCCACAAACCGTGATGGCTGTGCTTGGAGTCAATCGCAAGTAGGATGGTCTCCAGACACCGGGGCACCAGTTTTCACGCCGAAAGCATAAACGACGAGCAGATATGAAAGTGTTAGAACTGGACGTGCCGTTTCTCTGCGAAGAACACCTCGAGCTGTAGCGTTGTTGCGCTGCCTAGATGCAGTGTTGCTCATATCACATTTGCTTCAACGACTGCCGCCTTCGCTGTATCCCTAGACACTCAACAGTAAGCGCTTTTTGTAGGCAGGGGCACCCCCTATCAGTGACTGCGCCAAAACATCTTCGGATCCCCTTGTCCAATCAAACTCATCGAATTCTTACATTTAAGACCCTAATATCACATCATTAGTGATTAATTGCCACTGCCAAAATTCTGTCCAGAAGCGTTTTAGTTCGCTCCACTAAAGTTGTTTAAAACGACTACCAAATCCGCATGTTAGGGGATTTCTTATTAATTCTTTTATCGTGAGGAACAGCGGATCTTAATGGATGGCCGCAGGTGGTATGGAAGCTAATAGCGCGGGTGAGAGGGTAATCAGCCGTGTTCACCTACACAACGCTAACGGGCGATTCTATAAGATTCCGCATTGCGTCTACTTATAAGATGTCTCAACGGTATCCGCAACTTGTGAAGTGCCTACTATCCTTAAACGCATATCTCGCCCAGTAGCTTCCCAATATGTGAGCATCAATTGTTGTCCGGGCCGAGATAGTCATGTGCTCACGGAACTTACTGTATGAGTAGTGATTTGAAAGAGTTGTCAGTTTGCTGGTTCAGGTAAAGGTTCCTCACGCTACCTCAAAGTAAGAGAGCGGTCGTGACATTATCCGTGATTTTCTCACTACTATTAGTACTCACGACTCGATTCTGCCGCAGCCATGTTTCGCCAGAATGCCAGTCAGCATTAAGGAGAGCTCAGGGCAGGTCAACTCGCATAGTGAGGGTTACATGTTCGTTGGGCTCTTCCGACACGAACCTCAGTTAGCCTACATCCTACCAGAGGTCTGTGCCCCGGTGGTGAGAAGTGCGGATTTCGTATTTGCAGCTCGTCAGTACTTTCAGAATCATGGCCTGCACGGCAAAATGACGCTTATAATGGACTTCGACATGGCAATAACGCCTCGTTTCTACGTCAGGAGGAGAATAGTATAAACATAACTGCTGTCGGCAGAAGCGCCAAAGGAGTCTCTGAATTCTTATTCCCGAATAACATCCGTCTCCGTGCGGGAAAATCACCGACGGCGTTTTATAGAAGCCTAGGGGAACAGATTGGTCTAATTAGCTTAAGAGAGTAAATTCTGGGATCATTCAGTAGTAATCACAAATTTACGGTGGGGCTTTTTTGGCGGATCTTTACAGATACTAACCAGGTGATTTCAACTAATTTAGTTGACGATTTAGGCGCGCTATCCCGTAATCTTCAAATTAAAACATAGCGTTCCATGAGGGCTAGAATTACTTACCGGCCTTCACCATGCCTGCGTTATTCGCGCCCACTCTCCCATTTATCCGCGCAAGCGGATGCGATGCGATTGCCCGCTAAGATATTCTTACGTGTAACGTAGCTAAGTATTCTACAGAGCTGGCGTACGCGTTGAACACTTCACAGATGATAGGGATTCG
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/sequence2.txt Fri Dec 19 15:02:49 2025 +0000 @@ -0,0 +1,1 @@ +CTCACTTGTGGATACCCCGACCTATTTTGACGGGACCACTCGCGGTAGTCGTTGGGCTTATGCACCGTAAAGTCCTCCGCCGGCCTCCCCCCTACAAAAGATGATAAGCTCCGGCAAGCAATATTGAACAACGCAAGGATCGGCGATATAAACAGAGAAACGGCTGATTACTCTTGTTGGTGTGGTATCGCTAAACTGCGTCGCGGAGCCTTATGGCATAGTCGTCCGCGGAGCACTCTGGTAACGCTTATGGTCCATAGCACATTCATCGCATCCGGGCATGCGCTCTATTTGACGATCCCTTGGCGCAGAGATGCTGGCCACGAGCTAAATTAAAGCGACTGCACTACTGTAAGGTCCGTCACGCAGACGACGGCCCGGGGAGAGCACTAACCCATCAACCTGTACGGGAACTTTCTATATCGTTCTCGGACGGAGAGATAACTACAGTGCCGCTTACAGCCCCTCTGTCGTCGCCGACGTCTGTAATATAGCCTTGTTGTGATTCCACCCTATTGAGGCATTGACTGATGCGGAAGGAGATCTGGAATGAACTGGTCTATGTGACAGAAACTGTGCAAGTACCTAATCTCGTTAGTGTAGGTTCTGACCGATACGTGCTTCGTTGAGAACTCACAATTTTACAACTGGGGACATAAGCCCTACGCCCATCATCTACTGACGTCCCTGAGGCTGCAGTTCATGTAATGGGACAGTATCCGCCGCAAGTTCTAGTGCAATGGCGGTATAGTACGCTCGTACTGTAGTAGAGGCGACACGGGTGGGATCATCACTAATAAGGATACTGGGAAGACTCACAGGCCTCCGCCTATAGGCGGTGCTTACTCTTACATAAAGCGGCTGTTAGTATTACCCCGCGAGGATTCGAAAAGGTGAACCAACCCGGTCGATCCGGAGGGACGGGCCTCAAAGCCGCGTGACGACGGCTGTCGGCCCGTAACAGAATCCCCGCAATAAGCTCCCGTGAGCCTCGATTGAACAGCCCTGGTGGGCCCCATCAGTAGCCCGAATATGTCGCTTTTCGGGTCCTGGGCCGAGGAGCGATACCTTCCAGTAATCGAGGCCGTTCGTTAATTCTTGTTGCGTTCCTAGCGCCTATATTTGTCTCTTTGCCGGCTTATGTGGACAAGCATAGCATAGCCATTTATCGGAGCGCCTCGGTACACGGTATGACCAGACGCCTCGTGAGACCATTACGTATACCAGGTGTCCTGTGAGCAGCGAAGGCCCATACGCGAGATACACTGCCAGAAATCCGCGTGATTACGAGTCGTGGTAAATTTAATCTGGCTGTGGTCTAGACATTCCAGGCGGTGCGTCTGCTGTCGGGTGCCTCTGGTGACTGGCTAGATGGACTTGCCGCTGGTAAACACACCATGACCCCGCCTCTCCATTGATGCCACGGCGAATGTCGGGGAGACAGCAGCGACTGCAGACATCAGATCAGAGTAATACTAACATGCGATAAGTCCCTAACTGACTATGGCCTTCTGTAGAGTCTACTTCACCAGATATGCTGTCTCTGGCACGTGGATGGTTTAGAGGAATCACATTCAAGTCTGGTTAACCATGAAACAAGTCTTGAGTGTAAAATTGTCGTCTCCTGTGTACGAGATGGAGGTACTAGATGACTGCAGGGACTCCGACGTTATGTACGTTGCTCCGTCAAAGGCGCCATTCAGGATCACGTTACCGCCAAAAAATGGGAGCAGGAGCTCTTCTCCCCTGCGGTCACGTCTATAGAAATTACACCTTTAACCCTCCTGAGAACCGGGAGGCGGGAATCCGTCACGTATGAGAAGGTATTTGCCCGACAATCAATACCGGACGCTCCTAAGTTTTTCCACTCGCTTGAGCCGGCTAGGCCTCTCTGCCCGAAGTTTCGACGGACTGCTGCCAACGCCCAGGCATAGTTTTAGGAGGATTATTCGGGGGCACTGGCAACCAACTTCTCGGGTCCTGCCCGACTGGTCTTCGGGCTAATATAGCGAATTGCCGAGAACCCGGCCCCACGCAATGGAACGTCCTTAGCTCCGGCAGGCAATTAAGGGGAACGTAAGTATAGCGCAAAAAAACAGAGAAATAGGCGAATGAATCTATTCTTACTGTATCGAAGAATGGCCTCGCGGAGGCATGTGTCATGCTAGCGTGCGGGGTACTCTAGTTATCCATATGGTCCACAGGACACTCGTTGCTTTCGGATTTGCCCTTTATGCGCCGGTTTTCAGCCACGCTTATGCTCAGCATCGTTATAACCAGACCGATACTAGATCTATAAAGTCCGCCATGCAGACGAGACCAGACGGAGATTACCGAGCAATCTATCAGATCGGCGACCATTAGTGAGCTACTGGAGCCGAGGGGTAACTACGATGCCGCTAAGAACCTCTCGGTCGTCGCTAGCGATTACACTCCAGTCTCATTATAATCGTTCGCTATTCAGGGATTGACCAACACCGGAAAACATTTCACTTGAAGTATTGTATACGACAGAGTCCGTGCACCTACCAAACCTGTTTAATCTAAGTTCAGACTAGTTGGAAGTATGTCTAGATCTCAGATTTCGTCACTAGAGGGCCCACGCTCTATTTTTATGATCCATTGATCTCCCTGACGCTGCAAGATTTGCAACCAGGCAGACTTGGCGGTAGGTCCTAGTGCAGCGGGGCTTTTTTTCTATAGTCCTTGAGAGGAGGAGACGTCAGTCCAGATATCTTTGATGTCCTGATTGGAAGGACCGTTGGCCCCCCACCCTTAGGCAGTGTATACTCTTCCATAAACGAGCTATTAGTTATGAGGTCCGTAGATTGAAAAGGGTGACGGAATTCGGCCGAACGGGAAAGACGGACATCTAGGTATCCTGAGCACGGTTGCGCGTCCGTATCAAGCTCCTCTTTATAGGCCCCGGTTACTGTTGGTCGTAGAGCCCAGAACGGGTTGGGCAGATGTACGACAATATCGCTTAGTCACCCTTGGGCCACGGTCCGCTACCTTACAGGAATTGAGACCGTCCATTAATTTCCCTTGCATATATATTGCGTTTCTTCGACCTTTTAACCGCTCTCTTAGAAGAGAGACAGATAGCTTCTTACCCGTGCCCCACCGTTGGCAGTACGATCGCACGCCCCACGTGAACGATTGGTAAACCCTGTGGCCTGTGAGCGACAAAAGCTTTAATGGGAAATACGCGCCCATAACTTGGTGCGAATACGGGTCGTAGCAATGTTCGTCTGAGTATGATCTATATAATACGGGCGGTACGTCTGCTTTGGTCAGCCTCTAATGGCTCGTATGATAGTGCAGCCGCTGGTGATCACTCAATGATCTCGGCTCCCCGTTGCAACTACGGGGATTCTTGGAGAGCCAGCTGCGTTCGGTATTGTGAGGACAGTGTAGTATTAGCAAACGATAAGTCCCGAACTAGTTGTGACCTAACGAAAAGAGAATTTCATAATACGTGCTGTCCCACGCGCATGGTACATTTGGACAATATTGAATGGAGTCTGATCAACCTTCACACCGATCTAGAATCGAATGCAAAGATCACCCAGGTGCAAATCAAAAATTCTAGGTAACTAGAAGATTTGCGACGTTCTAAGTGTTGGACGATATGAATCGCGACCCAGGATGACGTCGCCCTGAAAAAAAGATTTCTGCAACTCTCCTCGTCAGCAGTCTGGTGTATCGAAAGTACAGGACTAGCCTTCCTAGCAACCGCGGGCTGGGAATCTGAGACATGAGTCAAGATATTTGCTCGGTAACGTATGCTCTAGGCATCTAACTATTCCCTGTGTCTTATAGGGGCCTGCGTTATCTGCCTGTCGAACCATAGGATTCGTGTCAGCGCGCAGGCTTGGATCGAGATGAAATCTCCGGGGCCTAAGACTACGAGCATCTGGCGTCTTGGCTAACCCCCCTACATGTTGTTATAAACAATCAGTGGAAACCCAGTGCTAGAGGATGGAATGACCTTAAATCAGGGACGATATTAAACGGAACGTATATTCAACGCAATGAAGCCGGAGGATTGGCGTGGGAATCGTGCTTCTGTCTAAGCAAGTAAGGGTATGAGGTCGCAACCGTCCCCCAAGCGTACAGGGTGCACTTTGTAACGATTTGGGAGTCCAGAGACTCGCTGTTTTCGAAATTTGCCCTCAAGCGCGAGTATTGAACCAGGCTTACGCCCAAGAACGTAGCAAGCTGACTCAAACAAAATACATTTTGCCCGCGTTACATATGAATCAAGTTGGAAGTTATGGAGCATAGTAACATGTGGACGGCCAGTGGTGGGTTGCTACACCCCTGCGGCAACGTTGAAGCTCCTGGATTACACTGGCTGGATCTAAGCCGTGACACCCGTCATACTCCATAACCGTCTGTAACTCACGGCTTGTTCTGGACTGGATTGCCATTCTCTCAGAGTATTATGCAGGCCGGCGTACGGGTCCCATATAAACCTGTCATAGCTTACCTGACTCTACTTGGAAATGTGGCTAGGTCTTTGCCCACGCACCTAATCGGTCCTCGTTTGCTTTTTAGGACCCGATGAACTACAGAACACTGCAAGAATCTCTACCTGCTTTACAAAGTGCTGGATCCTATTCCAGCGGGATCTTTTATCTAAACACGATGAGAGGAGTATTCGTCAGGCCACATAGCTTTCTTGTTCTGATCGGAACGATCGTTGGCGCCCGACCCCCCGATTCCATAGTGAGTTCTTCGTCCGAGCCATTGTATGCGAGATCGATAGACTGATAGGGGATGCAGTATATCCCTGGATACAATAGACGCACAGGTTGGAATCCTAAGTGAAGTCGCGCGTCCGAACCCAGCTCTATTTTAGAGGTCATGGGTTCTGGTGCCCGCGAGCCGCGGAACCGATTAGGGGCATGTACAACAATATTTATTAGTCATCTTTCAGACACAATCTCCCAGCTCACTGGTATATAGTTCCTGCTATAATTAGCCTCCCTCATAAGTTGCACTACTTCAGCGTCCCAAATGCACCCTTACCACGAAGACAGGATTGTCCGATCCCATATTACGACCTTGGCAGGGGGTTCGCAAGTCCCACCCCAAACGATGCTGAAGGCTCAGGTTTCACAGGGACAAAAGCTTTAAACGCGAGTTCCCGCTCATAACCTGGACCGAATGCAGAATCATGCATCGTTCCACTGTGTTCGTGTCATCTAGGACGGGCGCAAAGGATATATAATTCAATTTTGAATACCTTATATTATTGTACACCTACCGGTCACCAGCCAACAATGTGCGGATGGCGTTACAACTTTCTGGGCCTAATCTGACCGTTCTAGATACCGCACTCTGGGCAATACGAGGTAAAGCCAGTCACCCAGTGTCGATCAACACCTAACCTAACGGTAAGAGGCTCACATAATGGCACTGTCGGCGTCCCCAGGGTATTTTACGTTAGCATCAGGTGGACTAACATGAATCTTTACTCCCAAGCGAAAACGGGTGCGTGGACTAGCGAGGAGCAAACGAAAATTCTTGGCCTGCTTGGTGTCTCGTATTCCTCTTAGAGATCGACGAAATGTTTCACGACCAAGGGAAAGGTCGCCCTACAAAATAGATTTGCGTTACTCTCTCCATAAGGAGTCCGGTGTAGCGAAGGATCAAGGCGACCCTAGGTAGCAACCGCCGGCTTCGGCGGTAAGGTATCACTCAAGAAGCAGACACAGTAAGACACGGTCTAGCTGACTGTCTATCGGCTAGGTCAAATAGAGAGCTTTGATATCTGCATGTCTAGCTTTAGAATTCAGTTTAGCGCGCTGATCTGAGTCGAGATAAAATCACCAGTACCCAAGACCAGGGGGGCTCGCCACGTTGGCTAATCCTGGTACATCTTGTAATCAATATTCAGTAGAAAATTTGTGTTAGAAGGACGAGTCACCATGTACCAATAGCGATAACGATCGGTCGGACTATTCATTGTGGTGATGACGCTGGGTTTACGTGGGAAAGGTGCTTGTGTCCCGACAGGCTAGGATATAATGCTGAGGCCCTTCCCCAAGCGTTCAGCGTGGGATTTGCTACAACTTCCGAGTCCTACATGTGCGTGTTCATGTTATGTATGCACAAGGCCGAGAATAGGACGTAGCCTTCGAGTTAGTACGTAGCGTGGTCGCACAAGCACAGTAGATCCTCCCCGCGCATCCTATTTATTAAGTTAATTCTAAAGCAATACGATCACATGTGGATGGGCAGTGGCCGGTTGTTACACGCCTACCGCGGTGCTGAATGACCGGGACTAAAGAGGCGAAGATTATGGCGTGTGACCCGTTATGCTCGAGTTCGGTCAGTGCGTCATTGCAAGTAGTCGATTGCTTTCTCAATCTCCGAGCGATTTAGCGTGACAGCCCCAGGGAACCCATAAAATGTGATCGCAGTCCATCCGATCGTACATAGAAAGGAAGGTCCCCATACGCCCACGCACCTGTTTACTCGTCGTATGCATAAAAGAGCCGCACGAACCACAGAGCATAAAGAGGACCTCTAGTTCCTTTACAAAGTACAGGTTCGCTTTTCGGCGAGATGCCTTACCTAGATGCAATGACGGACGTATTCCTCTGGCCACATCGGTTCCTGCTTTCGCTGGGATCCAAGATTGGCAGCTGAAGCCGCCTTTCCATAGTGAGTCCTTCGTCTGTGACTAACTGTGCCAAATCGTCTAGCAAACTGCTGATCCAGTTTAACTCACCAAATTATAGCCGTACAGACCGAAATCTTAAGTCATATCACGCGACTAGCCTCTGCTTAATTTTTGTGCTCAAGGGTTTTGGTCCGCCCGAGCGGTGCAGCCGATTAGGACCATGTAATACATTTGTTACAAGACTTCTTTTAAACACTTTCTTCCTGCCCAGTAGCGGATGATAATCGTTGTTGCCAGCCGGCGTGGAAGGTAACAGCACCGGTGCGAGCCTAATGTGCCGTCTCCACGAACACAAGGCTGTCCGATCGTATAATAGGATTCCGCAATGGGGTTAGCAAGTGGCAGCCTAAACGATATCGGGGACTTGCGATGTACATGCTTTGGTACAATACATACGTGATCCAGTTGTTATCCTGCATCGGAACATCAATTGTGCATCGGACCAGCATATTCATGTCATCTAGGAGGCGCGCGTAGGATAAATAATTCAATTAAGATGTCGTTTTGCTAGTATACGTCTAGGCGTCACCCGCCATCTGTGTGCAGGTGGGCCGACGAGACACTGTCCCTGATTTCTCCGCTTCTAATAGCACACACGGGGCAATACCAGCACAAGCCAGTCTCGCAGCAACGCTCGTCAGCAAACGAAAGAGCTTAAGGCTCGCCAATTCGCACTGTCAGGGTCGCTTGGGTGTTTTGCACTAGCGTCAGGTACGCTAGTATGCGTTCTTCCTTCCAGGGGTATGTGGCTGCGTGGTCAAATGTGCGGCATACGTATTTGCTCGACGTGTTTGCTCTCACGAACTTGACCTGGAGATCAAGGAGATGTTTCTTGTCGAACTGGACAGCGCTTCAACGGAACGGATCTACGTTACAGCCTGCATAATGAAAACGGAGTTGCCGACGACGAAAGCGACTTTGGGTTCTGTCTGTTGTCATTGGCGGAAAACTTCCGTTCAGGAGGCGGACACTGATTGACACGGTTTAGCAGAAGGTTTGAGGAATAGGTTAAATTGAGTGGTTTAATAACGGTATGTCTGGGATTAAAGTGTAGTATAGTGTGATTATCGGAGACGGTTTTAAGACACGAGTTCCCAAAATCAAGCGGGGTCATTACAACGGTTATTCCTGGTAGTTTAGGTGTACAATGTCCTGAAGAATATTTAAGAAAAAAGCACCCCTCATCGCCTAGAATTACCTACTACGGTCGACCATACCTTCGATTATCGCGGCCACTCTCGCATTAGTCGGCAGAGGTGGTTGTGTTGCGATAGCCCAGTATAATATTCTAAGGCGTTACCCTGATGAATATCCAACGGAATTGCTATAGGCCTTGAACGCTACACGGACGATACGAAATTATGTATGGACCGGGTCATCAAAAGGTTATACCCTTGTAGTTAACATGTAGCCCGGCCCTATTAGTACAGTAGTGCCTTGAATGGCATTCTCTTTATTAAGTTTTCTCTACAGCTAAACGATCAAGTGCACTTCCACAGAGCGCGGTGGAGATTCATTCACTCGGCAGCTCTGTAATAGGGACTAAAAAAGTGATGATAATCATGAGTGCCGCGTTATGGTGGTGTCGGAACAGAGCGGTCTTACGGCCAGTCGTATGCCTTCTCGAGTTCCGTCCAGTTAAGCGTGACAGTCCCAGTGTACCCACAAACCGTGATGGCTGTGCTTGGAGTCAATCGCAAGTAGGATGGTCTCCAGACACCGGGGCACCAGTTTTCACGCCGAAAGCATAAACGACGAGCAGATATGAAAGTGTTAGAACTGGACGTGCCGTTTCTCTGCGAAGAACACCTCGAGCTGTAGCGTTGTTGCGCTGCCTAGATGCAGTGTTGCTCATATCACATTTGCTTCAACGACTGCCGCCTTCGCTGTATCCCTAGACACTCAACAGTAAGCGCTTTTTGTAGGCAGGGGCACCCCCTATCAGTGACTGCGCCAAAACATCTTCGGATCCCCTTGTCCAATCAAACTCATCGAATTCTTACATTTAAGACCCTAATATCACATCATTAGTGATTAATTGCCACTGCCAAAATTCTGTCCAGAAGCGTTTTAGTTCGCTCCACTAAAGTTGTTTAAAACGACTACCAAATCCGCATGTTAGGGGATTTCTTATTAATTCTTTTATCGTGAGGAACAGCGGATCTTAATGGATGGCCGCAGGTGGTATGGAAGCTAATAGCGCGGGTGAGAGGGTAATCAGCCGTGTTCACCTACACAACGCTAACGGGCGATTCTATAAGATTCCGCATTGCGTCTACTTATAAGATGTCTCAACGGTATCCGCAACTTGTGAAGTGCCTACTATCCTTAAACGCATATCTCGCCCAGTAGCTTCCCAATATGTGAGCATCAATTGTTGTCCGGGCCGAGATAGTCATGTGCTCACGGAACTTACTGTATGAGTAGTGATTTGAAAGAGTTGTCAGTTTGCTGGTTCAGGTAAAGGTTCCTCACGCTACCTCAAAGTAAGAGAGCGGTCGTGACATTATCCGTGATTTTCTCACTACTATTAGTACTCACGACTCGATTCTGCCGCAGCCATGTTTCGCCAGAATGCCAGTCAGCATTAAGGAGAGCTCAGGGCAGGTCAACTCGCATAGTGAGGGTTACATGTTCGTTGGGCTCTTCCGACACGAACCTCAGTTAGCCTACATCCTACCAGAGGTCTGTGCCCCGGTGGTGAGAAGTGCGGATTTCGTATTTGCAGCTCGTCAGTACTTTCAGAATCATGGCCTGCACGGCAAAATGACGCTTATAATGGACTTCGACATGGCAATAACGCCTCGTTTCTACGTCAGGAGGAGAATAGTATAAACATAACTGCTGTCGGCAGAAGCGCCAAAGGAGTCTCTGAATTCTTATTCCCGAATAACATCCGTCTCCGTGCGGGAAAATCACCGACGGCGTTTTATAGAAGCCTAGGGGAACAGATTGGTCTAATTAGCTTAAGAGAGTAAATTCTGGGATCATTCAGTAGTAATCACAAATACGGTGGGGCTTTTTTGGCGGATCTTTACAGATACTAACCAGGTGATTTCAACTAATTTAGTTGACGATTTAGGCGCGCTATCCCGTAATCTTCAAATTAAAACATAGCGTTCCATGAGGGCTAGAATTACTTACCGGCCTTCACCATGCCTGCGTTATTCGCGCCCACTCTCCCATTTATCCGCGCAAGCGGATGCGATGCGATTGCCCGCTAAGATATTCTTACGTGTAACGTAGCTAAGTATTCTACAGAGCTGGCGTACGCGTTGAACACTTCACAGATGATAG
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/test1_iscc.txt Fri Dec 19 15:02:49 2025 +0000 @@ -0,0 +1,1 @@ +K4AOMGOGQJA4Y46PAC4YPPA63GKD5RVFPR7FU3I4OOEW44TYXNYOTMY \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/test3.fasta Fri Dec 19 15:02:49 2025 +0000 @@ -0,0 +1,58 @@ +>sp|P51587|BRCA2_HUMAN Breast cancer type 2 susceptibility protein OS=Homo sapiens OX=9606 GN=BRCA2 PE=1 SV=4 +MPIGSKERPTFFEIFKTRCNKADLGPISLNWFEELSSEAPPYNSEPAEESEHKNNNYEPN +LFKTPQRKPSYNQLASTPIIFKEQGLTLPLYQSPVKELDKFKLDLGRNVPNSRHKSLRTV +KTKMDQADDVSCPLLNSCLSESPVVLQCTHVTPQRDKSVVCGSLFHTPKFVKGRQTPKHI +SESLGAEVDPDMSWSSSLATPPTLSSTVLIVRNEEASETVFPHDTTANVKSYFSNHDESL +KKNDRFIASVTDSENTNQREAASHGFGKTSGNSFKVNSCKDHIGKSMPNVLEDEVYETVV +DTSEEDSFSLCFSKCRTKNLQKVRTSKTRKKIFHEANADECEKSKNQVKEKYSFVSEVEP +NDTDPLDSNVANQKPFESGSDKISKEVVPSLACEWSQLTLSGLNGAQMEKIPLLHISSCD +QNISEKDLLDTENKRKKDFLTSENSLPRISSLPKSEKPLNEETVVNKRDEEQHLESHTDC +ILAVKQAISGTSPVASSFQGIKKSIFRIRESPKETFNASFSGHMTDPNFKKETEASESGL +EIHTVCSQKEDSLCPNLIDNGSWPATTTQNSVALKNAGLISTLKKKTNKFIYAIHDETSY +KGKKIPKDQKSELINCSAQFEANAFEAPLTFANADSGLLHSSVKRSCSQNDSEEPTLSLT +SSFGTILRKCSRNETCSNNTVISQDLDYKEAKCNKEKLQLFITPEADSLSCLQEGQCEND +PKSKKVSDIKEEVLAAACHPVQHSKVEYSDTDFQSQKSLLYDHENASTLILTPTSKDVLS +NLVMISRGKESYKMSDKLKGNNYESDVELTKNIPMEKNQDVCALNENYKNVELLPPEKYM +RVASPSRKVQFNQNTNLRVIQKNQEETTSISKITVNPDSEELFSDNENNFVFQVANERNN +LALGNTKELHETDLTCVNEPIFKNSTMVLYGDTGDKQATQVSIKKDLVYVLAEENKNSVK +QHIKMTLGQDLKSDISLNIDKIPEKNNDYMNKWAGLLGPISNHSFGGSFRTASNKEIKLS +EHNIKKSKMFFKDIEEQYPTSLACVEIVNTLALDNQKKLSKPQSINTVSAHLQSSVVVSD +CKNSHITPQMLFSKQDFNSNHNLTPSQKAEITELSTILEESGSQFEFTQFRKPSYILQKS +TFEVPENQMTILKTTSEECRDADLHVIMNAPSIGQVDSSKQFEGTVEIKRKFAGLLKNDC +NKSASGYLTDENEVGFRGFYSAHGTKLNVSTEALQKAVKLFSDIENISEETSAEVHPISL +SSSKCHDSVVSMFKIENHNDKTVSEKNNKCQLILQNNIEMTTGTFVEEITENYKRNTENE +DNKYTAASRNSHNLEFDGSDSSKNDTVCIHKDETDLLFTDQHNICLKLSGQFMKEGNTQI +KEDLSDLTFLEVAKAQEACHGNTSNKEQLTATKTEQNIKDFETSDTFFQTASGKNISVAK +ESFNKIVNFFDQKPEELHNFSLNSELHSDIRKNKMDILSYEETDIVKHKILKESVPVGTG +NQLVTFQGQPERDEKIKEPTLLGFHTASGKKVKIAKESLDKVKNLFDEKEQGTSEITSFS +HQWAKTLKYREACKDLELACETIEITAAPKCKEMQNSLNNDKNLVSIETVVPPKLLSDNL +CRQTENLKTSKSIFLKVKVHENVEKETAKSPATCYTNQSPYSVIENSALAFYTSCSRKTS +VSQTSLLEAKKWLREGIFDGQPERINTADYVGNYLYENNSNSTIAENDKNHLSEKQDTYL +SNSSMSNSYSYHSDEVYNDSGYLSKNKLDSGIEPVLKNVEDQKNTSFSKVISNVKDANAY +PQTVNEDICVEELVTSSSPCKNKNAAIKLSISNSNNFEVGPPAFRIASGKIVCVSHETIK +KVKDIFTDSFSKVIKENNENKSKICQTKIMAGCYEALDDSEDILHNSLDNDECSTHSHKV +FADIQSEEILQHNQNMSGLEKVSKISPCDVSLETSDICKCSIGKLHKSVSSANTCGIFST +ASGKSVQVSDASLQNARQVFSEIEDSTKQVFSKVLFKSNEHSDQLTREENTAIRTPEHLI +SQKGFSYNVVNSSAFSGFSTASGKQVSILESSLHKVKGVLEEFDLIRTEHSLHYSPTSRQ +NVSKILPRVDKRNPEHCVNSEMEKTCSKEFKLSNNLNVEGGSSENNHSIKVSPYLSQFQQ +DKQQLVLGTKVSLVENIHVLGKEQASPKNVKMEIGKTETFSDVPVKTNIEVCSTYSKDSE +NYFETEAVEIAKAFMEDDELTDSKLPSHATHSLFTCPENEEMVLSNSRIGKRRGEPLILV +GEPSIKRNLLNEFDRIIENQEKSLKASKSTPDGTIKDRRLFMHHVSLEPITCVPFRTTKE +RQEIQNPNFTAPGQEFLSKSHLYEHLTLEKSSSNLAVSGHPFYQVSATRNEKMRHLITTG +RPTKVFVPPFKTKSHFHRVEQCVRNINLEENRQKQNIDGHGSDDSKNKINDNEIHQFNKN +NSNQAVAVTFTKCEEEPLDLITSLQNARDIQDMRIKKKQRQRVFPQPGSLYLAKTSTLPR +ISLKAAVGGQVPSACSHKQLYTYGVSKHCIKINSKNAESFQFHTEDYFGKESLWTGKGIQ +LADGGWLIPSNDGKAGKEEFYRALCDTPGVDPKLISRIWVYNHYRWIIWKLAAMECAFPK +EFANRCLSPERVLLQLKYRYDTEIDRSRRSAIKKIMERDDTAAKTLVLCVSDIISLSANI +SETSSNKTSSADTQKVAIIELTDGWYAVKAQLDPPLLAVLKNGRLTVGQKIILHGAELVG +SPDACTPLEAPESLMLKISANSTRPARWYTKLGFFPDPRPFPLPLSSLFSDGGNVGCVDV +IIQRAYPIQWMEKTSSGLYIFRNEREEEKEAAKYVEAQQKRLEALFTKIQEEFEEHEENT +TKPYLPSRALTRQQVRALQDGAELYEAVKNAADPAYLEGYFSEEQLRALNNHRQMLNDKK +QAQIQLEIRKAMESAEQKEQGLSRDVTTVWKLRIVSYSKKEKDSVILSIWRPSSDLYSLL +TEGKRYRIYHLATSKSKSKSERANIQLAATKKTQYQQLPVSDEILFQIYQPREPLHFSKF +LDPDFQPSCSEVDLIGFVVSVVKKTGLAPFVYLSDECYNLLAIKFWIDLNEDIIKPHMLI +AASNLQWRPESKSGLLTLFAGDFSVFSASPKEGHFQETFNKMKNTVENIDILCNEAENKL +MHILHANDPKWSTPTKDCTSGPYTAQIIPGTGNKLLMSSPNCEIYYQSPLSLCMAKRKSV +STPVSAQMTSKSCKGEKEIDDQKNCKKRRALDFLSRLPLPPPVSPICTFVSPAAQKAFQP +PRSCGTKYETPIKKKELNSPQMTPFKKFNEISLLESNSIADEELALINTQALLSGSTGEK +QFISVSESTRTAPTSSEDYLRLKRRCTTSLIKEQESSQASTEECEKNKQDTITTKKYI
