changeset 0:9fee6d81910d draft default tip

planemo upload for repository https://github.com/BMCV/galaxy-image-analysis/tree/master/tools/iscc-sum commit 6db86b8b65a0e05b7f3541d505fbe900633fc72a
author imgteam
date Fri, 19 Dec 2025 15:02:49 +0000
parents
children
files creators.xml iscc_similarity_parse_output.py iscc_verify.xml macros.xml test-data/sequence1.txt test-data/sequence2.txt test-data/test1.png test-data/test1_copy.png test-data/test1_iscc.txt test-data/test2.tiff test-data/test2_similar.tiff test-data/test3.fasta
diffstat 12 files changed, 461 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/creators.xml	Fri Dec 19 15:02:49 2025 +0000
@@ -0,0 +1,21 @@
+<macros>
+
+    <xml name="creators/iscc">
+        <organization name="ISCC Foundation" url="https://iscc.foundation"/>
+        <yield />
+    </xml>
+    <xml name="creators/lco">
+        <organization name="Leiden Cell Observatory, Leiden University" url="https://www.universiteitleiden.nl/en/research/research-facilities/science/cell-observatory"/>
+        <yield />
+    </xml>
+    <xml name="creators/maartenpaul">
+        <person givenName="Maarten" familyName="Paul"/>
+        <yield/>
+    </xml>
+    <xml name="creators/etzm">
+        <person givenName="Martin" familyName="Etzrodt"/>
+        <yield/>
+    </xml>
+ 
+
+</macros>
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/iscc_similarity_parse_output.py	Fri Dec 19 15:02:49 2025 +0000
@@ -0,0 +1,158 @@
+#!/usr/bin/env python
+"""
+Parse ISCC similarity output into tabular format with unique identifiers.
+
+Input format (from iscc-sum --similar):
+    ISCC:K4AOMG... *file1.txt
+      ~08 ISCC:K4AOMG... *file2.txt
+      ~10 ISCC:K4AOMG... *file3.txt
+    ISCC:K4AGSPO... *file4.txt
+
+Output format (tabular with 7 columns, bidirectional):
+    file_id      filename    iscc_code    match_id     match_filename    match_iscc_hash    distance
+    23  file1.txt   K4AOMG...    24  file2.txt         K4AOMG...          8
+    24  file2.txt   K4AOMG...    23  file1.txt         K4AOMG...          8
+    25  file4.txt   K4AGSPO...                                                      -1
+"""
+import argparse
+
+
+def clean_filename(filename):
+    """Remove directory prefix from filename."""
+    # Remove 'input_files/' prefix if present
+    if filename.startswith('input_files/'):
+        filename = filename[len('input_files/'):]
+
+    return filename
+
+
+def load_id_mapping(mapping_file):
+    """Load filename to element_identifier mapping.
+
+    Returns: dict mapping cleaned filename -> element_identifier
+    """
+    mapping = {}
+    with open(mapping_file, 'r') as f:
+        for line in f:
+            parts = line.strip().split('\t')
+            if len(parts) == 2:
+                filename, element_id = parts
+                # Clean the filename the same way as in parse
+                cleaned = clean_filename(filename)
+                mapping[cleaned] = element_id
+    return mapping
+
+
+def parse_iscc_line(line):
+    """Parse ISCC line and extract code and filename.
+
+    Format: "ISCC:CODE *filename" or "  ~NN ISCC:CODE *filename"
+    Returns: (code, filename) or (None, None) if parse fails
+    """
+    # Find the * separator
+    if ' *' not in line:
+        return None, None
+
+    # Split on ' *' to get code part and filename
+    parts = line.split(' *', 1)
+    code_part = parts[0].strip()
+    filename = clean_filename(parts[1].strip())
+
+    # Extract CODE (after 'ISCC:')
+    if 'ISCC:' in code_part:
+        code = code_part.split('ISCC:', 1)[1].strip()
+    else:
+        code = ''
+
+    return code, filename
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description='Parse ISCC similarity output into tabular format'
+    )
+    parser.add_argument(
+        'similarity_raw',
+        help='Raw similarity output from iscc-sum --similar'
+    )
+    parser.add_argument(
+        'id_mapping',
+        help='TSV file mapping filenames to element identifiers'
+    )
+    parser.add_argument(
+        'output_file',
+        help='Tabular output file'
+    )
+    args = parser.parse_args()
+
+    # Load ID mapping
+    id_map = load_id_mapping(args.id_mapping)
+
+    # Parse similarity output
+    file_codes = {}  # filename -> code mapping
+    matches = []  # List of (file1, code1, file2, code2, distance)
+    current_ref = None
+    current_code = None
+
+    with open(args.similarity_raw, 'r') as f:
+        for line in f:
+            line = line.rstrip()
+            if not line:
+                continue
+
+            if line.startswith('ISCC:'):
+                # Reference file: "ISCC:CODE *filename"
+                code, filename = parse_iscc_line(line)
+                if code and filename:
+                    current_ref = filename
+                    current_code = code
+                    file_codes[filename] = code
+
+            elif line.startswith(' ') and current_ref:
+                # Similar file: "  ~NN ISCC:CODE *filename"
+                parts = line.strip().split(None, 1)  # Split on first whitespace
+                if len(parts) == 2:
+                    dist_str = parts[0].replace('~', '')
+                    distance = int(dist_str)
+
+                    # Parse the rest of the line for ISCC and filename
+                    code, filename = parse_iscc_line(parts[1])
+
+                    if code and filename:
+                        matches.append((current_ref, current_code, filename, code, distance))
+                        file_codes[filename] = code
+    # Write output with identifiers
+    with open(args.output_file, 'w') as out:
+        # Write header (7 columns)
+        out.write("file_id\tfilename\tiscc_code\tmatch_id\tmatch_filename\tmatch_iscc_code\tdistance\n")
+
+        # Track which files have matches
+        files_with_matches = set()
+
+        # Write similarity matches in both directions
+        for file1, code1, file2, code2, distance in matches:
+            # Get element identifiers
+            file1_name = id_map[file1]
+            file2_name = id_map[file2]
+            file1_id = str.split(file1, '_', 1)[0]  # Extract ID from filename
+            file2_id = str.split(file2, '_', 1)[0]  # Extract ID from filename
+
+            # Write A -> B (file_id is the numeric ID, filename is the element_identifier)
+            out.write(f"{file1_id}\t{file1_name}\t{code1}\t{file2_id}\t{file2_name}\t{code2}\t{distance}\n")
+            # Write B -> A (bidirectional)
+            out.write(f"{file2_id}\t{file2_name}\t{code2}\t{file1_id}\t{file1_name}\t{code1}\t{distance}\n")
+
+            files_with_matches.add(file1)
+            files_with_matches.add(file2)
+
+        # Write files with no matches (distance = -1, empty match columns)
+        for filename in sorted(file_codes.keys()):
+            if filename not in files_with_matches:
+                file_id = str.split(filename, '_', 1)[0]  # Extract ID from filename
+                element_name = id_map[filename]
+                code_val = file_codes[filename]
+                out.write(f"{file_id}\t{element_name}\t{code_val}\t\t\t\t-1\n")
+
+
+if __name__ == '__main__':
+    main()
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/iscc_verify.xml	Fri Dec 19 15:02:49 2025 +0000
@@ -0,0 +1,197 @@
+<tool id="iscc_sum_verify" name="Verify ISCC-CODE" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="24.1">
+    <description>with ISCC-SUM</description>
+    <macros>
+        <import>macros.xml</import>
+        <import>creators.xml</import>
+    </macros>
+    <expand macro="requirements" />
+    <expand macro="version_command" />
+    <creator>
+        <expand macro="creators/iscc" />
+        <expand macro="creators/lco" />
+        <expand macro="creators/maartenpaul" />
+        <expand macro="creators/etzm" />
+    </creator>
+
+    <command detect_errors="exit_code"><![CDATA[
+        ## Generate ISCC-CODE for input dataset
+        GENERATED=\$(iscc-sum '${input_file}' | cut -d':' -f2 | cut -d' ' -f1) &&
+        
+        ## Get expected ISCC-CODE
+        EXPECTED='${expected_code}' &&
+        
+        ## Validate expected ISCC-CODE length
+        if [ \${#EXPECTED} -ne 55 ]; then
+            echo "ERROR: Expected ISCC-CODE must be exactly 55 characters" >&2;
+            echo "Found: \${#EXPECTED} characters" >&2;
+            exit 1;
+        fi &&
+        
+        ## Output verification report
+        if [ "\$GENERATED" = "\$EXPECTED" ]; then
+            echo "OK - ISCC-CODEs match" > '${output_file}'; 
+        else
+            echo "FAILED - ISCC-CODEs do not match" > '${output_file}'; 
+        fi &&
+        echo "Expected:  \$EXPECTED" >> '${output_file}' &&
+        echo "Generated: \$GENERATED" >> '${output_file}' &&
+        echo "" >> '${output_file}' 
+
+    ]]></command>
+        
+    <inputs>
+        <param name="input_file" type="data" format="data" label="Dataset to verify"
+            help="Verify this dataset's ISCC-CODE. When a collection is provided, each dataset is verified separately against the same expected ISCC-CODE."/>
+        <param name="expected_code" type="text" label="Expected ISCC-CODE"
+            help="The 55-character ISCC-CODE to verify against">
+            <validator type="length" min="55" max="55" message="ISCC-CODE must be exactly 55 characters"/>
+        </param>
+    </inputs>
+    
+    <outputs>
+        <data name="output_file" format="txt" label="${tool.name} on ${on_string}"/>
+    </outputs>
+    
+    <tests>
+        <!-- Test 1: Successful verification -->
+        <test expect_num_outputs="1">
+            <param name="input_file" value="test1.png"/>
+            <param name="expected_code" value="K4AOMGOGQJA4Y46PAC4YPPA63GKD5RVFPR7FU3I4OOEW44TYXNYOTMY"/>
+            <output name="output_file">
+                <assert_contents>
+                    <has_text text="OK - ISCC-CODEs match"/>
+                    <has_text text="Expected:  K4AOMGOGQJA4Y46PAC4YPPA63GKD5RVFPR7FU3I4OOEW44TYXNYOTMY"/>
+                    <has_text text="Generated: K4AOMGOGQJA4Y46PAC4YPPA63GKD5RVFPR7FU3I4OOEW44TYXNYOTMY"/>
+                    <has_n_lines n="4"/>
+                </assert_contents>
+            </output>
+        </test>
+        
+        <!-- Test 2: Failed verification -->
+        <test expect_num_outputs="1">
+            <param name="input_file" value="test1.png"/>
+            <param name="expected_code" value="K4AGSPOSB5SS2X427WZ27QASTSBVTS55DXLMFDF7WOJKEOSTDEI3OXQ"/>
+            <output name="output_file">
+                <assert_contents>
+                    <has_text text="FAILED - ISCC-CODEs do not match"/>
+                    <has_text text="Expected:  K4AGSPOSB5SS2X427WZ27QASTSBVTS55DXLMFDF7WOJKEOSTDEI3OXQ"/>
+                    <has_text text="Generated: K4AOMGOGQJA4Y46PAC4YPPA63GKD5RVFPR7FU3I4OOEW44TYXNYOTMY"/>
+                    <has_n_lines n="4"/>
+                </assert_contents>
+            </output>
+        </test>
+        
+        <!-- Test 3: FASTA file verification -->
+        <test expect_num_outputs="1">
+            <param name="input_file" value="test3.fasta"/>
+            <param name="expected_code" value="K4AKF7PTZ7JTAAYZ7YZHZPR5RETKYXXE7RTBTJA4JX5GQQMSLZRC6QQ"/>
+            <output name="output_file">
+                <assert_contents>
+                    <has_text text="OK - ISCC-CODEs match"/>
+                    <has_text text="Expected:  K4AKF7PTZ7JTAAYZ7YZHZPR5RETKYXXE7RTBTJA4JX5GQQMSLZRC6QQ"/>
+                    <has_text text="Generated: K4AKF7PTZ7JTAAYZ7YZHZPR5RETKYXXE7RTBTJA4JX5GQQMSLZRC6QQ"/>
+                    <has_n_lines n="4"/>
+                </assert_contents>
+            </output>
+        </test>
+    </tests>
+    
+    <help><![CDATA[
+What it does
+============
+
+Verifies that a file (dataset) matches an expected ISCC-CODE (International Standard Content Code) for exact content verification. This tool uses ISCC-SUM, which generates an ISCC-CODE containing Data-Code and Instance-Code units for bit-level file comparison.
+
+Exit Codes
+==========
+
+The tool uses exit codes for workflow logic:
+
+- **0**: Verification successful (OK - ISCC-CODEs match)
+- **1**: Verification failed (FAILED - ISCC-CODEs do not match)
+
+Dataset Mapping
+===============
+
+When you provide a collection, Galaxy automatically runs verification once per dataset. All datasets are verified against the same expected ISCC-CODE.
+
+Output
+======
+
+A verification report containing:
+
+- Filename (or element identifier)
+- Expected ISCC-CODE
+- Generated ISCC-CODE
+- Status: OK or FAILED
+
+Example output::
+    Status: OK - ISCC-CODEs match
+
+    Expected:  K4AOMGOGQJA4Y46PAC4YPPA63GKD5RVFPR7FU3I4OOEW44TYXNYOTMY
+    Generated: K4AOMGOGQJA4Y46PAC4YPPA63GKD5RVFPR7FU3I4OOEW44TYXNYOTMY
+    
+Use Cases
+=========
+
+- Verify file integrity after transfer or storage
+- Confirm downloaded datasets match reference ISCC-CODEs
+- Validate that backups are identical to originals
+- Quality control for data archiving
+
+Workflow Examples
+=================
+
+Verify a single dataset
+-----------------------
+
+::
+
+    Input: document.pdf
+    Expected ISCC-CODE: K4AOMG...
+    ↓
+    [Verify ISCC-CODE]
+    ↓
+    Output: "OK" or "FAILED"
+
+Verify against reference table
+-------------------------------
+
+Generate reference ISCC-CODEs first::
+
+    Original files → [Generate ISCC-CODE] → Reference ISCC-CODEs
+    
+Later verify::
+
+    New datasets → [Generate ISCC-CODE] → New ISCC-CODEs
+    ↓
+    [Join two Datasets] on filename
+    ↓
+    Compare ISCC-CODE columns
+    ↓
+    Result: Which files match/differ
+
+Working with ISCC-CODE Files in Workflows
+==========================================
+
+If you have the expected ISCC-CODE in a dataset (e.g., from Generate ISCC tool):
+
+1. In the workflow editor, connect the ISCC-CODE file to this tool
+2. The ISCC-CODE file content will be used automatically
+3. Or manually copy the ISCC-CODE from the file into the text field
+
+Important Notes
+===============
+
+- **Exact match only**: Any change to the file will cause verification to fail
+- **Bit-level comparison**: Even metadata changes are detected
+- **For similarity detection**: Use "Find similar ISCC-CODEs" tool instead
+
+More Information
+================
+
+For details about ISCC: https://sum.iscc.codes/ and https://iscc.codes/
+For ISCC structure and subtypes: https://ieps.iscc.codes/iep-0001/
+    ]]></help>
+    <expand macro="citations" />
+</tool>
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/macros.xml	Fri Dec 19 15:02:49 2025 +0000
@@ -0,0 +1,24 @@
+<macros>
+    <token name="@TOOL_VERSION@">0.1.0</token>
+    <token name="@VERSION_SUFFIX@">1</token>
+    <xml name="citations">
+        <citations>
+            <citation type="bibtex">
+    @misc{iscc_sum,
+        title={ISCC - Similarity Hash for Digital Media},
+        url={https://iscc.codes/},
+        note={Accessed: 2025}
+    }
+            </citation>
+        </citations>
+    </xml>
+    <xml name="requirements">
+        <requirements>
+            <requirement type="package" version="@TOOL_VERSION@">iscc-sum</requirement>
+        </requirements>
+    </xml>
+    <xml name="version_command">    
+        <version_command>iscc-sum --version</version_command>
+    </xml>
+</macros>
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/sequence1.txt	Fri Dec 19 15:02:49 2025 +0000
@@ -0,0 +1,1 @@
+CTCACTTGTGGATACCCCGACCTATTTTGACGGGACCACTCGCGGTAGTCGTTGGGCTTATGCACCGTAAAGTCCTCCGCCGGCCTCCCCCCTACAAAAGATGATAAGCTCCGGCAAGCAATATTGAACAACGCAAGGATCGGCGATATAAACAGAGAAACGGCTGATTACTCTTGTTGGTGTGGTATCGCTAAACTGCGTCGCGGAGCCTTATGGCATAGTCGTCCGCGGAGCACTCTGGTAACGCTTATGGTCCATAGCACATTCATCGCATCCGGGCATGCGCTCTATTTGACGATCCCTTGGCGCAGAGATGCTGGCCACGAGCTAAATTAAAGCGACTGCACTACTGTAAGGTCCGTCACGCAGACGACGGCCCGGGGAGAGCACTAACCCATCAACCTGTACGGGAACTTTCTATATCGTTCTCGGACGGAGAGATAACTACAGTGCCGCTTACAGCCCCTCTGTCGTCGCCGACGTCTGTAATATAGCCTTGTTGTGATTCCACCCTATTGAGGCATTGACTGATGCGGAAGGAGATCTGGAATGAACTGGTCTATGTGACAGAAACTGTGCAAGTACCTAATCTCGTTAGTGTAGGTTCTGACCGATACGTGCTTCGTTGAGAACTCACAATTTTACAACTGGGGACATAAGCCCTACGCCCATCATCTACTGACGTCCCTGAGGCTGCAGTTCATGTAATGGGACAGTATCCGCCGCAAGTTCTAGTGCAATGGCGGTATAGTACGCTCGTACTGTAGTAGAGGCGACACGGGTGGGATCATCACTAATAAGGATACTGGGAAGACTCACAGGCCTCCGCCTATAGGCGGTGCTTACTCTTACATAAAGCGGCTGTTAGTATTACCCCGCGAGGATTCGAAAAGGTGAACCAACCCGGTCGATCCGGAGGGACGGGCCTCAAAGCCGCGTGACGACGGCTGTCGGCCCGTAACAGAATCCCCGCAATAAGCTCCCGTGAGCCTCGATTGAACAGCCCTGGTGGGCCCCATCAGTAGCCCGAATATGTCGCTTTTCGGGTCCTGGGCCGAGGAGCGATACCTTCCAGTAATCGAGGCCGTTCGTTAATTCTTGTTGCGTTCCTAGCGCCTATATTTGTCTCTTTGCCGGCTTATGTGGACAAGCATAGCATAGCCATTTATCGGAGCGCCTCGGTACACGGTATGACCAGACGCCTCGTGAGACCATTACGTATACCAGGTGTCCTGTGAGCAGCGAAGGCCCATACGCGAGATACACTGCCAGAAATCCGCGTGATTACGAGTCGTGGTAAATTTAATCTGGCTGTGGTCTAGACATTCCAGGCGGTGCGTCTGCTGTCGGGTGCCTCTGGTGACTGGCTAGATGGACTTGCCGCTGGTAAACACACCATGACCCCGCCTCTCCATTGATGCCACGGCGAATGTCGGGGAGACAGCAGCGACTGCAGACATCAGATCAGAGTAATACTAACATGCGATAAGTCCCTAACTGACTATGGCCTTCTGTAGAGTCTACTTCACCAGATATGCTGTCTCTGGCACGTGGATGGTTTAGAGGAATCACATTCAAGTCTGGTTAACCATGAAACAAGTCTTGAGTGTAAAATTGTCGTCTCCTGTGTACGAGATGGAGGTACTAGATGACTGCAGGGACTCCGACGTTATGTACGTTGCTCCGTCAAAGGCGCCATTCAGGATCACGTTACCGCCAAAAAATGGGAGCAGGAGCTCTTCTCCCCTGCGGTCACGTCTATAGAAATTACACCTTTAACCCTCCTGAGAACCGGGAGGCGGGAATCCGTCACGTATGAGAAGGTATTTGCCCGACAATCAATACCGGACGCTCCTAAGTTTTTCCACTCGCTTGAGCCGGCTAGGCCTCTCTGCCCGAAGTTTCGACGGACTGCTGCCAACGCCCAGGCATAGTTTTAGGAGGATTATTCGGGGGCACTGGCAACCAACTTCTCGGGTCCTGCCCGACTGGTCTTCGGGCTAATATAGCGAATTGCCGAGAACCCGGCCCCACGCAATGGAACGTCCTTAGCTCCGGCAGGCAATTAAGGGGAACGTAAGTATAGCGCAAAAAAACAGAGAAATAGGCGAATGAATCTATTCTTACTGTATCGAAGAATGGCCTCGCGGAGGCATGTGTCATGCTAGCGTGCGGGGTACTCTAGTTATCCATATGGTCCACAGGACACTCGTTGCTTTCGGATTTGCCCTTTATGCGCCGGTTTTCAGCCACGCTTATGCTCAGCATCGTTATAACCAGACCGATACTAGATCTATAAAGTCCGCCATGCAGACGAGACCAGACGGAGATTACCGAGCAATCTATCAGATCGGCGACCATTAGTGAGCTACTGGAGCCGAGGGGTAACTACGATGCCGCTAAGAACCTCTCGGTCGTCGCTAGCGATTACACTCCAGTCTCATTATAATCGTTCGCTATTCAGGGATTGACCAACACCGGAAAACATTTCACTTGAAGTATTGTATACGACAGAGTCCGTGCACCTACCAAACCTGTTTAATCTAAGTTCAGACTAGTTGGAAGTATGTCTAGATCTCAGATTTCGTCACTAGAGGGCCCACGCTCTATTTTTATGATCCATTGATCTCCCTGACGCTGCAAGATTTGCAACCAGGCAGACTTGGCGGTAGGTCCTAGTGCAGCGGGGCTTTTTTTCTATAGTCCTTGAGAGGAGGAGACGTCAGTCCAGATATCTTTGATGTCCTGATTGGAAGGACCGTTGGCCCCCCACCCTTAGGCAGTGTATACTCTTCCATAAACGAGCTATTAGTTATGAGGTCCGTAGATTGAAAAGGGTGACGGAATTCGGCCGAACGGGAAAGACGGACATCTAGGTATCCTGAGCACGGTTGCGCGTCCGTATCAAGCTCCTCTTTATAGGCCCCGGTTACTGTTGGTCGTAGAGCCCAGAACGGGTTGGGCAGATGTACGACAATATCGCTTAGTCACCCTTGGGCCACGGTCCGCTACCTTACAGGAATTGAGACCGTCCATTAATTTCCCTTGCATATATATTGCGTTTCTTCGACCTTTTAACCGCTCTCTTAGAAGAGAGACAGATAGCTTCTTACCCGTGCCCCACCGTTGGCAGTACGATCGCACGCCCCACGTGAACGATTGGTAAACCCTGTGGCCTGTGAGCGACAAAAGCTTTAATGGGAAATACGCGCCCATAACTTGGTGCGAATACGGGTCGTAGCAATGTTCGTCTGAGTATGATCTATATAATACGGGCGGTACGTCTGCTTTGGTCAGCCTCTAATGGCTCGTATGATAGTGCAGCCGCTGGTGATCACTCAATGATCTCGGCTCCCCGTTGCAACTACGGGGATTCTTGGAGAGCCAGCTGCGTTCGGTATTGTGAGGACAGTGTAGTATTAGCAAACGATAAGTCCCGAACTAGTTGTGACCTAACGAAAAGAGAATTTCATAATACGTGCTGTCCCACGCGCATGGTACATTTGGACAATATTGAATGGAGTCTGATCAACCTTCACACCGATCTAGAATCGAATGCAAAGATCACCCAGGTGCAAATCAAAAATTCTAGGTAACTAGAAGATTTGCGACGTTCTAAGTGTTGGACGATATGAATCGCGACCCAGGATGACGTCGCCCTGAAAAAAAGATTTCTGCAACTCTCCTCGTCAGCAGTCTGGTGTATCGAAAGTACAGGACTAGCCTTCCTAGCAACCGCGGGCTGGGAATCTGAGACATGAGTCAAGATATTTGCTCGGTAACGTATGCTCTAGGCATCTAACTATTCCCTGTGTCTTATAGGGGCCTGCGTTATCTGCCTGTCGAACCATAGGATTCGTGTCAGCGCGCAGGCTTGGATCGAGATGAAATCTCCGGGGCCTAAGACTACGAGCATCTGGCGTCTTGGCTAACCCCCCTACATGTTGTTATAAACAATCAGTGGAAACCCAGTGCTAGAGGATGGAATGACCTTAAATCAGGGACGATATTAAACGGAACGTATATTCAACGCAATGAAGCCGGAGGATTGGCGTGGGAATCGTGCTTCTGTCTAAGCAAGTAAGGGTATGAGGTCGCAACCGTCCCCCAAGCGTACAGGGTGCACTTTGTAACGATTTGGGAGTCCAGAGACTCGCTGTTTTCGAAATTTGCCCTCAAGCGCGAGTATTGAACCAGGCTTACGCCCAAGAACGTAGCAAGCTGACTCAAACAAAATACATTTTGCCCGCGTTACATATGAATCAAGTTGGAAGTTATGGAGCATAGTAACATGTGGACGGCCAGTGGTGGGTTGCTACACCCCTGCGGCAACGTTGAAGCTCCTGGATTACACTGGCTGGATCTAAGCCGTGACACCCGTCATACTCCATAACCGTCTGTAACTCACGGCTTGTTCTGGACTGGATTGCCATTCTCTCAGAGTATTATGCAGGCCGGCGTACGGGTCCCATATAAACCTGTCATAGCTTACCTGACTCTACTTGGAAATGTGGCTAGGTCTTTGCCCACGCACCTAATCGGTCCTCGTTTGCTTTTTAGGACCCGATGAACTACAGAACACTGCAAGAATCTCTACCTGCTTTACAAAGTGCTGGATCCTATTCCAGCGGGATCTTTTATCTAAACACGATGAGAGGAGTATTCGTCAGGCCACATAGCTTTCTTGTTCTGATCGGAACGATCGTTGGCGCCCGACCCCCCGATTCCATAGTGAGTTCTTCGTCCGAGCCATTGTATGCGAGATCGATAGACTGATAGGGGATGCAGTATATCCCTGGATACAATAGACGCACAGGTTGGAATCCTAAGTGAAGTCGCGCGTCCGAACCCAGCTCTATTTTAGAGGTCATGGGTTCTGGTGCCCGCGAGCCGCGGAACCGATTAGGGGCATGTACAACAATATTTATTAGTCATCTTTCAGACACAATCTCCCAGCTCACTGGTATATAGTTCCTGCTATAATTAGCCTCCCTCATAAGTTGCACTACTTCAGCGTCCCAAATGCACCCTTACCACGAAGACAGGATTGTCCGATCCCATATTACGACCTTGGCAGGGGGTTCGCAAGTCCCACCCCAAACGATGCTGAAGGCTCAGGTTTCACAGGGACAAAAGCTTTAAACGCGAGTTCCCGCTCATAACCTGGACCGAATGCAGAATCATGCATCGTTCCACTGTGTTCGTGTCATCTAGGACGGGCGCAAAGGATATATAATTCAATTTTGAATACCTTATATTATTGTACACCTACCGGTCACCAGCCAACAATGTGCGGATGGCGTTACAACTTTCTGGGCCTAATCTGACCGTTCTAGATACCGCACTCTGGGCAATACGAGGTAAAGCCAGTCACCCAGTGTCGATCAACACCTAACCTAACGGTAAGAGGCTCACATAATGGCACTGTCGGCGTCCCCAGGGTATTTTACGTTAGCATCAGGTGGACTAACATGAATCTTTACTCCCAAGCGAAAACGGGTGCGTGGACTAGCGAGGAGCAAACGAAAATTCTTGGCCTGCTTGGTGTCTCGTATTCCTCTTAGAGATCGACGAAATGTTTCACGACCAAGGGAAAGGTCGCCCTACAAAATAGATTTGCGTTACTCTCTCCATAAGGAGTCCGGTGTAGCGAAGGATCAAGGCGACCCTAGGTAGCAACCGCCGGCTTCGGCGGTAAGGTATCACTCAAGAAGCAGACACAGTAAGACACGGTCTAGCTGACTGTCTATCGGCTAGGTCAAATAGAGAGCTTTGATATCTGCATGTCTAGCTTTAGAATTCAGTTTAGCGCGCTGATCTGAGTCGAGATAAAATCACCAGTACCCAAGACCAGGGGGGCTCGCCACGTTGGCTAATCCTGGTACATCTTGTAATCAATATTCAGTAGAAAATTTGTGTTAGAAGGACGAGTCACCATGTACCAATAGCGATAACGATCGGTCGGACTATTCATTGTGGTGATGACGCTGGGTTTACGTGGGAAAGGTGCTTGTGTCCCGACAGGCTAGGATATAATGCTGAGGCCCTTCCCCAAGCGTTCAGCGTGGGATTTGCTACAACTTCCGAGTCCTACATGTGCGTGTTCATGTTATGTATGCACAAGGCCGAGAATAGGACGTAGCCTTCGAGTTAGTACGTAGCGTGGTCGCACAAGCACAGTAGATCCTCCCCGCGCATCCTATTTATTAAGTTAATTCTAAAGCAATACGATCACATGTGGATGGGCAGTGGCCGGTTGTTACACGCCTACCGCGGTGCTGAATGACCGGGACTAAAGAGGCGAAGATTATGGCGTGTGACCCGTTATGCTCGAGTTCGGTCAGTGCGTCATTGCAAGTAGTCGATTGCTTTCTCAATCTCCGAGCGATTTAGCGTGACAGCCCCAGGGAACCCATAAAATGTGATCGCAGTCCATCCGATCGTACATAGAAAGGAAGGTCCCCATACGCCCACGCACCTGTTTACTCGTCGTATGCATAAAAGAGCCGCACGAACCACAGAGCATAAAGAGGACCTCTAGTTCCTTTACAAAGTACAGGTTCGCTTTTCGGCGAGATGCCTTACCTAGATGCAATGACGGACGTATTCCTCTGGCCACATCGGTTCCTGCTTTCGCTGGGATCCAAGATTGGCAGCTGAAGCCGCCTTTCCATAGTGAGTCCTTCGTCTGTGACTAACTGTGCCAAATCGTCTAGCAAACTGCTGATCCAGTTTAACTCACCAAATTATAGCCGTACAGACCGAAATCTTAAGTCATATCACGCGACTAGCCTCTGCTTAATTTTTGTGCTCAAGGGTTTTGGTCCGCCCGAGCGGTGCAGCCGATTAGGACCATGTAATACATTTGTTACAAGACTTCTTTTAAACACTTTCTTCCTGCCCAGTAGCGGATGATAATCGTTGTTGCCAGCCGGCGTGGAAGGTAACAGCACCGGTGCGAGCCTAATGTGCCGTCTCCACGAACACAAGGCTGTCCGATCGTATAATAGGATTCCGCAATGGGGTTAGCAAGTGGCAGCCTAAACGATATCGGGGACTTGCGATGTACATGCTTTGGTACAATACATACGTGATCCAGTTGTTATCCTGCATCGGAACATCAATTGTGCATCGGACCAGCATATTCATGTCATCTAGGAGGCGCGCGTAGGATAAATAATTCAATTAAGATGTCGTTTTGCTAGTATACGTCTAGGCGTCACCCGCCATCTGTGTGCAGGTGGGCCGACGAGACACTGTCCCTGATTTCTCCGCTTCTAATAGCACACACGGGGCAATACCAGCACAAGCCAGTCTCGCAGCAACGCTCGTCAGCAAACGAAAGAGCTTAAGGCTCGCCAATTCGCACTGTCAGGGTCGCTTGGGTGTTTTGCACTAGCGTCAGGTACGCTAGTATGCGTTCTTCCTTCCAGGGGTATGTGGCTGCGTGGTCAAATGTGCGGCATACGTATTTGCTCGACGTGTTTGCTCTCACGAACTTGACCTGGAGATCAAGGAGATGTTTCTTGTCGAACTGGACAGCGCTTCAACGGAACGGATCTACGTTACAGCCTGCATAATGAAAACGGAGTTGCCGACGACGAAAGCGACTTTGGGTTCTGTCTGTTGTCATTGGCGGAAAACTTCCGTTCAGGAGGCGGACACTGATTGACACGGTTTAGCAGAAGGTTTGAGGAATAGGTTAAATTGAGTGGTTTAATAACGGTATGTCTGGGATTAAAGTGTAGTATAGTGTGATTATCGGAGACGGTTTTAAGACACGAGTTCCCAAAATCAAGCGGGGTCATTACAACGGTTATTCCTGGTAGTTTAGGTGTACAATGTCCTGAAGAATATTTAAGAAAAAAGCACCCCTCATCGCCTAGAATTACCTACTACGGTCGACCATACCTTCGATTATCGCGGCCACTCTCGCATTAGTCGGCAGAGGTGGTTGTGTTGCGATAGCCCAGTATAATATTCTAAGGCGTTACCCTGATGAATATCCAACGGAATTGCTATAGGCCTTGAACGCTACACGGACGATACGAAATTATGTATGGACCGGGTCATCAAAAGGTTATACCCTTGTAGTTAACATGTAGCCCGGCCCTATTAGTACAGTAGTGCCTTGAATGGCATTCTCTTTATTAAGTTTTCTCTACAGCTAAACGATCAAGTGCACTTCCACAGAGCGCGGTGGAGATTCATTCACTCGGCAGCTCTGTAATAGGGACTAAAAAAGTGATGATAATCATGAGTGCCGCGTTATGGTGGTGTCGGAACAGAGCGGTCTTACGGCCAGTCGTATGCCTTCTCGAGTTCCGTCCAGTTAAGCGTGACAGTCCCAGTGTACCCACAAACCGTGATGGCTGTGCTTGGAGTCAATCGCAAGTAGGATGGTCTCCAGACACCGGGGCACCAGTTTTCACGCCGAAAGCATAAACGACGAGCAGATATGAAAGTGTTAGAACTGGACGTGCCGTTTCTCTGCGAAGAACACCTCGAGCTGTAGCGTTGTTGCGCTGCCTAGATGCAGTGTTGCTCATATCACATTTGCTTCAACGACTGCCGCCTTCGCTGTATCCCTAGACACTCAACAGTAAGCGCTTTTTGTAGGCAGGGGCACCCCCTATCAGTGACTGCGCCAAAACATCTTCGGATCCCCTTGTCCAATCAAACTCATCGAATTCTTACATTTAAGACCCTAATATCACATCATTAGTGATTAATTGCCACTGCCAAAATTCTGTCCAGAAGCGTTTTAGTTCGCTCCACTAAAGTTGTTTAAAACGACTACCAAATCCGCATGTTAGGGGATTTCTTATTAATTCTTTTATCGTGAGGAACAGCGGATCTTAATGGATGGCCGCAGGTGGTATGGAAGCTAATAGCGCGGGTGAGAGGGTAATCAGCCGTGTTCACCTACACAACGCTAACGGGCGATTCTATAAGATTCCGCATTGCGTCTACTTATAAGATGTCTCAACGGTATCCGCAACTTGTGAAGTGCCTACTATCCTTAAACGCATATCTCGCCCAGTAGCTTCCCAATATGTGAGCATCAATTGTTGTCCGGGCCGAGATAGTCATGTGCTCACGGAACTTACTGTATGAGTAGTGATTTGAAAGAGTTGTCAGTTTGCTGGTTCAGGTAAAGGTTCCTCACGCTACCTCAAAGTAAGAGAGCGGTCGTGACATTATCCGTGATTTTCTCACTACTATTAGTACTCACGACTCGATTCTGCCGCAGCCATGTTTCGCCAGAATGCCAGTCAGCATTAAGGAGAGCTCAGGGCAGGTCAACTCGCATAGTGAGGGTTACATGTTCGTTGGGCTCTTCCGACACGAACCTCAGTTAGCCTACATCCTACCAGAGGTCTGTGCCCCGGTGGTGAGAAGTGCGGATTTCGTATTTGCAGCTCGTCAGTACTTTCAGAATCATGGCCTGCACGGCAAAATGACGCTTATAATGGACTTCGACATGGCAATAACGCCTCGTTTCTACGTCAGGAGGAGAATAGTATAAACATAACTGCTGTCGGCAGAAGCGCCAAAGGAGTCTCTGAATTCTTATTCCCGAATAACATCCGTCTCCGTGCGGGAAAATCACCGACGGCGTTTTATAGAAGCCTAGGGGAACAGATTGGTCTAATTAGCTTAAGAGAGTAAATTCTGGGATCATTCAGTAGTAATCACAAATTTACGGTGGGGCTTTTTTGGCGGATCTTTACAGATACTAACCAGGTGATTTCAACTAATTTAGTTGACGATTTAGGCGCGCTATCCCGTAATCTTCAAATTAAAACATAGCGTTCCATGAGGGCTAGAATTACTTACCGGCCTTCACCATGCCTGCGTTATTCGCGCCCACTCTCCCATTTATCCGCGCAAGCGGATGCGATGCGATTGCCCGCTAAGATATTCTTACGTGTAACGTAGCTAAGTATTCTACAGAGCTGGCGTACGCGTTGAACACTTCACAGATGATAGGGATTCG
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/sequence2.txt	Fri Dec 19 15:02:49 2025 +0000
@@ -0,0 +1,1 @@
+CTCACTTGTGGATACCCCGACCTATTTTGACGGGACCACTCGCGGTAGTCGTTGGGCTTATGCACCGTAAAGTCCTCCGCCGGCCTCCCCCCTACAAAAGATGATAAGCTCCGGCAAGCAATATTGAACAACGCAAGGATCGGCGATATAAACAGAGAAACGGCTGATTACTCTTGTTGGTGTGGTATCGCTAAACTGCGTCGCGGAGCCTTATGGCATAGTCGTCCGCGGAGCACTCTGGTAACGCTTATGGTCCATAGCACATTCATCGCATCCGGGCATGCGCTCTATTTGACGATCCCTTGGCGCAGAGATGCTGGCCACGAGCTAAATTAAAGCGACTGCACTACTGTAAGGTCCGTCACGCAGACGACGGCCCGGGGAGAGCACTAACCCATCAACCTGTACGGGAACTTTCTATATCGTTCTCGGACGGAGAGATAACTACAGTGCCGCTTACAGCCCCTCTGTCGTCGCCGACGTCTGTAATATAGCCTTGTTGTGATTCCACCCTATTGAGGCATTGACTGATGCGGAAGGAGATCTGGAATGAACTGGTCTATGTGACAGAAACTGTGCAAGTACCTAATCTCGTTAGTGTAGGTTCTGACCGATACGTGCTTCGTTGAGAACTCACAATTTTACAACTGGGGACATAAGCCCTACGCCCATCATCTACTGACGTCCCTGAGGCTGCAGTTCATGTAATGGGACAGTATCCGCCGCAAGTTCTAGTGCAATGGCGGTATAGTACGCTCGTACTGTAGTAGAGGCGACACGGGTGGGATCATCACTAATAAGGATACTGGGAAGACTCACAGGCCTCCGCCTATAGGCGGTGCTTACTCTTACATAAAGCGGCTGTTAGTATTACCCCGCGAGGATTCGAAAAGGTGAACCAACCCGGTCGATCCGGAGGGACGGGCCTCAAAGCCGCGTGACGACGGCTGTCGGCCCGTAACAGAATCCCCGCAATAAGCTCCCGTGAGCCTCGATTGAACAGCCCTGGTGGGCCCCATCAGTAGCCCGAATATGTCGCTTTTCGGGTCCTGGGCCGAGGAGCGATACCTTCCAGTAATCGAGGCCGTTCGTTAATTCTTGTTGCGTTCCTAGCGCCTATATTTGTCTCTTTGCCGGCTTATGTGGACAAGCATAGCATAGCCATTTATCGGAGCGCCTCGGTACACGGTATGACCAGACGCCTCGTGAGACCATTACGTATACCAGGTGTCCTGTGAGCAGCGAAGGCCCATACGCGAGATACACTGCCAGAAATCCGCGTGATTACGAGTCGTGGTAAATTTAATCTGGCTGTGGTCTAGACATTCCAGGCGGTGCGTCTGCTGTCGGGTGCCTCTGGTGACTGGCTAGATGGACTTGCCGCTGGTAAACACACCATGACCCCGCCTCTCCATTGATGCCACGGCGAATGTCGGGGAGACAGCAGCGACTGCAGACATCAGATCAGAGTAATACTAACATGCGATAAGTCCCTAACTGACTATGGCCTTCTGTAGAGTCTACTTCACCAGATATGCTGTCTCTGGCACGTGGATGGTTTAGAGGAATCACATTCAAGTCTGGTTAACCATGAAACAAGTCTTGAGTGTAAAATTGTCGTCTCCTGTGTACGAGATGGAGGTACTAGATGACTGCAGGGACTCCGACGTTATGTACGTTGCTCCGTCAAAGGCGCCATTCAGGATCACGTTACCGCCAAAAAATGGGAGCAGGAGCTCTTCTCCCCTGCGGTCACGTCTATAGAAATTACACCTTTAACCCTCCTGAGAACCGGGAGGCGGGAATCCGTCACGTATGAGAAGGTATTTGCCCGACAATCAATACCGGACGCTCCTAAGTTTTTCCACTCGCTTGAGCCGGCTAGGCCTCTCTGCCCGAAGTTTCGACGGACTGCTGCCAACGCCCAGGCATAGTTTTAGGAGGATTATTCGGGGGCACTGGCAACCAACTTCTCGGGTCCTGCCCGACTGGTCTTCGGGCTAATATAGCGAATTGCCGAGAACCCGGCCCCACGCAATGGAACGTCCTTAGCTCCGGCAGGCAATTAAGGGGAACGTAAGTATAGCGCAAAAAAACAGAGAAATAGGCGAATGAATCTATTCTTACTGTATCGAAGAATGGCCTCGCGGAGGCATGTGTCATGCTAGCGTGCGGGGTACTCTAGTTATCCATATGGTCCACAGGACACTCGTTGCTTTCGGATTTGCCCTTTATGCGCCGGTTTTCAGCCACGCTTATGCTCAGCATCGTTATAACCAGACCGATACTAGATCTATAAAGTCCGCCATGCAGACGAGACCAGACGGAGATTACCGAGCAATCTATCAGATCGGCGACCATTAGTGAGCTACTGGAGCCGAGGGGTAACTACGATGCCGCTAAGAACCTCTCGGTCGTCGCTAGCGATTACACTCCAGTCTCATTATAATCGTTCGCTATTCAGGGATTGACCAACACCGGAAAACATTTCACTTGAAGTATTGTATACGACAGAGTCCGTGCACCTACCAAACCTGTTTAATCTAAGTTCAGACTAGTTGGAAGTATGTCTAGATCTCAGATTTCGTCACTAGAGGGCCCACGCTCTATTTTTATGATCCATTGATCTCCCTGACGCTGCAAGATTTGCAACCAGGCAGACTTGGCGGTAGGTCCTAGTGCAGCGGGGCTTTTTTTCTATAGTCCTTGAGAGGAGGAGACGTCAGTCCAGATATCTTTGATGTCCTGATTGGAAGGACCGTTGGCCCCCCACCCTTAGGCAGTGTATACTCTTCCATAAACGAGCTATTAGTTATGAGGTCCGTAGATTGAAAAGGGTGACGGAATTCGGCCGAACGGGAAAGACGGACATCTAGGTATCCTGAGCACGGTTGCGCGTCCGTATCAAGCTCCTCTTTATAGGCCCCGGTTACTGTTGGTCGTAGAGCCCAGAACGGGTTGGGCAGATGTACGACAATATCGCTTAGTCACCCTTGGGCCACGGTCCGCTACCTTACAGGAATTGAGACCGTCCATTAATTTCCCTTGCATATATATTGCGTTTCTTCGACCTTTTAACCGCTCTCTTAGAAGAGAGACAGATAGCTTCTTACCCGTGCCCCACCGTTGGCAGTACGATCGCACGCCCCACGTGAACGATTGGTAAACCCTGTGGCCTGTGAGCGACAAAAGCTTTAATGGGAAATACGCGCCCATAACTTGGTGCGAATACGGGTCGTAGCAATGTTCGTCTGAGTATGATCTATATAATACGGGCGGTACGTCTGCTTTGGTCAGCCTCTAATGGCTCGTATGATAGTGCAGCCGCTGGTGATCACTCAATGATCTCGGCTCCCCGTTGCAACTACGGGGATTCTTGGAGAGCCAGCTGCGTTCGGTATTGTGAGGACAGTGTAGTATTAGCAAACGATAAGTCCCGAACTAGTTGTGACCTAACGAAAAGAGAATTTCATAATACGTGCTGTCCCACGCGCATGGTACATTTGGACAATATTGAATGGAGTCTGATCAACCTTCACACCGATCTAGAATCGAATGCAAAGATCACCCAGGTGCAAATCAAAAATTCTAGGTAACTAGAAGATTTGCGACGTTCTAAGTGTTGGACGATATGAATCGCGACCCAGGATGACGTCGCCCTGAAAAAAAGATTTCTGCAACTCTCCTCGTCAGCAGTCTGGTGTATCGAAAGTACAGGACTAGCCTTCCTAGCAACCGCGGGCTGGGAATCTGAGACATGAGTCAAGATATTTGCTCGGTAACGTATGCTCTAGGCATCTAACTATTCCCTGTGTCTTATAGGGGCCTGCGTTATCTGCCTGTCGAACCATAGGATTCGTGTCAGCGCGCAGGCTTGGATCGAGATGAAATCTCCGGGGCCTAAGACTACGAGCATCTGGCGTCTTGGCTAACCCCCCTACATGTTGTTATAAACAATCAGTGGAAACCCAGTGCTAGAGGATGGAATGACCTTAAATCAGGGACGATATTAAACGGAACGTATATTCAACGCAATGAAGCCGGAGGATTGGCGTGGGAATCGTGCTTCTGTCTAAGCAAGTAAGGGTATGAGGTCGCAACCGTCCCCCAAGCGTACAGGGTGCACTTTGTAACGATTTGGGAGTCCAGAGACTCGCTGTTTTCGAAATTTGCCCTCAAGCGCGAGTATTGAACCAGGCTTACGCCCAAGAACGTAGCAAGCTGACTCAAACAAAATACATTTTGCCCGCGTTACATATGAATCAAGTTGGAAGTTATGGAGCATAGTAACATGTGGACGGCCAGTGGTGGGTTGCTACACCCCTGCGGCAACGTTGAAGCTCCTGGATTACACTGGCTGGATCTAAGCCGTGACACCCGTCATACTCCATAACCGTCTGTAACTCACGGCTTGTTCTGGACTGGATTGCCATTCTCTCAGAGTATTATGCAGGCCGGCGTACGGGTCCCATATAAACCTGTCATAGCTTACCTGACTCTACTTGGAAATGTGGCTAGGTCTTTGCCCACGCACCTAATCGGTCCTCGTTTGCTTTTTAGGACCCGATGAACTACAGAACACTGCAAGAATCTCTACCTGCTTTACAAAGTGCTGGATCCTATTCCAGCGGGATCTTTTATCTAAACACGATGAGAGGAGTATTCGTCAGGCCACATAGCTTTCTTGTTCTGATCGGAACGATCGTTGGCGCCCGACCCCCCGATTCCATAGTGAGTTCTTCGTCCGAGCCATTGTATGCGAGATCGATAGACTGATAGGGGATGCAGTATATCCCTGGATACAATAGACGCACAGGTTGGAATCCTAAGTGAAGTCGCGCGTCCGAACCCAGCTCTATTTTAGAGGTCATGGGTTCTGGTGCCCGCGAGCCGCGGAACCGATTAGGGGCATGTACAACAATATTTATTAGTCATCTTTCAGACACAATCTCCCAGCTCACTGGTATATAGTTCCTGCTATAATTAGCCTCCCTCATAAGTTGCACTACTTCAGCGTCCCAAATGCACCCTTACCACGAAGACAGGATTGTCCGATCCCATATTACGACCTTGGCAGGGGGTTCGCAAGTCCCACCCCAAACGATGCTGAAGGCTCAGGTTTCACAGGGACAAAAGCTTTAAACGCGAGTTCCCGCTCATAACCTGGACCGAATGCAGAATCATGCATCGTTCCACTGTGTTCGTGTCATCTAGGACGGGCGCAAAGGATATATAATTCAATTTTGAATACCTTATATTATTGTACACCTACCGGTCACCAGCCAACAATGTGCGGATGGCGTTACAACTTTCTGGGCCTAATCTGACCGTTCTAGATACCGCACTCTGGGCAATACGAGGTAAAGCCAGTCACCCAGTGTCGATCAACACCTAACCTAACGGTAAGAGGCTCACATAATGGCACTGTCGGCGTCCCCAGGGTATTTTACGTTAGCATCAGGTGGACTAACATGAATCTTTACTCCCAAGCGAAAACGGGTGCGTGGACTAGCGAGGAGCAAACGAAAATTCTTGGCCTGCTTGGTGTCTCGTATTCCTCTTAGAGATCGACGAAATGTTTCACGACCAAGGGAAAGGTCGCCCTACAAAATAGATTTGCGTTACTCTCTCCATAAGGAGTCCGGTGTAGCGAAGGATCAAGGCGACCCTAGGTAGCAACCGCCGGCTTCGGCGGTAAGGTATCACTCAAGAAGCAGACACAGTAAGACACGGTCTAGCTGACTGTCTATCGGCTAGGTCAAATAGAGAGCTTTGATATCTGCATGTCTAGCTTTAGAATTCAGTTTAGCGCGCTGATCTGAGTCGAGATAAAATCACCAGTACCCAAGACCAGGGGGGCTCGCCACGTTGGCTAATCCTGGTACATCTTGTAATCAATATTCAGTAGAAAATTTGTGTTAGAAGGACGAGTCACCATGTACCAATAGCGATAACGATCGGTCGGACTATTCATTGTGGTGATGACGCTGGGTTTACGTGGGAAAGGTGCTTGTGTCCCGACAGGCTAGGATATAATGCTGAGGCCCTTCCCCAAGCGTTCAGCGTGGGATTTGCTACAACTTCCGAGTCCTACATGTGCGTGTTCATGTTATGTATGCACAAGGCCGAGAATAGGACGTAGCCTTCGAGTTAGTACGTAGCGTGGTCGCACAAGCACAGTAGATCCTCCCCGCGCATCCTATTTATTAAGTTAATTCTAAAGCAATACGATCACATGTGGATGGGCAGTGGCCGGTTGTTACACGCCTACCGCGGTGCTGAATGACCGGGACTAAAGAGGCGAAGATTATGGCGTGTGACCCGTTATGCTCGAGTTCGGTCAGTGCGTCATTGCAAGTAGTCGATTGCTTTCTCAATCTCCGAGCGATTTAGCGTGACAGCCCCAGGGAACCCATAAAATGTGATCGCAGTCCATCCGATCGTACATAGAAAGGAAGGTCCCCATACGCCCACGCACCTGTTTACTCGTCGTATGCATAAAAGAGCCGCACGAACCACAGAGCATAAAGAGGACCTCTAGTTCCTTTACAAAGTACAGGTTCGCTTTTCGGCGAGATGCCTTACCTAGATGCAATGACGGACGTATTCCTCTGGCCACATCGGTTCCTGCTTTCGCTGGGATCCAAGATTGGCAGCTGAAGCCGCCTTTCCATAGTGAGTCCTTCGTCTGTGACTAACTGTGCCAAATCGTCTAGCAAACTGCTGATCCAGTTTAACTCACCAAATTATAGCCGTACAGACCGAAATCTTAAGTCATATCACGCGACTAGCCTCTGCTTAATTTTTGTGCTCAAGGGTTTTGGTCCGCCCGAGCGGTGCAGCCGATTAGGACCATGTAATACATTTGTTACAAGACTTCTTTTAAACACTTTCTTCCTGCCCAGTAGCGGATGATAATCGTTGTTGCCAGCCGGCGTGGAAGGTAACAGCACCGGTGCGAGCCTAATGTGCCGTCTCCACGAACACAAGGCTGTCCGATCGTATAATAGGATTCCGCAATGGGGTTAGCAAGTGGCAGCCTAAACGATATCGGGGACTTGCGATGTACATGCTTTGGTACAATACATACGTGATCCAGTTGTTATCCTGCATCGGAACATCAATTGTGCATCGGACCAGCATATTCATGTCATCTAGGAGGCGCGCGTAGGATAAATAATTCAATTAAGATGTCGTTTTGCTAGTATACGTCTAGGCGTCACCCGCCATCTGTGTGCAGGTGGGCCGACGAGACACTGTCCCTGATTTCTCCGCTTCTAATAGCACACACGGGGCAATACCAGCACAAGCCAGTCTCGCAGCAACGCTCGTCAGCAAACGAAAGAGCTTAAGGCTCGCCAATTCGCACTGTCAGGGTCGCTTGGGTGTTTTGCACTAGCGTCAGGTACGCTAGTATGCGTTCTTCCTTCCAGGGGTATGTGGCTGCGTGGTCAAATGTGCGGCATACGTATTTGCTCGACGTGTTTGCTCTCACGAACTTGACCTGGAGATCAAGGAGATGTTTCTTGTCGAACTGGACAGCGCTTCAACGGAACGGATCTACGTTACAGCCTGCATAATGAAAACGGAGTTGCCGACGACGAAAGCGACTTTGGGTTCTGTCTGTTGTCATTGGCGGAAAACTTCCGTTCAGGAGGCGGACACTGATTGACACGGTTTAGCAGAAGGTTTGAGGAATAGGTTAAATTGAGTGGTTTAATAACGGTATGTCTGGGATTAAAGTGTAGTATAGTGTGATTATCGGAGACGGTTTTAAGACACGAGTTCCCAAAATCAAGCGGGGTCATTACAACGGTTATTCCTGGTAGTTTAGGTGTACAATGTCCTGAAGAATATTTAAGAAAAAAGCACCCCTCATCGCCTAGAATTACCTACTACGGTCGACCATACCTTCGATTATCGCGGCCACTCTCGCATTAGTCGGCAGAGGTGGTTGTGTTGCGATAGCCCAGTATAATATTCTAAGGCGTTACCCTGATGAATATCCAACGGAATTGCTATAGGCCTTGAACGCTACACGGACGATACGAAATTATGTATGGACCGGGTCATCAAAAGGTTATACCCTTGTAGTTAACATGTAGCCCGGCCCTATTAGTACAGTAGTGCCTTGAATGGCATTCTCTTTATTAAGTTTTCTCTACAGCTAAACGATCAAGTGCACTTCCACAGAGCGCGGTGGAGATTCATTCACTCGGCAGCTCTGTAATAGGGACTAAAAAAGTGATGATAATCATGAGTGCCGCGTTATGGTGGTGTCGGAACAGAGCGGTCTTACGGCCAGTCGTATGCCTTCTCGAGTTCCGTCCAGTTAAGCGTGACAGTCCCAGTGTACCCACAAACCGTGATGGCTGTGCTTGGAGTCAATCGCAAGTAGGATGGTCTCCAGACACCGGGGCACCAGTTTTCACGCCGAAAGCATAAACGACGAGCAGATATGAAAGTGTTAGAACTGGACGTGCCGTTTCTCTGCGAAGAACACCTCGAGCTGTAGCGTTGTTGCGCTGCCTAGATGCAGTGTTGCTCATATCACATTTGCTTCAACGACTGCCGCCTTCGCTGTATCCCTAGACACTCAACAGTAAGCGCTTTTTGTAGGCAGGGGCACCCCCTATCAGTGACTGCGCCAAAACATCTTCGGATCCCCTTGTCCAATCAAACTCATCGAATTCTTACATTTAAGACCCTAATATCACATCATTAGTGATTAATTGCCACTGCCAAAATTCTGTCCAGAAGCGTTTTAGTTCGCTCCACTAAAGTTGTTTAAAACGACTACCAAATCCGCATGTTAGGGGATTTCTTATTAATTCTTTTATCGTGAGGAACAGCGGATCTTAATGGATGGCCGCAGGTGGTATGGAAGCTAATAGCGCGGGTGAGAGGGTAATCAGCCGTGTTCACCTACACAACGCTAACGGGCGATTCTATAAGATTCCGCATTGCGTCTACTTATAAGATGTCTCAACGGTATCCGCAACTTGTGAAGTGCCTACTATCCTTAAACGCATATCTCGCCCAGTAGCTTCCCAATATGTGAGCATCAATTGTTGTCCGGGCCGAGATAGTCATGTGCTCACGGAACTTACTGTATGAGTAGTGATTTGAAAGAGTTGTCAGTTTGCTGGTTCAGGTAAAGGTTCCTCACGCTACCTCAAAGTAAGAGAGCGGTCGTGACATTATCCGTGATTTTCTCACTACTATTAGTACTCACGACTCGATTCTGCCGCAGCCATGTTTCGCCAGAATGCCAGTCAGCATTAAGGAGAGCTCAGGGCAGGTCAACTCGCATAGTGAGGGTTACATGTTCGTTGGGCTCTTCCGACACGAACCTCAGTTAGCCTACATCCTACCAGAGGTCTGTGCCCCGGTGGTGAGAAGTGCGGATTTCGTATTTGCAGCTCGTCAGTACTTTCAGAATCATGGCCTGCACGGCAAAATGACGCTTATAATGGACTTCGACATGGCAATAACGCCTCGTTTCTACGTCAGGAGGAGAATAGTATAAACATAACTGCTGTCGGCAGAAGCGCCAAAGGAGTCTCTGAATTCTTATTCCCGAATAACATCCGTCTCCGTGCGGGAAAATCACCGACGGCGTTTTATAGAAGCCTAGGGGAACAGATTGGTCTAATTAGCTTAAGAGAGTAAATTCTGGGATCATTCAGTAGTAATCACAAATACGGTGGGGCTTTTTTGGCGGATCTTTACAGATACTAACCAGGTGATTTCAACTAATTTAGTTGACGATTTAGGCGCGCTATCCCGTAATCTTCAAATTAAAACATAGCGTTCCATGAGGGCTAGAATTACTTACCGGCCTTCACCATGCCTGCGTTATTCGCGCCCACTCTCCCATTTATCCGCGCAAGCGGATGCGATGCGATTGCCCGCTAAGATATTCTTACGTGTAACGTAGCTAAGTATTCTACAGAGCTGGCGTACGCGTTGAACACTTCACAGATGATAG
Binary file test-data/test1.png has changed
Binary file test-data/test1_copy.png has changed
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/test1_iscc.txt	Fri Dec 19 15:02:49 2025 +0000
@@ -0,0 +1,1 @@
+K4AOMGOGQJA4Y46PAC4YPPA63GKD5RVFPR7FU3I4OOEW44TYXNYOTMY
\ No newline at end of file
Binary file test-data/test2.tiff has changed
Binary file test-data/test2_similar.tiff has changed
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/test3.fasta	Fri Dec 19 15:02:49 2025 +0000
@@ -0,0 +1,58 @@
+>sp|P51587|BRCA2_HUMAN Breast cancer type 2 susceptibility protein OS=Homo sapiens OX=9606 GN=BRCA2 PE=1 SV=4
+MPIGSKERPTFFEIFKTRCNKADLGPISLNWFEELSSEAPPYNSEPAEESEHKNNNYEPN
+LFKTPQRKPSYNQLASTPIIFKEQGLTLPLYQSPVKELDKFKLDLGRNVPNSRHKSLRTV
+KTKMDQADDVSCPLLNSCLSESPVVLQCTHVTPQRDKSVVCGSLFHTPKFVKGRQTPKHI
+SESLGAEVDPDMSWSSSLATPPTLSSTVLIVRNEEASETVFPHDTTANVKSYFSNHDESL
+KKNDRFIASVTDSENTNQREAASHGFGKTSGNSFKVNSCKDHIGKSMPNVLEDEVYETVV
+DTSEEDSFSLCFSKCRTKNLQKVRTSKTRKKIFHEANADECEKSKNQVKEKYSFVSEVEP
+NDTDPLDSNVANQKPFESGSDKISKEVVPSLACEWSQLTLSGLNGAQMEKIPLLHISSCD
+QNISEKDLLDTENKRKKDFLTSENSLPRISSLPKSEKPLNEETVVNKRDEEQHLESHTDC
+ILAVKQAISGTSPVASSFQGIKKSIFRIRESPKETFNASFSGHMTDPNFKKETEASESGL
+EIHTVCSQKEDSLCPNLIDNGSWPATTTQNSVALKNAGLISTLKKKTNKFIYAIHDETSY
+KGKKIPKDQKSELINCSAQFEANAFEAPLTFANADSGLLHSSVKRSCSQNDSEEPTLSLT
+SSFGTILRKCSRNETCSNNTVISQDLDYKEAKCNKEKLQLFITPEADSLSCLQEGQCEND
+PKSKKVSDIKEEVLAAACHPVQHSKVEYSDTDFQSQKSLLYDHENASTLILTPTSKDVLS
+NLVMISRGKESYKMSDKLKGNNYESDVELTKNIPMEKNQDVCALNENYKNVELLPPEKYM
+RVASPSRKVQFNQNTNLRVIQKNQEETTSISKITVNPDSEELFSDNENNFVFQVANERNN
+LALGNTKELHETDLTCVNEPIFKNSTMVLYGDTGDKQATQVSIKKDLVYVLAEENKNSVK
+QHIKMTLGQDLKSDISLNIDKIPEKNNDYMNKWAGLLGPISNHSFGGSFRTASNKEIKLS
+EHNIKKSKMFFKDIEEQYPTSLACVEIVNTLALDNQKKLSKPQSINTVSAHLQSSVVVSD
+CKNSHITPQMLFSKQDFNSNHNLTPSQKAEITELSTILEESGSQFEFTQFRKPSYILQKS
+TFEVPENQMTILKTTSEECRDADLHVIMNAPSIGQVDSSKQFEGTVEIKRKFAGLLKNDC
+NKSASGYLTDENEVGFRGFYSAHGTKLNVSTEALQKAVKLFSDIENISEETSAEVHPISL
+SSSKCHDSVVSMFKIENHNDKTVSEKNNKCQLILQNNIEMTTGTFVEEITENYKRNTENE
+DNKYTAASRNSHNLEFDGSDSSKNDTVCIHKDETDLLFTDQHNICLKLSGQFMKEGNTQI
+KEDLSDLTFLEVAKAQEACHGNTSNKEQLTATKTEQNIKDFETSDTFFQTASGKNISVAK
+ESFNKIVNFFDQKPEELHNFSLNSELHSDIRKNKMDILSYEETDIVKHKILKESVPVGTG
+NQLVTFQGQPERDEKIKEPTLLGFHTASGKKVKIAKESLDKVKNLFDEKEQGTSEITSFS
+HQWAKTLKYREACKDLELACETIEITAAPKCKEMQNSLNNDKNLVSIETVVPPKLLSDNL
+CRQTENLKTSKSIFLKVKVHENVEKETAKSPATCYTNQSPYSVIENSALAFYTSCSRKTS
+VSQTSLLEAKKWLREGIFDGQPERINTADYVGNYLYENNSNSTIAENDKNHLSEKQDTYL
+SNSSMSNSYSYHSDEVYNDSGYLSKNKLDSGIEPVLKNVEDQKNTSFSKVISNVKDANAY
+PQTVNEDICVEELVTSSSPCKNKNAAIKLSISNSNNFEVGPPAFRIASGKIVCVSHETIK
+KVKDIFTDSFSKVIKENNENKSKICQTKIMAGCYEALDDSEDILHNSLDNDECSTHSHKV
+FADIQSEEILQHNQNMSGLEKVSKISPCDVSLETSDICKCSIGKLHKSVSSANTCGIFST
+ASGKSVQVSDASLQNARQVFSEIEDSTKQVFSKVLFKSNEHSDQLTREENTAIRTPEHLI
+SQKGFSYNVVNSSAFSGFSTASGKQVSILESSLHKVKGVLEEFDLIRTEHSLHYSPTSRQ
+NVSKILPRVDKRNPEHCVNSEMEKTCSKEFKLSNNLNVEGGSSENNHSIKVSPYLSQFQQ
+DKQQLVLGTKVSLVENIHVLGKEQASPKNVKMEIGKTETFSDVPVKTNIEVCSTYSKDSE
+NYFETEAVEIAKAFMEDDELTDSKLPSHATHSLFTCPENEEMVLSNSRIGKRRGEPLILV
+GEPSIKRNLLNEFDRIIENQEKSLKASKSTPDGTIKDRRLFMHHVSLEPITCVPFRTTKE
+RQEIQNPNFTAPGQEFLSKSHLYEHLTLEKSSSNLAVSGHPFYQVSATRNEKMRHLITTG
+RPTKVFVPPFKTKSHFHRVEQCVRNINLEENRQKQNIDGHGSDDSKNKINDNEIHQFNKN
+NSNQAVAVTFTKCEEEPLDLITSLQNARDIQDMRIKKKQRQRVFPQPGSLYLAKTSTLPR
+ISLKAAVGGQVPSACSHKQLYTYGVSKHCIKINSKNAESFQFHTEDYFGKESLWTGKGIQ
+LADGGWLIPSNDGKAGKEEFYRALCDTPGVDPKLISRIWVYNHYRWIIWKLAAMECAFPK
+EFANRCLSPERVLLQLKYRYDTEIDRSRRSAIKKIMERDDTAAKTLVLCVSDIISLSANI
+SETSSNKTSSADTQKVAIIELTDGWYAVKAQLDPPLLAVLKNGRLTVGQKIILHGAELVG
+SPDACTPLEAPESLMLKISANSTRPARWYTKLGFFPDPRPFPLPLSSLFSDGGNVGCVDV
+IIQRAYPIQWMEKTSSGLYIFRNEREEEKEAAKYVEAQQKRLEALFTKIQEEFEEHEENT
+TKPYLPSRALTRQQVRALQDGAELYEAVKNAADPAYLEGYFSEEQLRALNNHRQMLNDKK
+QAQIQLEIRKAMESAEQKEQGLSRDVTTVWKLRIVSYSKKEKDSVILSIWRPSSDLYSLL
+TEGKRYRIYHLATSKSKSKSERANIQLAATKKTQYQQLPVSDEILFQIYQPREPLHFSKF
+LDPDFQPSCSEVDLIGFVVSVVKKTGLAPFVYLSDECYNLLAIKFWIDLNEDIIKPHMLI
+AASNLQWRPESKSGLLTLFAGDFSVFSASPKEGHFQETFNKMKNTVENIDILCNEAENKL
+MHILHANDPKWSTPTKDCTSGPYTAQIIPGTGNKLLMSSPNCEIYYQSPLSLCMAKRKSV
+STPVSAQMTSKSCKGEKEIDDQKNCKKRRALDFLSRLPLPPPVSPICTFVSPAAQKAFQP
+PRSCGTKYETPIKKKELNSPQMTPFKKFNEISLLESNSIADEELALINTQALLSGSTGEK
+QFISVSESTRTAPTSSEDYLRLKRRCTTSLIKEQESSQASTEECEKNKQDTITTKKYI