Mercurial > repos > tduigou > create_assembly_picklists

diff CreateAssemblyPicklists_script.py @ 0:4bde3e90ee98 draft
planemo upload for repository https://github.com/Edinburgh-Genome-Foundry/Plateo commit 98d5e65b8008dbca117b2e0655cfdd54655fac48-dirty
author: tduigou
date: Wed, 06 Aug 2025 08:02:58 +0000
children: 196e13c09881
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/CreateAssemblyPicklists_script.py	Wed Aug 06 08:02:58 2025 +0000
@@ -0,0 +1,588 @@
+#!/usr/bin/env python
+# coding: utf-8
+# Code copied from CUBA backend tools.py and create_assembly_picklists/CreateAssemblyPicklistsView.py
+# Code modified for running in a script in Galaxy.
+##############################################################################
+##############################################################################
+# App code
+## EGF Galaxy Create assembly picklists -- script
+
+##############################################################################
+# IMPORTS
+import argparse
+import os
+from io import StringIO, BytesIO
+import re
+from base64 import b64encode, b64decode
+from copy import deepcopy
+import sys
+
+from collections import OrderedDict
+from fuzzywuzzy import process
+import matplotlib.pyplot as plt
+from matplotlib.backends.backend_pdf import PdfPages
+import pandas
+
+from Bio import SeqIO
+from Bio.SeqRecord import SeqRecord
+from Bio.Seq import Seq
+
+import bandwagon as bw
+import crazydoc
+from dnachisel.biotools import sequence_to_biopython_record
+import dnacauldron
+import flametree
+from plateo import AssemblyPlan
+from plateo.parsers import plate_from_content_spreadsheet
+from plateo.containers import Plate4ti0960
+from plateo.exporters import AssemblyPicklistGenerator, picklist_to_assembly_mix_report
+from plateo.exporters import (
+    picklist_to_labcyte_echo_picklist_file,
+    picklist_to_tecan_evo_picklist_file,
+    plate_to_platemap_spreadsheet,
+    PlateTextPlotter,
+)
+from plateo.tools import human_volume
+from snapgene_reader import snapgene_file_to_seqrecord
+
+
+##############################################################################
+# FUNCTIONS
+
+def fix_and_rename_paths(paths):
+    fixed_paths = []
+    for path in paths:
+        new_path = path.replace("__sq__", "'")
+        if new_path != path:
+            os.rename(path, new_path)
+        fixed_paths.append(new_path)
+    return fixed_paths
+
+
+def did_you_mean(name, other_names, limit=5, min_score=50):  # test
+    results = process.extract(name, list(other_names), limit=limit)
+    return [e for (e, score) in results if score >= min_score]
+
+
+def fix_ice_genbank(genbank_txt):
+    lines = genbank_txt.splitlines()
+    lines[0] += max(0, 80 - len(lines[0])) * " "
+    return "\n".join(lines)
+
+
+def write_record(record, target, fmt="genbank"):
+    """Write a record as genbank, fasta, etc. via Biopython, with fixes"""
+    record = deepcopy(record)
+    if fmt == "genbank":
+        if isinstance(record, (list, tuple)):
+            for r in record:
+                r.name = r.name[:20]
+        else:
+            record.name = record.name[:20]
+    if hasattr(target, "open"):
+        target = target.open("w")
+    SeqIO.write(record, target, fmt)
+
+
+def autoname_genbank_file(record):
+    return record.id.replace(".", "_") + ".gb"
+
+
+def string_to_records(string):
+    """Convert a string of a fasta, genbank... into a simple ATGC string.
+
+    Can also be used to detect a format.
+    """
+    matches = re.match("([ATGC][ATGC]*)", string)
+    # print("============", len(matches.groups()[0]), len(string))
+    # print (matches.groups()[0] == string)
+    if (matches is not None) and (matches.groups()[0] == string):
+        return [SeqRecord(Seq(string))], "ATGC"
+
+    for fmt in ("fasta", "genbank"):
+        if fmt == "genbank":
+            string = fix_ice_genbank(string)
+        try:
+            stringio = StringIO(string)
+            records = list(SeqIO.parse(stringio, fmt))
+            if len(records) > 0:
+                return (records, fmt)
+        except:
+            pass
+    try:
+        record = snapgene_file_to_seqrecord(filecontent=StringIO(string))
+        return [record]
+    except:
+        pass
+    raise ValueError("Invalid sequence format")
+
+
+def file_to_filelike_object(file_, type="byte"):
+    content = file_.content.split("base64,")[1]
+    filelike = BytesIO if (type == "byte") else StringIO
+    return filelike(b64decode(content))
+
+
+def spreadsheet_file_to_dataframe(filedict, header="infer"):
+    filelike = file_to_filelike_object(filedict)
+    if filedict.name.endswith(".csv"):
+        return pandas.read_csv(filelike, header=header)
+    else:
+        return pandas.read_excel(filelike, header=header)
+
+
+def records_from_zip_file(zip_file, use_file_names_as_ids=False):
+    zip_name = zip_file.name
+    zip_file = flametree.file_tree(file_to_filelike_object(zip_file))
+    records = []
+    for f in zip_file._all_files:
+        ext = f._extension.lower()
+        if ext in ["gb", "gbk", "fa", "dna"]:
+            try:
+                new_records, fmt = string_to_records(f.read())
+                if not isinstance(new_records, list):
+                    new_records = [new_records]
+            except:
+                content_stream = BytesIO(f.read("rb"))
+                try:
+                    record = snapgene_file_to_seqrecord(fileobject=content_stream)
+                    new_records, fmt = [record], "snapgene"
+                except:
+                    try:
+                        parser = crazydoc.CrazydocParser(
+                            ["highlight_color", "bold", "underline"]
+                        )
+                        new_records = parser.parse_doc_file(content_stream)
+                        fmt = "doc"
+                    except:
+                        raise ValueError("Format not recognized for file " + f._path)
+
+            single_record = len(new_records) == 1
+            for i, record in enumerate(new_records):
+                name = record.id
+                if name in [
+                    None,
+                    "",
+                    "<unknown id>",
+                    ".",
+                    " ",
+                    "<unknown name>",
+                ]:
+                    number = "" if single_record else ("%04d" % i)
+                    name = f._name_no_extension.replace(" ", "_") + number
+                record.id = name
+                record.name = name
+                record.file_name = f._name_no_extension
+                record.zip_file_name = zip_name
+                if use_file_names_as_ids and single_record:
+                    basename = os.path.basename(record.file_name)
+                    basename_no_extension = os.path.splitext(basename)[0]
+                    record.id = basename_no_extension
+            records += new_records
+    return records
+
+
+def records_from_data_file(data_file):
+    content = b64decode(data_file.content.split("base64,")[1])
+    try:
+        records, fmt = string_to_records(content.decode("utf-8"))
+    except:
+        try:
+            record = snapgene_file_to_seqrecord(fileobject=BytesIO(content))
+            records, fmt = [record], "snapgene"
+        except:
+            try:
+                parser = crazydoc.CrazydocParser(
+                    ["highlight_color", "bold", "underline"]
+                )
+                records = parser.parse_doc_file(BytesIO(content))
+                fmt = "doc"
+            except:
+                try:
+                    df = spreadsheet_file_to_dataframe(data_file, header=None)
+                    records = [
+                        sequence_to_biopython_record(sequence=seq, id=name, name=name)
+                        for name, seq in df.values
+                    ]
+                    fmt = "spreadsheet"
+                except:
+                    raise ValueError("Format not recognized for file " + data_file.name)
+    if not isinstance(records, list):
+        records = [records]
+    return records, fmt
+
+
+def record_to_formated_string(record, fmt="genbank", remove_descr=False):
+    if remove_descr:
+        record = deepcopy(record)
+        if isinstance(record, (list, tuple)):
+            for r in record:
+                r.description = ""
+        else:
+            record.description = ""
+    fileobject = StringIO()
+    write_record(record, fileobject, fmt)
+    return fileobject.getvalue().encode("utf-8")
+
+
+def records_from_data_files(data_files, use_file_names_as_ids=False):
+    records = []
+    for file_ in data_files:
+        circular = ("circular" not in file_) or file_.circular
+        if file_.name.lower().endswith("zip"):
+            records += records_from_zip_file(
+                file_, use_file_names_as_ids=use_file_names_as_ids
+            )
+            continue
+        recs, fmt = records_from_data_file(file_)
+        single_record = len(recs) == 1
+        for i, record in enumerate(recs):
+            record.circular = circular
+            record.linear = not circular
+            name_no_extension = "".join(file_.name.split(".")[:-1])
+            name = name_no_extension + ("" if single_record else ("%04d" % i))
+            name = name.replace(" ", "_")
+            UNKNOWN_IDS = [
+                "None",
+                "",
+                "<unknown id>",
+                ".",
+                "EXPORTED",
+                "<unknown name>",
+                "Exported",
+            ]
+            # Sorry for this parts, it took a lot of "whatever works".
+            # keep your part names under 20c and pointless, and everything
+            # will be good
+            if str(record.id).strip() in UNKNOWN_IDS:
+                record.id = name
+            if str(record.name).strip() in UNKNOWN_IDS:
+                record.name = name
+            record.file_name = name_no_extension
+            if use_file_names_as_ids and single_record:
+                basename = os.path.basename(record.source_file)
+                basename_no_extension = os.path.splitext(basename)[0]
+                record.id = basename_no_extension
+        records += recs
+    return records
+
+
+def data_to_html_data(data, datatype, filename=None):
+    """Data types: zip, genbank, fasta, pdf"""
+    datatype = {
+        "zip": "application/zip",
+        "genbank": "application/genbank",
+        "fasta": "application/fasta",
+        "pdf": "application/pdf",
+        "xlsx": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
+    }.get(datatype, datatype)
+    datatype = "data:%s;" % datatype
+    data64 = "base64,%s" % b64encode(data).decode("utf-8")
+    headers = ""
+    if filename is not None:
+        headers += "headers=filename%3D" + filename + ";"
+    return datatype + headers + data64
+
+
+def zip_data_to_html_data(data):
+    return data_to_html_data(data, "application/zip")
+
+
+LADDERS = {"100_to_4k": bw.ladders.LADDER_100_to_4k}
+
+
+def matplotlib_figure_to_svg_base64_data(fig, **kwargs):
+    """Return a string of the form 'data:image/svg+xml;base64,XXX' where XXX
+    is the base64-encoded svg version of the figure."""
+    output = BytesIO()
+    fig.savefig(output, format="svg", **kwargs)
+    svg_txt = output.getvalue().decode("utf-8")
+    svg_txt = "\n".join(svg_txt.split("\n")[4:])
+    svg_txt = "".join(svg_txt.split("\n"))
+
+    content = b64encode(svg_txt.encode("utf-8"))
+    result = (b"data:image/svg+xml;base64," + content).decode("utf-8")
+
+    return result
+
+
+def matplotlib_figure_to_bitmap_base64_data(fig, fmt="png", **kwargs):
+    """Return a string of the form 'data:image/png;base64,XXX' where XXX
+    is the base64-encoded svg version of the figure."""
+    output = BytesIO()
+    fig.savefig(output, format=fmt, **kwargs)
+    bitmap = output.getvalue()
+    content = b64encode(bitmap)
+    result = (b"data:image/%s;base64,%s" % (fmt.encode("utf-8"), content)).decode(
+        "utf-8"
+    )
+    return result
+
+
+def figures_to_pdf_report_data(figures, filename="report.pdf"):
+    pdf_io = BytesIO()
+    with PdfPages(pdf_io) as pdf:
+        for fig in figures:
+            pdf.savefig(fig, bbox_inches="tight")
+    return {
+        "data": (
+            "data:application/pdf;base64,"
+            + b64encode(pdf_io.getvalue()).decode("utf-8")
+        ),
+        "name": filename,
+        "mimetype": "application/pdf",
+    }
+
+
+def csv_to_list(csv_string, sep=","):
+    return [
+        element.strip()
+        for line in csv_string.split("\n")
+        for element in line.split(sep)
+        if len(element.strip())
+    ]
+
+
+def set_record_topology(record, topology):
+    """Set the Biopython record's topology, possibly passing if already set.
+
+    This actually sets the ``record.annotations['topology']``.The ``topology``
+    parameter can be "circular", "linear", "default_to_circular" (will default
+    to circular if ``annotations['topology']`` is not already set) or
+    "default_to_linear".
+    """
+    valid_topologies = [
+        "circular",
+        "linear",
+        "default_to_circular",
+        "default_to_linear",
+    ]
+    if topology not in valid_topologies:
+        raise ValueError(
+            "topology (%s) should be one of %s."
+            % (topology, ", ".join(valid_topologies))
+        )
+    annotations = record.annotations
+    default_prefix = "default_to_"
+    if topology.startswith(default_prefix):
+        if "topology" not in annotations:
+            annotations["topology"] = topology[len(default_prefix) :]
+    else:
+        annotations["topology"] = topology
+
+
+##############################################################################
+def main():
+
+    parser = argparse.ArgumentParser(description="Generate picklist for DNA assembly.")
+    parser.add_argument("--parts_files", help="Directory with parts data or file with part sizes")
+    parser.add_argument("--picklist", type=str, help="Path to the assembly plan CSV or Excel file")
+    parser.add_argument("--source_plate", help="Source plate file (CSV or Excel)")
+    parser.add_argument("--backbone_name", help="Name of the backbone")
+    parser.add_argument("--result_zip", help="Name of the output zip file")
+    parser.add_argument("--part_backbone_ratio", type=float, help="Part to backbone molar ratio")
+    parser.add_argument("--quantity_unit", choices=["fmol", "nM", "ng"], help="Quantity unit")
+    parser.add_argument("--part_quantity", type=float, help="Quantity of each part")
+    parser.add_argument("--buffer_volume", type=float, help="Buffer volume in µL")
+    parser.add_argument("--total_volume", type=float, help="Total reaction volume in µL")
+    parser.add_argument("--dispenser", choices=["labcyte_echo", "tecan_evo"], help="Dispenser machine")
+
+    args = parser.parse_args()
+
+    # Parameters:
+    picklist = args.picklist  # assembly plan
+    # directory or can be a csv/Excel with part sizes
+    if isinstance(args.parts_files, str):
+        args.parts_files = args.parts_files.split(",")
+    parts_dir = fix_and_rename_paths(args.parts_files)
+    source_plate_path = args.source_plate
+    backbone_name = args.backbone_name
+    part_backbone_ratio = args.part_backbone_ratio
+    result_zip_file = args.result_zip  # output file name "picklist.zip"
+    ##############################################################################
+    # Defaults:
+    destination_plate = None
+    destination_type = "new"  # this parameter is not actually used
+    destination_size = 96  # this parameter is not actually used
+    fill_by = "column"  # this parameter is not actually used
+    quantity_unit = args.quantity_unit
+    part_quantity = args.part_quantity # 1.3
+    buffer_volume = args.buffer_volume # 0.3  # (µL)
+    total_volume = args.total_volume # 1  # (µL)
+    dispenser_machine = args.dispenser
+    dispenser_min_volume = 0.5  # (nL), this parameter is not actually used
+    dispenser_max_volume = 5  # (µL), this parameter is not actually used
+    dispenser_resolution = 2.5  # (nL), this parameter is not actually used
+    dispenser_dead_volume = 8  # (µL), this parameter is not actually used
+    use_file_names_as_ids = True
+
+    # CODE
+    if picklist.endswith(".csv"):
+        csv = picklist.read().decode()
+        rows = [line.split(",") for line in csv.split("\n") if len(line)]
+    else:
+        dataframe = pandas.read_excel(picklist)
+        rows = [row for i, row in dataframe.iterrows()]
+
+    assembly_plan = AssemblyPlan(
+        OrderedDict(
+            [
+                (
+                    row[0],
+                    [
+                        str(e).strip()
+                        for e in row[1:]
+                        if str(e).strip() not in ["-", "nan", ""]
+                    ],
+                )
+                for row in rows
+                if row[0] not in ["nan", "Construct name", "constructs", "construct"]
+            ]
+        )
+    )
+    for assembly, parts in assembly_plan.assemblies.items():
+        assembly_plan.assemblies[assembly] = [part.replace(" ", "_") for part in parts]
+
+    # Reading part infos
+    if not isinstance(parts_dir, list):
+        if parts_dir.endswith((".csv", ".xls", ".xlsx")):  # part sizes specified in table
+            if parts_dir.endswith(".csv"):
+                dataframe = pandas.read_csv(parts_dir)
+            else:
+                dataframe = pandas.read_excel(parts_dir)
+            parts_data = {row.part: {"size": row["size"]} for i, row in dataframe.iterrows()}
+    else:  # input records
+        records = dnacauldron.biotools.load_records_from_files(
+            files=parts_dir, use_file_names_as_ids=use_file_names_as_ids
+        )
+        parts_data = {rec.id.replace(" ", "_").lower(): {"record": rec} for rec in records}
+        #parts_data = process_parts_with_mapping(records, args.file_name_mapping)
+    assembly_plan.parts_data = parts_data
+    parts_without_data = assembly_plan.parts_without_data()
+    if len(parts_without_data):
+        print("success: False")
+        print("message: Some parts have no provided record or data.")
+        print("missing_parts: ", parts_without_data)
+        sys.exit()
+    # Reading protocol
+    if quantity_unit == "fmol":
+        part_mol = part_quantity * 1e-15
+        part_g = None
+    if quantity_unit == "nM":
+        part_mol = part_quantity * total_volume * 1e-15
+        part_g = None
+    if quantity_unit == "ng":
+        part_mol = None
+        part_g = part_quantity * 1e-9
+        # Backbone:part molar ratio calculation is not performed in this case.
+        # This ensures no change regardless of form input:
+        part_backbone_ratio = 1
+    print("Generating picklist")
+    picklist_generator = AssemblyPicklistGenerator(
+        part_mol=part_mol,
+        part_g=part_g,
+        complement_to=total_volume * 1e-6,  # convert uL to L
+        buffer_volume=buffer_volume * 1e-6,
+        volume_rounding=2.5e-9,  # not using parameter from form
+        minimal_dispense_volume=5e-9,  # Echo machine's minimum dispense -
+    )
+    backbone_name_list = backbone_name.split(",")
+    source_plate = plate_from_content_spreadsheet(source_plate_path)
+
+    for well in source_plate.iter_wells():
+        if well.is_empty:
+            continue
+        quantities = well.content.quantities
+        part, quantity = list(quantities.items())[0]
+        quantities.pop(part)
+        quantities[part.replace(" ", "_")] = quantity
+
+        if part in backbone_name_list:
+            # This section multiplies the backbone concentration with the
+            # part:backbone molar ratio. This tricks the calculator into making
+            # a picklist with the desired ratio.
+            # For example, a part:backbone = 2:1 will multiply the
+            # backbone concentration by 2, therefore half as much of it will be
+            # added to the well.
+            quantities[part.replace(" ", "_")] = quantity * part_backbone_ratio
+        else:
+            quantities[part.replace(" ", "_")] = quantity
+
+    source_plate.name = "Source"
+    if destination_plate:
+        dest_filelike = file_to_filelike_object(destination_plate)
+        destination_plate = plate_from_content_spreadsheet(destination_plate)
+    else:
+        destination_plate = Plate4ti0960("Mixplate")
+    destination_wells = (
+        well for well in destination_plate.iter_wells(direction="column") if well.is_empty
+    )
+    picklist, picklist_data = picklist_generator.make_picklist(
+        assembly_plan,
+        source_wells=source_plate.iter_wells(),
+        destination_wells=destination_wells,
+    )
+    if picklist is None:
+        print("success: False")
+        print("message: Some parts in the assembly plan have no corresponding well.")
+        print("picklist_data: ", picklist_data)
+        print("missing_parts:", picklist_data.get("missing_parts", None))
+        sys.exit()
+
+    future_plates = picklist.simulate(inplace=False)
+
+
+    def text(w):
+        txt = human_volume(w.content.volume)
+        if "construct" in w.data:
+            txt = "\n".join([w.data["construct"], txt])
+        return txt
+
+
+    plotter = PlateTextPlotter(text)
+    ax, _ = plotter.plot_plate(future_plates[destination_plate], figsize=(20, 8))
+
+    ziproot = flametree.file_tree(result_zip_file, replace=True)
+
+    # MIXPLATE MAP PLOT
+    ax.figure.savefig(
+        ziproot._file("final_mixplate.pdf").open("wb"),
+        format="pdf",
+        bbox_inches="tight",
+    )
+    plt.close(ax.figure)
+    plate_to_platemap_spreadsheet(
+        future_plates[destination_plate],
+        lambda w: w.data.get("construct", ""),
+        filepath=ziproot._file("final_mixplate.xls").open("wb"),
+    )
+
+    # ASSEMBLY REPORT
+    print("Writing report...")
+    picklist_to_assembly_mix_report(
+        picklist,
+        ziproot._file("assembly_mix_picklist_report.pdf").open("wb"),
+        data=picklist_data,
+    )
+    assembly_plan.write_report(ziproot._file("assembly_plan_summary.pdf").open("wb"))
+
+    # MACHINE PICKLIST
+
+    if dispenser_machine == "labcyte_echo":
+        picklist_to_labcyte_echo_picklist_file(
+            picklist, ziproot._file("ECHO_picklist.csv").open("w")
+        )
+    else:
+        picklist_to_tecan_evo_picklist_file(
+            picklist, ziproot._file("EVO_picklist.gwl").open("w")
+        )
+    # We'll not write the input source plate.
+    # raw = file_to_filelike_object(source_plate_path).read()
+    # f = ziproot.copy(source_plate_path)
+    # f.write(raw, mode="wb")
+    ziproot._close()
+    print("success: True")
+    
+
+if __name__ == "__main__":
+    main()
author	tduigou
date	Wed, 06 Aug 2025 08:02:58 +0000
parents
children	196e13c09881