Mercurial > repos > tduigou > create_assembly_picklists
view CreateAssemblyPicklists_script.py @ 9:c3951e41e488 draft default tip
planemo upload for repository https://github.com/Edinburgh-Genome-Foundry/Plateo commit ed922f834f891e6048385803c3004465551c911d-dirty
author | tduigou |
---|---|
date | Thu, 07 Aug 2025 12:37:19 +0000 |
parents | 196e13c09881 |
children |
line wrap: on
line source
#!/usr/bin/env python # coding: utf-8 # Code copied from CUBA backend tools.py and create_assembly_picklists/CreateAssemblyPicklistsView.py # Code modified for running in a script in Galaxy. ############################################################################## ############################################################################## # App code ## EGF Galaxy Create assembly picklists -- script ############################################################################## # IMPORTS import argparse import os from io import StringIO, BytesIO import re from base64 import b64encode, b64decode from copy import deepcopy import sys from collections import OrderedDict from fuzzywuzzy import process import matplotlib.pyplot as plt from matplotlib.backends.backend_pdf import PdfPages import pandas from Bio import SeqIO from Bio.SeqRecord import SeqRecord from Bio.Seq import Seq import bandwagon as bw import crazydoc from dnachisel.biotools import sequence_to_biopython_record import dnacauldron import flametree from plateo import AssemblyPlan from plateo.parsers import plate_from_content_spreadsheet from plateo.containers import Plate4ti0960 from plateo.exporters import AssemblyPicklistGenerator, picklist_to_assembly_mix_report from plateo.exporters import ( picklist_to_labcyte_echo_picklist_file, picklist_to_tecan_evo_picklist_file, plate_to_platemap_spreadsheet, PlateTextPlotter, ) from plateo.tools import human_volume from snapgene_reader import snapgene_file_to_seqrecord ############################################################################## # FUNCTIONS def fix_and_rename_paths(paths): fixed_paths = [] for path in paths: new_path = path.replace("__sq__", "'") if new_path != path: os.rename(path, new_path) fixed_paths.append(new_path) return fixed_paths def parse_optional_float(x): if x == '': return None return float(x) def did_you_mean(name, other_names, limit=5, min_score=50): # test results = process.extract(name, list(other_names), limit=limit) return [e for (e, score) in results if score >= min_score] def fix_ice_genbank(genbank_txt): lines = genbank_txt.splitlines() lines[0] += max(0, 80 - len(lines[0])) * " " return "\n".join(lines) def write_record(record, target, fmt="genbank"): """Write a record as genbank, fasta, etc. via Biopython, with fixes""" record = deepcopy(record) if fmt == "genbank": if isinstance(record, (list, tuple)): for r in record: r.name = r.name[:20] else: record.name = record.name[:20] if hasattr(target, "open"): target = target.open("w") SeqIO.write(record, target, fmt) def autoname_genbank_file(record): return record.id.replace(".", "_") + ".gb" def string_to_records(string): """Convert a string of a fasta, genbank... into a simple ATGC string. Can also be used to detect a format. """ matches = re.match("([ATGC][ATGC]*)", string) # print("============", len(matches.groups()[0]), len(string)) # print (matches.groups()[0] == string) if (matches is not None) and (matches.groups()[0] == string): return [SeqRecord(Seq(string))], "ATGC" for fmt in ("fasta", "genbank"): if fmt == "genbank": string = fix_ice_genbank(string) try: stringio = StringIO(string) records = list(SeqIO.parse(stringio, fmt)) if len(records) > 0: return (records, fmt) except: pass try: record = snapgene_file_to_seqrecord(filecontent=StringIO(string)) return [record] except: pass raise ValueError("Invalid sequence format") def file_to_filelike_object(file_, type="byte"): content = file_.content.split("base64,")[1] filelike = BytesIO if (type == "byte") else StringIO return filelike(b64decode(content)) def spreadsheet_file_to_dataframe(filedict, header="infer"): filelike = file_to_filelike_object(filedict) if filedict.name.endswith(".csv"): return pandas.read_csv(filelike, header=header) else: return pandas.read_excel(filelike, header=header) def records_from_zip_file(zip_file, use_file_names_as_ids=False): zip_name = zip_file.name zip_file = flametree.file_tree(file_to_filelike_object(zip_file)) records = [] for f in zip_file._all_files: ext = f._extension.lower() if ext in ["gb", "gbk", "fa", "dna"]: try: new_records, fmt = string_to_records(f.read()) if not isinstance(new_records, list): new_records = [new_records] except: content_stream = BytesIO(f.read("rb")) try: record = snapgene_file_to_seqrecord(fileobject=content_stream) new_records, fmt = [record], "snapgene" except: try: parser = crazydoc.CrazydocParser( ["highlight_color", "bold", "underline"] ) new_records = parser.parse_doc_file(content_stream) fmt = "doc" except: raise ValueError("Format not recognized for file " + f._path) single_record = len(new_records) == 1 for i, record in enumerate(new_records): name = record.id if name in [ None, "", "<unknown id>", ".", " ", "<unknown name>", ]: number = "" if single_record else ("%04d" % i) name = f._name_no_extension.replace(" ", "_") + number record.id = name record.name = name record.file_name = f._name_no_extension record.zip_file_name = zip_name if use_file_names_as_ids and single_record: basename = os.path.basename(record.file_name) basename_no_extension = os.path.splitext(basename)[0] record.id = basename_no_extension records += new_records return records def records_from_data_file(data_file): content = b64decode(data_file.content.split("base64,")[1]) try: records, fmt = string_to_records(content.decode("utf-8")) except: try: record = snapgene_file_to_seqrecord(fileobject=BytesIO(content)) records, fmt = [record], "snapgene" except: try: parser = crazydoc.CrazydocParser( ["highlight_color", "bold", "underline"] ) records = parser.parse_doc_file(BytesIO(content)) fmt = "doc" except: try: df = spreadsheet_file_to_dataframe(data_file, header=None) records = [ sequence_to_biopython_record(sequence=seq, id=name, name=name) for name, seq in df.values ] fmt = "spreadsheet" except: raise ValueError("Format not recognized for file " + data_file.name) if not isinstance(records, list): records = [records] return records, fmt def record_to_formated_string(record, fmt="genbank", remove_descr=False): if remove_descr: record = deepcopy(record) if isinstance(record, (list, tuple)): for r in record: r.description = "" else: record.description = "" fileobject = StringIO() write_record(record, fileobject, fmt) return fileobject.getvalue().encode("utf-8") def records_from_data_files(data_files, use_file_names_as_ids=False): records = [] for file_ in data_files: circular = ("circular" not in file_) or file_.circular if file_.name.lower().endswith("zip"): records += records_from_zip_file( file_, use_file_names_as_ids=use_file_names_as_ids ) continue recs, fmt = records_from_data_file(file_) single_record = len(recs) == 1 for i, record in enumerate(recs): record.circular = circular record.linear = not circular name_no_extension = "".join(file_.name.split(".")[:-1]) name = name_no_extension + ("" if single_record else ("%04d" % i)) name = name.replace(" ", "_") UNKNOWN_IDS = [ "None", "", "<unknown id>", ".", "EXPORTED", "<unknown name>", "Exported", ] # Sorry for this parts, it took a lot of "whatever works". # keep your part names under 20c and pointless, and everything # will be good if str(record.id).strip() in UNKNOWN_IDS: record.id = name if str(record.name).strip() in UNKNOWN_IDS: record.name = name record.file_name = name_no_extension if use_file_names_as_ids and single_record: basename = os.path.basename(record.source_file) basename_no_extension = os.path.splitext(basename)[0] record.id = basename_no_extension records += recs return records def data_to_html_data(data, datatype, filename=None): """Data types: zip, genbank, fasta, pdf""" datatype = { "zip": "application/zip", "genbank": "application/genbank", "fasta": "application/fasta", "pdf": "application/pdf", "xlsx": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", }.get(datatype, datatype) datatype = "data:%s;" % datatype data64 = "base64,%s" % b64encode(data).decode("utf-8") headers = "" if filename is not None: headers += "headers=filename%3D" + filename + ";" return datatype + headers + data64 def zip_data_to_html_data(data): return data_to_html_data(data, "application/zip") LADDERS = {"100_to_4k": bw.ladders.LADDER_100_to_4k} def matplotlib_figure_to_svg_base64_data(fig, **kwargs): """Return a string of the form '' where XXX is the base64-encoded svg version of the figure.""" output = BytesIO() fig.savefig(output, format="svg", **kwargs) svg_txt = output.getvalue().decode("utf-8") svg_txt = "\n".join(svg_txt.split("\n")[4:]) svg_txt = "".join(svg_txt.split("\n")) content = b64encode(svg_txt.encode("utf-8")) result = (b"data:image/svg+xml;base64," + content).decode("utf-8") return result def matplotlib_figure_to_bitmap_base64_data(fig, fmt="png", **kwargs): """Return a string of the form '' where XXX is the base64-encoded svg version of the figure.""" output = BytesIO() fig.savefig(output, format=fmt, **kwargs) bitmap = output.getvalue() content = b64encode(bitmap) result = (b"data:image/%s;base64,%s" % (fmt.encode("utf-8"), content)).decode( "utf-8" ) return result def figures_to_pdf_report_data(figures, filename="report.pdf"): pdf_io = BytesIO() with PdfPages(pdf_io) as pdf: for fig in figures: pdf.savefig(fig, bbox_inches="tight") return { "data": ( "data:application/pdf;base64," + b64encode(pdf_io.getvalue()).decode("utf-8") ), "name": filename, "mimetype": "application/pdf", } def csv_to_list(csv_string, sep=","): return [ element.strip() for line in csv_string.split("\n") for element in line.split(sep) if len(element.strip()) ] def set_record_topology(record, topology): """Set the Biopython record's topology, possibly passing if already set. This actually sets the ``record.annotations['topology']``.The ``topology`` parameter can be "circular", "linear", "default_to_circular" (will default to circular if ``annotations['topology']`` is not already set) or "default_to_linear". """ valid_topologies = [ "circular", "linear", "default_to_circular", "default_to_linear", ] if topology not in valid_topologies: raise ValueError( "topology (%s) should be one of %s." % (topology, ", ".join(valid_topologies)) ) annotations = record.annotations default_prefix = "default_to_" if topology.startswith(default_prefix): if "topology" not in annotations: annotations["topology"] = topology[len(default_prefix) :] else: annotations["topology"] = topology ############################################################################## def main(): parser = argparse.ArgumentParser(description="Generate picklist for DNA assembly.") parser.add_argument("--parts_files", help="Directory with parts data or file with part sizes") parser.add_argument("--picklist", type=str, help="Path to the assembly plan CSV or Excel file") parser.add_argument("--source_plate", help="Source plate file (CSV or Excel)") parser.add_argument("--backbone_name", required=False, help="Name of the backbone") parser.add_argument("--result_zip", help="Name of the output zip file") parser.add_argument("--part_backbone_ratio", type=parse_optional_float, required=False, help="Part to backbone molar ratio") parser.add_argument("--quantity_unit", choices=["fmol", "nM", "ng"], help="Quantity unit") parser.add_argument("--part_quantity", type=float, help="Quantity of each part") parser.add_argument("--buffer_volume", type=float, help="Buffer volume in µL") parser.add_argument("--total_volume", type=float, help="Total reaction volume in µL") parser.add_argument("--dispenser", choices=["labcyte_echo", "tecan_evo"], help="Dispenser machine") args = parser.parse_args() # Parameters: picklist = args.picklist # assembly plan # directory or can be a csv/Excel with part sizes if isinstance(args.parts_files, str): args.parts_files = args.parts_files.split(",") parts_dir = fix_and_rename_paths(args.parts_files) source_plate_path = args.source_plate backbone_name = args.backbone_name part_backbone_ratio = args.part_backbone_ratio result_zip_file = args.result_zip # output file name "picklist.zip" ############################################################################## # Defaults: destination_plate = None destination_type = "new" # this parameter is not actually used destination_size = 96 # this parameter is not actually used fill_by = "column" # this parameter is not actually used quantity_unit = args.quantity_unit part_quantity = args.part_quantity # 1.3 buffer_volume = args.buffer_volume # 0.3 # (µL) total_volume = args.total_volume # 1 # (µL) dispenser_machine = args.dispenser dispenser_min_volume = 0.5 # (nL), this parameter is not actually used dispenser_max_volume = 5 # (µL), this parameter is not actually used dispenser_resolution = 2.5 # (nL), this parameter is not actually used dispenser_dead_volume = 8 # (µL), this parameter is not actually used use_file_names_as_ids = True # CODE if picklist.endswith(".csv"): csv = picklist.read().decode() rows = [line.split(",") for line in csv.split("\n") if len(line)] else: dataframe = pandas.read_excel(picklist) rows = [row for i, row in dataframe.iterrows()] assembly_plan = AssemblyPlan( OrderedDict( [ ( row[0], [ str(e).strip() for e in row[1:] if str(e).strip() not in ["-", "nan", ""] ], ) for row in rows if row[0] not in ["nan", "Construct name", "constructs", "construct"] ] ) ) for assembly, parts in assembly_plan.assemblies.items(): assembly_plan.assemblies[assembly] = [part.replace(" ", "_") for part in parts] # Reading part infos if not isinstance(parts_dir, list): if parts_dir.endswith((".csv", ".xls", ".xlsx")): # part sizes specified in table if parts_dir.endswith(".csv"): dataframe = pandas.read_csv(parts_dir) else: dataframe = pandas.read_excel(parts_dir) parts_data = {row.part: {"size": row["size"]} for i, row in dataframe.iterrows()} else: # input records records = dnacauldron.biotools.load_records_from_files( files=parts_dir, use_file_names_as_ids=use_file_names_as_ids ) parts_data = {rec.id.replace(" ", "_").lower(): {"record": rec} for rec in records} #parts_data = process_parts_with_mapping(records, args.file_name_mapping) assembly_plan.parts_data = parts_data parts_without_data = assembly_plan.parts_without_data() if len(parts_without_data): print("success: False") print("message: Some parts have no provided record or data.") print("missing_parts: ", parts_without_data) sys.exit() # Reading protocol if quantity_unit == "fmol": part_mol = part_quantity * 1e-15 part_g = None if quantity_unit == "nM": part_mol = part_quantity * total_volume * 1e-15 part_g = None if quantity_unit == "ng": part_mol = None part_g = part_quantity * 1e-9 # Backbone:part molar ratio calculation is not performed in this case. # This ensures no change regardless of form input: part_backbone_ratio = 1 print("Generating picklist") picklist_generator = AssemblyPicklistGenerator( part_mol=part_mol, part_g=part_g, complement_to=total_volume * 1e-6, # convert uL to L buffer_volume=buffer_volume * 1e-6, volume_rounding=2.5e-9, # not using parameter from form minimal_dispense_volume=5e-9, # Echo machine's minimum dispense - ) if backbone_name != '' and backbone_name != 'Non': backbone_name_list = backbone_name.split(",") source_plate = plate_from_content_spreadsheet(source_plate_path) for well in source_plate.iter_wells(): if well.is_empty: continue quantities = well.content.quantities part, quantity = list(quantities.items())[0] quantities.pop(part) quantities[part.replace(" ", "_")] = quantity if backbone_name != '' and backbone_name != 'Non': if part in backbone_name_list: # This section multiplies the backbone concentration with the # part:backbone molar ratio. This tricks the calculator into making # a picklist with the desired ratio. # For example, a part:backbone = 2:1 will multiply the # backbone concentration by 2, therefore half as much of it will be # added to the well. quantities[part.replace(" ", "_")] = quantity * part_backbone_ratio else: quantities[part.replace(" ", "_")] = quantity source_plate.name = "Source" if destination_plate: dest_filelike = file_to_filelike_object(destination_plate) destination_plate = plate_from_content_spreadsheet(destination_plate) else: destination_plate = Plate4ti0960("Mixplate") destination_wells = ( well for well in destination_plate.iter_wells(direction="column") if well.is_empty ) picklist, picklist_data = picklist_generator.make_picklist( assembly_plan, source_wells=source_plate.iter_wells(), destination_wells=destination_wells, ) if picklist is None: print("success: False") print("message: Some parts in the assembly plan have no corresponding well.") print("picklist_data: ", picklist_data) print("missing_parts:", picklist_data.get("missing_parts", None)) sys.exit() future_plates = picklist.simulate(inplace=False) def text(w): txt = human_volume(w.content.volume) if "construct" in w.data: txt = "\n".join([w.data["construct"], txt]) return txt plotter = PlateTextPlotter(text) ax, _ = plotter.plot_plate(future_plates[destination_plate], figsize=(20, 8)) ziproot = flametree.file_tree(result_zip_file, replace=True) # MIXPLATE MAP PLOT ax.figure.savefig( ziproot._file("final_mixplate.pdf").open("wb"), format="pdf", bbox_inches="tight", ) plt.close(ax.figure) plate_to_platemap_spreadsheet( future_plates[destination_plate], lambda w: w.data.get("construct", ""), filepath=ziproot._file("final_mixplate.xls").open("wb"), ) # ASSEMBLY REPORT print("Writing report...") picklist_to_assembly_mix_report( picklist, ziproot._file("assembly_mix_picklist_report.pdf").open("wb"), data=picklist_data, ) assembly_plan.write_report(ziproot._file("assembly_plan_summary.pdf").open("wb")) # MACHINE PICKLIST if dispenser_machine == "labcyte_echo": picklist_to_labcyte_echo_picklist_file( picklist, ziproot._file("ECHO_picklist.csv").open("w") ) else: picklist_to_tecan_evo_picklist_file( picklist, ziproot._file("EVO_picklist.gwl").open("w") ) # We'll not write the input source plate. # raw = file_to_filelike_object(source_plate_path).read() # f = ziproot.copy(source_plate_path) # f.write(raw, mode="wb") ziproot._close() print("success: True") if __name__ == "__main__": main()