Mercurial > repos > tduigou > cloning_simulation
diff cloning_simulation.py @ 16:fbb241adf6c2 draft default tip
planemo upload for repository https://github.com/Edinburgh-Genome-Foundry/DnaCauldron/tree/master commit af45e5e0e81535ab0423b0bcff8b5b220bb9b4d0-dirty
author | tduigou |
---|---|
date | Thu, 17 Jul 2025 10:17:24 +0000 |
parents | 16ccb36aa8e3 |
children |
line wrap: on
line diff
--- a/cloning_simulation.py Mon May 26 13:39:57 2025 +0000 +++ b/cloning_simulation.py Thu Jul 17 10:17:24 2025 +0000 @@ -1,29 +1,30 @@ +import argparse import os +import json +import zipfile +import pandas import dnacauldron -from Bio import SeqIO -import pandas -import argparse -import zipfile + def cloning_simulation(files_to_assembly, domesticated_list, csv_file, assembly_type, topology, file_name_mapping, file_name_mapping_dom, use_file_names_as_id, - outdir_simulation, output_simulation,enzyme,outdir_gb): + outdir_simulation, output_simulation, enzyme, outdir_gb): files_to_assembly = files_to_assembly.split(',') repository = dnacauldron.SequenceRepository() - repository.import_records(files=files_to_assembly, - use_file_names_as_ids=use_file_names_as_id, + repository.import_records(files=files_to_assembly, + use_file_names_as_ids=use_file_names_as_id, topology=topology) if domesticated_list: domesticated_files = domesticated_list.split(',') - repository.import_records(files=domesticated_files, - use_file_names_as_ids=use_file_names_as_id, - topology=topology) + repository.import_records(files=domesticated_files, + use_file_names_as_ids=use_file_names_as_id, + topology=topology) - #refine the real record name dict + # refine the real record name dict if isinstance(file_name_mapping, str): file_name_mapping = dict( item.split(":") for item in file_name_mapping.split(",") @@ -31,11 +32,11 @@ real_names = { os.path.splitext(os.path.basename(k))[0]: v.replace(".gb", "") for k, v in file_name_mapping.items() - } + } - #refine the real record name dict_dom + # refine the real record name dict_dom if file_name_mapping_dom == "": - file_name_mapping_dom={} + file_name_mapping_dom = {} else: if isinstance(file_name_mapping_dom, str): file_name_mapping_dom = dict( @@ -44,10 +45,10 @@ dom_real_names = { os.path.splitext(os.path.basename(k))[0]: v.replace(".gb", "") for k, v in file_name_mapping_dom.items() - } + } real_names.update(dom_real_names) - - #update the records + + # update the records for key, record in list(repository.collections["parts"].items()): current_id = record.id @@ -58,9 +59,9 @@ record.description = new_id repository.collections["parts"][new_id] = repository.collections["parts"].pop(key) ######################################################## - #print (f"repo: {vars(repository)}") - #any(pandas.read_csv(csv_file, index_col=0, header=None).duplicated()) - df=pandas.read_csv(csv_file, index_col=0, header=None) + # print (f"repo: {vars(repository)}") + # any(pandas.read_csv(csv_file, index_col=0, header=None).duplicated()) + df = pandas.read_csv(csv_file, index_col=0, header=None) if df.duplicated().any(): raise ValueError("Duplicate rows found in the data!") @@ -78,7 +79,7 @@ assembly_class = dnacauldron.LigaseCyclingReactionAssembly else: raise ValueError(f"Unsupported assembly type: {assembly_type}") - + new_csvname = "assambly.csv" os.rename(csv_file, new_csvname) @@ -113,9 +114,9 @@ full_path = os.path.join(root, file) arcname = os.path.relpath(full_path, outdir_simulation) zipf.write(full_path, arcname) - #print("Files in the zip archive:") - #for info in zipf.infolist(): - #print(info.filename) + # print("Files in the zip archive:") + # for info in zipf.infolist(): + # print(info.filename) for member in zipf.namelist(): # Only extract actual files inside 'all_construct_records/' (not subfolders) if member.startswith("assambly_simulation/all_construct_records/") and not member.endswith("/"): @@ -137,40 +138,63 @@ def parse_command_line_args(): parser = argparse.ArgumentParser(description="Domestication") - parser.add_argument("--parts_files", required=True, + parser.add_argument("--parts_files", required=True, help="List of GenBank files (Comma-separated)") - parser.add_argument("--domesticated_seq", required=True, + parser.add_argument("--domesticated_seq", required=True, help="output of domestication (ganbank list)") - parser.add_argument("--assembly_csv", required=True, + parser.add_argument("--assembly_csv", required=True, help="csv assembly") - parser.add_argument('--assembly_plan_name', type=str, + parser.add_argument('--assembly_plan_name', type=str, required=False, help='type of assembly') - parser.add_argument('--topology', type=str, + parser.add_argument('--topology', type=str, required=False, help='"circular" or "linear"') - parser.add_argument('--file_name_mapping', type=str, + parser.add_argument('--file_name_mapping', type=str, help='Mapping of Galaxy filenames to original filenames') - parser.add_argument('--file_name_mapping_dom', type=str, + parser.add_argument('--file_name_mapping_dom', type=str, help='Mapping of Galaxy filenames to original domestication filenames') - parser.add_argument("--use_file_names_as_id", type=lambda x: x.lower() == 'true', default=True, + parser.add_argument("--use_file_names_as_id", type=lambda x: x.lower() == 'true', default=True, help="Use file names as IDs (True/False)") - parser.add_argument("--outdir_simulation", required=True, + parser.add_argument("--outdir_simulation", required=True, help="dir output for cloning simulation results") - parser.add_argument("--output_simulation", required=True, + parser.add_argument("--output_simulation", required=True, help="zip output for cloning simulation results") - parser.add_argument('--enzyme', type=str, + parser.add_argument('--enzyme', type=str,required=False, help='enzyme to use') - parser.add_argument("--outdir_gb", required=True, + parser.add_argument("--outdir_gb", required=True, help="dir output constructs gb files") + parser.add_argument("--use_json_paramers", required=True, + help="Use parameters from JSON: true/false") + parser.add_argument("--json_conf", required=False, + help="JSON config file with DB parameters") return parser.parse_args() + if __name__ == "__main__": args = parse_command_line_args() + #json param checking + config_params = {} + use_json = args.use_json_paramers == 'true' + if use_json: + if not args.json_conf: + raise ValueError("You must provide --json_conf when --use_json_paramers is 'true'") + with open(args.json_conf, "r") as f: + config_params = json.load(f) + else: + config_params = { + "assembly_plan_name": args.assembly_plan_name, + "topology": args.topology, + "enzyme": args.enzyme + } + assembly_plan_name = config_params["assembly_plan_name"] + topology = config_params["topology"] + enzyme = config_params["enzyme"] + cloning_simulation( args.parts_files, args.domesticated_seq, - args.assembly_csv, args.assembly_plan_name, args.topology, - args.file_name_mapping, args.file_name_mapping_dom, + args.assembly_csv, assembly_plan_name, topology, + args.file_name_mapping, args.file_name_mapping_dom, args.use_file_names_as_id, args.outdir_simulation, - args.output_simulation, args.enzyme, args.outdir_gb - ) \ No newline at end of file + args.output_simulation, enzyme, args.outdir_gb + )