diff cloning_simulation.py @ 16:fbb241adf6c2 draft default tip

planemo upload for repository https://github.com/Edinburgh-Genome-Foundry/DnaCauldron/tree/master commit af45e5e0e81535ab0423b0bcff8b5b220bb9b4d0-dirty
author tduigou
date Thu, 17 Jul 2025 10:17:24 +0000
parents 16ccb36aa8e3
children
line wrap: on
line diff
--- a/cloning_simulation.py	Mon May 26 13:39:57 2025 +0000
+++ b/cloning_simulation.py	Thu Jul 17 10:17:24 2025 +0000
@@ -1,29 +1,30 @@
+import argparse
 import os
+import json
+import zipfile
+import pandas
 import dnacauldron
-from Bio import SeqIO
-import pandas
-import argparse
-import zipfile
+
 
 def cloning_simulation(files_to_assembly, domesticated_list,
                        csv_file, assembly_type, topology,
                        file_name_mapping, file_name_mapping_dom,
                        use_file_names_as_id,
-                       outdir_simulation, output_simulation,enzyme,outdir_gb):
+                       outdir_simulation, output_simulation, enzyme, outdir_gb):
 
     files_to_assembly = files_to_assembly.split(',')
 
     repository = dnacauldron.SequenceRepository()
-    repository.import_records(files=files_to_assembly, 
-                              use_file_names_as_ids=use_file_names_as_id, 
+    repository.import_records(files=files_to_assembly,
+                              use_file_names_as_ids=use_file_names_as_id,
                               topology=topology)
     if domesticated_list:
        domesticated_files = domesticated_list.split(',')
-       repository.import_records(files=domesticated_files, 
-                                    use_file_names_as_ids=use_file_names_as_id, 
-                                    topology=topology)
+       repository.import_records(files=domesticated_files,
+                                 use_file_names_as_ids=use_file_names_as_id,
+                                 topology=topology)
 
-    #refine the real record name dict
+    # refine the real record name dict
     if isinstance(file_name_mapping, str):
         file_name_mapping = dict(
             item.split(":") for item in file_name_mapping.split(",")
@@ -31,11 +32,11 @@
     real_names = {
         os.path.splitext(os.path.basename(k))[0]: v.replace(".gb", "")
         for k, v in file_name_mapping.items()
-    } 
+    }
 
-    #refine the real record name dict_dom
+    # refine the real record name dict_dom
     if file_name_mapping_dom == "":
-        file_name_mapping_dom={}
+        file_name_mapping_dom = {}
     else:
         if isinstance(file_name_mapping_dom, str):
             file_name_mapping_dom = dict(
@@ -44,10 +45,10 @@
         dom_real_names = {
             os.path.splitext(os.path.basename(k))[0]: v.replace(".gb", "")
             for k, v in file_name_mapping_dom.items()
-        } 
+        }
         real_names.update(dom_real_names)
-        
-    #update the records 
+
+    # update the records
 
     for key, record in list(repository.collections["parts"].items()):
         current_id = record.id
@@ -58,9 +59,9 @@
             record.description = new_id
             repository.collections["parts"][new_id] = repository.collections["parts"].pop(key)
     ########################################################
-    #print (f"repo: {vars(repository)}")
-    #any(pandas.read_csv(csv_file, index_col=0, header=None).duplicated())
-    df=pandas.read_csv(csv_file, index_col=0, header=None)
+    # print (f"repo: {vars(repository)}")
+    # any(pandas.read_csv(csv_file, index_col=0, header=None).duplicated())
+    df = pandas.read_csv(csv_file, index_col=0, header=None)
     if df.duplicated().any():
         raise ValueError("Duplicate rows found in the data!")
 
@@ -78,7 +79,7 @@
         assembly_class = dnacauldron.LigaseCyclingReactionAssembly
     else:
         raise ValueError(f"Unsupported assembly type: {assembly_type}")
-    
+
     new_csvname = "assambly.csv"
     os.rename(csv_file, new_csvname)
 
@@ -113,9 +114,9 @@
                 full_path = os.path.join(root, file)
                 arcname = os.path.relpath(full_path, outdir_simulation)
                 zipf.write(full_path, arcname)
-        #print("Files in the zip archive:")
-        #for info in zipf.infolist():
-            #print(info.filename)
+ #       print("Files in the zip archive:")
+ #       for info in zipf.infolist():
+ #           print(info.filename)
         for member in zipf.namelist():
             # Only extract actual files inside 'all_construct_records/' (not subfolders)
             if member.startswith("assambly_simulation/all_construct_records/") and not member.endswith("/"):
@@ -137,40 +138,63 @@
 def parse_command_line_args():
     parser = argparse.ArgumentParser(description="Domestication")
 
-    parser.add_argument("--parts_files", required=True, 
+    parser.add_argument("--parts_files", required=True,
                         help="List of GenBank files (Comma-separated)")
-    parser.add_argument("--domesticated_seq", required=True, 
+    parser.add_argument("--domesticated_seq", required=True,
                         help="output of domestication (ganbank list)")
-    parser.add_argument("--assembly_csv", required=True, 
+    parser.add_argument("--assembly_csv", required=True,
                         help="csv assembly")
-    parser.add_argument('--assembly_plan_name', type=str, 
+    parser.add_argument('--assembly_plan_name', type=str, required=False,
                         help='type of assembly')
-    parser.add_argument('--topology', type=str, 
+    parser.add_argument('--topology', type=str, required=False,
                         help='"circular" or "linear"')
-    parser.add_argument('--file_name_mapping', type=str, 
+    parser.add_argument('--file_name_mapping', type=str,
                         help='Mapping of Galaxy filenames to original filenames')
-    parser.add_argument('--file_name_mapping_dom', type=str, 
+    parser.add_argument('--file_name_mapping_dom', type=str,
                         help='Mapping of Galaxy filenames to original domestication filenames')
-    parser.add_argument("--use_file_names_as_id", type=lambda x: x.lower() == 'true', default=True, 
+    parser.add_argument("--use_file_names_as_id", type=lambda x: x.lower() == 'true', default=True,
                         help="Use file names as IDs (True/False)")
-    parser.add_argument("--outdir_simulation", required=True, 
+    parser.add_argument("--outdir_simulation", required=True,
                         help="dir output for cloning simulation results")
-    parser.add_argument("--output_simulation", required=True, 
+    parser.add_argument("--output_simulation", required=True,
                         help="zip output for cloning simulation results")
-    parser.add_argument('--enzyme', type=str, 
+    parser.add_argument('--enzyme', type=str,required=False,
                         help='enzyme to use')
-    parser.add_argument("--outdir_gb", required=True, 
+    parser.add_argument("--outdir_gb", required=True,
                         help="dir output constructs gb files")
+    parser.add_argument("--use_json_paramers", required=True,
+                         help="Use parameters from JSON: true/false")
+    parser.add_argument("--json_conf", required=False,
+                         help="JSON config file with DB parameters")
  
     return parser.parse_args()
 
+
 if __name__ == "__main__":
     args = parse_command_line_args()
 
+    #json param checking 
+    config_params = {}
+    use_json = args.use_json_paramers == 'true'
+    if use_json:
+        if not args.json_conf:
+            raise ValueError("You must provide --json_conf when --use_json_paramers is 'true'")
+        with open(args.json_conf, "r") as f:
+            config_params = json.load(f)
+    else:
+        config_params = {
+            "assembly_plan_name": args.assembly_plan_name,
+            "topology": args.topology,
+            "enzyme": args.enzyme
+        }
+    assembly_plan_name = config_params["assembly_plan_name"]
+    topology = config_params["topology"]
+    enzyme = config_params["enzyme"]
+
     cloning_simulation(
         args.parts_files, args.domesticated_seq,
-        args.assembly_csv, args.assembly_plan_name, args.topology,
-        args.file_name_mapping, args.file_name_mapping_dom, 
+        args.assembly_csv, assembly_plan_name, topology,
+        args.file_name_mapping, args.file_name_mapping_dom,
         args.use_file_names_as_id, args.outdir_simulation,
-        args.output_simulation, args.enzyme, args.outdir_gb
-    )
\ No newline at end of file
+        args.output_simulation, enzyme, args.outdir_gb
+    )