diff cut_trajectory.py @ 5:e504457035e5 draft

"planemo upload for repository https://github.com/mesocentre-clermont-auvergne/aubi_piaf commit b6488400d4478d46697019485e912c38ea2202a5-dirty"
author agpetit
date Mon, 30 May 2022 14:21:51 +0000
parents
children f1dd5d99ea2d
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/cut_trajectory.py	Mon May 30 14:21:51 2022 +0000
@@ -0,0 +1,364 @@
+#!/usr/bin/env python3
+
+# coding: utf-8
+"""
+The script allows to estimate the number of sub-trajectories to obtain.
+It also allows to split the trajectory into a number of sub-trajectories.
+# USAGE to estimate number of sub-trajectories :
+# USAGE : cut_trajectory.py
+  -s : number of desired sub-trajectories (number or file)
+  -gro : .gro file -xtc : .xtc file -log : name of log file (optional)
+  -d : output directory (optional) -o : name of output file (optional)
+  -g : group for output (optional)
+  -start : start time of the trajectory (optional)
+  -end : end time of the trajectory (optional)
+"""
+
+__all__ = []
+__author__ = "Agnès-Elisabeth Petit"
+__date__ = "29/04/2022"
+__version__ = "0.8"
+__copyright__ = "(c) 2022 CC-BY-NC-SA"
+
+# Library import
+import argparse
+import logging
+import os
+import subprocess
+import sys
+
+from joblib import Parallel, delayed
+
+
+def test_setup():
+    global args
+    args = parse_arguments()
+    args.verbose = True
+
+
+def parse_arguments():
+    list_choices = []
+    [list_choices.append(str(x)) for x in range(17)]
+    parser = argparse.ArgumentParser(
+        description="The script allows to estimate the number of "
+                    "sub-trajectories to obtain. It also allows to split"
+                    " the trajectory into a number of sub-trajectories.",
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+        prefix_chars="-",
+        add_help=True,
+    )
+    parser.add_argument(
+        "-v",
+        "--verbose",
+        action="store_true",
+        help="""Information messages to stderr""",
+    )
+    parser.add_argument(
+        "-c",
+        "--input_check",
+        type=str,
+        help=""".txt file obtained with gmx check -f.
+        It contains information about the trajectory""",
+    )
+    parser.add_argument(
+        "-s",
+        "--nbr_sub_traj",
+        type=str,
+        help="""numbers of sub_trajectories""",
+    )
+    parser.add_argument(
+        "-gro", "--gro_file", type=str, help="""My input .gro file"""
+    )
+    parser.add_argument(
+        "-xtc", "--xtc_file", type=str, help="""My input .xtc file"""
+    )
+    parser.add_argument(
+        "-log", "--log_output", type=str,
+        default="log/cut_trajectory.log",
+        help="""Output for log file. Default :
+        log/cut_trajectory.log"""
+    )
+    parser.add_argument(
+        "-d", "--output_directory", type=str,
+        default="out/",
+        help="""It's output Directory. Default : out/"""
+    )
+    parser.add_argument(
+        "-g",
+        "--group_output",
+        type=str,
+        default="0",
+        choices=list_choices,
+        help="""Select group for output. 0 : system, 1: protein, 2: protein-H,
+         3: C-alpha, 4: Backbone, 5: MainChain, 6: MainChain+Cb,
+         7: MainChain+H, 8: SideChain, 9: SideChain-H, 10: Prot-Masses,
+         11: non-Protein, 12: Other, 13: POPC, 14: POT, 15: CLA,
+         16: TIP3""",
+    )
+    parser.add_argument(
+        "-start", "--start_traj", type=str,
+        help="""Start of the trajectory to be cut"""
+    )
+    parser.add_argument(
+        "-end", "--end_traj", type=str,
+        help="""End of the trajectory to be cut"""
+    )
+    parser.add_argument(
+        "-cpus", "--number_cpus", type=int,
+        default=1,
+        help="""Number of cpus. Default : 1"""
+    )
+    return parser.parse_args()
+
+
+def search_nbr_steps_time_step(txt_file):
+    """
+    Description : Keeping the number of frames of the complete trajectory and
+     the time between each frame.
+    param txt_file: file obtained with gmx check.
+    return: list that contains the number of frames of the complete trajectory
+     and time between each frame.
+    """
+    if args.verbose:
+        logging.info("\nFunction search_nbr_steps_time_step")
+        logging.info("The input file is " + txt_file)
+    with open(txt_file, "r") as f:
+        len_traj = ""
+        time_step = ""
+        for li in f:
+            li = li.rstrip()
+            if li.startswith("Step"):
+                li2 = li.split()
+                len_traj = int(li2[1])
+                time_step = int(li2[2])
+    if args.verbose:
+        logging.info("The length of the trajectory is " + str(len_traj))
+        logging.info(
+            "The elapsed time between two steps is : " + str(time_step) + " ps"
+        )
+        logging.info("End search_nbr_steps_time_step functions")
+    return len_traj, time_step
+
+
+def search_nbr_sub_traj(tsv_nb_sub_traj_file):
+    """
+    Description: Obtaining the number of frames of the complete trajectory,
+      the number of sub-trajectories to create, the time between each
+      frame and the number of frames per sub-trajectory.
+    param tsv_nb_sub_traj_file: tsv file obtained with the estimation of
+      the number of sub-trajectories
+    return: list that contains the number of images of the complete trajectory,
+      the number of sub-trajectories to create, the time between each image and
+      the number of images per sub-trajectory.
+    """
+    if args.verbose:
+        logging.info("\nFunction search_nbr_sub_traj")
+        logging.info("The input file is " + tsv_nb_sub_traj_file)
+    list_number_sub_traj = []
+    with open(tsv_nb_sub_traj_file, "r") as f:
+        for li in f:
+            li = li.rstrip()
+            if not li.startswith("Length"):
+                list_number_sub_traj = li.split()
+    if args.verbose:
+        logging.info("The length of complete trajectory is "
+                     + list_number_sub_traj[0])
+        logging.info(
+            "The number of sub-trajectories required is  "
+            + list_number_sub_traj[1]
+        )
+        logging.info(
+            "The elapsed time between two steps is : "
+            + list_number_sub_traj[2] + "ps"
+        )
+        logging.info(
+            "The number of frames per sub-trajectory is "
+            + list_number_sub_traj[3]
+        )
+        logging.info("End search_nbr_sub_traj function")
+    list_number_sub_traj = [int(v) for v in list_number_sub_traj]
+    return list_number_sub_traj
+
+
+def launch_cut_traj(list_file, gro_file, xtc_file, out_dir,
+                    logging_file, n_group, n_cpus):
+    """
+    Description: function that allows to split the trajectory
+      into a number of sub-trajectories
+    param list_file: list that contains the number of images of the complete
+      trajectory, the number of sub-trajectories to create, the time between
+      each image and the number of images per sub-trajectory.
+    param gro_file: .gro file
+    param xtc_file: .xtc file
+    param out_dir: output directory
+    param logging_file: name of log file
+    return: None
+    output: create a number of sub-trajectories
+    """
+    if args.verbose:
+        logging.info("\nFunction cut_traj")
+        logging.info("The length of the trajectory is " + str(list_file[0])
+                     + " frames")
+        logging.info(
+            "The elapsed time between two steps is : " + str(list_file[2])
+            + " ps"
+        )
+        logging.info("The number of sub-trajectories required is  "
+                     + str(list_file[1]))
+        logging.info(".gro file is " + gro_file)
+        logging.info(".xtc file is " + xtc_file)
+        logging.info("Output directory is " + str(out_dir))
+        logging.info("The name of .log file is " + logging_file)
+        logging.info("The number of cpus used is " + str(n_cpus))
+    if not os.path.exists(out_dir):
+        os.makedirs(out_dir)
+    n_sub_traj = list_file[1]
+    times_step = list_file[2]
+    nb_frame_sub_traj = list_file[3]
+    prefix_name_file = os.path.basename(gro_file).rsplit(".", 1)[0]
+    start = list_file[4]
+    end = list_file[4] - 1
+    dict_sub_traj = {}
+    for nb_traj in range(n_sub_traj):
+        end += nb_frame_sub_traj
+        value = str(start * times_step) + "," + str(end * times_step)
+        dict_sub_traj[nb_traj + 1] = value
+        start += nb_frame_sub_traj
+    if args.verbose:
+        for k, v in dict_sub_traj.items():
+            logging.info("Sub_trajectory " + str(k) + " starts at "
+                         + v.split(",")[0] + " ps and ends at "
+                         + v.split(",")[1] + " ps")
+        logging.info("Launch gmx trjconv\n")
+    if args.verbose:
+        list_log_files = Parallel(n_jobs=n_cpus)(
+            delayed(cut_traj)(out_dir, prefix_name_file, k, v, n_group,
+                              xtc_file, gro_file, logging_file, args)
+            for k, v in dict_sub_traj.items())
+        log_file_complete = open(logging_file, "a")
+        for f in list_log_files:
+            open_f = open(f, "r")
+            [log_file_complete.write(li) for li in open_f.readlines()]
+            open_f.close()
+            os.remove(f)
+        log_file_complete.close()
+    else:
+        logging_file = ""
+        Parallel(n_jobs=n_cpus)(delayed(cut_traj)(out_dir, prefix_name_file, k,
+                                                  v, n_group, xtc_file,
+                                                  gro_file, logging_file, args)
+                                for k, v in dict_sub_traj.items())
+    if args.verbose:
+        logging.info("End cut_traj function")
+
+
+def cut_traj(out_dir, prefix_name_file, k, v, n_group, xtc_file, gro_file,
+             logging_file, arguments):
+    out_traj = (str(out_dir) + prefix_name_file + "_traj_" + str(k) + ".xtc")
+    bash_command = ("echo " + n_group + " | gmx trjconv -f " + xtc_file
+                    + " -s " + gro_file + " -b " + str(v.split(",")[0])
+                    + " -e " + str(v.split(",")[1]) + " -o " + out_traj)
+    if arguments.verbose:
+        log_directory = "log/tmp/"
+        logging_file = (log_directory
+                        + logging_file.rsplit("/", 1)[1].split(".")[0]
+                        + "_" + str(k) + ".log")
+        if not os.path.exists(log_directory):
+            os.makedirs(log_directory)
+        f_log = open(logging_file, "w")
+        subprocess.run(bash_command, shell=True, stdout=f_log, stderr=f_log)
+        f_log.close()
+        return logging_file
+    else:
+        subprocess.run(bash_command, shell=True)
+
+
+if __name__ == "__main__":
+    args = parse_arguments()
+    log_file = ""
+    if args.output_directory.endswith("/"):
+        out_directory = args.output_directory
+    else:
+        out_directory = args.output_directory + "/"
+    if args.verbose:
+        if "/" in args.log_output:
+            log_dir = args.log_output.rsplit("/", 1)[0]
+        else:
+            log_dir = "log/"
+        log_file = args.log_output
+        if not os.path.exists(log_dir):
+            os.makedirs(log_dir)
+        if os.path.isfile(log_file):
+            os.remove(log_file)
+        if args.log_output:
+            logging.basicConfig(
+                filename=log_file,
+                format="%(levelname)s - %(message)s",
+                level=logging.INFO,
+            )
+        else:
+            logging.basicConfig(
+                filename=log_file,
+                format="%(levelname)s - %(message)s",
+                level=logging.INFO,
+            )
+        logging.info("verbose mode on")
+    gro = ""
+    xtc = ""
+    list_nbr_sub_traj = []
+    if args.verbose:
+        logging.info("Start cut trajectory")
+    if args.gro_file and ".gro" in args.gro_file:
+        gro = os.path.basename(args.gro_file)
+    else:
+        if args.verbose:
+            logging.error("The entry is not a file or is not a .gro file")
+        sys.exit(1)
+    if args.xtc_file and ".xtc" in args.xtc_file:
+        xtc = os.path.basename(args.xtc_file)
+    else:
+        if args.verbose:
+            logging.error("The entry is not a file or is not a .xtc file")
+        sys.exit(1)
+    if ".tsv" in args.nbr_sub_traj:
+        list_nbr_sub_traj = search_nbr_sub_traj(args.nbr_sub_traj)
+        nb_sub_traj = list_nbr_sub_traj[1]
+    else:
+        nb_sub_traj = int(args.nbr_sub_traj)
+        nb_steps_time_step = search_nbr_steps_time_step(
+            args.input_check)
+        if args.start_traj and args.start_traj != "":
+            start_trajectory = int(args.start_traj)
+        else:
+            start_trajectory = 0
+        if args.end_traj and args.end_traj != "":
+            end_trajectory = int(args.end_traj)
+        else:
+            end_trajectory = nb_steps_time_step[0] - 1
+        len_trajectory = end_trajectory - start_trajectory + 1
+        list_nbr_sub_traj = [
+            len_trajectory,
+            nb_sub_traj,
+            nb_steps_time_step[1],
+            len_trajectory // nb_sub_traj,
+            start_trajectory,
+            end_trajectory,
+        ]
+    if nb_sub_traj >= (int(list_nbr_sub_traj[0]) // 2):
+        if args.verbose:
+            logging.error(
+                "The number of averages requested is greater than "
+                "half the trajectory size"
+            )
+        print(
+            "Number of requested sub-trajectories too large compared "
+            "to the size of the trajectory"
+        )
+        sys.exit(2)
+    nb_cpus = args.number_cpus
+    launch_cut_traj(
+        list_nbr_sub_traj, gro, xtc, out_directory, log_file,
+        args.group_output, nb_cpus
+    )
+
+