Mercurial > repos > agpetit > calculate_diameter

diff calculate_pore_diameter_aqp.py @ 0:e227d80e53be draft
"planemo upload for repository https://github.com/mesocentre-clermont-auvergne/aubi_piaf commit 48a10de1b21f94ab8019d9d0e4a43e0bd9d0c31e-dirty"
author: agpetit
date: Wed, 25 May 2022 07:17:24 +0000
children: a4097272ade0
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/calculate_pore_diameter_aqp.py	Wed May 25 07:17:24 2022 +0000
@@ -0,0 +1,1064 @@
+#!/usr/bin/env python3
+
+# coding: utf-8
+"""
+This script allows to launch a calculation of the pore diameter of an aquaporin
+  as a function of time from a .gro file and an .xtc file.
+At the end of the script, a table containing the evolution of the distance over
+  time is created.
+# USAGE : calculate_pore_diameter_aqp.py -xtc xtc_file -gro gro_file
+  -npao npa1 Pattern (optional) -npas npa2 Pattern (optional)
+  -arro arR1 Pattern on H5 (optional) -arrs arR2 Pattern on HE (optional)
+  -v verbose (optional) -w work_directory (optional)
+  -o output_directory (optional) --gro_file_ext extension of gro file
+  (optional) --xtc_file_ext extension of xtc file (optional)
+  -log name of log file (optional) -o name of output file (optional).
+"""
+
+
+__all__ = [
+    "calculate_length_proto",
+    "search_pattern",
+    "run_one_frame_npa_arr",
+    "run_one_frame_arr",
+    "run_all_frames_npa_arr",
+    "run_all_frames_arr",
+    "create_config_file",
+]
+__author__ = "Agnès-Elisabeth Petit"
+__date__ = "29/04/2021"
+__version__ = "1.0"
+__copyright__ = "(c) 2022 CC-BY-NC-SA"
+
+# Library import
+import argparse
+import configparser
+import logging
+import os
+import re
+import sys
+
+import MDAnalysis as Mda
+
+import numpy as np
+
+import pandas as pd
+
+# Global variables
+
+# Functions
+
+
+def test_setup():
+    global args
+    args = parse_arguments()
+    args.verbose = True
+
+
+def parse_arguments():
+    parser = argparse.ArgumentParser(
+        description="This script allows to launch a calculation of the pore"
+                    + " diameter of an aquaporin as a function of time from"
+                    + " a .gro file and an .xtc file.",
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+        prefix_chars="-",
+        add_help=True,
+    )
+    parser.add_argument(
+        "-v",
+        "--verbose",
+        action="store_true",
+        help="""Information messages to stderr""",
+    )
+    parser.add_argument(
+        "-gro", "--gro_file", type=str, nargs=1, help="""My input .gro file"""
+    )
+    parser.add_argument(
+        "-xtc", "--xtc_file", type=str, nargs=1, help="""My input .xtc file"""
+    )
+    parser.add_argument(
+        "--gro_file_ext", type=str, nargs=1, help="""My input .gro file"""
+    )
+    parser.add_argument(
+        "--xtc_file_ext", type=str, nargs=1, help="""My input .xtc file"""
+    )
+    parser.add_argument(
+        "-w", "--work_directory", type=str, nargs=1,
+        help="""It's work Directory"""
+    )
+    parser.add_argument(
+        "-od",
+        "--output_directory",
+        default="stdout",
+        type=str,
+        nargs=1,
+        help="""It's output Directory""",
+    )
+    parser.add_argument(
+        "-npao",
+        "--npa_one",
+        type=str,
+        nargs=1,
+        help="""3-letter pattern of the 3 amino acids of npa1""",
+    )
+    parser.add_argument(
+        "-npas",
+        "--npa_second",
+        type=str,
+        nargs=1,
+        help="""3-letter pattern of the 3 amino acids of npa2""",
+    )
+    parser.add_argument(
+        "-arro",
+        "--arr_one",
+        type=str,
+        default="",
+        nargs=1,
+        help="""3-letter pattern of the 4 amino acids of first ArR on H5""",
+    )
+    parser.add_argument(
+        "-arrs",
+        "--arr_second",
+        type=str,
+        default="",
+        nargs=1,
+        help="""3-letter pattern of the 4 amino acids of second ArR on HE""",
+    )
+    parser.add_argument("-o", "--output", help="""Output for files""")
+    parser.add_argument("-log", "--log_output", help="""Output for files""")
+    return parser.parse_args()
+
+
+def search_amino_acids_pattern(list_pattern, uni):
+    """
+    Description:Creates a list that contains two lists.
+      The first one contains the residue numbers and the second one the residue
+      names for each amino acid in the pattern.
+    param list_pattern: list of the names of each amino acid of the pattern.
+    param uni: MDAnalysis universe.
+    return: list containing two lists. One containing residue numbers
+      and the other containing residue names.
+    """
+    if args.verbose:
+        logging.info("\nFunction search_amino_acids_pattern")
+        logging.info("The input list is " + str(list_pattern))
+        logging.info("The input universe is " + str(uni))
+    if len(list_pattern) == 3:
+        slct_pattern = (
+            "(resname "
+            + list_pattern[0]
+            + " or resname "
+            + list_pattern[1]
+            + " or resname "
+            + list_pattern[2]
+            + ") and name CA"
+        )
+    else:
+        slct_pattern = (
+            "(resname "
+            + list_pattern[0]
+            + " or resname "
+            + list_pattern[1]
+            + " or resname "
+            + list_pattern[2]
+            + " or resname "
+            + list_pattern[3]
+            + ") and name CA"
+        )
+    if args.verbose:
+        logging.info("The selected pattern is : " + slct_pattern)
+    list_atm_npa = uni.select_atoms(slct_pattern).residues
+    list_all_num_resid = list_atm_npa.resids
+    if args.verbose:
+        logging.info(
+            "The list containing all the residue numbers associated"
+            + " with the pattern is : " + str(list_all_num_resid)
+        )
+    list_all_name_resid = list_atm_npa.resnames
+    if args.verbose:
+        logging.info(
+            "The list containing all the residue names associated with the "
+            + "pattern is : " + str(list_all_name_resid)
+        )
+    list_all_resid = [list_all_num_resid, list_all_name_resid]
+    if args.verbose:
+        logging.info(
+            "The list containing the lists of numbers and names of residues"
+            + " associated with the pattern is : " + str(list_all_resid)
+        )
+    return list_all_resid
+
+
+def calculate_length_proto(list_num_resid):
+    """
+    Description : calculate the length of each protomer according to the number
+      of amino acids corresponding to the pattern.
+    param list_num_resid: list of amino acid numbers corresponding to the
+      pattern in the protein.
+    return: list of the length of each protomer.
+    """
+    if args.verbose:
+        logging.info("\nFunction calculate_length_proto")
+        logging.info("The input list is " + str(list_num_resid))
+    list_length_proto = [1, 0, 0, 0, 0, 0, 0, 0]
+    ind_list_length_proto = 0
+    for i, v in enumerate(list_num_resid[1:]):
+        if int(v) < int(list_num_resid[i]):
+            ind_list_length_proto += 1
+            list_length_proto[ind_list_length_proto] += 1
+        else:
+            list_length_proto[ind_list_length_proto] += 1
+    if args.verbose:
+        for i, v in enumerate(list_length_proto):
+            logging.info(
+                "The length of the protomer "
+                + str(i + 1)
+                + " is "
+                + str(v)
+                + " residues"
+            )
+    return list_length_proto
+
+
+def create_config_file(list_length_proto):
+    """
+    Description : creates a Configparser object that describes the start and
+      end indices of each protomer.
+    param list_length_proto: list of the length of each protomer.
+    return: Configparser object that describes the start and end indices
+      of each protomer.
+    """
+    if args.verbose:
+        logging.info("\nFunction create_config_file")
+        logging.info("The input list is " + str(list_length_proto))
+    config = configparser.ConfigParser()
+    proto_prev = ""
+    for aqp in range(1, 3):
+        for p in range(1, 5):
+            if aqp == 1 and p == 1:
+                proto_current = "AQP" + str(aqp) + ",P" + str(p)
+                config[proto_current] = {}
+                config.set(proto_current, "start", str(0))
+                config.set(proto_current, "end", str(list_length_proto[0] - 1))
+                proto_prev = proto_current
+                if args.verbose:
+                    logging.info(proto_current + " starts at index "
+                                 + str(config["AQP1,P1"]["start"])
+                                 + " and finish at index "
+                                 + str(config["AQP1,P1"]["end"]))
+            else:
+                proto_current = "AQP" + str(aqp) + ",P" + str(p)
+                config[proto_current] = {}
+                if aqp == 1 and p == 2:
+                    config.set(proto_current, "start",
+                               str(int(config[proto_prev]["start"]) +
+                                   list_length_proto[p-2] - 1))
+                else:
+                    config.set(proto_current, "start",
+                               str(int(config[proto_prev]["start"])
+                                   + list_length_proto[p-2]))
+                config.set(proto_current, "end",
+                           str(int(config[proto_prev]["end"])
+                               + list_length_proto[p-1]))
+                proto_prev = proto_current
+                if args.verbose:
+                    logging.info(
+                        proto_current
+                        + " starts at index "
+                        + config[proto_current]["start"]
+                        + " and finish at index "
+                        + config[proto_current]["end"]
+                    )
+    if args.verbose:
+        logging.info(
+            "The sections of the settings object are : "
+            + str(config.sections())
+        )
+        logging.info(
+            "The keys for each section of the settings object are: "
+            + str(list(config["AQP1,P3"].keys()))
+        )
+    return config
+
+
+def search_pattern(list_all_resid, config, list_pattern, pattern_resid, uni):
+    """
+    Description : creates the dictionary that contains the name, number and
+      position of the residue and the key for the following dictionary.
+    param list_all_resid: list containing two lists. One containing residue
+      numbers and the other containing residue names.
+    param config: Configparser object that describes the start and end indices
+      of each protomer.
+    param list_pattern: list of the names of each amino acid of the pattern.
+    param pattern_resid: string that gives the pattern (NPA1,NPA2,ArR1 ou ArR2)
+      in which we are.
+    param uni: MDAnalysis universe
+    return: dictionary which contains as key the name of the
+      aquaporin, the protomer and the residue, as well as the residue number.
+      The value contains the name, the number and the position of the residue,
+      as well as the key of the next dictionary.
+    """
+    if args.verbose:
+        logging.info("\nFunction search_pattern")
+        logging.info("The first input argument is " + str(list_all_resid))
+        logging.info("The second input argument is a configParser object")
+        logging.info("The third input argument is " + str(list_pattern))
+        logging.info("The fourth input argument is " + str(pattern_resid))
+        logging.info("The fourth input argument is " + str(uni))
+    num_ca_npa = 0
+    num_ca_arr = 0
+    dict_resid_pattern = {}
+    str_pattern = "".join(list_pattern)
+    for aqp in range(1, 3):
+        for p in range(1, 5):
+            list_ind_temp_resid = []
+            list_ind_final_resid = []
+            proto = "AQP" + str(aqp) + ",P" + str(p)
+            ss_name_list = list(
+                list_all_resid[1][
+                    int(config[proto]["start"]): int(config[proto]["end"])
+                ]
+            )
+            ss_name_str = "".join(ss_name_list)
+            pattern = re.compile(str_pattern)
+            for v in pattern.finditer(ss_name_str):
+                if args.verbose:
+                    logging.info(
+                        "The pattern " + str(pattern) + " is searched in "
+                        + ss_name_str
+                    )
+                if aqp == 1 and p == 1:
+                    list_ind_temp_resid.append(int(v.start() / 3))
+                else:
+                    list_ind_temp_resid.append(
+                        int(v.start() / 3) + int(config[proto]["start"])
+                    )
+            if args.verbose:
+                logging.info("The temporary index list is "
+                             + str(list_ind_temp_resid))
+            for ind_current in list_ind_temp_resid:
+                val_one = list_all_resid[0][ind_current]
+                val_two = list_all_resid[0][ind_current + 1]
+                val_three = list_all_resid[0][ind_current + 2]
+                if val_one + 1 == val_two and val_one + 2 == val_three:
+                    if pattern_resid == "ArR1" or pattern_resid == "ArR2":
+                        val_four = list_all_resid[0][ind_current + 3]
+                        if val_one + 3 == val_four:
+                            list_ind_final_resid.append(ind_current)
+                    else:
+                        list_ind_final_resid.append(ind_current)
+                        list_ind_final_resid.append(ind_current + 1)
+                        list_ind_final_resid.append(ind_current + 2)
+            if len(list_ind_final_resid) == 0:
+                if args.verbose:
+                    logging.critical("The pattern is not found "
+                                     + "in the protein sequence")
+                print("The pattern is not found in the protein sequence")
+                sys.exit(1)
+            if args.verbose:
+                logging.info("The final index list is "
+                             + str(list_ind_final_resid))
+            ind_list = []
+            for ind, val_ind in enumerate(list_ind_final_resid):
+                ind_list.append(ind)
+                key = (
+                    proto
+                    + ","
+                    + list_all_resid[1][val_ind]
+                    + ","
+                    + str(list_all_resid[0][val_ind])
+                )
+                if args.verbose:
+                    logging.info("The key of dictionary is " + key)
+                slct_resid = (
+                    "(resid " + str(list_all_resid[0][val_ind])
+                    + ") and name CA"
+                )
+                if args.verbose:
+                    logging.info("The selected residue is : " + slct_resid)
+                resid = uni.select_atoms(slct_resid)
+                if args.verbose:
+                    logging.info("The AtomGroup is " + str(resid))
+                if pattern_resid == "ArR1":
+                    new_key = "ArR1," + proto
+                    if args.verbose:
+                        logging.info("The key of next dictionary is "
+                                     + new_key)
+                    position_ca = resid.positions[num_ca_arr]
+                    num_ca_arr += 1
+                elif pattern_resid == "ArR2":
+                    new_key = "ArR2," + proto
+                    if args.verbose:
+                        logging.info("The key of next dictionary is "
+                                     + new_key)
+                    position_ca = resid.positions[num_ca_arr]
+                    num_ca_arr += 1
+                else:
+                    if (ind - 3) in ind_list or pattern_resid == "NPA2":
+                        new_key = "NPA2," + proto
+                        if args.verbose:
+                            logging.info("The key of next dictionary is "
+                                         + new_key)
+                        position_ca = resid.positions[num_ca_npa]
+                    else:
+                        new_key = "NPA1," + proto
+                        if args.verbose:
+                            logging.info("The key of next dictionary is "
+                                         + new_key)
+                        position_ca = resid.positions[num_ca_npa]
+                dict_resid_pattern[key] = [
+                    list_all_resid[1][val_ind],
+                    list_all_resid[0][val_ind],
+                    position_ca,
+                    new_key,
+                ]
+                if args.verbose:
+                    logging.info(
+                        "The value associated at the key "
+                        + key
+                        + " is "
+                        + str(dict_resid_pattern[key])
+                    )
+            if len(list_ind_final_resid) > 1:
+                num_ca_npa += 1
+    if args.verbose:
+        logging.info("The dictionary contains" + str(dict_resid_pattern))
+    return dict_resid_pattern
+
+
+def search_resid_positions(dict_resid_pattern, uni):
+    """
+    Description : creates the dictionary that contains the position of the
+      barycenter of the alpha carbons of NPA and ArR.
+    param dict_resid_pattern: dictionary which contains as key the name of the
+      aquaporin, the protomer and the residue, as well as the residue number.
+      The value contains the name, the number and the position of the residue,
+      as well as the key of the next dictionary.
+    param uni: MDAnalysis universe.
+    return: dictionary which contains as key the name of the pattern,
+      the aquaporin and the protomer. The value associated with the key contain
+      the position of the barycenter of the pattern.
+    """
+    if args.verbose:
+        logging.info("\nFunction search_resid_positions")
+        logging.info("The input dictionary is " + str(dict_resid_pattern))
+        logging.info("The input universe is " + str(uni))
+    dict_pos_resid = {}
+    for npa in range(1, 3):
+        num_proto = 0
+        for aqp in range(1, 3):
+            for p in range(1, 5):
+                list_atm_npa = []
+                key_npa = "NPA" + str(npa) + ",AQP" + str(aqp) + ",P" + str(p)
+                key_arr1 = "ArR1,AQP" + str(aqp) + ",P" + str(p)
+                key_arr2 = "ArR2,AQP" + str(aqp) + ",P" + str(p)
+                for k, v in dict_resid_pattern.items():
+                    if key_npa == v[3]:
+                        list_atm_npa.append(v[1])
+                    elif key_arr1 == v[3]:
+                        dict_pos_resid[key_arr1] = v[2]
+                    elif key_arr2 == v[3]:
+                        dict_pos_resid[key_arr2] = v[2]
+                if len(list_atm_npa) > 0:
+                    slct_pattern = (
+                        "(resid "
+                        + str(list_atm_npa[0])
+                        + " or resid "
+                        + str(list_atm_npa[1])
+                        + " or resid "
+                        + str(list_atm_npa[2])
+                        + ") and name CA"
+                    )
+                    if args.verbose:
+                        logging.info("The selected pattern is " + slct_pattern)
+                    atom_group_pattern = uni.select_atoms(slct_pattern)
+                    if args.verbose:
+                        logging.info(str(atom_group_pattern))
+                    deb = 0 + 3 * num_proto
+                    fin = +3 + 3 * num_proto
+                    atm_proto_aqp = atom_group_pattern[deb:fin]
+                    if args.verbose:
+                        logging.info("deb : " + str(deb) + "   fin : "
+                                     + str(fin))
+                        logging.info("The selected AtomGroup is "
+                                     + str(atm_proto_aqp))
+                    dict_pos_resid[key_npa] = atm_proto_aqp.centroid()
+                    num_proto += 1
+    if args.verbose:
+        logging.info("The dictionary contains " + str(dict_pos_resid))
+    return dict_pos_resid
+
+
+def calculate_euclidean_distance(dict_pos_resid, choice_measure):
+    """
+    Description : creates the dictionary that contains the Euclidean distance
+      between ArR and NPA1 or NPA2.
+    param dict_pos_resid: dictionary which contains as key the name of the
+      pattern, the aquaporin and the protomer.
+      The value contains the position of the barycenter of the pattern.
+    param choice_measure: variable that contains ArR-ArR or NPA-ArR.
+    return: dictionary which contains as key the name of the couple,
+      the aquaporin and the protomer.
+      The value contains the euclidean distance between the barycenters of
+      the two motifs of the couple.
+    """
+    if args.verbose:
+        logging.info("\nFunction calculate_euclidean_distance")
+        logging.info("The input dictionary is " + str(dict_pos_resid))
+        logging.info("The distance will be calculated between"
+                     + choice_measure)
+    dict_pair_dist = {}
+    if choice_measure == "NPA-ArR":
+        for npa in range(1, 3):
+            for aqp in range(1, 3):
+                for p in range(1, 5):
+                    key_npa = "NPA" + str(npa) + ",AQP" + str(aqp) \
+                              + ",P" + str(p)
+                    key_arr1 = "ArR1,AQP" + str(aqp) + ",P" + str(p)
+                    new_key = "ArR-NPA" + str(npa) + ",AQP" + str(aqp) \
+                              + ",P" + str(p)
+                    dict_pair_dist[new_key] = np.linalg.norm(
+                        dict_pos_resid[key_npa] - dict_pos_resid[key_arr1]
+                    )
+                    if args.verbose:
+                        logging.info(
+                            "Key NPA : "
+                            + key_npa
+                            + "  value : "
+                            + str(dict_pos_resid[key_npa])
+                        )
+                        logging.info(
+                            "Key ArR : "
+                            + key_arr1
+                            + "  value : "
+                            + str(dict_pos_resid[key_arr1])
+                        )
+                        logging.info(
+                            "The value associated at the key "
+                            + new_key
+                            + " is "
+                            + str(dict_pair_dist[new_key])
+                        )
+    else:
+        for aqp in range(1, 3):
+            for p in range(1, 5):
+                key_arr1 = "ArR1,AQP" + str(aqp) + ",P" + str(p)
+                key_arr2 = "ArR2,AQP" + str(aqp) + ",P" + str(p)
+                new_key = "ArR1-ArR2,AQP" + str(aqp) + ",P" + str(p)
+                dict_pair_dist[new_key] = np.linalg.norm(
+                    dict_pos_resid[key_arr1] - dict_pos_resid[key_arr2]
+                )
+                if args.verbose:
+                    logging.info(
+                        "Key ArR1 : "
+                        + key_arr1
+                        + "  value : "
+                        + str(dict_pos_resid[key_arr1])
+                    )
+                    logging.info(
+                        "Key ArR2 : "
+                        + key_arr2
+                        + "  value : "
+                        + str(dict_pos_resid[key_arr2])
+                    )
+                    logging.info(
+                        "The value associated at the key "
+                        + new_key
+                        + " is "
+                        + str(dict_pair_dist[new_key])
+                    )
+    if args.verbose:
+        logging.info("The  dictionary contains " + str(dict_pair_dist))
+    return dict_pair_dist
+
+
+def run_one_frame_npa_arr(list_resid_npa1, list_resid_npa2, list_resid_arr,
+                          uni):
+    """
+    Description: compiles the different functions to create the dict_pair_dist
+      dictionary for one frame.
+    param list_resid_npa1: list containing the names of the residues
+      constituting the NPA1.
+    param list_resid_npa2: list containing the names of the residues
+      constituting the NPA2.
+    param list_resid_arr: list containing the names of the residues
+      constituting the ArR.
+    param uni: MDAnalysis universe.
+    return: dictionary which contains as key the name of the couple,
+      the aquaporin and the protomer.
+      The value contains the euclidean distance between the barycenters of
+      the two motifs of the couple.
+    """
+    if args.verbose:
+        logging.info("\nFunction runForOneFrame")
+        logging.info("The first input parameter is " + str(list_resid_npa1))
+        logging.info("The second input parameter is " + str(list_resid_npa2))
+        logging.info("The third input parameter is " + str(list_resid_arr))
+        logging.info("The fourth input parameter is " + str(uni))
+    choice_measure = "NPA-ArR"
+    if list_resid_npa1 == list_resid_npa2:
+        if args.verbose:
+            logging.info("Pattern NPA1 identical to pattern NPA2")
+        pattern = "NPA1"
+        list_all_resid_npa = search_amino_acids_pattern(list_resid_npa1, uni)
+        list_length_proto_npa = \
+            calculate_length_proto(list(list_all_resid_npa[0]))
+        config_npa = create_config_file(list_length_proto_npa)
+        dict_resid_pattern = search_pattern(
+            list_all_resid_npa, config_npa, list_resid_npa1, pattern, uni
+        )
+    else:
+        if args.verbose:
+            logging.info("Pattern NPA1 different from the NPA2 pattern")
+        # NPA1
+        pattern = "NPA1"
+        list_all_resid_npa = search_amino_acids_pattern(list_resid_npa1, uni)
+        list_length_proto_npa = \
+            calculate_length_proto(list(list_all_resid_npa[0]))
+        config_npa1 = create_config_file(list_length_proto_npa)
+        dict_resid_pattern = search_pattern(
+            list_all_resid_npa, config_npa1, list_resid_npa1, pattern, uni
+        )
+        # NPA2
+        pattern = "NPA2"
+        list_all_resid_npa = search_amino_acids_pattern(list_resid_npa2, uni)
+        list_length_proto_npa = \
+            calculate_length_proto(list(list_all_resid_npa[0]))
+        config_npa2 = create_config_file(list_length_proto_npa)
+        dict_resid_pattern.update(
+            search_pattern(
+                list_all_resid_npa, config_npa2, list_resid_npa2, pattern, uni
+            )
+        )
+    # ArR
+    pattern = "ArR1"
+    list_all_resid_arr = search_amino_acids_pattern(list_resid_arr, uni)
+    list_length_proto_arr = \
+        calculate_length_proto(list(list_all_resid_arr[0]))
+    config_arr = create_config_file(list_length_proto_arr)
+    dict_resid_pattern.update(
+        search_pattern(list_all_resid_arr, config_arr, list_resid_arr,
+                       pattern, uni)
+    )
+    # dict position
+    dict_pos_resid = search_resid_positions(dict_resid_pattern, uni)
+    # dict distance
+    dict_pair_dist = \
+        calculate_euclidean_distance(dict_pos_resid, choice_measure)
+    return dict_pair_dist
+
+
+def run_one_frame_arr(list_resid_arr1, list_resid_arr2, uni):
+    """
+    Description: compiles the different functions to create the dict_pair_dist
+      dictionary for one frame.
+    param list_resid_arr1: list containing the names of the residues
+      constituting the ArR1.
+    param list_resid_arr2: list_resid_arr2, list containing the names of
+      the residues constituting the ArR2.
+    param uni: MDAnalysis universe.
+    return: dictionary which contains as key the name of the couple,
+      the aquaporin and the protomer.
+      The value contains the euclidean distance between the barycenters of
+      the two motifs of the couple.
+    """
+    if args.verbose:
+        logging.info("\nFunction runForOneFrame")
+        logging.info("The first input parameter is " + str(list_resid_arr1))
+        logging.info("The second input parameter is " + str(list_resid_arr2))
+        logging.info("The third input parameter is " + str(uni))
+    dict_resid_pattern = {}
+    # ArR1
+    pattern = "ArR1"
+    list_all_resid_arr1 = search_amino_acids_pattern(list_resid_arr1, uni)
+    list_length_proto_arr1 = \
+        calculate_length_proto(list(list_all_resid_arr1[0]))
+    config_arr1 = create_config_file(list_length_proto_arr1)
+    dict_resid_pattern.update(
+        search_pattern(list_all_resid_arr1, config_arr1, list_resid_arr1,
+                       pattern, uni)
+    )
+    # ArR2
+    pattern = "ArR2"
+    list_all_resid_arr2 = search_amino_acids_pattern(list_resid_arr2, uni)
+    list_length_proto_arr2 = \
+        calculate_length_proto(list(list_all_resid_arr2[0]))
+    config_arr2 = create_config_file(list_length_proto_arr2)
+    dict_resid_pattern.update(
+        search_pattern(list_all_resid_arr2, config_arr2, list_resid_arr2,
+                       pattern, uni)
+    )
+    # dict position
+    dict_pos_resid = search_resid_positions(dict_resid_pattern, uni)
+    # dict distance
+    choice_measure = "ArR-ArR"
+    dict_pair_dist = \
+        calculate_euclidean_distance(dict_pos_resid, choice_measure)
+    return dict_pair_dist
+
+
+def run_all_frames_npa_arr(
+    list_resid_npa1, list_resid_npa2, list_resid_arr, col_number, uni
+):
+    """
+    Description: creates the table containing the pore diameter of an aquaporin
+      over time.
+    param list_resid_npa1: list containing the names of the residues
+      constituting the NPA1.
+    param list_resid_npa2: list containing the names of the residues
+      constituting the NPA2.
+    param list_resid_arr: list containing the names of the residues
+      constituting the ArR.
+    param col_number: number of columns in the table
+    param uni: MDAnalysis universe.
+    return: list containing an array and a key list. The table containing the
+      pore diameter of an aquaporin over time.
+    """
+    if args.verbose:
+        logging.info("\nFunction runForAllFrame")
+        logging.info("The first input parameter is " + str(list_resid_npa1))
+        logging.info("The second input parameter is " + str(list_resid_npa2))
+        logging.info("The third input parameter is " + str(list_resid_arr))
+        logging.info("The fourth input parameter is " + str(col_number))
+        logging.info("The fifth input parameter is " + str(uni))
+    nbr_frame = len(uni.trajectory)
+    if args.verbose:
+        logging.info("The number of frame is " + str(nbr_frame))
+    tab_pair_dist = np.empty((nbr_frame, col_number))
+    if args.verbose:
+        logging.info(
+            "The dimensions of the table of means and standard deviation are"
+            + " as follows " + str(tab_pair_dist.shape)
+        )
+    list_key = []
+    pattern_arr = ""
+    pattern_npa1 = ""
+    pattern_npa2 = ""
+    dict_prot = {
+        "ALA": "A",
+        "CYS": "C",
+        "ASP": "D",
+        "GLU": "E",
+        "PHE": "F",
+        "GLY": "G",
+        "HIS": "H",
+        "ILE": "I",
+        "LYS": "K",
+        "LEU": "L",
+        "MET": "M",
+        "ASN": "N",
+        "PRO": "P",
+        "GLN": "Q",
+        "ARG": "R",
+        "SER": "S",
+        "THR": "T",
+        "VAL": "V",
+        "TRP": "W",
+        "TYR": "Y",
+        "HSD": "H",
+        "HSP": "H",
+    }
+    for aa in list_resid_arr:
+        pattern_arr += dict_prot[aa]
+    for aa in list_resid_npa1:
+        pattern_npa1 += dict_prot[aa]
+    for aa in list_resid_npa2:
+        pattern_npa2 += dict_prot[aa]
+    for ts in uni.trajectory:
+        if args.verbose:
+            logging.info("Frame " + str(uni.trajectory.frame))
+            logging.info(str(ts))
+        dict_pair_dist = run_one_frame_npa_arr(
+            list_resid_npa1, list_resid_npa2, list_resid_arr, uni
+        )
+        tab_pair_dist[uni.trajectory.frame][0] = uni.trajectory.time
+        list_key = ["Time in ps"]
+        num_npa_proto = 0
+        for npa in range(1, 3):
+            if npa == 1:
+                pattern_complete = "," + pattern_arr + "-" + pattern_npa1 + "1"
+            else:
+                pattern_complete = "," + pattern_arr + "-" + pattern_npa2 + "2"
+            for aqp in range(1, 3):
+                for p in range(1, 5):
+                    num_npa_proto += 1
+                    key1 = "ArR-NPA" + str(npa) + ",AQP" + str(aqp)\
+                           + ",P" + str(p)
+                    key2 = "AQP" + str(aqp) + ",P" + str(p) + pattern_complete\
+                           + " (Angstroms)"
+                    tab_pair_dist[uni.trajectory.frame][num_npa_proto] = \
+                        dict_pair_dist[key1]
+                    list_key.append(key2)
+    list_out = [tab_pair_dist, list_key]
+    return list_out
+
+
+def run_all_frames_arr(list_resid_arr1, list_resid_arr2, col_number, uni):
+    """
+    Description: creates the table containing the pore diameter of an aquaporin
+      over time.
+    param list_resid_arr1: list containing the names of the residues
+      constituting the ArR1.
+    param list_resid_arr2: list containing the names of the residues
+      constituting the ArR2.
+    param col_number: number of columns in the table
+      param uni: MDAnalysis universe.
+    return: list containing an array and a key list. The table containing the
+      pore diameter of an aquaporin over time.
+    """
+    if args.verbose:
+        logging.info("\nFunction runForAllFrame")
+        logging.info("The first input parameter is " + str(list_resid_arr1))
+        logging.info("The second input parameter is " + str(list_resid_arr2))
+        logging.info("The third input parameter is " + str(col_number))
+        logging.info("The fourth input parameter is " + str(uni))
+    nbr_frame = len(uni.trajectory)
+    if args.verbose:
+        logging.info("The number of frame is " + str(nbr_frame))
+    tab_pair_dist = np.empty((nbr_frame, col_number))
+    if args.verbose:
+        logging.info(
+            "The dimensions of the table of means and standard deviation"
+            + " are as follows " + str(tab_pair_dist.shape)
+        )
+    list_key = []
+    pattern1 = ""
+    pattern2 = ""
+    dict_prot = {
+        "ALA": "A",
+        "CYS": "C",
+        "ASP": "D",
+        "GLU": "E",
+        "PHE": "F",
+        "GLY": "G",
+        "HIS": "H",
+        "ILE": "I",
+        "LYS": "K",
+        "LEU": "L",
+        "MET": "M",
+        "ASN": "N",
+        "PRO": "P",
+        "GLN": "Q",
+        "ARG": "R",
+        "SER": "S",
+        "THR": "T",
+        "VAL": "V",
+        "TRP": "W",
+        "TYR": "Y",
+        "HSD": "H",
+        "HSP": "H",
+    }
+    for aa in list_resid_arr1:
+        pattern1 += dict_prot[aa]
+    for aa in list_resid_arr2:
+        pattern2 += dict_prot[aa]
+    for ts in uni.trajectory:
+        if args.verbose:
+            logging.info("Frame " + str(uni.trajectory.frame))
+            logging.info(str(ts))
+        dict_pair_dist = run_one_frame_arr(list_resid_arr1, list_resid_arr2,
+                                           uni)
+        tab_pair_dist[uni.trajectory.frame][0] = uni.trajectory.time
+        list_key = ["Time (ps)"]
+        num_proto = 0
+        for aqp in range(1, 3):
+            for p in range(1, 5):
+                num_proto += 1
+                key1 = "ArR1-ArR2,AQP" + str(aqp) + ",P" + str(p)
+                key2 = (
+                    "AQP" + str(aqp) + ",P" + str(p) + "," + pattern1 +
+                    "-" + pattern2 + " (Angstroms)"
+                )
+                tab_pair_dist[uni.trajectory.frame][num_proto] = \
+                    dict_pair_dist[key1]
+                list_key.append(key2)
+    list_out = [tab_pair_dist, list_key]
+    return list_out
+
+
+def create_final_dataframe(
+        list_resid_arr1, list_resid_arr2, list_resid_npa1, list_resid_npa2, uni
+):
+    """
+    Description: creates the final dataframe.
+    param list_resid_arr1: list containing the arr1 pattern
+    param list_resid_arr2: list containing the arr2 pattern
+    param list_resid_npa1: list containing the npa1 pattern
+    param list_resid_npa2: list containing the npa2 pattern
+    param uni: MDAnalysis universe
+    return: final dataframe
+    """
+    if args.verbose:
+        logging.info("\nFunction runForAllFrame")
+        logging.info("The first input parameter is " + str(list_resid_arr1))
+        logging.info("The second input parameter is " + str(list_resid_arr2))
+        logging.info("The third input parameter is " + str(list_resid_npa1))
+        logging.info("The fourth input parameter is " + str(list_resid_npa2))
+        logging.info("The fifth input parameter is " + str(uni))
+    if list_resid_npa1 is None and list_resid_npa2 is None:
+        if len(list_resid_arr1) >= 4 and len(list_resid_arr2) >= 4:
+            columns_number = 9
+            list_out = run_all_frames_arr(
+                list_resid_arr1, list_resid_arr2, columns_number, uni
+            )
+        else:
+            if args.verbose:
+                if len(list_resid_arr1) < 4 <= len(list_resid_arr2):
+                    logging.critical("Pattern1 must be provided")
+                elif len(list_resid_arr1) < 4 and len(list_resid_arr2) < 4:
+                    logging.critical("Pattern1 and Pattern2 must be provided")
+                elif len(list_resid_arr1) >= 4 > len(list_resid_arr2):
+                    logging.critical("Pattern2 must be provided")
+            print("Pattern error : please read the log file or help")
+            sys.exit(2)
+    elif list_resid_npa1 is None and list_resid_npa2 is not None:
+        if args.verbose:
+            if len(list_resid_arr1) < 4 <= len(list_resid_arr2):
+                logging.critical("PatternArr1 must be provided")
+                logging.critical("Please, do not put patternArr2")
+                logging.critical("PatternNPA1 must be provided")
+            elif len(list_resid_arr1) < 4 and len(list_resid_arr2) < 4:
+                logging.critical("PatternArr1 must be provided")
+                logging.critical("PatternNPA1 must be provided")
+            elif len(list_resid_arr1) >= 4 > len(list_resid_arr2):
+                logging.critical("PatternNPA1 must be provided")
+            else:
+                logging.critical("Please, do not put patternArr2")
+                logging.critical("PatternNPA1 must be provided")
+        print("Pattern error : please read the log file or help")
+        sys.exit(2)
+    elif list_resid_npa1 is not None and list_resid_npa2 is None:
+        if args.verbose:
+            if len(list_resid_arr1) < 4 <= len(list_resid_arr2):
+                logging.critical("PatternArr1 must be provided")
+                logging.critical("Please, do not put patternArr2")
+                logging.critical("PatternNPA2 must be provided")
+            elif len(list_resid_arr1) < 4 and len(list_resid_arr2) < 4:
+                logging.critical("PatternArr1 must be provided")
+                logging.critical("PatternNPA2 must be provided")
+            elif len(list_resid_arr1) >= 4 > len(list_resid_arr2):
+                logging.critical("PatternNPA1 must be provided")
+            else:
+                logging.critical("Please, do not put patternArr2")
+                logging.critical("PatternNPA2 must be provided")
+        print("Pattern error : please read the log file or help")
+        sys.exit(2)
+    else:
+        if (
+            list_resid_npa1 is not None
+            and list_resid_npa2 is not None
+            and len(list_resid_arr1) >= 4 > len(list_resid_arr2)
+        ):
+            columns_number = 17
+            list_out = run_all_frames_npa_arr(
+                list_resid_npa1, list_resid_npa2, list_resid_arr1,
+                columns_number, uni
+            )
+        else:
+            if len(list_resid_arr1) < 4 <= len(list_resid_arr2):
+                logging.critical("PatternArr1 must be provided")
+                logging.critical("Please, do not put patternArr2")
+            elif len(list_resid_arr1) < 4 and len(list_resid_arr2) < 4:
+                logging.critical("Pattern1 must be provided")
+            print("Pattern error : please read the log file or help")
+            sys.exit(2)
+    list_std = list(np.std(list_out[0], axis=0))
+    list_mean = list(np.mean(list_out[0], axis=0))
+    np_mean_around = list(np.around(list_mean, decimals=3))
+    np_std_around = list(np.around(list_std, decimals=3))
+    np_mean_around[0] = "Mean"
+    np_std_around[0] = "Std"
+    np_mean_around = np.array([np_mean_around])
+    np_std_around = np.array([np_std_around])
+    np_mean_std_around = np.concatenate((np_mean_around, np_std_around),
+                                        axis=0)
+    tab_pair_dist = np.concatenate((np_mean_std_around, list_out[0]), axis=0)
+    df_dist = pd.DataFrame(tab_pair_dist, columns=list_out[1])
+    return df_dist
+
+
+# Class
+
+
+if __name__ == "__main__":
+    args = parse_arguments()
+    out_file_name = "test"
+    file_name_log = out_file_name + ".log"
+    if args.verbose:
+        if args.log_output:
+            if os.path.isfile(args.log_output):
+                os.remove(args.log_output)
+            logging.basicConfig(
+                filename=args.log_output,
+                format="%(levelname)s - %(message)s",
+                level=logging.INFO,
+            )
+        else:
+            if os.path.isfile(file_name_log):
+                os.remove(file_name_log)
+            logging.basicConfig(
+                filename=file_name_log,
+                format="%(levelname)s - %(message)s",
+                level=logging.INFO,
+            )
+        logging.info("verbose mode on")
+    gro_file = ''
+    xtc_file = ''
+    if args.gro_file:
+        gro_file = os.path.basename(args.gro_file[0])
+    else:
+        logging.critical("A .gro file must be provided")
+    if args.xtc_file:
+        xtc_file = os.path.basename(args.xtc_file[0])
+    else:
+        logging.critical("A .xtc file must be provided")
+    if args.work_directory:
+        work_directory = args.work_directory[0]
+    else:
+        work_directory = os.path.dirname(args.gro_file[0])
+        if not work_directory.endswith("/"):
+            work_directory += "/"
+    output_directory = args.output_directory[0]
+    if args.output:
+        output = args.output
+    else:
+        output = ""
+    os.chdir(work_directory)
+    if args.gro_file_ext and args.xtc_file_ext:
+        u = Mda.Universe(
+            args.gro_file[0],
+            args.xtc_file[0],
+            topology_format=args.gro_file_ext[0],
+            format=args.xtc_file_ext[0],
+        )
+    else:
+        u = Mda.Universe(args.gro_file[0], args.xtc_file[0])
+    if args.verbose:
+        logging.info("Load universe : " + str(u))
+    if args.npa_one:
+        resid_npa1 = args.npa_one[0].split()
+    else:
+        resid_npa1 = args.npa_one
+    if args.npa_second:
+        resid_npa2 = args.npa_second[0].split()
+    else:
+        resid_npa2 = args.npa_second
+    if args.arr_one:
+        resid_arr1 = args.arr_one[0].split()
+    else:
+        resid_arr1 = args.arr_one
+    if args.arr_second:
+        resid_arr2 = args.arr_second[0].split()
+    else:
+        resid_arr2 = args.arr_second
+    df_final = create_final_dataframe(resid_arr1, resid_arr2,
+                                      resid_npa1, resid_npa2, u)
+    if len(output_directory) > 2:
+        if not os.path.exists(output_directory):
+            os.makedirs(output_directory)
+        os.chdir(output_directory)
+    if len(output) > 2:
+        df_final.to_csv(output, sep="\t", index=False)
+    else:
+        file_name_tabular = out_file_name + "PairDist.tabular"
+        df_final.to_csv(file_name_tabular, sep="\t", index=False)
author	agpetit
date	Wed, 25 May 2022 07:17:24 +0000
parents
children	a4097272ade0