# HG changeset patch # User p.lucas # Date 1620290813 0 # Node ID 6a4d5446c12357ff99d6940471becbf91aece7ac Uploaded python script diff -r 000000000000 -r 6a4d5446c123 ab_haddock_format.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/ab_haddock_format.py Thu May 06 08:46:53 2021 +0000 @@ -0,0 +1,192 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# +# Copyright 2020: +# Francesco Ambrosetti +# + +""" +Formats the antibody to fit the HADDOCK requirements with the +specified chain id and returns the list of residues belonging +to the HV loops defined according to the HADDOCK friendly format. + +*** The antibody has to be numbered according to the Chothia scheme *** + +Usage: + python haddock-format.py + +Example: + python 4G6K_ch.pdb 4G6K-HADDOCK.pdb A + +Author: {0} +Email: {1} +""" + +import argparse +import biopandas.pdb as bp +import copy as cp +import os +import sys + +__author__ = "Francesco Ambrosetti" +__email__ = "ambrosetti.francesco@gmail.com" +USAGE = __doc__.format(__author__, __email__) + + +def check_input(): + """ + Check and collect the script inputs + Returns: + args.pdb (str): path to the pdb-file + args.chain (str): chain id to use for the HADDOCK-formatted structure + """ + + # Parse command line arguments + parser = argparse.ArgumentParser( + description=USAGE, + formatter_class=argparse.RawTextHelpFormatter) + parser.add_argument('pdb', help='Path to the Chothia numbered antibody PDB structure', type=str) + parser.add_argument('out', help='Path to the output PDB file', type=str) + parser.add_argument('chain', help='Chain id to use for the HADDOCK-formatted PDB structure', type=str) + + args = parser.parse_args() + + if not os.path.isfile(args.pdb): + emsg = 'ERROR!! File {0} not found or not readable\n'.format(args.pdb) + sys.stderr.write(emsg) + sys.exit(1) + + if not args.pdb.endswith(".pdb"): + emsg = 'ERROR!! File {0} not recognize as a PDB file\n'.format(args.pdb) + sys.stderr.write(emsg) + sys.exit(1) + + return args.pdb, args.out, args.chain + + +def unique(sequence): + seen = set() + return [x for x in sequence if not (x in seen or seen.add(x))] + + +class AbHaddockFormat: + """Class to renumber a Chothia antibody to make it HADDOCK-ready""" + + # Loops + # L chain loops + l1 = ['26_L', '27_L', '28_L', '29_L', '30_L', '30A_L', '30B_L', '30C_L', '30D_L', '30E_L', '30F_L', '31_L', '32_L'] + l2 = ['50_L', '50A_L', '50B_L', '50C_L', '50D_L', '50E_L', '50F_L', '51_L', '52_L'] + l3 = ['91_L', '92_L', '93_L', '94_L', '95_L', '95A_L', '95B_L', '95C_L', '95D_L', '95E_L', '95F_L', '96_L'] + loops_l = l1 + l2 + l3 + + # H chain loops + h1 = ['26_H', '27_H', '28_H', '29_H', '30_H', '31_H', '31A_H', '31B_H', '31C_H', '31D_H', '31E_H', '31F_H', '32_H'] + h2 = ['52A_H', '52B_H', '52C_H', '52D_H', '52E_H', '52F_H', '53_H', '54_H', '55_H'] + h3 = ['96_H', '97_H', '98_H', '99_H', '100_H', '100A_H', '100B_H', '100C_H', '100D_H', '100E_H', '100F_H', '100G_H', + '100H_H', '100I_H', '100J_H', '100K_H', '101_H'] + loops_h = h1 + h2 + h3 + + def __init__(self, pdbfile, chain): + """ + Constructor for the AbHaddockFormat class + Args: + pdbfile (str): path to the antibody .pdb file + chain (str): chain id to use for the HADDOCK-ready structure + """ + self.file = pdbfile + self.pdb = bp.PandasPdb().read_pdb(self.file) + self.chain = chain + + def check_chain(self): + """ + Check if the antibody contains the light and heavy chain + Returns: + 0 + """ + chain_ids = self.pdb.df['ATOM']['chain_id'].values + + if 'H' not in chain_ids: + emsg = 'ERROR!! File {0} does not contain the heavy chain\n'.format(self.file) + sys.stderr.write(emsg) + sys.exit(1) + + elif 'L' not in chain_ids: + emsg = 'ERROR!! File {0} does not contain the light chain\n'.format(self.file) + sys.stderr.write(emsg) + sys.exit(1) + + return 0 + + def ab_format(self): + """ + Renumbers the antibody and extract the HV residues + + Returns: + hv_list (list): list of the HV residue numbers + new_pdb (biopandas.pdb.pandas_pdb.PandasPdb): HADDOCK-ready pdb + """ + + # Check antibody chain ids + self.check_chain() + + # Modify resno to include insertions and chain id + resno = self.pdb.df['ATOM']['residue_number'].values + ins = self.pdb.df['ATOM']['insertion'].values + chain = self.pdb.df['ATOM']['chain_id'].values + ch_resno = ['{0}{1}_{2}'.format(i, j, c) for i, j, c in zip(resno, ins, chain)] + + # Create new resno + count = 0 + prev_resid = None + new_resno = [] + + # Renumber + for r in ch_resno: + if r != prev_resid: + count += 1 + new_resno.append(count) + prev_resid = r + elif r == prev_resid: + new_resno.append(count) + prev_resid = r + + # Update pdb + new_pdb = cp.deepcopy(self.pdb) + new_pdb.df['ATOM']['chain_id'] = self.chain + new_pdb.df['ATOM']['residue_number'] = new_resno + new_pdb.df['ATOM']['insertion'] = '' # Remove insertions + + # Create dictionary with old and new numbering + resno_dict = dict(zip(unique(ch_resno), unique(new_resno))) + + # Collect HV residues with the new numbering + hv_list = [] + + # Heavy chain + for hv_heavy in self.loops_h: + if hv_heavy in resno_dict.keys(): + hv_list.append(resno_dict[hv_heavy]) + + # Light chain + for hv_light in self.loops_l: + if hv_light in resno_dict.keys(): + hv_list.append(resno_dict[hv_light]) + + hv_list.sort() + return hv_list, new_pdb + + +if __name__ == '__main__': + + # Get inputs + pdb_file, out_file, chain_id = check_input() + + # Renumber pdb file and get HV residues + pdb_format = AbHaddockFormat(pdb_file, chain_id) + hv_resno, pdb_ren = pdb_format.ab_format() + + # Write pdb into a file + pdb_ren.to_pdb(path=out_file, records=['ATOM'], append_newline=True) + + # Print HV residues + print(','.join(map(str, hv_resno)))