Mercurial > repos > p.lucas > abhaddock
view ab_haddock_format.py @ 0:6a4d5446c123 draft
Uploaded python script
author | p.lucas |
---|---|
date | Thu, 06 May 2021 08:46:53 +0000 |
parents | |
children |
line wrap: on
line source
#!/usr/bin/env python # -*- coding: utf-8 -*- # # Copyright 2020: # Francesco Ambrosetti # """ Formats the antibody to fit the HADDOCK requirements with the specified chain id and returns the list of residues belonging to the HV loops defined according to the HADDOCK friendly format. *** The antibody has to be numbered according to the Chothia scheme *** Usage: python haddock-format.py <chothia numbered antibody> <output .pdb file> <chain_id> Example: python 4G6K_ch.pdb 4G6K-HADDOCK.pdb A Author: {0} Email: {1} """ import argparse import biopandas.pdb as bp import copy as cp import os import sys __author__ = "Francesco Ambrosetti" __email__ = "ambrosetti.francesco@gmail.com" USAGE = __doc__.format(__author__, __email__) def check_input(): """ Check and collect the script inputs Returns: args.pdb (str): path to the pdb-file args.chain (str): chain id to use for the HADDOCK-formatted structure """ # Parse command line arguments parser = argparse.ArgumentParser( description=USAGE, formatter_class=argparse.RawTextHelpFormatter) parser.add_argument('pdb', help='Path to the Chothia numbered antibody PDB structure', type=str) parser.add_argument('out', help='Path to the output PDB file', type=str) parser.add_argument('chain', help='Chain id to use for the HADDOCK-formatted PDB structure', type=str) args = parser.parse_args() if not os.path.isfile(args.pdb): emsg = 'ERROR!! File {0} not found or not readable\n'.format(args.pdb) sys.stderr.write(emsg) sys.exit(1) if not args.pdb.endswith(".pdb"): emsg = 'ERROR!! File {0} not recognize as a PDB file\n'.format(args.pdb) sys.stderr.write(emsg) sys.exit(1) return args.pdb, args.out, args.chain def unique(sequence): seen = set() return [x for x in sequence if not (x in seen or seen.add(x))] class AbHaddockFormat: """Class to renumber a Chothia antibody to make it HADDOCK-ready""" # Loops # L chain loops l1 = ['26_L', '27_L', '28_L', '29_L', '30_L', '30A_L', '30B_L', '30C_L', '30D_L', '30E_L', '30F_L', '31_L', '32_L'] l2 = ['50_L', '50A_L', '50B_L', '50C_L', '50D_L', '50E_L', '50F_L', '51_L', '52_L'] l3 = ['91_L', '92_L', '93_L', '94_L', '95_L', '95A_L', '95B_L', '95C_L', '95D_L', '95E_L', '95F_L', '96_L'] loops_l = l1 + l2 + l3 # H chain loops h1 = ['26_H', '27_H', '28_H', '29_H', '30_H', '31_H', '31A_H', '31B_H', '31C_H', '31D_H', '31E_H', '31F_H', '32_H'] h2 = ['52A_H', '52B_H', '52C_H', '52D_H', '52E_H', '52F_H', '53_H', '54_H', '55_H'] h3 = ['96_H', '97_H', '98_H', '99_H', '100_H', '100A_H', '100B_H', '100C_H', '100D_H', '100E_H', '100F_H', '100G_H', '100H_H', '100I_H', '100J_H', '100K_H', '101_H'] loops_h = h1 + h2 + h3 def __init__(self, pdbfile, chain): """ Constructor for the AbHaddockFormat class Args: pdbfile (str): path to the antibody .pdb file chain (str): chain id to use for the HADDOCK-ready structure """ self.file = pdbfile self.pdb = bp.PandasPdb().read_pdb(self.file) self.chain = chain def check_chain(self): """ Check if the antibody contains the light and heavy chain Returns: 0 """ chain_ids = self.pdb.df['ATOM']['chain_id'].values if 'H' not in chain_ids: emsg = 'ERROR!! File {0} does not contain the heavy chain\n'.format(self.file) sys.stderr.write(emsg) sys.exit(1) elif 'L' not in chain_ids: emsg = 'ERROR!! File {0} does not contain the light chain\n'.format(self.file) sys.stderr.write(emsg) sys.exit(1) return 0 def ab_format(self): """ Renumbers the antibody and extract the HV residues Returns: hv_list (list): list of the HV residue numbers new_pdb (biopandas.pdb.pandas_pdb.PandasPdb): HADDOCK-ready pdb """ # Check antibody chain ids self.check_chain() # Modify resno to include insertions and chain id resno = self.pdb.df['ATOM']['residue_number'].values ins = self.pdb.df['ATOM']['insertion'].values chain = self.pdb.df['ATOM']['chain_id'].values ch_resno = ['{0}{1}_{2}'.format(i, j, c) for i, j, c in zip(resno, ins, chain)] # Create new resno count = 0 prev_resid = None new_resno = [] # Renumber for r in ch_resno: if r != prev_resid: count += 1 new_resno.append(count) prev_resid = r elif r == prev_resid: new_resno.append(count) prev_resid = r # Update pdb new_pdb = cp.deepcopy(self.pdb) new_pdb.df['ATOM']['chain_id'] = self.chain new_pdb.df['ATOM']['residue_number'] = new_resno new_pdb.df['ATOM']['insertion'] = '' # Remove insertions # Create dictionary with old and new numbering resno_dict = dict(zip(unique(ch_resno), unique(new_resno))) # Collect HV residues with the new numbering hv_list = [] # Heavy chain for hv_heavy in self.loops_h: if hv_heavy in resno_dict.keys(): hv_list.append(resno_dict[hv_heavy]) # Light chain for hv_light in self.loops_l: if hv_light in resno_dict.keys(): hv_list.append(resno_dict[hv_light]) hv_list.sort() return hv_list, new_pdb if __name__ == '__main__': # Get inputs pdb_file, out_file, chain_id = check_input() # Renumber pdb file and get HV residues pdb_format = AbHaddockFormat(pdb_file, chain_id) hv_resno, pdb_ren = pdb_format.ab_format() # Write pdb into a file pdb_ren.to_pdb(path=out_file, records=['ATOM'], append_newline=True) # Print HV residues print(','.join(map(str, hv_resno)))