comparison ab_haddock_format.py @ 0:6a4d5446c123 draft

Uploaded python script
author p.lucas
date Thu, 06 May 2021 08:46:53 +0000
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:6a4d5446c123
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3 #
4 # Copyright 2020:
5 # Francesco Ambrosetti
6 #
7
8 """
9 Formats the antibody to fit the HADDOCK requirements with the
10 specified chain id and returns the list of residues belonging
11 to the HV loops defined according to the HADDOCK friendly format.
12
13 *** The antibody has to be numbered according to the Chothia scheme ***
14
15 Usage:
16 python haddock-format.py <chothia numbered antibody> <output .pdb file> <chain_id>
17
18 Example:
19 python 4G6K_ch.pdb 4G6K-HADDOCK.pdb A
20
21 Author: {0}
22 Email: {1}
23 """
24
25 import argparse
26 import biopandas.pdb as bp
27 import copy as cp
28 import os
29 import sys
30
31 __author__ = "Francesco Ambrosetti"
32 __email__ = "ambrosetti.francesco@gmail.com"
33 USAGE = __doc__.format(__author__, __email__)
34
35
36 def check_input():
37 """
38 Check and collect the script inputs
39 Returns:
40 args.pdb (str): path to the pdb-file
41 args.chain (str): chain id to use for the HADDOCK-formatted structure
42 """
43
44 # Parse command line arguments
45 parser = argparse.ArgumentParser(
46 description=USAGE,
47 formatter_class=argparse.RawTextHelpFormatter)
48 parser.add_argument('pdb', help='Path to the Chothia numbered antibody PDB structure', type=str)
49 parser.add_argument('out', help='Path to the output PDB file', type=str)
50 parser.add_argument('chain', help='Chain id to use for the HADDOCK-formatted PDB structure', type=str)
51
52 args = parser.parse_args()
53
54 if not os.path.isfile(args.pdb):
55 emsg = 'ERROR!! File {0} not found or not readable\n'.format(args.pdb)
56 sys.stderr.write(emsg)
57 sys.exit(1)
58
59 if not args.pdb.endswith(".pdb"):
60 emsg = 'ERROR!! File {0} not recognize as a PDB file\n'.format(args.pdb)
61 sys.stderr.write(emsg)
62 sys.exit(1)
63
64 return args.pdb, args.out, args.chain
65
66
67 def unique(sequence):
68 seen = set()
69 return [x for x in sequence if not (x in seen or seen.add(x))]
70
71
72 class AbHaddockFormat:
73 """Class to renumber a Chothia antibody to make it HADDOCK-ready"""
74
75 # Loops
76 # L chain loops
77 l1 = ['26_L', '27_L', '28_L', '29_L', '30_L', '30A_L', '30B_L', '30C_L', '30D_L', '30E_L', '30F_L', '31_L', '32_L']
78 l2 = ['50_L', '50A_L', '50B_L', '50C_L', '50D_L', '50E_L', '50F_L', '51_L', '52_L']
79 l3 = ['91_L', '92_L', '93_L', '94_L', '95_L', '95A_L', '95B_L', '95C_L', '95D_L', '95E_L', '95F_L', '96_L']
80 loops_l = l1 + l2 + l3
81
82 # H chain loops
83 h1 = ['26_H', '27_H', '28_H', '29_H', '30_H', '31_H', '31A_H', '31B_H', '31C_H', '31D_H', '31E_H', '31F_H', '32_H']
84 h2 = ['52A_H', '52B_H', '52C_H', '52D_H', '52E_H', '52F_H', '53_H', '54_H', '55_H']
85 h3 = ['96_H', '97_H', '98_H', '99_H', '100_H', '100A_H', '100B_H', '100C_H', '100D_H', '100E_H', '100F_H', '100G_H',
86 '100H_H', '100I_H', '100J_H', '100K_H', '101_H']
87 loops_h = h1 + h2 + h3
88
89 def __init__(self, pdbfile, chain):
90 """
91 Constructor for the AbHaddockFormat class
92 Args:
93 pdbfile (str): path to the antibody .pdb file
94 chain (str): chain id to use for the HADDOCK-ready structure
95 """
96 self.file = pdbfile
97 self.pdb = bp.PandasPdb().read_pdb(self.file)
98 self.chain = chain
99
100 def check_chain(self):
101 """
102 Check if the antibody contains the light and heavy chain
103 Returns:
104 0
105 """
106 chain_ids = self.pdb.df['ATOM']['chain_id'].values
107
108 if 'H' not in chain_ids:
109 emsg = 'ERROR!! File {0} does not contain the heavy chain\n'.format(self.file)
110 sys.stderr.write(emsg)
111 sys.exit(1)
112
113 elif 'L' not in chain_ids:
114 emsg = 'ERROR!! File {0} does not contain the light chain\n'.format(self.file)
115 sys.stderr.write(emsg)
116 sys.exit(1)
117
118 return 0
119
120 def ab_format(self):
121 """
122 Renumbers the antibody and extract the HV residues
123
124 Returns:
125 hv_list (list): list of the HV residue numbers
126 new_pdb (biopandas.pdb.pandas_pdb.PandasPdb): HADDOCK-ready pdb
127 """
128
129 # Check antibody chain ids
130 self.check_chain()
131
132 # Modify resno to include insertions and chain id
133 resno = self.pdb.df['ATOM']['residue_number'].values
134 ins = self.pdb.df['ATOM']['insertion'].values
135 chain = self.pdb.df['ATOM']['chain_id'].values
136 ch_resno = ['{0}{1}_{2}'.format(i, j, c) for i, j, c in zip(resno, ins, chain)]
137
138 # Create new resno
139 count = 0
140 prev_resid = None
141 new_resno = []
142
143 # Renumber
144 for r in ch_resno:
145 if r != prev_resid:
146 count += 1
147 new_resno.append(count)
148 prev_resid = r
149 elif r == prev_resid:
150 new_resno.append(count)
151 prev_resid = r
152
153 # Update pdb
154 new_pdb = cp.deepcopy(self.pdb)
155 new_pdb.df['ATOM']['chain_id'] = self.chain
156 new_pdb.df['ATOM']['residue_number'] = new_resno
157 new_pdb.df['ATOM']['insertion'] = '' # Remove insertions
158
159 # Create dictionary with old and new numbering
160 resno_dict = dict(zip(unique(ch_resno), unique(new_resno)))
161
162 # Collect HV residues with the new numbering
163 hv_list = []
164
165 # Heavy chain
166 for hv_heavy in self.loops_h:
167 if hv_heavy in resno_dict.keys():
168 hv_list.append(resno_dict[hv_heavy])
169
170 # Light chain
171 for hv_light in self.loops_l:
172 if hv_light in resno_dict.keys():
173 hv_list.append(resno_dict[hv_light])
174
175 hv_list.sort()
176 return hv_list, new_pdb
177
178
179 if __name__ == '__main__':
180
181 # Get inputs
182 pdb_file, out_file, chain_id = check_input()
183
184 # Renumber pdb file and get HV residues
185 pdb_format = AbHaddockFormat(pdb_file, chain_id)
186 hv_resno, pdb_ren = pdb_format.ab_format()
187
188 # Write pdb into a file
189 pdb_ren.to_pdb(path=out_file, records=['ATOM'], append_newline=True)
190
191 # Print HV residues
192 print(','.join(map(str, hv_resno)))