comparison server/transfs.py @ 0:69fe50d03b80 draft

"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/transfs commit d9a9e2f0e12fe9d2c37f632d99f2164df577b4af"
author bgruening
date Fri, 27 Mar 2020 13:11:29 +0000
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:69fe50d03b80
1 # Create dir containing ligands.sdf and protein.pdb
2 # Enter docker container like this:
3 # docker run -it --rm --gpus all -v $PWD:/root/train/fragalysis_test_files/work:Z informaticsmatters/deep-app-ubuntu-1604:latest bash
4 #
5 # Now inside the container run like this:
6 # mkdir /tmp/work
7 # rm -rf /tmp/work/* && python3 work/transfs.py -i work/test-data/ligands.sdf -r work/test-data/receptor.pdb -d 2 -w /tmp/work
8 #
9 # If testing with no GPU you can use the --mock option to generate random scores
10 #
11 # Start container for testing like this:
12 # docker run -it --rm -v $PWD:$PWD:Z -w $PWD informaticsmatters/deep-app-ubuntu-1604:latest bash
13 # Inside container test like this:
14 # mkdir /tmp/work
15 # cd chemicaltoolbox/xchem-deep
16 # rm -rf /tmp/work/* && python3 transfs.py -i test-data/ligands.sdf -r test-data/receptor.pdb -d 2 -w /tmp/work --mock
17 #
18
19 import argparse, os, sys, math
20 import subprocess
21 import random
22 from openbabel import pybel
23
24 types_file_name = 'inputs.types'
25 types_file_name = 'inputs.types'
26 predict_file_name = 'predictions.txt'
27 work_dir = '.'
28 paths = None
29 inputs_protein = []
30 inputs_ligands = []
31
32
33 def log(*args, **kwargs):
34 """Log output to STDERR
35 """
36 print(*args, file=sys.stderr, ** kwargs)
37
38 def write_raw_inputs(receptor_pdb, ligands_sdf, distance):
39 """
40 Analyses the PDB file for waters that clash with each ligand in the SDF and writes out:
41 1. a PDB file named like receptor-123-543.pdb where the numeric parts are the waters that have been omitted
42 2. a corresponding directory named like receptor-123-543
43 3. an SDF named like receptor-123-543/ligands.sdf containing those ligands that correspond to that receptor.
44 :param receptor_pdb: A PDB file without the ligand but with the crystallographic waters
45 :param ligands_sdf: A SDF with the docked poses
46 :param distance: The distance to consider when removing waters. Only heavy atoms in the ligand are considered.
47 :return:
48 """
49
50 global work_dir
51 global inputs_protein
52 global inputs_ligands
53 global paths
54
55
56 log("Writing data to", work_dir)
57 if not os.path.isdir(work_dir):
58 os.mkdir(work_dir)
59
60 receptor_file = os.path.basename(receptor_pdb)
61
62 sdf_writers = {}
63 paths = []
64
65 # read the receptor once as we'll need to process it many times
66 with open(receptor_pdb, 'r') as f:
67 lines = f.readlines()
68
69 count = 0
70 for mol in pybel.readfile("sdf", ligands_sdf):
71 count += 1
72 if count % 50000 == 0:
73 log('Processed', count)
74
75 try:
76 # print("Processing mol", mol.title)
77
78 clone = pybel.Molecule(mol)
79 clone.removeh()
80
81 coords = []
82 for atom in clone.atoms:
83 coords.append(atom.coords)
84
85 watnumcode = ''
86
87 # getting receptor without waters that will clash with ligand
88 new_receptor_pdb = []
89 for line in lines:
90 if line[17:20] == 'HOH':
91 x, y, z = float(line[30:39]), float(line[39:46]), float(line[46:55])
92 distances = []
93 for i in coords:
94 distances.append(math.sqrt((x-i[0])**2 + (y-i[1])**2 + (z-i[2])**2)) # calculates distance based on cartesian coordinates
95 if min(distances) > distance: # if all distances are larger than 2.0A, then molecule makes it to new file
96 new_receptor_pdb.append(line)
97 else:
98 watnum = line[23:28].strip()
99 # print("Skipped water " + watnum)
100 watnumcode += '-' + watnum
101 if line[17:20] != 'LIG' and line[17:20] != 'HOH': # ligand lines are also removed
102 new_receptor_pdb.append(line)
103
104
105 name = receptor_file[0:-4] + watnumcode
106 # print('CODE:', name)
107 mol.data['TransFSReceptor'] = name
108
109
110 if watnumcode not in sdf_writers:
111 # we've not yet encountered this combination of waters so need to write the PDB file
112
113 dir = os.path.sep.join([work_dir, name])
114 log('WRITING to :', dir)
115
116 os.mkdir(dir)
117 paths.append(dir)
118
119 sdf_writers[watnumcode] = pybel.Outputfile("sdf", os.path.sep.join([dir, 'ligands.sdf']))
120
121 # writing into new pdb file
122 receptor_writer = open(os.path.sep.join([work_dir, name + '.pdb']), "w+")
123 for line in new_receptor_pdb:
124 receptor_writer.write(str(line))
125 receptor_writer.close()
126
127 # write the molecule to the corresponding SDF file
128 sdf_out = sdf_writers[watnumcode]
129 sdf_out.write(mol)
130
131 except Exception as e:
132 log('Failed to handle molecule: '+ str(e))
133 continue
134
135 for writer in sdf_writers.values():
136 writer.close()
137
138 log('Wrote', count, 'molecules and', len(sdf_writers), 'proteins')
139
140 def write_inputs(protein_file, ligands_file, distance):
141 """
142 Runs gninatyper on the proteins and ligands and generates the input.types file that will tell the predictor
143 what ligands correspond to what proteins.
144
145 :param protein_file:
146 :param ligands_file:
147 :param distance:
148 :return:
149 """
150
151 global types_file_name
152 global work_dir
153 global inputs_protein
154 global inputs_ligands
155 global prepared_ligands
156
157 write_raw_inputs(protein_file, ligands_file, distance)
158
159 types_path = os.path.sep.join([work_dir, types_file_name])
160 log("Writing types to", types_path)
161 with open(types_path, 'w') as types_file:
162
163 for path in paths:
164
165 log("Gninatyping ligands in", path)
166
167 ligands_dir = os.path.sep.join([path, 'ligands'])
168 os.mkdir(ligands_dir)
169
170 cmd1 = ['gninatyper', os.path.sep.join([path, 'ligands.sdf']), os.path.sep.join([ligands_dir, 'ligand'])]
171 log('CMD:', cmd1)
172 exit_code = subprocess.call(cmd1)
173 log("Status:", exit_code)
174 if exit_code:
175 raise Exception("Failed to write ligands")
176 ligand_gninatypes = os.listdir(os.path.sep.join([path, 'ligands']))
177
178 log("Gninatyping proteins in", path)
179
180 proteins_dir = os.path.sep.join([path, 'proteins'])
181 os.mkdir(proteins_dir)
182
183 cmd2 = ['gninatyper', path + '.pdb', os.path.sep.join([proteins_dir, 'protein'])]
184 log('CMD:', cmd2)
185 exit_code = subprocess.call(cmd2)
186 log("Status:", exit_code)
187 if exit_code:
188 raise Exception("Failed to write proteins")
189 protein_gninatypes = os.listdir(os.path.sep.join([path, 'proteins']))
190
191 num_proteins = 0
192 num_ligands = 0
193
194 for protein in protein_gninatypes:
195 num_proteins += 1
196 num_ligands = 0
197 inputs_protein.append(protein)
198 inputs_protein.append(os.path.sep.join([path, 'proteins', protein]))
199 for ligand in ligand_gninatypes:
200 num_ligands += 1
201 log("Handling", protein, ligand)
202 inputs_ligands.append(os.path.sep.join([path, 'ligands', ligand]))
203 line = "0 {0}{3}proteins{3}{1} {0}{3}ligands{3}{2}\n".format(path, protein, ligand, os.path.sep)
204 types_file.write(line)
205
206 return num_proteins, num_ligands
207
208
209 def generate_predictions_filename(work_dir, predict_file_name):
210 return "{0}{1}{2}".format(work_dir, os.path.sep, predict_file_name)
211
212
213 def run_predictions():
214 global types_file_name
215 global predict_file_name
216 global work_dir
217 # python3 scripts/predict.py -m resources/dense.prototxt -w resources/weights.caffemodel -i work_0/test_set.types >> work_0/caffe_output/predictions.txt
218 cmd1 = ['python3', '/train/fragalysis_test_files/scripts/predict.py',
219 '-m', '/train/fragalysis_test_files/resources/dense.prototxt',
220 '-w', '/train/fragalysis_test_files/resources/weights.caffemodel',
221 '-i', os.path.sep.join([work_dir, types_file_name]),
222 '-o', os.path.sep.join([work_dir, predict_file_name])]
223 log("CMD:", cmd1)
224 subprocess.call(cmd1)
225
226
227 def mock_predictions():
228 global work_dir
229 global predict_file_name
230
231 log("WARNING: generating mock results instead of running on GPU")
232 outfile = generate_predictions_filename(work_dir, predict_file_name)
233 count = 0
234 with open(outfile, 'w') as predictions:
235 for path in paths:
236 log("Reading", path)
237 protein_gninatypes = os.listdir(os.path.sep.join([path, 'proteins']))
238 ligand_gninatypes = os.listdir(os.path.sep.join([path, 'ligands']))
239 for protein in protein_gninatypes:
240 for ligand in ligand_gninatypes:
241 count += 1
242 score = random.random()
243 line = "{0} | 0 {1}{4}proteins{4}{2} {1}{4}ligands{4}{3}\n".format(score, path, protein, ligand,
244 os.path.sep)
245 # log("Writing", line)
246 predictions.write(line)
247
248 log('Wrote', count, 'mock predictions')
249
250
251 def read_predictions():
252 global predict_file_name
253 global work_dir
254 scores = {}
255 with open("{0}{1}{2}".format(work_dir, os.path.sep, predict_file_name), 'r') as input:
256 for line in input:
257 # log(line)
258 tokens = line.split()
259 if len(tokens) == 5 and tokens[1] == '|':
260 # log(len(tokens), tokens[0], tokens[3], tokens[4])
261 record_no = inputs_ligands.index(tokens[4])
262 if record_no is not None:
263 # log(record_no, tokens[0])
264 scores[record_no] = tokens[0]
265 log("Found", len(scores), "scores")
266 return scores
267
268
269 def patch_scores_sdf(outfile, scores):
270
271 counter = 0
272 sdf_path = "{0}{1}{2}".format(work_dir, os.path.sep, outfile)
273 log("Writing results to {0}".format(sdf_path))
274 sdf_file = pybel.Outputfile("sdf", sdf_path)
275
276 for path in paths:
277 for mol in pybel.readfile("sdf", os.path.sep.join([path, 'ligands.sdf'])):
278 if counter in scores:
279 score = scores[counter]
280 # og("Score for record {0} is {1}".format(counter, score))
281 mol.data['TransFSScore'] = score
282 sdf_file.write(mol)
283 else:
284 log("No score found for record", counter)
285 counter += 1
286 sdf_file.close()
287
288
289 def execute(ligands_sdf, protein, outfile, distance, mock=False):
290
291 write_inputs(protein, ligands_sdf, distance)
292 if mock:
293 mock_predictions()
294 else:
295 run_predictions()
296 scores = read_predictions()
297 patch_scores_sdf(outfile, scores)
298
299
300 def main():
301 global work_dir
302
303 parser = argparse.ArgumentParser(description='XChem deep - pose scoring')
304
305 parser.add_argument('-i', '--input', help="SDF containing the poses to score)")
306 parser.add_argument('-r', '--receptor', help="Receptor file for scoring (PDB format)")
307 parser.add_argument('-d', '--distance', type=float, default=2.0, help="Cuttoff for removing waters")
308 parser.add_argument('-o', '--outfile', default='output.sdf', help="File name for results")
309 parser.add_argument('-w', '--work-dir', default=".", help="Working directory")
310 parser.add_argument('--mock', action='store_true', help='Generate mock scores rather than run on GPU')
311
312 args = parser.parse_args()
313 log("XChem deep args: ", args)
314
315 work_dir = args.work_dir
316
317 execute(args.input, args.receptor, args.outfile, args.distance, mock=args.mock)
318
319
320 if __name__ == "__main__":
321 main()
322