Mercurial > repos > bgruening > xchem_transfs_scoring
comparison server/transfs.py @ 0:69fe50d03b80 draft
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/transfs commit d9a9e2f0e12fe9d2c37f632d99f2164df577b4af"
author | bgruening |
---|---|
date | Fri, 27 Mar 2020 13:11:29 +0000 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:69fe50d03b80 |
---|---|
1 # Create dir containing ligands.sdf and protein.pdb | |
2 # Enter docker container like this: | |
3 # docker run -it --rm --gpus all -v $PWD:/root/train/fragalysis_test_files/work:Z informaticsmatters/deep-app-ubuntu-1604:latest bash | |
4 # | |
5 # Now inside the container run like this: | |
6 # mkdir /tmp/work | |
7 # rm -rf /tmp/work/* && python3 work/transfs.py -i work/test-data/ligands.sdf -r work/test-data/receptor.pdb -d 2 -w /tmp/work | |
8 # | |
9 # If testing with no GPU you can use the --mock option to generate random scores | |
10 # | |
11 # Start container for testing like this: | |
12 # docker run -it --rm -v $PWD:$PWD:Z -w $PWD informaticsmatters/deep-app-ubuntu-1604:latest bash | |
13 # Inside container test like this: | |
14 # mkdir /tmp/work | |
15 # cd chemicaltoolbox/xchem-deep | |
16 # rm -rf /tmp/work/* && python3 transfs.py -i test-data/ligands.sdf -r test-data/receptor.pdb -d 2 -w /tmp/work --mock | |
17 # | |
18 | |
19 import argparse, os, sys, math | |
20 import subprocess | |
21 import random | |
22 from openbabel import pybel | |
23 | |
24 types_file_name = 'inputs.types' | |
25 types_file_name = 'inputs.types' | |
26 predict_file_name = 'predictions.txt' | |
27 work_dir = '.' | |
28 paths = None | |
29 inputs_protein = [] | |
30 inputs_ligands = [] | |
31 | |
32 | |
33 def log(*args, **kwargs): | |
34 """Log output to STDERR | |
35 """ | |
36 print(*args, file=sys.stderr, ** kwargs) | |
37 | |
38 def write_raw_inputs(receptor_pdb, ligands_sdf, distance): | |
39 """ | |
40 Analyses the PDB file for waters that clash with each ligand in the SDF and writes out: | |
41 1. a PDB file named like receptor-123-543.pdb where the numeric parts are the waters that have been omitted | |
42 2. a corresponding directory named like receptor-123-543 | |
43 3. an SDF named like receptor-123-543/ligands.sdf containing those ligands that correspond to that receptor. | |
44 :param receptor_pdb: A PDB file without the ligand but with the crystallographic waters | |
45 :param ligands_sdf: A SDF with the docked poses | |
46 :param distance: The distance to consider when removing waters. Only heavy atoms in the ligand are considered. | |
47 :return: | |
48 """ | |
49 | |
50 global work_dir | |
51 global inputs_protein | |
52 global inputs_ligands | |
53 global paths | |
54 | |
55 | |
56 log("Writing data to", work_dir) | |
57 if not os.path.isdir(work_dir): | |
58 os.mkdir(work_dir) | |
59 | |
60 receptor_file = os.path.basename(receptor_pdb) | |
61 | |
62 sdf_writers = {} | |
63 paths = [] | |
64 | |
65 # read the receptor once as we'll need to process it many times | |
66 with open(receptor_pdb, 'r') as f: | |
67 lines = f.readlines() | |
68 | |
69 count = 0 | |
70 for mol in pybel.readfile("sdf", ligands_sdf): | |
71 count += 1 | |
72 if count % 50000 == 0: | |
73 log('Processed', count) | |
74 | |
75 try: | |
76 # print("Processing mol", mol.title) | |
77 | |
78 clone = pybel.Molecule(mol) | |
79 clone.removeh() | |
80 | |
81 coords = [] | |
82 for atom in clone.atoms: | |
83 coords.append(atom.coords) | |
84 | |
85 watnumcode = '' | |
86 | |
87 # getting receptor without waters that will clash with ligand | |
88 new_receptor_pdb = [] | |
89 for line in lines: | |
90 if line[17:20] == 'HOH': | |
91 x, y, z = float(line[30:39]), float(line[39:46]), float(line[46:55]) | |
92 distances = [] | |
93 for i in coords: | |
94 distances.append(math.sqrt((x-i[0])**2 + (y-i[1])**2 + (z-i[2])**2)) # calculates distance based on cartesian coordinates | |
95 if min(distances) > distance: # if all distances are larger than 2.0A, then molecule makes it to new file | |
96 new_receptor_pdb.append(line) | |
97 else: | |
98 watnum = line[23:28].strip() | |
99 # print("Skipped water " + watnum) | |
100 watnumcode += '-' + watnum | |
101 if line[17:20] != 'LIG' and line[17:20] != 'HOH': # ligand lines are also removed | |
102 new_receptor_pdb.append(line) | |
103 | |
104 | |
105 name = receptor_file[0:-4] + watnumcode | |
106 # print('CODE:', name) | |
107 mol.data['TransFSReceptor'] = name | |
108 | |
109 | |
110 if watnumcode not in sdf_writers: | |
111 # we've not yet encountered this combination of waters so need to write the PDB file | |
112 | |
113 dir = os.path.sep.join([work_dir, name]) | |
114 log('WRITING to :', dir) | |
115 | |
116 os.mkdir(dir) | |
117 paths.append(dir) | |
118 | |
119 sdf_writers[watnumcode] = pybel.Outputfile("sdf", os.path.sep.join([dir, 'ligands.sdf'])) | |
120 | |
121 # writing into new pdb file | |
122 receptor_writer = open(os.path.sep.join([work_dir, name + '.pdb']), "w+") | |
123 for line in new_receptor_pdb: | |
124 receptor_writer.write(str(line)) | |
125 receptor_writer.close() | |
126 | |
127 # write the molecule to the corresponding SDF file | |
128 sdf_out = sdf_writers[watnumcode] | |
129 sdf_out.write(mol) | |
130 | |
131 except Exception as e: | |
132 log('Failed to handle molecule: '+ str(e)) | |
133 continue | |
134 | |
135 for writer in sdf_writers.values(): | |
136 writer.close() | |
137 | |
138 log('Wrote', count, 'molecules and', len(sdf_writers), 'proteins') | |
139 | |
140 def write_inputs(protein_file, ligands_file, distance): | |
141 """ | |
142 Runs gninatyper on the proteins and ligands and generates the input.types file that will tell the predictor | |
143 what ligands correspond to what proteins. | |
144 | |
145 :param protein_file: | |
146 :param ligands_file: | |
147 :param distance: | |
148 :return: | |
149 """ | |
150 | |
151 global types_file_name | |
152 global work_dir | |
153 global inputs_protein | |
154 global inputs_ligands | |
155 global prepared_ligands | |
156 | |
157 write_raw_inputs(protein_file, ligands_file, distance) | |
158 | |
159 types_path = os.path.sep.join([work_dir, types_file_name]) | |
160 log("Writing types to", types_path) | |
161 with open(types_path, 'w') as types_file: | |
162 | |
163 for path in paths: | |
164 | |
165 log("Gninatyping ligands in", path) | |
166 | |
167 ligands_dir = os.path.sep.join([path, 'ligands']) | |
168 os.mkdir(ligands_dir) | |
169 | |
170 cmd1 = ['gninatyper', os.path.sep.join([path, 'ligands.sdf']), os.path.sep.join([ligands_dir, 'ligand'])] | |
171 log('CMD:', cmd1) | |
172 exit_code = subprocess.call(cmd1) | |
173 log("Status:", exit_code) | |
174 if exit_code: | |
175 raise Exception("Failed to write ligands") | |
176 ligand_gninatypes = os.listdir(os.path.sep.join([path, 'ligands'])) | |
177 | |
178 log("Gninatyping proteins in", path) | |
179 | |
180 proteins_dir = os.path.sep.join([path, 'proteins']) | |
181 os.mkdir(proteins_dir) | |
182 | |
183 cmd2 = ['gninatyper', path + '.pdb', os.path.sep.join([proteins_dir, 'protein'])] | |
184 log('CMD:', cmd2) | |
185 exit_code = subprocess.call(cmd2) | |
186 log("Status:", exit_code) | |
187 if exit_code: | |
188 raise Exception("Failed to write proteins") | |
189 protein_gninatypes = os.listdir(os.path.sep.join([path, 'proteins'])) | |
190 | |
191 num_proteins = 0 | |
192 num_ligands = 0 | |
193 | |
194 for protein in protein_gninatypes: | |
195 num_proteins += 1 | |
196 num_ligands = 0 | |
197 inputs_protein.append(protein) | |
198 inputs_protein.append(os.path.sep.join([path, 'proteins', protein])) | |
199 for ligand in ligand_gninatypes: | |
200 num_ligands += 1 | |
201 log("Handling", protein, ligand) | |
202 inputs_ligands.append(os.path.sep.join([path, 'ligands', ligand])) | |
203 line = "0 {0}{3}proteins{3}{1} {0}{3}ligands{3}{2}\n".format(path, protein, ligand, os.path.sep) | |
204 types_file.write(line) | |
205 | |
206 return num_proteins, num_ligands | |
207 | |
208 | |
209 def generate_predictions_filename(work_dir, predict_file_name): | |
210 return "{0}{1}{2}".format(work_dir, os.path.sep, predict_file_name) | |
211 | |
212 | |
213 def run_predictions(): | |
214 global types_file_name | |
215 global predict_file_name | |
216 global work_dir | |
217 # python3 scripts/predict.py -m resources/dense.prototxt -w resources/weights.caffemodel -i work_0/test_set.types >> work_0/caffe_output/predictions.txt | |
218 cmd1 = ['python3', '/train/fragalysis_test_files/scripts/predict.py', | |
219 '-m', '/train/fragalysis_test_files/resources/dense.prototxt', | |
220 '-w', '/train/fragalysis_test_files/resources/weights.caffemodel', | |
221 '-i', os.path.sep.join([work_dir, types_file_name]), | |
222 '-o', os.path.sep.join([work_dir, predict_file_name])] | |
223 log("CMD:", cmd1) | |
224 subprocess.call(cmd1) | |
225 | |
226 | |
227 def mock_predictions(): | |
228 global work_dir | |
229 global predict_file_name | |
230 | |
231 log("WARNING: generating mock results instead of running on GPU") | |
232 outfile = generate_predictions_filename(work_dir, predict_file_name) | |
233 count = 0 | |
234 with open(outfile, 'w') as predictions: | |
235 for path in paths: | |
236 log("Reading", path) | |
237 protein_gninatypes = os.listdir(os.path.sep.join([path, 'proteins'])) | |
238 ligand_gninatypes = os.listdir(os.path.sep.join([path, 'ligands'])) | |
239 for protein in protein_gninatypes: | |
240 for ligand in ligand_gninatypes: | |
241 count += 1 | |
242 score = random.random() | |
243 line = "{0} | 0 {1}{4}proteins{4}{2} {1}{4}ligands{4}{3}\n".format(score, path, protein, ligand, | |
244 os.path.sep) | |
245 # log("Writing", line) | |
246 predictions.write(line) | |
247 | |
248 log('Wrote', count, 'mock predictions') | |
249 | |
250 | |
251 def read_predictions(): | |
252 global predict_file_name | |
253 global work_dir | |
254 scores = {} | |
255 with open("{0}{1}{2}".format(work_dir, os.path.sep, predict_file_name), 'r') as input: | |
256 for line in input: | |
257 # log(line) | |
258 tokens = line.split() | |
259 if len(tokens) == 5 and tokens[1] == '|': | |
260 # log(len(tokens), tokens[0], tokens[3], tokens[4]) | |
261 record_no = inputs_ligands.index(tokens[4]) | |
262 if record_no is not None: | |
263 # log(record_no, tokens[0]) | |
264 scores[record_no] = tokens[0] | |
265 log("Found", len(scores), "scores") | |
266 return scores | |
267 | |
268 | |
269 def patch_scores_sdf(outfile, scores): | |
270 | |
271 counter = 0 | |
272 sdf_path = "{0}{1}{2}".format(work_dir, os.path.sep, outfile) | |
273 log("Writing results to {0}".format(sdf_path)) | |
274 sdf_file = pybel.Outputfile("sdf", sdf_path) | |
275 | |
276 for path in paths: | |
277 for mol in pybel.readfile("sdf", os.path.sep.join([path, 'ligands.sdf'])): | |
278 if counter in scores: | |
279 score = scores[counter] | |
280 # og("Score for record {0} is {1}".format(counter, score)) | |
281 mol.data['TransFSScore'] = score | |
282 sdf_file.write(mol) | |
283 else: | |
284 log("No score found for record", counter) | |
285 counter += 1 | |
286 sdf_file.close() | |
287 | |
288 | |
289 def execute(ligands_sdf, protein, outfile, distance, mock=False): | |
290 | |
291 write_inputs(protein, ligands_sdf, distance) | |
292 if mock: | |
293 mock_predictions() | |
294 else: | |
295 run_predictions() | |
296 scores = read_predictions() | |
297 patch_scores_sdf(outfile, scores) | |
298 | |
299 | |
300 def main(): | |
301 global work_dir | |
302 | |
303 parser = argparse.ArgumentParser(description='XChem deep - pose scoring') | |
304 | |
305 parser.add_argument('-i', '--input', help="SDF containing the poses to score)") | |
306 parser.add_argument('-r', '--receptor', help="Receptor file for scoring (PDB format)") | |
307 parser.add_argument('-d', '--distance', type=float, default=2.0, help="Cuttoff for removing waters") | |
308 parser.add_argument('-o', '--outfile', default='output.sdf', help="File name for results") | |
309 parser.add_argument('-w', '--work-dir', default=".", help="Working directory") | |
310 parser.add_argument('--mock', action='store_true', help='Generate mock scores rather than run on GPU') | |
311 | |
312 args = parser.parse_args() | |
313 log("XChem deep args: ", args) | |
314 | |
315 work_dir = args.work_dir | |
316 | |
317 execute(args.input, args.receptor, args.outfile, args.distance, mock=args.mock) | |
318 | |
319 | |
320 if __name__ == "__main__": | |
321 main() | |
322 |