Mercurial > repos > bgruening > sdf_to_tab
changeset 5:48c536413a2f draft
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/rdkit commit 4d0bfcf37bfbedafc7ff0672dfe452766ca8a606"
author | bgruening |
---|---|
date | Wed, 17 Feb 2021 12:58:34 +0000 |
parents | 67b9a5b22bf0 |
children | 3aa5a03c3b32 |
files | dimorphite_dl.py rdkit_descriptors.py sdf_to_tab.py test-data/mol.pdb test-data/mol_pdb_charges.tab test-data/rdkit_descriptors_result1.tab test-data/rdkit_descriptors_subset.tab |
diffstat | 7 files changed, 124 insertions(+), 28 deletions(-) [+] |
line wrap: on
line diff
--- a/dimorphite_dl.py Tue Jul 28 12:05:30 2020 +0000 +++ b/dimorphite_dl.py Wed Feb 17 12:58:34 2021 +0000 @@ -1,3 +1,4 @@ +# flake8: noqa # Copyright 2018 Jacob D. Durrant # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -18,7 +19,6 @@ """ from __future__ import print_function -import copy import os import argparse import sys
--- a/rdkit_descriptors.py Tue Jul 28 12:05:30 2020 +0000 +++ b/rdkit_descriptors.py Wed Feb 17 12:58:34 2021 +0000 @@ -1,44 +1,49 @@ #!/usr/bin/env python -from rdkit.Chem import Descriptors -from rdkit import Chem -import sys, os, re import argparse import inspect +import sys -def get_supplier( infile, format = 'smiles' ): +from rdkit import Chem +from rdkit.Chem import Descriptors + + +def get_supplier(infile, format='smiles'): """ - Returns a generator over a SMILES or InChI file. Every element is of RDKit + Returns a generator over a SMILES or InChI file. Every element is of RDKit molecule and has its original string as _Name property. """ with open(infile) as handle: for line in handle: line = line.strip() if format == 'smiles': - mol = Chem.MolFromSmiles( line, sanitize=True ) + mol = Chem.MolFromSmiles(line, sanitize=True) elif format == 'inchi': - mol = Chem.inchi.MolFromInchi( line, sanitize=True, removeHs=True, logLevel=None, treatWarningAsError=False ) + mol = Chem.inchi.MolFromInchi(line, sanitize=True, removeHs=True, logLevel=None, treatWarningAsError=False) if mol is None: yield False else: - mol.SetProp( '_Name', line.split('\t')[0] ) + mol.SetProp('_Name', line.split('\t')[0]) yield mol + def get_rdkit_descriptor_functions(): """ Returns all descriptor functions under the Chem.Descriptors Module as tuple of (name, function) """ - ret = [ (name, f) for name, f in inspect.getmembers( Descriptors ) if inspect.isfunction( f ) and not name.startswith( '_' ) ] + ret = [(name, f) for name, f in inspect.getmembers(Descriptors) if inspect.isfunction(f) and not name.startswith('_')] + # some which are not in the official Descriptors module we need to add manually + ret.extend([('FormalCharge', Chem.GetFormalCharge), ('SSSR', Chem.GetSSSR)]) ret.sort() return ret -def descriptors( mol, functions ): +def descriptors(mol, functions): """ Calculates the descriptors of a given molecule. """ for name, function in functions: - yield (name, function( mol )) + yield (name, function(mol)) if __name__ == "__main__": @@ -46,31 +51,44 @@ parser.add_argument('-i', '--infile', required=True, help='Path to the input file.') parser.add_argument("--iformat", help="Specify the input file format.") - parser.add_argument('-o', '--outfile', type=argparse.FileType('w+'), - default=sys.stdout, help="path to the result file, default it sdtout") + parser.add_argument('-o', '--outfile', type=argparse.FileType('w+'), + default=sys.stdout, + help="path to the result file, default is stdout") + + parser.add_argument('-s', '--select', default=None, + help="select a subset of comma-separated descriptors to use") parser.add_argument("--header", dest="header", action="store_true", - default=False, - help="Write header line.") + default=False, + help="Write header line.") args = parser.parse_args() if args.iformat == 'sdf': - supplier = Chem.SDMolSupplier( args.infile ) - elif args.iformat =='smi': - supplier = get_supplier( args.infile, format = 'smiles' ) + supplier = Chem.SDMolSupplier(args.infile) + elif args.iformat == 'smi': + supplier = get_supplier(args.infile, format='smiles') elif args.iformat == 'inchi': - supplier = get_supplier( args.infile, format = 'inchi' ) + supplier = get_supplier(args.infile, format='inchi') + elif args.iformat == 'pdb': + supplier = [Chem.MolFromPDBFile(args.infile)] + elif args.iformat == 'mol2': + supplier = [Chem.MolFromMol2File(args.infile)] functions = get_rdkit_descriptor_functions() + if args.select and args.select != 'None': + selected = args.select.split(',') + functions = [(name, f) for name, f in functions if name in selected] if args.header: - args.outfile.write( '%s\n' % '\t'.join( ['MoleculeID'] + [name for name, f in functions] ) ) + args.outfile.write('%s\n' % '\t'.join(['MoleculeID'] + [name for name, f in functions])) for mol in supplier: if not mol: continue - descs = descriptors( mol, functions ) - molecule_id = mol.GetProp("_Name") - args.outfile.write( "%s\n" % '\t'.join( [molecule_id]+ [str(round(res, 6)) for name, res in descs] ) ) - + descs = descriptors(mol, functions) + try: + molecule_id = mol.GetProp("_Name") + except KeyError: + molecule_id = Chem.MolToSmiles(mol) + args.outfile.write("%s\n" % '\t'.join([molecule_id] + [str(round(res, 6)) for name, res in descs]))
--- a/sdf_to_tab.py Tue Jul 28 12:05:30 2020 +0000 +++ b/sdf_to_tab.py Wed Feb 17 12:58:34 2021 +0000 @@ -1,8 +1,10 @@ #!/usr/bin/env python3 import argparse + import pandas as pd from rdkit import Chem + def sdf_to_tab(vars): mols = Chem.SDMolSupplier(vars.inp, sanitize=False) df = pd.DataFrame() # for output @@ -29,6 +31,7 @@ sorted_cols = sorted(df.columns.values.tolist()) df.to_csv(vars.out, sep='\t', header=vars.header, columns=sorted_cols) + def main(): parser = argparse.ArgumentParser(description="Convert SDF to tabular") parser.add_argument('--inp', '-i', help="The input file", required=True) @@ -41,7 +44,7 @@ parser.add_argument('--name', '-n', action='store_true', help="Include molecule name in output.") sdf_to_tab(parser.parse_args()) - + if __name__ == "__main__": main()
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/mol.pdb Wed Feb 17 12:58:34 2021 +0000 @@ -0,0 +1,72 @@ +COMPND CNCC(O)CCCc1ccccc1 +AUTHOR GENERATED BY OPEN BABEL 3.1.0 +HETATM 1 C UNL 1 9.206 6.617 23.375 1.00 0.00 C +HETATM 2 N UNL 1 9.288 5.239 22.843 1.00 0.00 N1+ +HETATM 3 C UNL 1 9.901 4.245 23.787 1.00 0.00 C +HETATM 4 C UNL 1 9.552 2.837 23.275 1.00 0.00 C +HETATM 5 O UNL 1 10.280 2.666 22.057 1.00 0.00 O +HETATM 6 C UNL 1 9.877 1.780 24.331 1.00 0.00 C +HETATM 7 C UNL 1 9.398 0.387 23.922 1.00 0.00 C +HETATM 8 C UNL 1 10.424 -0.687 24.293 1.00 0.00 C +HETATM 9 C UNL 1 11.616 -0.605 23.380 1.00 0.00 C +HETATM 10 C UNL 1 11.773 -1.516 22.327 1.00 0.00 C +HETATM 11 C UNL 1 12.918 -1.487 21.533 1.00 0.00 C +HETATM 12 C UNL 1 13.916 -0.552 21.786 1.00 0.00 C +HETATM 13 C UNL 1 13.767 0.367 22.824 1.00 0.00 C +HETATM 14 C UNL 1 12.623 0.342 23.620 1.00 0.00 C +HETATM 15 H UNL 1 8.759 7.256 22.643 1.00 0.00 H +HETATM 16 H UNL 1 10.189 6.970 23.605 1.00 0.00 H +HETATM 17 H UNL 1 8.609 6.620 24.264 1.00 0.00 H +HETATM 18 H UNL 1 9.849 5.259 21.991 1.00 0.00 H +HETATM 19 H UNL 1 8.329 4.932 22.679 1.00 0.00 H +HETATM 20 H UNL 1 9.504 4.384 24.771 1.00 0.00 H +HETATM 21 H UNL 1 10.962 4.375 23.832 1.00 0.00 H +HETATM 22 H UNL 1 8.505 2.722 23.087 1.00 0.00 H +HETATM 23 H UNL 1 11.228 2.771 22.229 1.00 0.00 H +HETATM 24 H UNL 1 9.401 2.052 25.249 1.00 0.00 H +HETATM 25 H UNL 1 10.941 1.741 24.440 1.00 0.00 H +HETATM 26 H UNL 1 9.242 0.370 22.864 1.00 0.00 H +HETATM 27 H UNL 1 8.487 0.178 24.443 1.00 0.00 H +HETATM 28 H UNL 1 9.974 -1.653 24.199 1.00 0.00 H +HETATM 29 H UNL 1 10.746 -0.530 25.301 1.00 0.00 H +HETATM 30 H UNL 1 11.037 -2.214 22.138 1.00 0.00 H +HETATM 31 H UNL 1 13.025 -2.159 20.758 1.00 0.00 H +HETATM 32 H UNL 1 14.769 -0.538 21.204 1.00 0.00 H +HETATM 33 H UNL 1 14.504 1.066 23.003 1.00 0.00 H +HETATM 34 H UNL 1 12.517 1.022 24.389 1.00 0.00 H +CONECT 1 2 15 16 17 +CONECT 2 1 3 18 19 +CONECT 3 2 4 20 21 +CONECT 4 3 5 6 22 +CONECT 5 4 23 +CONECT 6 4 7 24 25 +CONECT 7 6 8 26 27 +CONECT 8 7 9 28 29 +CONECT 9 8 10 10 14 +CONECT 10 9 9 11 30 +CONECT 11 10 12 12 31 +CONECT 12 11 11 13 32 +CONECT 13 12 14 14 33 +CONECT 14 9 13 13 34 +CONECT 15 1 +CONECT 16 1 +CONECT 17 1 +CONECT 18 2 +CONECT 19 2 +CONECT 20 3 +CONECT 21 3 +CONECT 22 4 +CONECT 23 5 +CONECT 24 6 +CONECT 25 6 +CONECT 26 7 +CONECT 27 7 +CONECT 28 8 +CONECT 29 8 +CONECT 30 10 +CONECT 31 11 +CONECT 32 12 +CONECT 33 13 +CONECT 34 14 +MASTER 0 0 0 0 0 0 0 0 34 0 34 0 +END
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/mol_pdb_charges.tab Wed Feb 17 12:58:34 2021 +0000 @@ -0,0 +1,1 @@ +CNCC(O)CCCc1ccccc1 1
--- a/test-data/rdkit_descriptors_result1.tab Tue Jul 28 12:05:30 2020 +0000 +++ b/test-data/rdkit_descriptors_result1.tab Wed Feb 17 12:58:34 2021 +0000 @@ -1,2 +1,2 @@ -MoleculeID BalabanJ BertzCT Chi0 Chi0n Chi0v Chi1 Chi1n Chi1v Chi2n Chi2v Chi3n Chi3v Chi4n Chi4v EState_VSA1 EState_VSA10 EState_VSA11 EState_VSA2 EState_VSA3 EState_VSA4 EState_VSA5 EState_VSA6 EState_VSA7 EState_VSA8 EState_VSA9 ExactMolWt FpDensityMorgan1 FpDensityMorgan2 FpDensityMorgan3 FractionCSP3 HallKierAlpha HeavyAtomCount HeavyAtomMolWt Ipc Kappa1 Kappa2 Kappa3 LabuteASA MaxAbsEStateIndex MaxAbsPartialCharge MaxEStateIndex MaxPartialCharge MinAbsEStateIndex MinAbsPartialCharge MinEStateIndex MinPartialCharge MolLogP MolMR MolWt NHOHCount NOCount NumAliphaticCarbocycles NumAliphaticHeterocycles NumAliphaticRings NumAromaticCarbocycles NumAromaticHeterocycles NumAromaticRings NumHAcceptors NumHDonors NumHeteroatoms NumRadicalElectrons NumRotatableBonds NumSaturatedCarbocycles NumSaturatedHeterocycles NumSaturatedRings NumValenceElectrons PEOE_VSA1 PEOE_VSA10 PEOE_VSA11 PEOE_VSA12 PEOE_VSA13 PEOE_VSA14 PEOE_VSA2 PEOE_VSA3 PEOE_VSA4 PEOE_VSA5 PEOE_VSA6 PEOE_VSA7 PEOE_VSA8 PEOE_VSA9 RingCount SMR_VSA1 SMR_VSA10 SMR_VSA2 SMR_VSA3 SMR_VSA4 SMR_VSA5 SMR_VSA6 SMR_VSA7 SMR_VSA8 SMR_VSA9 SlogP_VSA1 SlogP_VSA10 SlogP_VSA11 SlogP_VSA12 SlogP_VSA2 SlogP_VSA3 SlogP_VSA4 SlogP_VSA5 SlogP_VSA6 SlogP_VSA7 SlogP_VSA8 SlogP_VSA9 TPSA VSA_EState1 VSA_EState10 VSA_EState2 VSA_EState3 VSA_EState4 VSA_EState5 VSA_EState6 VSA_EState7 VSA_EState8 VSA_EState9 fr_Al_COO fr_Al_OH fr_Al_OH_noTert fr_ArN fr_Ar_COO fr_Ar_N fr_Ar_NH fr_Ar_OH fr_COO fr_COO2 fr_C_O fr_C_O_noCOO fr_C_S fr_HOCCN fr_Imine fr_NH0 fr_NH1 fr_NH2 fr_N_O fr_Ndealkylation1 fr_Ndealkylation2 fr_Nhpyrrole fr_SH fr_aldehyde fr_alkyl_carbamate fr_alkyl_halide fr_allylic_oxid fr_amide fr_amidine fr_aniline fr_aryl_methyl fr_azide fr_azo fr_barbitur fr_benzene fr_benzodiazepine fr_bicyclic fr_diazo fr_dihydropyridine fr_epoxide fr_ester fr_ether fr_furan fr_guanido fr_halogen fr_hdrzine fr_hdrzone fr_imidazole fr_imide fr_isocyan fr_isothiocyan fr_ketone fr_ketone_Topliss fr_lactam fr_lactone fr_methoxy fr_morpholine fr_nitrile fr_nitro fr_nitro_arom fr_nitro_arom_nonortho fr_nitroso fr_oxazole fr_oxime fr_para_hydroxylation fr_phenol fr_phenol_noOrthoHbond fr_phos_acid fr_phos_ester fr_piperdine fr_piperzine fr_priamide fr_prisulfonamd fr_pyridine fr_quatN fr_sulfide fr_sulfonamd fr_sulfone fr_term_acetylene fr_tetrazole fr_thiazole fr_thiocyan fr_thiophene fr_unbrch_alkane fr_urea qed -3037 2.370228 503.61088 12.413849 8.821565 10.333422 8.058551 5.008353 5.764282 3.722845 4.595717 2.463985 2.934179 1.596526 1.985926 0.0 10.213055 0.0 11.499024 27.592991 0.0 12.132734 24.265468 0.0 0.0 23.20188 268.005785 0.764706 1.176471 1.588235 0.076923 -1.38 17 259.047 6943.4452 12.086867 4.861181 2.842672 109.048439 9.683208 0.507662 9.683208 0.118709 0.147014 0.118709 0.147014 -0.507662 3.9954 69.0396 269.127 2 2 0 0 0 2 0 2 2 2 4 0 2 0 0 0 88 10.213055 11.499024 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 23.20188 47.525105 16.466088 0.0 2 10.213055 23.20188 0.0 0.0 0.0 6.420822 0.0 57.570372 0.0 11.499024 0.0 0.0 11.499024 23.20188 10.213055 6.420822 0.0 11.126903 36.398202 10.045267 0.0 0.0 40.46 0.0 11.70887 0.0 20.448487 1.29642 0.294029 9.600621 0.373796 0.0 0.0 0 0 0 0 0 0 0 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 0 0 0 0 0 0 0 0 0 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0.864713 +MoleculeID BalabanJ BertzCT Chi0 Chi0n Chi0v Chi1 Chi1n Chi1v Chi2n Chi2v Chi3n Chi3v Chi4n Chi4v EState_VSA1 EState_VSA10 EState_VSA11 EState_VSA2 EState_VSA3 EState_VSA4 EState_VSA5 EState_VSA6 EState_VSA7 EState_VSA8 EState_VSA9 ExactMolWt FormalCharge FpDensityMorgan1 FpDensityMorgan2 FpDensityMorgan3 FractionCSP3 HallKierAlpha HeavyAtomCount HeavyAtomMolWt Ipc Kappa1 Kappa2 Kappa3 LabuteASA MaxAbsEStateIndex MaxAbsPartialCharge MaxEStateIndex MaxPartialCharge MinAbsEStateIndex MinAbsPartialCharge MinEStateIndex MinPartialCharge MolLogP MolMR MolWt NHOHCount NOCount NumAliphaticCarbocycles NumAliphaticHeterocycles NumAliphaticRings NumAromaticCarbocycles NumAromaticHeterocycles NumAromaticRings NumHAcceptors NumHDonors NumHeteroatoms NumRadicalElectrons NumRotatableBonds NumSaturatedCarbocycles NumSaturatedHeterocycles NumSaturatedRings NumValenceElectrons PEOE_VSA1 PEOE_VSA10 PEOE_VSA11 PEOE_VSA12 PEOE_VSA13 PEOE_VSA14 PEOE_VSA2 PEOE_VSA3 PEOE_VSA4 PEOE_VSA5 PEOE_VSA6 PEOE_VSA7 PEOE_VSA8 PEOE_VSA9 RingCount SMR_VSA1 SMR_VSA10 SMR_VSA2 SMR_VSA3 SMR_VSA4 SMR_VSA5 SMR_VSA6 SMR_VSA7 SMR_VSA8 SMR_VSA9 SSSR SlogP_VSA1 SlogP_VSA10 SlogP_VSA11 SlogP_VSA12 SlogP_VSA2 SlogP_VSA3 SlogP_VSA4 SlogP_VSA5 SlogP_VSA6 SlogP_VSA7 SlogP_VSA8 SlogP_VSA9 TPSA VSA_EState1 VSA_EState10 VSA_EState2 VSA_EState3 VSA_EState4 VSA_EState5 VSA_EState6 VSA_EState7 VSA_EState8 VSA_EState9 fr_Al_COO fr_Al_OH fr_Al_OH_noTert fr_ArN fr_Ar_COO fr_Ar_N fr_Ar_NH fr_Ar_OH fr_COO fr_COO2 fr_C_O fr_C_O_noCOO fr_C_S fr_HOCCN fr_Imine fr_NH0 fr_NH1 fr_NH2 fr_N_O fr_Ndealkylation1 fr_Ndealkylation2 fr_Nhpyrrole fr_SH fr_aldehyde fr_alkyl_carbamate fr_alkyl_halide fr_allylic_oxid fr_amide fr_amidine fr_aniline fr_aryl_methyl fr_azide fr_azo fr_barbitur fr_benzene fr_benzodiazepine fr_bicyclic fr_diazo fr_dihydropyridine fr_epoxide fr_ester fr_ether fr_furan fr_guanido fr_halogen fr_hdrzine fr_hdrzone fr_imidazole fr_imide fr_isocyan fr_isothiocyan fr_ketone fr_ketone_Topliss fr_lactam fr_lactone fr_methoxy fr_morpholine fr_nitrile fr_nitro fr_nitro_arom fr_nitro_arom_nonortho fr_nitroso fr_oxazole fr_oxime fr_para_hydroxylation fr_phenol fr_phenol_noOrthoHbond fr_phos_acid fr_phos_ester fr_piperdine fr_piperzine fr_priamide fr_prisulfonamd fr_pyridine fr_quatN fr_sulfide fr_sulfonamd fr_sulfone fr_term_acetylene fr_tetrazole fr_thiazole fr_thiocyan fr_thiophene fr_unbrch_alkane fr_urea qed +3037 2.370228 503.61088 12.413849 8.821565 10.333422 8.058551 5.008353 5.764282 3.722845 4.595717 2.463985 2.934179 1.596526 1.985926 0.0 10.213055 0.0 11.499024 27.592991 0.0 12.132734 24.265468 0.0 0.0 23.20188 268.005785 0 0.764706 1.176471 1.588235 0.076923 -1.38 17 259.047 6943.4452 12.086867 4.861181 2.842672 109.048439 9.683208 0.507662 9.683208 0.118709 0.147014 0.118709 0.147014 -0.507662 3.9954 69.0396 269.127 2 2 0 0 0 2 0 2 2 2 4 0 2 0 0 0 88 10.213055 11.499024 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 23.20188 47.525105 16.466088 0.0 2 10.213055 23.20188 0.0 0.0 0.0 6.420822 0.0 57.570372 0.0 11.499024 2 0.0 0.0 11.499024 23.20188 10.213055 6.420822 0.0 11.126903 36.398202 10.045267 0.0 0.0 40.46 0.0 11.70887 0.0 20.448487 1.29642 0.294029 9.600621 0.373796 0.0 0.0 0 0 0 0 0 0 0 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 0 0 0 0 0 0 0 0 0 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0.864713