Mercurial > repos > recetox > biotransformer
view wrapper_biotransformer.py @ 3:6080aee7c4f6 draft
"planemo upload for repository https://github.com/RECETOX/galaxytools/tree/master/tools/biotransformer commit 5cdd2628a1a509b3e0ccc599eaab63d664bf031a"
author | recetox |
---|---|
date | Wed, 13 Jan 2021 11:17:53 +0000 |
parents | 362a66a3889c |
children | 77f693bb14ac |
line wrap: on
line source
import subprocess import sys import tempfile import re import pandas from openbabel import openbabel, pybel openbabel.obErrorLog.StopLogging() # function for translating inchi to smiles def InchiToSmiles(df): sm = [] for item in df['InChI']: tmp = pybel.readstring("inchi", item) sm.append(tmp.write("smi")) return(sm) executable = ["biotransformer"] # executable_r = ["Rscript", "inchi_to_smiles.r"] argv = sys.argv[1:] if "-icsv" in argv: icsv = argv.pop(argv.index("-icsv") + 1) argv.remove("-icsv") if "-ocsv" not in argv: sys.stderr.write("excpected -ocsv parameter\n") sys.exit(1) ocsv = argv.pop(argv.index("-ocsv") + 1) argv.remove("-ocsv") ocsv_dup = argv.pop(argv.index("-ocsvDup") + 1) argv.remove("-ocsvDup") ocsv_dup2 = argv.pop(argv.index("-ocsvDup2") + 1) argv.remove("-ocsvDup2") in_df = pandas.read_csv(icsv, header=None) out_df1 = pandas.DataFrame() # all results out_df2 = pandas.DataFrame() # filtered results based on 6 columns out_df3 = pandas.DataFrame() # filtered results based on 3 columns tmp2 = pandas.DataFrame() tmp3 = pandas.DataFrame() smList1 = [] # list with smiles string smList2 = [] smList3 = [] for _, (smiles,) in in_df.iterrows(): with tempfile.NamedTemporaryFile() as out: print("Working on compound: " + smiles) if not re.search(r'\.', smiles): subprocess.run(executable + argv + ["-ismi", smiles] + ["-ocsv", out.name]) try: tmp2 = pandas.read_csv(out.name) tmp3 = pandas.read_csv(out.name) tmp2.drop_duplicates(inplace=True, subset=["InChI", "InChIKey", "Synonyms", "Molecular formula", "Major Isotope Mass", "ALogP"]) tmp3.drop_duplicates(inplace=True, subset=["Molecular formula", "Major Isotope Mass", "ALogP"]) smList2.append([smiles] * tmp2.shape[0]) smList3.append([smiles] * tmp3.shape[0]) out_df1 = pandas.concat([out_df1, pandas.read_csv(out.name)]) out_df2 = pandas.concat([out_df2, tmp2]) out_df3 = pandas.concat([out_df3, tmp3]) smList1.append([smiles] * pandas.read_csv(out.name).shape[0]) except pandas.errors.EmptyDataError: continue else: print("ERROR: Input compound cannot be a mixture.") smList1 = sum(smList1, []) # merge sublists into one list smList2 = sum(smList2, []) smList3 = sum(smList3, []) out_df1.insert(0, "SMILES query", smList1) out_df1.drop_duplicates(inplace=True) out_df1.insert(1, "SMILES target", InchiToSmiles(out_df1)) out_df1.to_csv(ocsv) out_df2.insert(0, "SMILES query", smList2) out_df3.insert(0, "SMILES query", smList3) out_df2.drop_duplicates(inplace=True) out_df3.drop_duplicates(inplace=True) out_df2.insert(1, "SMILES target", InchiToSmiles(out_df2)) out_df3.insert(1, "SMILES target", InchiToSmiles(out_df3)) # out_df.drop_duplicates(inplace=True, subset=["InChI", "InChIKey", "Synonyms", "Molecular formula", "Major Isotope Mass", "ALogP"]) out_df2.to_csv(ocsv_dup) out_df3.to_csv(ocsv_dup2) else: # code = subprocess.run(executable + argv).returncode # sys.exit(code) subprocess.run(executable + argv) smile = argv.pop(argv.index("-ismi") + 1) tmp = pandas.DataFrame() out = argv.pop(argv.index("-ocsv") + 1) tmp = pandas.read_csv(out) # reads created output file tmp.insert(0, "SMILES query", smile) # add SMILES string for query tmp.insert(1, "SMILES target", InchiToSmiles(tmp)) # add SMILES string for target tmp.to_csv(out)