Mercurial > repos > bgruening > sygma

--- a/sygma.xml	Mon Sep 30 17:37:58 2019 -0400
+++ b/sygma.xml	Sun Mar 15 17:18:19 2020 +0000
@@ -1,27 +1,31 @@
-<tool id="sygma" name="Generate possible metabolites with SyGMa" version="@VERSION@">
+<tool id="sygma" name="Generate possible metabolites with SyGMa" version="@VERSION@+galaxy1">
+    <description>by performing common reactions on one or more parent molecule(s)</description>
     <macros>
         <token name="@VERSION@">1.1.1</token>
     </macros>
-    <description>by performing common reactions on one or more parent molecule(s)</description>
     <requirements>
         <requirement type="package" version="@VERSION@">sygma</requirement>
         <requirement type="package" version="2019.03.4">rdkit</requirement>
     </requirements>
     <command detect_errors="exit_code"><![CDATA[
-        python '$__tool_directory__/sygma_metabolites.py'
-            -i '$input'
-            --iformat '$input.ext'
-            -o '$output'
-            --phase1 '$phase1'
+        python '$__tool_directory__/sygma_metabolites.py'
+            -i '$input'
+            --iformat '$input.ext'
+            -o '$output'
+            --phase1 '$phase1'
             --phase2 '$phase2'
+            #if $detailed
+              --detailed
+            #end if
     ]]></command>
     <inputs>
         <param type="data" name="input" format="smi,sdf" label="Parent molecule(s)" help="Upload an SDF or SMILES file."/>
         <param type="integer" name="phase1" value="1" min="0" label="Number of reaction cycles to apply for phase 1" help="Phase 1 metabolism rules include different types of oxidation, reduction, hydrolysis and condensation reactions."/>
         <param type="integer" name="phase2" value="1" min="0" label="Number of reaction cycles to apply for phase 2" help="Phase 2 metabolism rules include several conjugation reactions, e.g. with glucuronyl, sulfate, methyl or acetyl."/>
+        <param type="boolean" name="detailed" label="Include more detailed information" help="Will return detailed information including molecular formula of the chemical structure and transformation pathway involved."/>
     </inputs>
     <outputs>
-        <data name="output" format="smi"/>
+        <data name="output" format="tsv"/>
     </outputs>
     <tests>
         <test>
@@ -31,6 +35,13 @@
             <output name="output" file="o.smi"/>
         </test>
         <test>
+            <param name="input" value="i.smi" ftype="smi"/>
+            <param name="phase1" value="1"/>
+            <param name="phase2" value="1"/>
+            <param name="detailed" value="true"/>
+            <output name="output" file="o_detailed.smi"/>
+        </test>
+        <test>
             <param name="input" value="i.sdf" ftype="sdf"/>
             <param name="phase1" value="2"/>
             <param name="phase2" value="0"/>
@@ -44,8 +55,8 @@

 **What this tool does**

-SyGMa (Systematic Generation of potential Metabolites) is a tool to generate
-possible metabolic products of an input parent structure. The tool provides
+SyGMa (Systematic Generation of potential Metabolites) is a tool to generate
+possible metabolic products of an input parent structure. The tool provides
 two rulesets to cover both phase 1 and 2 metabolism.

 -----
@@ -54,11 +65,11 @@

 **Input**

-A file in SMILES or SDF format. Files may contain multiple molecule
-entries; in this case outputs are distinguished by the code included in the
+A file in SMILES or SDF format. Files may contain multiple molecule
+entries; in this case outputs are distinguished by the code included in the
 output file (e.g. SYGMA0MOL0 vs SYGMA1MOL0).

-The number of reaction cycles to be performed for both phase 1 and phase 2
+The number of reaction cycles to be performed for both phase 1 and phase 2
 metabolism should also be specified.

 -----
@@ -67,25 +78,32 @@

  **Output**

-For each molecule in the input file, a SMILES file is produced containing
-SMILES strings of the metabolite outputs, a generated ID code, and an empirical
-probability score (corresponding to an estimated probability that a product is
+A tab-separated values (tsv) file for each molecule in the input file.
+Columns contain a generated ID code (compound_id), SMILES strings of the
+metabolite outputs (smiles) and an empirical probability score (sygma_score).
+Calculated score value corresponds to an estimated probability that a product is
 actually metabolically produced in humans). The first line is always the parent
-molecule itself::
+molecule itself.

-    Oc1ccccc1   SYGMA0MOL0    1.0
-    O=C(O)C1OC(Oc2ccccc2)C(O)C(O)C1O    SYGMA0MOL1 0.25
-    O=S(=O)(O)Oc1ccccc1 SYGMA0MOL2  0.119
-    Oc1ccc(O)cc1    SYGMA0MOL3 0.056
-    COc1ccccc1  SYGMA0MOL4   0.054
-    Oc1ccccc1O  SYGMA0MOL5   0.032
-    O=C(O)C1OC(Oc2ccc(O)cc2)C(O)C(O)C1O SYGMA0MOL6  0.014
-    O=C(O)C1OC(Oc2ccccc2O)C(O)C(O)C1O   SYGMA0MOL7    0.008
-    O=S(=O)(O)Oc1ccc(O)cc1  SYGMA0MOL8   0.00666
-    O=S(=O)(O)Oc1ccccc1O    SYGMA0MOL9 0.00381
-    COc1ccc(O)cc1   SYGMA0MOL10   0.00302
-    COc1ccccc1O SYGMA0MOL11 0.00173
++----------------------------------+-------------+-------------+
+| smiles                           | compound_id | sygma_score |
++----------------------------------+-------------+-------------+
+| Oc1ccccc1                        | SYGMA0MOL0  | 1.0         |
++----------------------------------+-------------+-------------+
+| O=C(O)C1OC(Oc2ccccc2)C(O)C(O)C1O | SYGMA0MOL1  | 0.25        |
++----------------------------------+-------------+-------------+

+If option for more detailed output is selected, additional columns include
+molecular formula (molecular_formula) of the chemical structure, number of
+reactions (sygma_n) and transformation pathway (sygma_pathway) involved.
+
++----------------------------------+-------------+--------------+-------------------+----------+---------------------------------------+
+| smiles                           | compound_id | sygma_score  | molecular_formula | sygma_n  | sygma_pathway                         |
++----------------------------------+-------------+--------------+-------------------+----------+---------------------------------------+
+| Oc1ccccc1                        | SYGMA0MOL0  | 1.0          | C6H6O             | 1        | parent                                |
++----------------------------------+-------------+--------------+-------------------+----------+---------------------------------------+
+| O=C(O)C1OC(Oc2ccccc2)C(O)C(O)C1O | SYGMA0MOL1  | 0.25         | C12H14O7          | 2        | O-glucuronidation_(aromatic_hydroxyl) |
++----------------------------------+-------------+--------------+-------------------+----------+---------------------------------------+

     ]]></help>
     <citations>
--- a/sygma_metabolites.py	Mon Sep 30 17:37:58 2019 -0400
+++ b/sygma_metabolites.py	Sun Mar 15 17:18:19 2020 +0000
@@ -15,8 +15,8 @@
     """
     if ext == 'sdf':
         return [n for n in SDMolSupplier(filename)]
-    with open(filename) as f:
-        mols = f.read().split('\n')
+    with open(filename) as f:
+        mols = f.read().split('\n')
     if ext == 'smi' or ext == 'inchi':
         return [Chem.MolFromSmiles(mol, sanitize=True) for mol in mols if mol != '']

@@ -29,30 +29,53 @@
         [sygma.ruleset['phase2'], int(phase2_cycles)]])
     metabolic_tree = scenario.run(parent)
     metabolic_tree.calc_scores()
-    return metabolic_tree.to_smiles()
+    return metabolic_tree.to_list()


 def main():
     parser = argparse.ArgumentParser()
-    parser.add_argument('-i', '--infile', required=True, help='Path to the input file.')
-    parser.add_argument('-o', '--outfile', required=True, help='Path to the output file.')
-    parser.add_argument("--iformat", help="Specify the input file format.")
-    parser.add_argument("--phase1", help="Number of phase1 cycles.")
-    parser.add_argument("--phase2", help="Number of phase2 cycles.")
+    parser.add_argument("-i", "--infile", required=True, help="Path to the input file.")
+    parser.add_argument("-o", "--outfile", required=True, help="Path to the output file.")
+    parser.add_argument("--iformat", required=True, help="Specify the input file format.")
+    parser.add_argument("--phase1", required=True, help="Number of phase1 cycles.")
+    parser.add_argument("--phase2", required=True, help="Number of phase2 cycles.")
+    parser.add_argument("--detailed", dest="detailed",
+        action="store_true", help="Returns more detailed output")
     args = parser.parse_args()

     mols = mol_supplier(args.infile, args.iformat)
-    outp = np.zeros((0,3))
+    if args.detailed:
+        outp = np.zeros((0,6))
+    else:
+        outp = np.zeros((0,3))
     for n in range(len(mols)):
-        metabs = np.array(predict_metabolites(mols[n], args.phase1, args.phase2))
-        metabs = np.column_stack((
-            metabs[:,0],  # SMILES
-            ['SYGMA{}MOL{}'.format(n, m) for m in range(metabs.shape[0])],  # SMILES label
-            np.round(np.array(metabs[:,1], dtype=float), decimals=5)  # score rounded to 5 dp
-        ))
-        outp = np.vstack((outp, metabs))
-    np.savetxt(args.outfile, outp, fmt="%s")
-
+        metabs = predict_metabolites(mols[n], args.phase1, args.phase2)
+        for entry in range(len(metabs)):
+            smiles = Chem.MolToSmiles(metabs[entry]['SyGMa_metabolite'])
+            if args.detailed:
+                out = np.column_stack((
+                    smiles, # SMILES
+                    'SYGMA{}MOL{}'.format(n, entry), # SMILES label
+                    np.round(np.array(metabs[entry]['SyGMa_score'], dtype=float),
+                        decimals=5), # score rounded to 5 dp
+                    Chem.rdMolDescriptors.CalcMolFormula(Chem.MolFromSmiles(smiles)), # Molecular formula
+                    len(metabs[entry]["SyGMa_pathway"].split("\n")), # SyGMa_n Sygma pathway length
+                    metabs[entry]["SyGMa_pathway"].replace("\n", "") # SyGMa pathway
+                ))
+            else:
+                out = np.column_stack((
+                    smiles, # SMILES
+                    'SYGMA{}MOL{}'.format(n, entry), # SMILES label
+                    np.round(np.array(metabs[entry]['SyGMa_score'], dtype=float),
+                        decimals=5) # score rounded to 5 dp
+                ))
+            outp = np.vstack((outp, out))
+    if args.detailed:
+        np.savetxt(args.outfile, outp, fmt="%s", delimiter="\t",
+            header="smiles\tcompound_id\tsygma_score\tmolecular_formula\tsygma_n\tsygma_pathway", comments="")
+    else:
+        np.savetxt(args.outfile, outp, fmt="%s", delimiter="\t",
+            header="smiles\tcompound_id\tsygma_score", comments="")

 if __name__ == "__main__":
     main()
--- a/test-data/o.smi	Mon Sep 30 17:37:58 2019 -0400
+++ b/test-data/o.smi	Sun Mar 15 17:18:19 2020 +0000
@@ -1,16 +1,17 @@
-Oc1ccccc1 SYGMA0MOL0 1.0
-O=C(O)C1OC(Oc2ccccc2)C(O)C(O)C1O SYGMA0MOL1 0.25
-O=S(=O)(O)Oc1ccccc1 SYGMA0MOL2 0.119
-Oc1ccc(O)cc1 SYGMA0MOL3 0.056
-COc1ccccc1 SYGMA0MOL4 0.054
-Oc1ccccc1O SYGMA0MOL5 0.032
-O=C(O)C1OC(Oc2ccc(O)cc2)C(O)C(O)C1O SYGMA0MOL6 0.014
-O=C(O)C1OC(Oc2ccccc2O)C(O)C(O)C1O SYGMA0MOL7 0.008
-O=S(=O)(O)Oc1ccc(O)cc1 SYGMA0MOL8 0.00666
-O=S(=O)(O)Oc1ccccc1O SYGMA0MOL9 0.00381
-COc1ccc(O)cc1 SYGMA0MOL10 0.00302
-COc1ccccc1O SYGMA0MOL11 0.00173
-CCOCC SYGMA1MOL0 1.0
-CCO SYGMA1MOL1 0.087
-CCOC1OC(C(=O)O)C(O)C(O)C1O SYGMA1MOL2 0.00879
-CCOS(=O)(=O)O SYGMA1MOL3 0.00157
+smiles	compound_id	sygma_score
+Oc1ccccc1	SYGMA0MOL0	1.0
+O=C(O)C1OC(Oc2ccccc2)C(O)C(O)C1O	SYGMA0MOL1	0.25
+O=S(=O)(O)Oc1ccccc1	SYGMA0MOL2	0.119
+Oc1ccc(O)cc1	SYGMA0MOL3	0.056
+COc1ccccc1	SYGMA0MOL4	0.054
+Oc1ccccc1O	SYGMA0MOL5	0.032
+O=C(O)C1OC(Oc2ccc(O)cc2)C(O)C(O)C1O	SYGMA0MOL6	0.014
+O=C(O)C1OC(Oc2ccccc2O)C(O)C(O)C1O	SYGMA0MOL7	0.008
+O=S(=O)(O)Oc1ccc(O)cc1	SYGMA0MOL8	0.00666
+O=S(=O)(O)Oc1ccccc1O	SYGMA0MOL9	0.00381
+COc1ccc(O)cc1	SYGMA0MOL10	0.00302
+COc1ccccc1O	SYGMA0MOL11	0.00173
+CCOCC	SYGMA1MOL0	1.0
+CCO	SYGMA1MOL1	0.087
+CCOC1OC(C(=O)O)C(O)C(O)C1O	SYGMA1MOL2	0.00879
+CCOS(=O)(=O)O	SYGMA1MOL3	0.00157
--- a/test-data/o2.smi	Mon Sep 30 17:37:58 2019 -0400
+++ b/test-data/o2.smi	Sun Mar 15 17:18:19 2020 +0000
@@ -1,32 +1,33 @@
-CC(=O)Oc1ccccc1C(=O)O SYGMA0MOL0 1.0
-O=C(O)c1ccccc1O SYGMA0MOL1 0.529
-CC(=O)Oc1cc(O)ccc1C(=O)O SYGMA0MOL2 0.061
-CC(=O)Oc1ccc(O)cc1C(=O)O SYGMA0MOL3 0.056
-O=C(CO)Oc1ccccc1C(=O)O SYGMA0MOL4 0.049
-O=C(O)c1ccc(O)cc1O SYGMA0MOL5 0.03227
-CC(=O)Oc1c(O)cccc1C(=O)O SYGMA0MOL6 0.032
-O=C(O)c1cc(O)ccc1O SYGMA0MOL7 0.02962
-CC(=O)Oc1ccccc1 SYGMA0MOL8 0.023
-O=C(O)c1cccc(O)c1O SYGMA0MOL9 0.01693
-O=C(O)C(=O)Oc1ccccc1C(=O)O SYGMA0MOL10 0.016
-O=C(O)CO SYGMA0MOL11 0.01333
-Oc1ccccc1 SYGMA0MOL12 0.01217
-O=C(O)C(=O)O SYGMA0MOL13 0.00435
-O=C(O)Oc1ccccc1C(=O)O SYGMA0MOL14 0.00355
-CC(=O)Oc1cc(O)c(O)cc1C(=O)O SYGMA0MOL15 0.00342
-O=C(CO)Oc1cc(O)ccc1C(=O)O SYGMA0MOL16 0.00299
-O=C(CO)Oc1ccc(O)cc1C(=O)O SYGMA0MOL17 0.00274
-CC(=O)Oc1c(C(=O)O)ccc(O)c1O SYGMA0MOL18 0.00195
-CC(=O)Oc1c(O)cc(O)cc1C(=O)O SYGMA0MOL19 0.00179
-CC(=O)Oc1c(O)ccc(O)c1C(=O)O SYGMA0MOL20 0.00179
-O=C(CO)Oc1c(O)cccc1C(=O)O SYGMA0MOL21 0.00157
-CC(=O)Oc1cccc(O)c1 SYGMA0MOL22 0.0014
-CC(=O)Oc1ccc(O)cc1 SYGMA0MOL23 0.00129
-O=C(CO)Oc1ccccc1 SYGMA0MOL24 0.00113
-O=C(O)C(=O)Oc1cc(O)ccc1C(=O)O SYGMA0MOL25 0.00098
-O=C(O)C(=O)Oc1ccc(O)cc1C(=O)O SYGMA0MOL26 0.0009
-CC(=O)Oc1ccccc1O SYGMA0MOL27 0.00074
-CC(=O)Oc1ccc(O)c(O)c1C(=O)O SYGMA0MOL28 0.00073
-O=C(O)C(=O)Oc1c(O)cccc1C(=O)O SYGMA0MOL29 0.00051
-O=COc1ccccc1C(=O)O SYGMA0MOL30 0.00037
-O=C(O)C(=O)Oc1ccccc1 SYGMA0MOL31 0.00037
+smiles	compound_id	sygma_score
+CC(=O)Oc1ccccc1C(=O)O	SYGMA0MOL0	1.0
+O=C(O)c1ccccc1O	SYGMA0MOL1	0.529
+CC(=O)Oc1cc(O)ccc1C(=O)O	SYGMA0MOL2	0.061
+CC(=O)Oc1ccc(O)cc1C(=O)O	SYGMA0MOL3	0.056
+O=C(CO)Oc1ccccc1C(=O)O	SYGMA0MOL4	0.049
+O=C(O)c1ccc(O)cc1O	SYGMA0MOL5	0.03227
+CC(=O)Oc1c(O)cccc1C(=O)O	SYGMA0MOL6	0.032
+O=C(O)c1cc(O)ccc1O	SYGMA0MOL7	0.02962
+CC(=O)Oc1ccccc1	SYGMA0MOL8	0.023
+O=C(O)c1cccc(O)c1O	SYGMA0MOL9	0.01693
+O=C(O)C(=O)Oc1ccccc1C(=O)O	SYGMA0MOL10	0.016
+O=C(O)CO	SYGMA0MOL11	0.01333
+Oc1ccccc1	SYGMA0MOL12	0.01217
+O=C(O)C(=O)O	SYGMA0MOL13	0.00435
+O=C(O)Oc1ccccc1C(=O)O	SYGMA0MOL14	0.00355
+CC(=O)Oc1cc(O)c(O)cc1C(=O)O	SYGMA0MOL15	0.00342
+O=C(CO)Oc1cc(O)ccc1C(=O)O	SYGMA0MOL16	0.00299
+O=C(CO)Oc1ccc(O)cc1C(=O)O	SYGMA0MOL17	0.00274
+CC(=O)Oc1c(C(=O)O)ccc(O)c1O	SYGMA0MOL18	0.00195
+CC(=O)Oc1c(O)cc(O)cc1C(=O)O	SYGMA0MOL19	0.00179
+CC(=O)Oc1c(O)ccc(O)c1C(=O)O	SYGMA0MOL20	0.00179
+O=C(CO)Oc1c(O)cccc1C(=O)O	SYGMA0MOL21	0.00157
+CC(=O)Oc1cccc(O)c1	SYGMA0MOL22	0.0014
+CC(=O)Oc1ccc(O)cc1	SYGMA0MOL23	0.00129
+O=C(CO)Oc1ccccc1	SYGMA0MOL24	0.00113
+O=C(O)C(=O)Oc1cc(O)ccc1C(=O)O	SYGMA0MOL25	0.00098
+O=C(O)C(=O)Oc1ccc(O)cc1C(=O)O	SYGMA0MOL26	0.0009
+CC(=O)Oc1ccccc1O	SYGMA0MOL27	0.00074
+CC(=O)Oc1ccc(O)c(O)c1C(=O)O	SYGMA0MOL28	0.00073
+O=C(O)C(=O)Oc1c(O)cccc1C(=O)O	SYGMA0MOL29	0.00051
+O=COc1ccccc1C(=O)O	SYGMA0MOL30	0.00037
+O=C(O)C(=O)Oc1ccccc1	SYGMA0MOL31	0.00037
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/o_detailed.smi	Sun Mar 15 17:18:19 2020 +0000
@@ -0,0 +1,17 @@
+smiles	compound_id	sygma_score	molecular_formula	sygma_n	sygma_pathway
+Oc1ccccc1	SYGMA0MOL0	1.0	C6H6O	1	parent;
+O=C(O)C1OC(Oc2ccccc2)C(O)C(O)C1O	SYGMA0MOL1	0.25	C12H14O7	2	O-glucuronidation_(aromatic_hydroxyl);
+O=S(=O)(O)Oc1ccccc1	SYGMA0MOL2	0.119	C6H6O4S	2	sulfation_(aromatic_hydroxyl);
+Oc1ccc(O)cc1	SYGMA0MOL3	0.056	C6H6O2	2	aromatic_hydroxylation_(para_to_oxygen);
+COc1ccccc1	SYGMA0MOL4	0.054	C7H8O	2	methylation_(aromatic_OH);
+Oc1ccccc1O	SYGMA0MOL5	0.032	C6H6O2	2	aromatic_hydroxylation_(ortho_to_oxygen);
+O=C(O)C1OC(Oc2ccc(O)cc2)C(O)C(O)C1O	SYGMA0MOL6	0.014	C12H14O8	3	aromatic_hydroxylation_(para_to_oxygen); O-glucuronidation_(aromatic_hydroxyl);
+O=C(O)C1OC(Oc2ccccc2O)C(O)C(O)C1O	SYGMA0MOL7	0.008	C12H14O8	3	aromatic_hydroxylation_(ortho_to_oxygen); O-glucuronidation_(aromatic_hydroxyl);
+O=S(=O)(O)Oc1ccc(O)cc1	SYGMA0MOL8	0.00666	C6H6O5S	3	aromatic_hydroxylation_(para_to_oxygen); sulfation_(aromatic_hydroxyl);
+O=S(=O)(O)Oc1ccccc1O	SYGMA0MOL9	0.00381	C6H6O5S	3	aromatic_hydroxylation_(ortho_to_oxygen); sulfation_(aromatic_hydroxyl);
+COc1ccc(O)cc1	SYGMA0MOL10	0.00302	C7H8O2	3	aromatic_hydroxylation_(para_to_oxygen); methylation_(aromatic_OH);
+COc1ccccc1O	SYGMA0MOL11	0.00173	C7H8O2	3	aromatic_hydroxylation_(ortho_to_oxygen); methylation_(aromatic_OH);
+CCOCC	SYGMA1MOL0	1.0	C4H10O	1	parent;
+CCO	SYGMA1MOL1	0.087	C2H6O	2	O-dealkylation_(aliphatic);
+CCOC1OC(C(=O)O)C(O)C(O)C1O	SYGMA1MOL2	0.00879	C8H14O7	3	O-dealkylation_(aliphatic); O-glucuronidation_(aliphatic_hydroxyl);
+CCOS(=O)(=O)O	SYGMA1MOL3	0.00157	C2H6O4S	3	O-dealkylation_(aliphatic); sulfation_(aliphatic_hydroxyl);