Mercurial > repos > ggricourt > data_manager_bigg
annotate data_manager/metanetx_chem_prop_fetcher.py @ 13:c1d4f14dc768 draft
"planemo upload for repository https://github.com/brsynth/synbiocad-galaxy-wrappers commit 7db54c0555a12ecb8b3f756032228c54fe028e0a-dirty"
author | ggricourt |
---|---|
date | Wed, 09 Mar 2022 14:22:54 +0000 |
parents | |
children | 8e8a9e51f1d7 |
rev | line source |
---|---|
13
c1d4f14dc768
"planemo upload for repository https://github.com/brsynth/synbiocad-galaxy-wrappers commit 7db54c0555a12ecb8b3f756032228c54fe028e0a-dirty"
ggricourt
parents:
diff
changeset
|
1 import argparse |
c1d4f14dc768
"planemo upload for repository https://github.com/brsynth/synbiocad-galaxy-wrappers commit 7db54c0555a12ecb8b3f756032228c54fe028e0a-dirty"
ggricourt
parents:
diff
changeset
|
2 import json |
c1d4f14dc768
"planemo upload for repository https://github.com/brsynth/synbiocad-galaxy-wrappers commit 7db54c0555a12ecb8b3f756032228c54fe028e0a-dirty"
ggricourt
parents:
diff
changeset
|
3 import os |
c1d4f14dc768
"planemo upload for repository https://github.com/brsynth/synbiocad-galaxy-wrappers commit 7db54c0555a12ecb8b3f756032228c54fe028e0a-dirty"
ggricourt
parents:
diff
changeset
|
4 import shutil |
c1d4f14dc768
"planemo upload for repository https://github.com/brsynth/synbiocad-galaxy-wrappers commit 7db54c0555a12ecb8b3f756032228c54fe028e0a-dirty"
ggricourt
parents:
diff
changeset
|
5 import sys |
c1d4f14dc768
"planemo upload for repository https://github.com/brsynth/synbiocad-galaxy-wrappers commit 7db54c0555a12ecb8b3f756032228c54fe028e0a-dirty"
ggricourt
parents:
diff
changeset
|
6 import tempfile |
c1d4f14dc768
"planemo upload for repository https://github.com/brsynth/synbiocad-galaxy-wrappers commit 7db54c0555a12ecb8b3f756032228c54fe028e0a-dirty"
ggricourt
parents:
diff
changeset
|
7 import pandas as pd |
c1d4f14dc768
"planemo upload for repository https://github.com/brsynth/synbiocad-galaxy-wrappers commit 7db54c0555a12ecb8b3f756032228c54fe028e0a-dirty"
ggricourt
parents:
diff
changeset
|
8 try: |
c1d4f14dc768
"planemo upload for repository https://github.com/brsynth/synbiocad-galaxy-wrappers commit 7db54c0555a12ecb8b3f756032228c54fe028e0a-dirty"
ggricourt
parents:
diff
changeset
|
9 # For Python 3.0 and later |
c1d4f14dc768
"planemo upload for repository https://github.com/brsynth/synbiocad-galaxy-wrappers commit 7db54c0555a12ecb8b3f756032228c54fe028e0a-dirty"
ggricourt
parents:
diff
changeset
|
10 from urllib.request import Request, urlopen |
c1d4f14dc768
"planemo upload for repository https://github.com/brsynth/synbiocad-galaxy-wrappers commit 7db54c0555a12ecb8b3f756032228c54fe028e0a-dirty"
ggricourt
parents:
diff
changeset
|
11 except ImportError: |
c1d4f14dc768
"planemo upload for repository https://github.com/brsynth/synbiocad-galaxy-wrappers commit 7db54c0555a12ecb8b3f756032228c54fe028e0a-dirty"
ggricourt
parents:
diff
changeset
|
12 # Fall back to Python 2 imports |
c1d4f14dc768
"planemo upload for repository https://github.com/brsynth/synbiocad-galaxy-wrappers commit 7db54c0555a12ecb8b3f756032228c54fe028e0a-dirty"
ggricourt
parents:
diff
changeset
|
13 from urllib2 import Request, urlopen |
c1d4f14dc768
"planemo upload for repository https://github.com/brsynth/synbiocad-galaxy-wrappers commit 7db54c0555a12ecb8b3f756032228c54fe028e0a-dirty"
ggricourt
parents:
diff
changeset
|
14 |
c1d4f14dc768
"planemo upload for repository https://github.com/brsynth/synbiocad-galaxy-wrappers commit 7db54c0555a12ecb8b3f756032228c54fe028e0a-dirty"
ggricourt
parents:
diff
changeset
|
15 |
c1d4f14dc768
"planemo upload for repository https://github.com/brsynth/synbiocad-galaxy-wrappers commit 7db54c0555a12ecb8b3f756032228c54fe028e0a-dirty"
ggricourt
parents:
diff
changeset
|
16 METANETX_URL = "https://www.metanetx.org/ftp/" |
c1d4f14dc768
"planemo upload for repository https://github.com/brsynth/synbiocad-galaxy-wrappers commit 7db54c0555a12ecb8b3f756032228c54fe028e0a-dirty"
ggricourt
parents:
diff
changeset
|
17 |
c1d4f14dc768
"planemo upload for repository https://github.com/brsynth/synbiocad-galaxy-wrappers commit 7db54c0555a12ecb8b3f756032228c54fe028e0a-dirty"
ggricourt
parents:
diff
changeset
|
18 |
c1d4f14dc768
"planemo upload for repository https://github.com/brsynth/synbiocad-galaxy-wrappers commit 7db54c0555a12ecb8b3f756032228c54fe028e0a-dirty"
ggricourt
parents:
diff
changeset
|
19 def url_download(url, path): |
c1d4f14dc768
"planemo upload for repository https://github.com/brsynth/synbiocad-galaxy-wrappers commit 7db54c0555a12ecb8b3f756032228c54fe028e0a-dirty"
ggricourt
parents:
diff
changeset
|
20 try: |
c1d4f14dc768
"planemo upload for repository https://github.com/brsynth/synbiocad-galaxy-wrappers commit 7db54c0555a12ecb8b3f756032228c54fe028e0a-dirty"
ggricourt
parents:
diff
changeset
|
21 with urlopen(Request(url)) as fod: |
c1d4f14dc768
"planemo upload for repository https://github.com/brsynth/synbiocad-galaxy-wrappers commit 7db54c0555a12ecb8b3f756032228c54fe028e0a-dirty"
ggricourt
parents:
diff
changeset
|
22 with open(path, "wb") as dst: |
c1d4f14dc768
"planemo upload for repository https://github.com/brsynth/synbiocad-galaxy-wrappers commit 7db54c0555a12ecb8b3f756032228c54fe028e0a-dirty"
ggricourt
parents:
diff
changeset
|
23 while True: |
c1d4f14dc768
"planemo upload for repository https://github.com/brsynth/synbiocad-galaxy-wrappers commit 7db54c0555a12ecb8b3f756032228c54fe028e0a-dirty"
ggricourt
parents:
diff
changeset
|
24 chunk = fod.read(2**10) |
c1d4f14dc768
"planemo upload for repository https://github.com/brsynth/synbiocad-galaxy-wrappers commit 7db54c0555a12ecb8b3f756032228c54fe028e0a-dirty"
ggricourt
parents:
diff
changeset
|
25 if chunk: |
c1d4f14dc768
"planemo upload for repository https://github.com/brsynth/synbiocad-galaxy-wrappers commit 7db54c0555a12ecb8b3f756032228c54fe028e0a-dirty"
ggricourt
parents:
diff
changeset
|
26 dst.write(chunk) |
c1d4f14dc768
"planemo upload for repository https://github.com/brsynth/synbiocad-galaxy-wrappers commit 7db54c0555a12ecb8b3f756032228c54fe028e0a-dirty"
ggricourt
parents:
diff
changeset
|
27 else: |
c1d4f14dc768
"planemo upload for repository https://github.com/brsynth/synbiocad-galaxy-wrappers commit 7db54c0555a12ecb8b3f756032228c54fe028e0a-dirty"
ggricourt
parents:
diff
changeset
|
28 break |
c1d4f14dc768
"planemo upload for repository https://github.com/brsynth/synbiocad-galaxy-wrappers commit 7db54c0555a12ecb8b3f756032228c54fe028e0a-dirty"
ggricourt
parents:
diff
changeset
|
29 except Exception as e: |
c1d4f14dc768
"planemo upload for repository https://github.com/brsynth/synbiocad-galaxy-wrappers commit 7db54c0555a12ecb8b3f756032228c54fe028e0a-dirty"
ggricourt
parents:
diff
changeset
|
30 sys.exit(str(e)) |
c1d4f14dc768
"planemo upload for repository https://github.com/brsynth/synbiocad-galaxy-wrappers commit 7db54c0555a12ecb8b3f756032228c54fe028e0a-dirty"
ggricourt
parents:
diff
changeset
|
31 |
c1d4f14dc768
"planemo upload for repository https://github.com/brsynth/synbiocad-galaxy-wrappers commit 7db54c0555a12ecb8b3f756032228c54fe028e0a-dirty"
ggricourt
parents:
diff
changeset
|
32 |
c1d4f14dc768
"planemo upload for repository https://github.com/brsynth/synbiocad-galaxy-wrappers commit 7db54c0555a12ecb8b3f756032228c54fe028e0a-dirty"
ggricourt
parents:
diff
changeset
|
33 def clean_metanetx_file(path): |
c1d4f14dc768
"planemo upload for repository https://github.com/brsynth/synbiocad-galaxy-wrappers commit 7db54c0555a12ecb8b3f756032228c54fe028e0a-dirty"
ggricourt
parents:
diff
changeset
|
34 ftmp = tempfile.NamedTemporaryFile() |
c1d4f14dc768
"planemo upload for repository https://github.com/brsynth/synbiocad-galaxy-wrappers commit 7db54c0555a12ecb8b3f756032228c54fe028e0a-dirty"
ggricourt
parents:
diff
changeset
|
35 isHeaderFound = False |
c1d4f14dc768
"planemo upload for repository https://github.com/brsynth/synbiocad-galaxy-wrappers commit 7db54c0555a12ecb8b3f756032228c54fe028e0a-dirty"
ggricourt
parents:
diff
changeset
|
36 with open(path) as fid, open(ftmp.name, 'w') as fod: |
c1d4f14dc768
"planemo upload for repository https://github.com/brsynth/synbiocad-galaxy-wrappers commit 7db54c0555a12ecb8b3f756032228c54fe028e0a-dirty"
ggricourt
parents:
diff
changeset
|
37 for line in fid: |
c1d4f14dc768
"planemo upload for repository https://github.com/brsynth/synbiocad-galaxy-wrappers commit 7db54c0555a12ecb8b3f756032228c54fe028e0a-dirty"
ggricourt
parents:
diff
changeset
|
38 if line.startswith("#"): |
c1d4f14dc768
"planemo upload for repository https://github.com/brsynth/synbiocad-galaxy-wrappers commit 7db54c0555a12ecb8b3f756032228c54fe028e0a-dirty"
ggricourt
parents:
diff
changeset
|
39 last_line = line |
c1d4f14dc768
"planemo upload for repository https://github.com/brsynth/synbiocad-galaxy-wrappers commit 7db54c0555a12ecb8b3f756032228c54fe028e0a-dirty"
ggricourt
parents:
diff
changeset
|
40 else: |
c1d4f14dc768
"planemo upload for repository https://github.com/brsynth/synbiocad-galaxy-wrappers commit 7db54c0555a12ecb8b3f756032228c54fe028e0a-dirty"
ggricourt
parents:
diff
changeset
|
41 if not isHeaderFound: |
c1d4f14dc768
"planemo upload for repository https://github.com/brsynth/synbiocad-galaxy-wrappers commit 7db54c0555a12ecb8b3f756032228c54fe028e0a-dirty"
ggricourt
parents:
diff
changeset
|
42 last_line = last_line.replace("#", "") |
c1d4f14dc768
"planemo upload for repository https://github.com/brsynth/synbiocad-galaxy-wrappers commit 7db54c0555a12ecb8b3f756032228c54fe028e0a-dirty"
ggricourt
parents:
diff
changeset
|
43 fod.write(last_line) |
c1d4f14dc768
"planemo upload for repository https://github.com/brsynth/synbiocad-galaxy-wrappers commit 7db54c0555a12ecb8b3f756032228c54fe028e0a-dirty"
ggricourt
parents:
diff
changeset
|
44 isHeaderFound = True |
c1d4f14dc768
"planemo upload for repository https://github.com/brsynth/synbiocad-galaxy-wrappers commit 7db54c0555a12ecb8b3f756032228c54fe028e0a-dirty"
ggricourt
parents:
diff
changeset
|
45 fod.write(line) |
c1d4f14dc768
"planemo upload for repository https://github.com/brsynth/synbiocad-galaxy-wrappers commit 7db54c0555a12ecb8b3f756032228c54fe028e0a-dirty"
ggricourt
parents:
diff
changeset
|
46 shutil.copyfile(ftmp.name, path) |
c1d4f14dc768
"planemo upload for repository https://github.com/brsynth/synbiocad-galaxy-wrappers commit 7db54c0555a12ecb8b3f756032228c54fe028e0a-dirty"
ggricourt
parents:
diff
changeset
|
47 |
c1d4f14dc768
"planemo upload for repository https://github.com/brsynth/synbiocad-galaxy-wrappers commit 7db54c0555a12ecb8b3f756032228c54fe028e0a-dirty"
ggricourt
parents:
diff
changeset
|
48 |
c1d4f14dc768
"planemo upload for repository https://github.com/brsynth/synbiocad-galaxy-wrappers commit 7db54c0555a12ecb8b3f756032228c54fe028e0a-dirty"
ggricourt
parents:
diff
changeset
|
49 def records_chem_prop(path): |
c1d4f14dc768
"planemo upload for repository https://github.com/brsynth/synbiocad-galaxy-wrappers commit 7db54c0555a12ecb8b3f756032228c54fe028e0a-dirty"
ggricourt
parents:
diff
changeset
|
50 df = pd.read_csv(path, sep="\t") |
c1d4f14dc768
"planemo upload for repository https://github.com/brsynth/synbiocad-galaxy-wrappers commit 7db54c0555a12ecb8b3f756032228c54fe028e0a-dirty"
ggricourt
parents:
diff
changeset
|
51 df["name"] = df.apply(lambda x: "%s: %s (%s)" % (x["ID"], x["name"], x["formula"]), axis=1) |
c1d4f14dc768
"planemo upload for repository https://github.com/brsynth/synbiocad-galaxy-wrappers commit 7db54c0555a12ecb8b3f756032228c54fe028e0a-dirty"
ggricourt
parents:
diff
changeset
|
52 df.drop(columns=["reference", "formula", "charge", "mass", "InChIKey", "SMILES"], inplace=True) |
c1d4f14dc768
"planemo upload for repository https://github.com/brsynth/synbiocad-galaxy-wrappers commit 7db54c0555a12ecb8b3f756032228c54fe028e0a-dirty"
ggricourt
parents:
diff
changeset
|
53 df.rename(columns={"ID": "value", "InChI": "inchi"}) |
c1d4f14dc768
"planemo upload for repository https://github.com/brsynth/synbiocad-galaxy-wrappers commit 7db54c0555a12ecb8b3f756032228c54fe028e0a-dirty"
ggricourt
parents:
diff
changeset
|
54 return df.to_dict('records') |
c1d4f14dc768
"planemo upload for repository https://github.com/brsynth/synbiocad-galaxy-wrappers commit 7db54c0555a12ecb8b3f756032228c54fe028e0a-dirty"
ggricourt
parents:
diff
changeset
|
55 |
c1d4f14dc768
"planemo upload for repository https://github.com/brsynth/synbiocad-galaxy-wrappers commit 7db54c0555a12ecb8b3f756032228c54fe028e0a-dirty"
ggricourt
parents:
diff
changeset
|
56 |
c1d4f14dc768
"planemo upload for repository https://github.com/brsynth/synbiocad-galaxy-wrappers commit 7db54c0555a12ecb8b3f756032228c54fe028e0a-dirty"
ggricourt
parents:
diff
changeset
|
57 if __name__ == "__main__": |
c1d4f14dc768
"planemo upload for repository https://github.com/brsynth/synbiocad-galaxy-wrappers commit 7db54c0555a12ecb8b3f756032228c54fe028e0a-dirty"
ggricourt
parents:
diff
changeset
|
58 parser = argparse.ArgumentParser() |
c1d4f14dc768
"planemo upload for repository https://github.com/brsynth/synbiocad-galaxy-wrappers commit 7db54c0555a12ecb8b3f756032228c54fe028e0a-dirty"
ggricourt
parents:
diff
changeset
|
59 pinput = parser.add_mutually_exclusive_group(required=True) |
c1d4f14dc768
"planemo upload for repository https://github.com/brsynth/synbiocad-galaxy-wrappers commit 7db54c0555a12ecb8b3f756032228c54fe028e0a-dirty"
ggricourt
parents:
diff
changeset
|
60 pinput.add_argument("--version", help="Version to download") |
c1d4f14dc768
"planemo upload for repository https://github.com/brsynth/synbiocad-galaxy-wrappers commit 7db54c0555a12ecb8b3f756032228c54fe028e0a-dirty"
ggricourt
parents:
diff
changeset
|
61 parser.add_argument("--out-file", help="JSON output file") |
c1d4f14dc768
"planemo upload for repository https://github.com/brsynth/synbiocad-galaxy-wrappers commit 7db54c0555a12ecb8b3f756032228c54fe028e0a-dirty"
ggricourt
parents:
diff
changeset
|
62 args = parser.parse_args() |
c1d4f14dc768
"planemo upload for repository https://github.com/brsynth/synbiocad-galaxy-wrappers commit 7db54c0555a12ecb8b3f756032228c54fe028e0a-dirty"
ggricourt
parents:
diff
changeset
|
63 |
c1d4f14dc768
"planemo upload for repository https://github.com/brsynth/synbiocad-galaxy-wrappers commit 7db54c0555a12ecb8b3f756032228c54fe028e0a-dirty"
ggricourt
parents:
diff
changeset
|
64 # Init. |
c1d4f14dc768
"planemo upload for repository https://github.com/brsynth/synbiocad-galaxy-wrappers commit 7db54c0555a12ecb8b3f756032228c54fe028e0a-dirty"
ggricourt
parents:
diff
changeset
|
65 data_manager_json = {"data_tables": {}} |
c1d4f14dc768
"planemo upload for repository https://github.com/brsynth/synbiocad-galaxy-wrappers commit 7db54c0555a12ecb8b3f756032228c54fe028e0a-dirty"
ggricourt
parents:
diff
changeset
|
66 with open(args.out_file) as fh: |
c1d4f14dc768
"planemo upload for repository https://github.com/brsynth/synbiocad-galaxy-wrappers commit 7db54c0555a12ecb8b3f756032228c54fe028e0a-dirty"
ggricourt
parents:
diff
changeset
|
67 params = json.load(fh) |
c1d4f14dc768
"planemo upload for repository https://github.com/brsynth/synbiocad-galaxy-wrappers commit 7db54c0555a12ecb8b3f756032228c54fe028e0a-dirty"
ggricourt
parents:
diff
changeset
|
68 |
c1d4f14dc768
"planemo upload for repository https://github.com/brsynth/synbiocad-galaxy-wrappers commit 7db54c0555a12ecb8b3f756032228c54fe028e0a-dirty"
ggricourt
parents:
diff
changeset
|
69 workdir = params["output_data"][0]["extra_files_path"] |
c1d4f14dc768
"planemo upload for repository https://github.com/brsynth/synbiocad-galaxy-wrappers commit 7db54c0555a12ecb8b3f756032228c54fe028e0a-dirty"
ggricourt
parents:
diff
changeset
|
70 os.makedirs(workdir) |
c1d4f14dc768
"planemo upload for repository https://github.com/brsynth/synbiocad-galaxy-wrappers commit 7db54c0555a12ecb8b3f756032228c54fe028e0a-dirty"
ggricourt
parents:
diff
changeset
|
71 |
c1d4f14dc768
"planemo upload for repository https://github.com/brsynth/synbiocad-galaxy-wrappers commit 7db54c0555a12ecb8b3f756032228c54fe028e0a-dirty"
ggricourt
parents:
diff
changeset
|
72 # Load models and models metadata. |
c1d4f14dc768
"planemo upload for repository https://github.com/brsynth/synbiocad-galaxy-wrappers commit 7db54c0555a12ecb8b3f756032228c54fe028e0a-dirty"
ggricourt
parents:
diff
changeset
|
73 ftmp = tempfile.NamedTemporaryFile() |
c1d4f14dc768
"planemo upload for repository https://github.com/brsynth/synbiocad-galaxy-wrappers commit 7db54c0555a12ecb8b3f756032228c54fe028e0a-dirty"
ggricourt
parents:
diff
changeset
|
74 url = '/'.join([METANETX_URL, args.version, 'chem_prop.tsv']) |
c1d4f14dc768
"planemo upload for repository https://github.com/brsynth/synbiocad-galaxy-wrappers commit 7db54c0555a12ecb8b3f756032228c54fe028e0a-dirty"
ggricourt
parents:
diff
changeset
|
75 url_download(url, ftmp.name) |
c1d4f14dc768
"planemo upload for repository https://github.com/brsynth/synbiocad-galaxy-wrappers commit 7db54c0555a12ecb8b3f756032228c54fe028e0a-dirty"
ggricourt
parents:
diff
changeset
|
76 |
c1d4f14dc768
"planemo upload for repository https://github.com/brsynth/synbiocad-galaxy-wrappers commit 7db54c0555a12ecb8b3f756032228c54fe028e0a-dirty"
ggricourt
parents:
diff
changeset
|
77 # Clean header |
c1d4f14dc768
"planemo upload for repository https://github.com/brsynth/synbiocad-galaxy-wrappers commit 7db54c0555a12ecb8b3f756032228c54fe028e0a-dirty"
ggricourt
parents:
diff
changeset
|
78 clean_metanetx_file(ftmp.name) |
c1d4f14dc768
"planemo upload for repository https://github.com/brsynth/synbiocad-galaxy-wrappers commit 7db54c0555a12ecb8b3f756032228c54fe028e0a-dirty"
ggricourt
parents:
diff
changeset
|
79 |
c1d4f14dc768
"planemo upload for repository https://github.com/brsynth/synbiocad-galaxy-wrappers commit 7db54c0555a12ecb8b3f756032228c54fe028e0a-dirty"
ggricourt
parents:
diff
changeset
|
80 # Select records. |
c1d4f14dc768
"planemo upload for repository https://github.com/brsynth/synbiocad-galaxy-wrappers commit 7db54c0555a12ecb8b3f756032228c54fe028e0a-dirty"
ggricourt
parents:
diff
changeset
|
81 records = records_chem_prop(ftmp.name) |
c1d4f14dc768
"planemo upload for repository https://github.com/brsynth/synbiocad-galaxy-wrappers commit 7db54c0555a12ecb8b3f756032228c54fe028e0a-dirty"
ggricourt
parents:
diff
changeset
|
82 |
c1d4f14dc768
"planemo upload for repository https://github.com/brsynth/synbiocad-galaxy-wrappers commit 7db54c0555a12ecb8b3f756032228c54fe028e0a-dirty"
ggricourt
parents:
diff
changeset
|
83 # Write data. |
c1d4f14dc768
"planemo upload for repository https://github.com/brsynth/synbiocad-galaxy-wrappers commit 7db54c0555a12ecb8b3f756032228c54fe028e0a-dirty"
ggricourt
parents:
diff
changeset
|
84 data_manager_json["data_tables"]["metanetx_chem_prop"] = records |
c1d4f14dc768
"planemo upload for repository https://github.com/brsynth/synbiocad-galaxy-wrappers commit 7db54c0555a12ecb8b3f756032228c54fe028e0a-dirty"
ggricourt
parents:
diff
changeset
|
85 with open(args.out_file, "w") as fh: |
c1d4f14dc768
"planemo upload for repository https://github.com/brsynth/synbiocad-galaxy-wrappers commit 7db54c0555a12ecb8b3f756032228c54fe028e0a-dirty"
ggricourt
parents:
diff
changeset
|
86 json.dump(data_manager_json, fh, sort_keys=True) |