comparison data_manager/interproscan.py @ 0:3e344aedb267 draft

"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_interproscan commit 2f5d27a375fcc2e8d77914b3d9e402a9e2df2d97"
author iuc
date Mon, 15 Nov 2021 17:20:14 +0000
parents
children d9a238ff2bc3
comparison
equal deleted inserted replaced
-1:000000000000 0:3e344aedb267
1 #!/usr/bin/env python
2
3 import argparse
4 import hashlib
5 import json
6 import operator
7 import os
8 import re
9 import shutil
10 import subprocess
11 import sys
12 import tarfile
13
14 import requests
15
16
17 GH_REPO_API = 'https://api.github.com/repos/ebi-pf-team/interproscan/'
18 MD5_URL = 'http://ftp.ebi.ac.uk/pub/software/unix/iprscan/5/{version}/interproscan-{version}-64-bit.tar.gz.md5'
19 DATA_URL = 'http://ftp.ebi.ac.uk/pub/software/unix/iprscan/5/{version}/interproscan-{version}-64-bit.tar.gz'
20
21 # For tests: download a smaller archive containing *some* data
22 PARTIAL_URL = 'https://github.com/ebi-pf-team/interproscan/archive/{version}.tar.gz'
23
24
25 def list_tags(url=None):
26
27 if not url:
28 url = GH_REPO_API + 'tags'
29
30 resp = requests.get(url=url)
31 data = resp.json()
32
33 tags = []
34 for tag in data:
35 if re.match(r"^[0-9]\.[0-9]{2}-[0-9]{2}\.[0-9]$", tag['name']):
36 tags.append(tag['name'])
37
38 if 'next' in resp.links:
39 tags += list_tags(resp.links['next']['url'])
40
41 return sorted(tags)
42
43
44 def download_file(url, dest):
45 with requests.get(url, stream=True) as r:
46 r.raise_for_status()
47 with open(dest, 'wb') as f:
48 for chunk in r.iter_content(chunk_size=8192):
49 f.write(chunk)
50
51
52 def main():
53 parser = argparse.ArgumentParser(description='Download data for InterProScan')
54 parser.add_argument('--partial', dest='partial', action='store_true', help='Only download a small subset of data (for testing)')
55 parser.add_argument('-v', '--version', help='Specify an InterProScan version (default: latest)')
56 parser.add_argument("datatable_name")
57 parser.add_argument("galaxy_datamanager_filename")
58
59 args = parser.parse_args()
60
61 with open(args.galaxy_datamanager_filename) as fh:
62 config = json.load(fh)
63
64 output_directory = config.get("output_data", [{}])[0].get("extra_files_path", None)
65 data_manager_dict = {}
66 data_manager_dict["data_tables"] = config.get("data_tables", {})
67 data_manager_dict["data_tables"][args.datatable_name] = data_manager_dict[
68 "data_tables"
69 ].get(args.datatable_name, [])
70
71 os.mkdir(output_directory)
72
73 all_tags = list_tags()
74
75 if args.version:
76 if args.version not in all_tags:
77 raise RuntimeError("Version '%s' is not valid" % args.version)
78 tag = args.version
79 else:
80 tag = all_tags[-1]
81
82 print("Will download data for InterProScan version: %s" % tag)
83
84 print("Getting MD5 checksum:")
85 md5 = requests.get(url=MD5_URL.format(version=tag)).text
86 if not re.match(r"^([a-fA-F\d]{32}) interproscan-[0-9]\.[0-9]{2}-[0-9]{2}\.[0-9]-64-bit.tar.gz$", md5):
87 raise RuntimeError("Got invalid MD5 from the InterProScan FTP server: '%s'" % md5)
88 print("%s" % md5)
89
90 if args.partial:
91 print("Downloading partial data tarball...")
92 dest_tar = os.path.join(output_directory, PARTIAL_URL.format(version=tag).split('/')[-1])
93 download_file(PARTIAL_URL.format(version=tag), dest_tar)
94 else:
95 print("Downloading data tarball...")
96 dest_tar = os.path.join(output_directory, DATA_URL.format(version=tag).split('/')[-1])
97 download_file(DATA_URL.format(version=tag), dest_tar)
98
99 print("Finished, now checking md5...")
100 md5_computed = hashlib.md5(open(dest_tar, 'rb').read()).hexdigest()
101 if not md5.startswith(md5_computed):
102 raise RuntimeError("MD5 check failed: computed '%s', expected '%s'" % (md5_computed, md5))
103
104 print("Ok, now extracting data...")
105 tar = tarfile.open(dest_tar, "r:gz")
106 tar.extractall(output_directory)
107 tar.close()
108
109 if args.partial:
110 print("Moving partial data files around...")
111 shutil.move(os.path.join(output_directory, 'interproscan-%s' % tag, 'core/jms-implementation/support-mini-x86-32/data/'), os.path.join(output_directory, 'data'))
112 else:
113 print("Moving data files around...")
114 shutil.move(os.path.join(output_directory, 'interproscan-%s' % tag), os.path.join(output_directory, 'data'))
115
116 print("Done, removing tarball and unneeded files...")
117 os.remove(dest_tar)
118 shutil.rmtree(os.path.join(output_directory, 'interproscan-%s' % tag))
119
120 print("Running initial_setup.py (index hmm models)...")
121 # Write a temp properties file in work dir
122 prop_file_src = os.path.join(os.path.dirname(os.path.realpath(shutil.which("interproscan.sh"))), 'interproscan.properties')
123 with open(prop_file_src, 'r') as prop:
124 prop_content = prop.read()
125 prop_content = re.sub(r'^data\.directory=.*$', 'data.directory=%s' % os.path.join(output_directory, 'data'), prop_content, flags=re.M)
126 with open('interproscan.properties', 'w') as prop:
127 prop.write(prop_content)
128 # Run the index command
129 cmd_args = [os.path.join(os.path.dirname(os.path.realpath(shutil.which("interproscan.sh"))), 'initial_setup.py')]
130 proc = subprocess.Popen(args=cmd_args, shell=False)
131 out, err = proc.communicate()
132 print(out)
133 print(err, file=sys.stderr)
134 return_code = proc.wait()
135 if return_code:
136 print("Error running initial_setup.py.", file=sys.stderr)
137 sys.exit(return_code)
138
139 data_manager_dict["data_tables"][args.datatable_name].append(
140 dict(
141 value=tag,
142 description="InterProScan %s" % tag,
143 interproscan_version=tag,
144 path=output_directory,
145 )
146 )
147
148 print("Saving data table content...")
149
150 data_manager_dict["data_tables"][args.datatable_name].sort(
151 key=operator.itemgetter("value"), reverse=True
152 )
153 with open(args.galaxy_datamanager_filename, "w") as fh:
154 json.dump(data_manager_dict, fh, indent=2, sort_keys=True)
155
156 print("Finished.")
157
158
159 if __name__ == "__main__":
160 main()