Mercurial > repos > pimarin > data_manager_bakta
comparison data_manager/bakta_build_database.py @ 27:2879a0e702d5 draft default tip
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_build_bakta_database commit 3bfd33ae9741216e50373ddd04914d82f9731883
| author | pimarin |
|---|---|
| date | Wed, 23 Aug 2023 14:38:16 +0000 |
| parents | 0408796bce2a |
| children |
comparison
equal
deleted
inserted
replaced
| 26:0408796bce2a | 27:2879a0e702d5 |
|---|---|
| 1 import argparse | 1 import argparse |
| 2 import hashlib | 2 import hashlib |
| 3 import json | 3 import json |
| 4 import os | 4 import os |
| 5 import re | |
| 5 import sys | 6 import sys |
| 6 import tarfile | 7 import tarfile |
| 7 from datetime import datetime | 8 from datetime import datetime |
| 8 from pathlib import Path | 9 from pathlib import Path |
| 9 | 10 |
| 14 class GetBaktaDatabaseInfo: | 15 class GetBaktaDatabaseInfo: |
| 15 """ | 16 """ |
| 16 Extract bakta database information to make a json file for data_manager | 17 Extract bakta database information to make a json file for data_manager |
| 17 """ | 18 """ |
| 18 | 19 |
| 19 def __init__(self, | 20 def __init__( |
| 20 data_table_name="bakta_database", | 21 self, |
| 21 db_name=Path.cwd().joinpath("db"), | 22 data_table_name="bakta_database", |
| 22 db_version="latest", | 23 db_version="latest", |
| 23 test_mode=False): | 24 tarball_name="db.tar.gz", |
| 25 test_mode=False, | |
| 26 ): | |
| 24 self.bakta_table_list = None | 27 self.bakta_table_list = None |
| 25 self.db_url = None | 28 self.db_url = None |
| 29 self.db_name = "bakta-db" | |
| 30 self.db_type = "" | |
| 26 self.data_table_entry = None | 31 self.data_table_entry = None |
| 27 self.data_table_name = data_table_name | 32 self.data_table_name = data_table_name |
| 28 self.db_name = db_name | 33 self.tar_name = tarball_name |
| 29 self.db_version = db_version | 34 self.db_version = db_version |
| 30 self.DB_VERSIONS_URL = 'https://raw.githubusercontent.com/oschwengers/bakta/master/db-versions.json' | 35 self.DB_VERSIONS_URL = "https://raw.githubusercontent.com/oschwengers/bakta/master/db-versions.json" |
| 31 self.DB_TEST_URL = 'https://zenodo.org/record/7360542/files/db-versions.json' | 36 self.DB_TEST_URL = "https://zenodo.org/record/8021032/files/db-versions.json" |
| 32 self.test_mode = test_mode | 37 self.test_mode = test_mode |
| 38 | |
| 39 def get_database_type(self): | |
| 40 self.light_db = bool(re.search(pattern="light", string=self.db_version)) | |
| 41 self.db_version = self.db_version.split(sep="_")[0] | |
| 42 if self.light_db: | |
| 43 self.db_type = "light" | |
| 44 self.tar_name = "db-light.tar.gz" | |
| 45 self.md5 = self.fetch_db_versions()["md5-light"] | |
| 46 else: | |
| 47 self.md5 = self.fetch_db_versions()["md5"] | |
| 33 | 48 |
| 34 def get_data_table_format(self): | 49 def get_data_table_format(self): |
| 35 """ | 50 """ |
| 36 Skeleton of a data_table format | 51 Skeleton of a data_table format |
| 37 return: a data table formated for json output | 52 return: a data table formated for json output |
| 38 """ | 53 """ |
| 39 self.data_table_entry = { | 54 self.data_table_entry = {"data_tables": {self.data_table_name: {}}} |
| 40 "data_tables": { | |
| 41 self.data_table_name: {} | |
| 42 } | |
| 43 } | |
| 44 return self.data_table_entry | 55 return self.data_table_entry |
| 45 | 56 |
| 46 def fetch_db_versions(self, db_version="latest"): | 57 def fetch_db_versions(self): |
| 47 """ | 58 """ |
| 48 List bakta database info related to the db_version selected | 59 List bakta database info related to the db_version selected |
| 49 """ | 60 """ |
| 50 if self.test_mode is True: | 61 |
| 62 if self.test_mode: | |
| 51 self.DB_VERSIONS_URL = self.DB_TEST_URL | 63 self.DB_VERSIONS_URL = self.DB_TEST_URL |
| 52 try: | 64 try: |
| 53 with requests.get(self.DB_VERSIONS_URL) as resp: | 65 with requests.get(self.DB_VERSIONS_URL) as resp: |
| 54 versions = json.loads(resp.content) | 66 versions = json.loads(resp.content) |
| 55 except IOError as e: | 67 except IOError as e: |
| 56 print(e, file=sys.stderr) | 68 print(e, file=sys.stderr) |
| 57 raise e | 69 raise e |
| 58 else: | 70 |
| 59 if db_version == "latest": | 71 if self.db_version == "latest": |
| 60 db_date_list = [] | 72 db_date_list = [] |
| 61 for db_dic in versions: | 73 for db_dic in versions: |
| 62 db_date_list.append(datetime.strptime(db_dic["date"], | 74 db_date_list.append( |
| 63 '%Y-%m-%d').date()) | 75 datetime.strptime(db_dic["date"], "%Y-%m-%d").date() |
| 64 filtered_version = max(versions, key=lambda x: x['date']) | 76 ) |
| 65 else: | 77 filtered_version = max(versions, key=lambda x: x["date"]) |
| 66 filtered_version = None | 78 else: |
| 67 for item in versions: | 79 filtered_version = None |
| 68 if '{0}.{1}'.format(item["major"], item["minor"]) == db_version: | 80 for item in versions: |
| 69 filtered_version = item | 81 if "{0}.{1}".format(item["major"], item["minor"]) == self.db_version: |
| 70 break | 82 filtered_version = item |
| 71 if filtered_version is None: | 83 break |
| 72 print("No matching version detected in the list") | 84 if filtered_version is None: |
| 73 if filtered_version is not None: | 85 print("No matching version detected in the list") |
| 74 self.db_url = f"https://zenodo.org/record/" \ | 86 else: |
| 75 f"{filtered_version['record']}/files/db.tar.gz" | 87 self.db_url = f"https://zenodo.org/record/{filtered_version['record']}/files/{self.tar_name}" |
| 76 self.db_version = db_version | 88 return filtered_version |
| 77 return filtered_version | |
| 78 | 89 |
| 79 def get_data_manager(self, bakta_database_info): | 90 def get_data_manager(self, bakta_database_info): |
| 80 self.bakta_table_list = self.get_data_table_format() | 91 self.bakta_table_list = self.get_data_table_format() |
| 81 bakta_name = f"V{bakta_database_info['major']}." \ | 92 bakta_name = ( |
| 82 f"{bakta_database_info['minor']}_" \ | 93 f"V{bakta_database_info['major']}." |
| 83 f"{bakta_database_info['date']}" | 94 f"{bakta_database_info['minor']}{self.db_type}_" |
| 84 tool_version = str(f"{bakta_database_info['software-min']['major']}." | 95 f"{bakta_database_info['date']}" |
| 85 f"{bakta_database_info['software-min']['minor']}") | 96 ) |
| 86 data_info = dict(value=bakta_name, | 97 tool_version = str( |
| 87 dbkey=bakta_database_info['record'], | 98 f"{bakta_database_info['software-min']['major']}." |
| 88 bakta_version=tool_version, | 99 f"{bakta_database_info['software-min']['minor']}" |
| 89 path="db") | 100 ) |
| 101 data_info = dict( | |
| 102 value=bakta_name, | |
| 103 dbkey=bakta_database_info["record"], | |
| 104 bakta_version=tool_version, | |
| 105 path=self.db_name, | |
| 106 ) | |
| 90 self.bakta_table_list["data_tables"][self.data_table_name] = [data_info] | 107 self.bakta_table_list["data_tables"][self.data_table_name] = [data_info] |
| 91 return self.bakta_table_list | 108 return self.bakta_table_list |
| 92 | 109 |
| 93 | 110 |
| 94 class InstallBaktaDatabase(GetBaktaDatabaseInfo): | 111 class InstallBaktaDatabase(GetBaktaDatabaseInfo): |
| 96 Download the bakta database, | 113 Download the bakta database, |
| 97 check md5 sum, | 114 check md5 sum, |
| 98 untar the download db and update for the amrfinderplus database | 115 untar the download db and update for the amrfinderplus database |
| 99 """ | 116 """ |
| 100 | 117 |
| 101 def __init__(self, | 118 def __init__( |
| 102 db_dir=Path.cwd(), | 119 self, |
| 103 db_name="bakta", | 120 db_dir=Path.cwd(), |
| 104 tarball_name="db.tar.gz", | 121 db_name="bakta-db", |
| 105 test_mode=False): | 122 db_version="latest", |
| 123 test_mode=False | |
| 124 ): | |
| 106 super().__init__() | 125 super().__init__() |
| 107 self.md5 = None | 126 self.md5 = None |
| 127 self.db_version = db_version | |
| 108 self.db_dir = db_dir | 128 self.db_dir = db_dir |
| 109 self.db_name = db_name | 129 self.db_name = db_name |
| 110 self.tarball_name = tarball_name | 130 self.tarball_path = "" |
| 111 self.tarball_path = None | |
| 112 self.test_mode = test_mode | 131 self.test_mode = test_mode |
| 132 self.get_database_type() | |
| 113 | 133 |
| 114 def download(self): | 134 def download(self): |
| 115 self.db_name = f'{self.db_name}_{self.db_version}' | 135 #self.db_name = f"{self.db_name}_{self.db_version}{self.db_type}" |
| 116 bakta_path = Path(self.db_dir).joinpath(self.tarball_name) | 136 bakta_path = Path(self.db_dir).joinpath(self.tar_name) |
| 117 try: | 137 try: |
| 118 with bakta_path.open('wb') as fh_out, \ | 138 with bakta_path.open("wb") as fh_out, requests.get( |
| 119 requests.get(self.db_url, stream=True) as resp: | 139 self.db_url, stream=True) as resp: |
| 120 total_length = resp.headers.get('content-length') | 140 total_length = resp.headers.get("content-length") |
| 121 if total_length is None: # no content length header | 141 if total_length is None: # no content length header |
| 122 for data in resp.iter_content(chunk_size=1024 * 1024): | 142 for data in resp.iter_content(chunk_size=1024 * 1024): |
| 123 fh_out.write(data) | 143 fh_out.write(data) |
| 124 else: | 144 else: |
| 125 for data in resp.iter_content(chunk_size=1024 * 1024): | 145 for data in resp.iter_content(chunk_size=1024 * 1024): |
| 126 fh_out.write(data) | 146 fh_out.write(data) |
| 127 print(f'Download bakta database {self.db_version}') | 147 print(f"Download bakta database {self.db_version}") |
| 128 self.tarball_path = bakta_path | 148 self.tarball_path = bakta_path |
| 129 except IOError: | 149 except IOError: |
| 130 print(f'ERROR: Could not download file from Zenodo!' | 150 print( |
| 131 f' url={self.db_url}, path={self.tarball_name}') | 151 f"ERROR: Could not download file from Zenodo!" |
| 152 f" url={self.db_url}, to={self.tarball_path}" | |
| 153 ) | |
| 132 | 154 |
| 133 def untar(self): | 155 def untar(self): |
| 134 db_path = Path(self.db_dir).as_posix() | 156 db_path = Path(self.db_dir).joinpath(self.db_name) |
| 135 try: | 157 try: |
| 136 with self.tarball_path.open('rb') as fh_in, \ | 158 with self.tarball_path.open("rb") as fh_in, tarfile.open( |
| 137 tarfile.open(fileobj=fh_in, mode='r:gz') as tar_file: | 159 fileobj=fh_in, mode="r:gz" |
| 160 ) as tar_file: | |
| 138 tar_file.extractall(path=db_path) | 161 tar_file.extractall(path=db_path) |
| 139 print(f'Untar the database in {db_path}') | 162 print(f"Untar the database in {db_path}") |
| 140 return db_path | 163 # return db_path |
| 141 except OSError: | 164 except OSError: |
| 142 sys.exit(f'ERROR: Could not extract {self.tarball_name} ' | 165 sys.exit(f"ERROR: Could not extract {self.tar_name} " f"to {db_path}") |
| 143 f'to {self.db_name}') | |
| 144 | 166 |
| 145 def calc_md5_sum(self, buffer_size=1048576): | 167 def calc_md5_sum(self, buffer_size=1048576): |
| 146 tarball_path = Path(self.db_dir).joinpath(self.tarball_name) | 168 tarball_path = Path(self.db_dir).joinpath(self.tar_name) |
| 147 self.md5 = self.fetch_db_versions(db_version=self.db_version)["md5"] | |
| 148 md5 = hashlib.md5() | 169 md5 = hashlib.md5() |
| 149 with tarball_path.open('rb') as fh: | 170 with tarball_path.open("rb") as fh: |
| 150 data = fh.read(buffer_size) | 171 data = fh.read(buffer_size) |
| 151 while data: | 172 while data: |
| 152 md5.update(data) | 173 md5.update(data) |
| 153 data = fh.read(buffer_size) | 174 data = fh.read(buffer_size) |
| 154 if md5.hexdigest() == self.md5: | 175 if md5.hexdigest() == self.md5: |
| 155 print('\t...md5 control database OK') | 176 print("\t...md5 control database OK") |
| 156 else: | 177 else: |
| 157 print(f"Error: corrupt database file! " | 178 print( |
| 158 f"calculated md5 = {md5.hexdigest()}" | 179 f"Error: corrupt database file! " |
| 159 f" different from {self.md5} ") | 180 f"calculated md5 = {md5.hexdigest()}" |
| 160 | 181 f" different from {self.md5} " |
| 161 | 182 ) |
| 162 """ | |
| 163 This is the method to download the amrfinderplus database need by bakta. | |
| 164 Deprecated to use the amrfinderplus data_manager | |
| 165 def update_amrfinderplus_db(self): | |
| 166 amrfinderplus_db_path = f"{self.db_dir}/{self.db_name}/db/amrfinderplus-db" | |
| 167 if self.db_version == "test": | |
| 168 cmd = [ | |
| 169 'amrfinder_update', | |
| 170 '--database', str(amrfinderplus_db_path), | |
| 171 '--force_update', | |
| 172 '--help' | |
| 173 ] | |
| 174 else: | |
| 175 cmd = [ | |
| 176 'amrfinder_update', | |
| 177 '--database', str(amrfinderplus_db_path), | |
| 178 '--force_update' | |
| 179 ] | |
| 180 proc = sp.run( | |
| 181 cmd, | |
| 182 universal_newlines=True | |
| 183 ) | |
| 184 if proc.returncode != 0: | |
| 185 print(f"ERROR: AMRFinderPlus failed! " | |
| 186 f"command: 'amrfinder_update --force_update" | |
| 187 f" --database {amrfinderplus_db_path}'") | |
| 188 else: | |
| 189 print("AMRFinderPlus database download") | |
| 190 """ | |
| 191 | 183 |
| 192 | 184 |
| 193 def parse_arguments(): | 185 def parse_arguments(): |
| 194 # parse options and arguments | 186 # parse options and arguments |
| 195 arg_parser = argparse.ArgumentParser() | 187 arg_parser = argparse.ArgumentParser() |
| 196 arg_parser.add_argument("data_manager_json") | 188 arg_parser.add_argument("data_manager_json") |
| 197 arg_parser.add_argument("-d", "--database_version", | 189 arg_parser.add_argument( |
| 198 help='Select the database version ' | 190 "-d", |
| 199 '(major and minor eg. 4.0),' | 191 "--database_version", |
| 200 'default is the latest version', | 192 help="Select the database version " |
| 201 default="latest", | 193 "(major and minor eg. 4.0)," |
| 202 required=True) | 194 "default is the latest version", |
| 203 arg_parser.add_argument("-t", "--test", action='store_true', | 195 default="latest", |
| 204 help="option to test the script with an empty database") | 196 required=True, |
| 197 ) | |
| 198 arg_parser.add_argument( | |
| 199 "-t", | |
| 200 "--test", | |
| 201 action="store_true", | |
| 202 help="option to test the script with an empty database", | |
| 203 ) | |
| 205 return arg_parser.parse_args() | 204 return arg_parser.parse_args() |
| 206 | 205 |
| 207 | 206 |
| 208 def main(): | 207 def main(): |
| 209 all_args = parse_arguments() | 208 all_args = parse_arguments() |
| 210 with open(all_args.data_manager_json) as fh: | 209 with open(all_args.data_manager_json) as fh: |
| 211 params = json.load(fh) | 210 params = json.load(fh) |
| 212 target_dir = params['output_data'][0]['extra_files_path'] | 211 target_dir = params["output_data"][0]["extra_files_path"] |
| 213 os.makedirs(target_dir) | 212 os.makedirs(target_dir) |
| 214 # init the class to download bakta db | 213 # init the class to download bakta db |
| 215 bakta_upload = InstallBaktaDatabase(test_mode=all_args.test) | 214 bakta_upload = InstallBaktaDatabase( |
| 216 bakta_db = bakta_upload.fetch_db_versions(db_version=all_args.database_version) | 215 test_mode=all_args.test, db_version=all_args.database_version |
| 216 ) | |
| 217 bakta_db = bakta_upload.fetch_db_versions() | |
| 217 # update the path for galaxy | 218 # update the path for galaxy |
| 218 bakta_upload.db_dir = target_dir | 219 bakta_upload.db_dir = target_dir |
| 219 # download the database | 220 # download the database |
| 220 bakta_upload.download() | 221 bakta_upload.download() |
| 221 # check md5 sum | 222 # check md5 sum |
| 222 bakta_upload.calc_md5_sum() | 223 bakta_upload.calc_md5_sum() |
| 223 # untar db | 224 # untar db |
| 224 bakta_upload.untar() | 225 bakta_upload.untar() |
| 225 # make the data_manager metadata | 226 # make the data_manager metadata |
| 226 bakta_data_manager = bakta_upload.get_data_manager(bakta_database_info=bakta_db) | 227 bakta_data_manager = bakta_upload.get_data_manager(bakta_database_info=bakta_db) |
| 227 with open(all_args.data_manager_json, 'w') as fh: | 228 with open(all_args.data_manager_json, "w") as fh: |
| 228 json.dump(bakta_data_manager, fh, sort_keys=True) | 229 json.dump(bakta_data_manager, fh, sort_keys=True) |
| 229 | 230 |
| 230 | 231 |
| 231 if __name__ == '__main__': | 232 if __name__ == "__main__": |
| 232 main() | 233 main() |
