annotate data_manager/data_manager_vep_cache_download.py @ 7:7890790d2afd draft

Fully working now and improved several things
author sh477
date Tue, 01 Mar 2022 18:12:26 +0000
parents a3dba0440f08
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
5
a3dba0440f08 Reformatting
sh477
parents: 4
diff changeset
1 #!/usr/bin/env python
a3dba0440f08 Reformatting
sh477
parents: 4
diff changeset
2
a3dba0440f08 Reformatting
sh477
parents: 4
diff changeset
3 import datetime
a3dba0440f08 Reformatting
sh477
parents: 4
diff changeset
4 import json
a3dba0440f08 Reformatting
sh477
parents: 4
diff changeset
5 import os
a3dba0440f08 Reformatting
sh477
parents: 4
diff changeset
6 import re
a3dba0440f08 Reformatting
sh477
parents: 4
diff changeset
7 from urllib.request import urlretrieve
a3dba0440f08 Reformatting
sh477
parents: 4
diff changeset
8 import sys
a3dba0440f08 Reformatting
sh477
parents: 4
diff changeset
9 import tarfile
a3dba0440f08 Reformatting
sh477
parents: 4
diff changeset
10
a3dba0440f08 Reformatting
sh477
parents: 4
diff changeset
11
a3dba0440f08 Reformatting
sh477
parents: 4
diff changeset
12 def main():
a3dba0440f08 Reformatting
sh477
parents: 4
diff changeset
13 # Read in given out_file and create target directory for file download
a3dba0440f08 Reformatting
sh477
parents: 4
diff changeset
14 with open(sys.argv[1]) as fh:
a3dba0440f08 Reformatting
sh477
parents: 4
diff changeset
15 params = json.load(fh)
a3dba0440f08 Reformatting
sh477
parents: 4
diff changeset
16 target_directory = params['output_data'][0]['extra_files_path']
a3dba0440f08 Reformatting
sh477
parents: 4
diff changeset
17 os.mkdir(target_directory)
a3dba0440f08 Reformatting
sh477
parents: 4
diff changeset
18
a3dba0440f08 Reformatting
sh477
parents: 4
diff changeset
19 # Process parameters for metadata and file download
a3dba0440f08 Reformatting
sh477
parents: 4
diff changeset
20 url = params['param_dict']['url'].rstrip("/") + "/" + params['param_dict']['file_name'].lstrip("/")
7
7890790d2afd Fully working now and improved several things
sh477
parents: 5
diff changeset
21 m = re.search(r"(.*?)(merged|refseq)?_vep_(\d+?)_", params['param_dict']['file_name'])
7890790d2afd Fully working now and improved several things
sh477
parents: 5
diff changeset
22 version = str(m.group(3))
7890790d2afd Fully working now and improved several things
sh477
parents: 5
diff changeset
23 cache_type = m.group(2) if m.group(2) else "default"
7890790d2afd Fully working now and improved several things
sh477
parents: 5
diff changeset
24 species = m.group(1).rstrip("_")
7890790d2afd Fully working now and improved several things
sh477
parents: 5
diff changeset
25 display_name = f"{species.capitalize().replace('_', ' ')} {params['param_dict']['dbkey']} (V{version}{'' if cache_type == 'default' else ', ' + cache_type.capitalize()})"
5
a3dba0440f08 Reformatting
sh477
parents: 4
diff changeset
26
a3dba0440f08 Reformatting
sh477
parents: 4
diff changeset
27 # Download and extract given cache archive, remove archive afterwards
a3dba0440f08 Reformatting
sh477
parents: 4
diff changeset
28 final_file, headers = urlretrieve(url, os.path.join(target_directory, params['param_dict']['file_name']))
a3dba0440f08 Reformatting
sh477
parents: 4
diff changeset
29 tar = tarfile.open(final_file, "r:gz")
a3dba0440f08 Reformatting
sh477
parents: 4
diff changeset
30 tar.extractall(target_directory)
a3dba0440f08 Reformatting
sh477
parents: 4
diff changeset
31 tar.close()
a3dba0440f08 Reformatting
sh477
parents: 4
diff changeset
32 os.remove(final_file)
a3dba0440f08 Reformatting
sh477
parents: 4
diff changeset
33
a3dba0440f08 Reformatting
sh477
parents: 4
diff changeset
34 # Construct metadata for the new data table entry
a3dba0440f08 Reformatting
sh477
parents: 4
diff changeset
35 data_manager_dict = {
a3dba0440f08 Reformatting
sh477
parents: 4
diff changeset
36 'data_tables': {
7
7890790d2afd Fully working now and improved several things
sh477
parents: 5
diff changeset
37 'vep_versioned_annotation_cache': [
5
a3dba0440f08 Reformatting
sh477
parents: 4
diff changeset
38 {
a3dba0440f08 Reformatting
sh477
parents: 4
diff changeset
39 'value': params['param_dict']['file_name'].strip(".tar.gz"),
a3dba0440f08 Reformatting
sh477
parents: 4
diff changeset
40 'dbkey': params['param_dict']['dbkey'],
a3dba0440f08 Reformatting
sh477
parents: 4
diff changeset
41 'version': version,
a3dba0440f08 Reformatting
sh477
parents: 4
diff changeset
42 'cachetype': cache_type,
7
7890790d2afd Fully working now and improved several things
sh477
parents: 5
diff changeset
43 'name': display_name,
7890790d2afd Fully working now and improved several things
sh477
parents: 5
diff changeset
44 'species': species,
5
a3dba0440f08 Reformatting
sh477
parents: 4
diff changeset
45 'path': './%s' % params['param_dict']['file_name'].strip(".tar.gz")
a3dba0440f08 Reformatting
sh477
parents: 4
diff changeset
46 }
a3dba0440f08 Reformatting
sh477
parents: 4
diff changeset
47 ]
a3dba0440f08 Reformatting
sh477
parents: 4
diff changeset
48 }
a3dba0440f08 Reformatting
sh477
parents: 4
diff changeset
49 }
a3dba0440f08 Reformatting
sh477
parents: 4
diff changeset
50
a3dba0440f08 Reformatting
sh477
parents: 4
diff changeset
51 # Save metadata to out_file
a3dba0440f08 Reformatting
sh477
parents: 4
diff changeset
52 with open(sys.argv[1], 'w') as fh:
a3dba0440f08 Reformatting
sh477
parents: 4
diff changeset
53 json.dump(data_manager_dict, fh, sort_keys=True)
a3dba0440f08 Reformatting
sh477
parents: 4
diff changeset
54
a3dba0440f08 Reformatting
sh477
parents: 4
diff changeset
55
a3dba0440f08 Reformatting
sh477
parents: 4
diff changeset
56 if __name__ == "__main__":
a3dba0440f08 Reformatting
sh477
parents: 4
diff changeset
57 main()