annotate data_manager/fetch_vep_cache_data.py @ 2:17c98d091710 draft

Uploaded
author dvanzessen
date Mon, 15 Jul 2019 05:19:31 -0400
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
2
17c98d091710 Uploaded
dvanzessen
parents:
diff changeset
1 import argparse
17c98d091710 Uploaded
dvanzessen
parents:
diff changeset
2 import os
17c98d091710 Uploaded
dvanzessen
parents:
diff changeset
3 import json
17c98d091710 Uploaded
dvanzessen
parents:
diff changeset
4 import re
17c98d091710 Uploaded
dvanzessen
parents:
diff changeset
5 import pprint
17c98d091710 Uploaded
dvanzessen
parents:
diff changeset
6 import subprocess
17c98d091710 Uploaded
dvanzessen
parents:
diff changeset
7 import sys
17c98d091710 Uploaded
dvanzessen
parents:
diff changeset
8
17c98d091710 Uploaded
dvanzessen
parents:
diff changeset
9 if __name__ == "__main__":
17c98d091710 Uploaded
dvanzessen
parents:
diff changeset
10 parser = argparse.ArgumentParser(description="Use VEP INSTALL.pl to download/process the cache for an assembly")
17c98d091710 Uploaded
dvanzessen
parents:
diff changeset
11 parser.add_argument("--output-file")
17c98d091710 Uploaded
dvanzessen
parents:
diff changeset
12 parser.add_argument("--output-dir")
17c98d091710 Uploaded
dvanzessen
parents:
diff changeset
13 parser.add_argument("--species")
17c98d091710 Uploaded
dvanzessen
parents:
diff changeset
14 parser.add_argument("--species-type", choices=["ensembl", "refseq", "merged"], default="ensembl")
17c98d091710 Uploaded
dvanzessen
parents:
diff changeset
15 args = parser.parse_args()
17c98d091710 Uploaded
dvanzessen
parents:
diff changeset
16
17c98d091710 Uploaded
dvanzessen
parents:
diff changeset
17 output_file = args.output_file
17c98d091710 Uploaded
dvanzessen
parents:
diff changeset
18 output_dir = args.output_dir
17c98d091710 Uploaded
dvanzessen
parents:
diff changeset
19
17c98d091710 Uploaded
dvanzessen
parents:
diff changeset
20 species = args.species
17c98d091710 Uploaded
dvanzessen
parents:
diff changeset
21 assembly = ""
17c98d091710 Uploaded
dvanzessen
parents:
diff changeset
22
17c98d091710 Uploaded
dvanzessen
parents:
diff changeset
23 if species.startswith("homo_sapiens"):
17c98d091710 Uploaded
dvanzessen
parents:
diff changeset
24 if species.endswith("37"):
17c98d091710 Uploaded
dvanzessen
parents:
diff changeset
25 assembly = " --ASSEMBLY GRCh37"
17c98d091710 Uploaded
dvanzessen
parents:
diff changeset
26 elif species.endswith("38"):
17c98d091710 Uploaded
dvanzessen
parents:
diff changeset
27 assembly = " --ASSEMBLY GRCh38"
17c98d091710 Uploaded
dvanzessen
parents:
diff changeset
28 else:
17c98d091710 Uploaded
dvanzessen
parents:
diff changeset
29 print("Unknown human assembly")
17c98d091710 Uploaded
dvanzessen
parents:
diff changeset
30 sys.exit(1)
17c98d091710 Uploaded
dvanzessen
parents:
diff changeset
31 species = "homo_sapiens"
17c98d091710 Uploaded
dvanzessen
parents:
diff changeset
32
17c98d091710 Uploaded
dvanzessen
parents:
diff changeset
33 species_type = args.species_type
17c98d091710 Uploaded
dvanzessen
parents:
diff changeset
34 if species_type in ["refseq", "merged"]:
17c98d091710 Uploaded
dvanzessen
parents:
diff changeset
35 species = "{0}_{1}".format(species, species_type)
17c98d091710 Uploaded
dvanzessen
parents:
diff changeset
36
17c98d091710 Uploaded
dvanzessen
parents:
diff changeset
37 with open(output_file) as output_file_handle:
17c98d091710 Uploaded
dvanzessen
parents:
diff changeset
38 params = json.loads(output_file_handle.read())
17c98d091710 Uploaded
dvanzessen
parents:
diff changeset
39
17c98d091710 Uploaded
dvanzessen
parents:
diff changeset
40 print(output_file)
17c98d091710 Uploaded
dvanzessen
parents:
diff changeset
41 print(output_dir)
17c98d091710 Uploaded
dvanzessen
parents:
diff changeset
42 print(species)
17c98d091710 Uploaded
dvanzessen
parents:
diff changeset
43 print(species_type)
17c98d091710 Uploaded
dvanzessen
parents:
diff changeset
44 pprint.pprint(params)
17c98d091710 Uploaded
dvanzessen
parents:
diff changeset
45
17c98d091710 Uploaded
dvanzessen
parents:
diff changeset
46 if not os.path.exists(output_dir):
17c98d091710 Uploaded
dvanzessen
parents:
diff changeset
47 os.makedirs(output_dir)
17c98d091710 Uploaded
dvanzessen
parents:
diff changeset
48
17c98d091710 Uploaded
dvanzessen
parents:
diff changeset
49 vep_install_cmd = "vep_install --NO_HTSLIB -a alcf --CACHEDIR {0} --SPECIES {1}{2}".format(output_dir, species, assembly)
17c98d091710 Uploaded
dvanzessen
parents:
diff changeset
50
17c98d091710 Uploaded
dvanzessen
parents:
diff changeset
51 print("Running INSTALL.pl")
17c98d091710 Uploaded
dvanzessen
parents:
diff changeset
52 print(vep_install_cmd)
17c98d091710 Uploaded
dvanzessen
parents:
diff changeset
53 exit_code = subprocess.call(vep_install_cmd, cwd=output_dir, shell=True)
17c98d091710 Uploaded
dvanzessen
parents:
diff changeset
54
17c98d091710 Uploaded
dvanzessen
parents:
diff changeset
55 print(exit_code)
17c98d091710 Uploaded
dvanzessen
parents:
diff changeset
56
17c98d091710 Uploaded
dvanzessen
parents:
diff changeset
57 output_dict = dict(
17c98d091710 Uploaded
dvanzessen
parents:
diff changeset
58 data_tables=dict(
17c98d091710 Uploaded
dvanzessen
parents:
diff changeset
59 vep_cache_data=[{
17c98d091710 Uploaded
dvanzessen
parents:
diff changeset
60 "value": species,
17c98d091710 Uploaded
dvanzessen
parents:
diff changeset
61 "path": output_dir,
17c98d091710 Uploaded
dvanzessen
parents:
diff changeset
62 "dbkey": args.species,
17c98d091710 Uploaded
dvanzessen
parents:
diff changeset
63 "type": species_type,
17c98d091710 Uploaded
dvanzessen
parents:
diff changeset
64 "name": "{0} ({1})".format(args.species, species_type)
17c98d091710 Uploaded
dvanzessen
parents:
diff changeset
65 }]
17c98d091710 Uploaded
dvanzessen
parents:
diff changeset
66 )
17c98d091710 Uploaded
dvanzessen
parents:
diff changeset
67 )
17c98d091710 Uploaded
dvanzessen
parents:
diff changeset
68 with open(output_file, 'w') as output_file_handle:
17c98d091710 Uploaded
dvanzessen
parents:
diff changeset
69 output_file_handle.write(json.dumps(output_dict))
17c98d091710 Uploaded
dvanzessen
parents:
diff changeset
70 sys.exit(exit_code)