diff data_manager/fetch_vep_cache_data.py @ 2:17c98d091710 draft

Uploaded
author dvanzessen
date Mon, 15 Jul 2019 05:19:31 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/data_manager/fetch_vep_cache_data.py	Mon Jul 15 05:19:31 2019 -0400
@@ -0,0 +1,70 @@
+import argparse
+import os
+import json
+import re
+import pprint
+import subprocess
+import sys
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Use VEP INSTALL.pl to download/process the cache for an assembly")
+    parser.add_argument("--output-file")
+    parser.add_argument("--output-dir")
+    parser.add_argument("--species")
+    parser.add_argument("--species-type", choices=["ensembl", "refseq", "merged"], default="ensembl")
+    args = parser.parse_args()
+
+    output_file = args.output_file
+    output_dir = args.output_dir
+
+    species = args.species
+    assembly = ""
+    
+    if species.startswith("homo_sapiens"):
+        if species.endswith("37"):
+            assembly = " --ASSEMBLY GRCh37"
+        elif species.endswith("38"):
+            assembly = " --ASSEMBLY GRCh38"
+        else:
+            print("Unknown human assembly")
+            sys.exit(1)
+        species = "homo_sapiens"
+
+    species_type = args.species_type
+    if species_type in ["refseq", "merged"]:
+        species = "{0}_{1}".format(species, species_type)
+    
+    with open(output_file) as output_file_handle:
+        params = json.loads(output_file_handle.read())
+
+    print(output_file)
+    print(output_dir)
+    print(species)
+    print(species_type)
+    pprint.pprint(params)
+
+    if not os.path.exists(output_dir):
+        os.makedirs(output_dir)
+    
+    vep_install_cmd = "vep_install --NO_HTSLIB -a alcf --CACHEDIR {0} --SPECIES {1}{2}".format(output_dir, species, assembly)
+
+    print("Running INSTALL.pl")
+    print(vep_install_cmd)
+    exit_code = subprocess.call(vep_install_cmd, cwd=output_dir, shell=True)
+
+    print(exit_code)
+
+    output_dict = dict(
+        data_tables=dict(
+            vep_cache_data=[{
+                "value": species,
+                "path": output_dir,
+                "dbkey": args.species,
+                "type": species_type,
+                "name": "{0} ({1})".format(args.species, species_type)
+            }]
+        )
+    )
+    with open(output_file, 'w') as output_file_handle:
+        output_file_handle.write(json.dumps(output_dict))
+    sys.exit(exit_code)
\ No newline at end of file