Mercurial > repos > artbio > ngsplot
diff bin/ngsplotdb.py @ 0:3ca58369469c draft
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/ngsplot commit b'e9fcc157a7f2f2fa9d6ac9a58d425ff17c975f5c\n'
| author | artbio |
|---|---|
| date | Wed, 06 Dec 2017 19:01:53 -0500 |
| parents | |
| children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/bin/ngsplotdb.py Wed Dec 06 19:01:53 2017 -0500 @@ -0,0 +1,670 @@ +#!/usr/bin/env python +# +# ngsplotdb - PYTHON script to manipulate the ngs.plot database installation. +# Author: Li Shen, Mount Sinai School of Medicine +# Date created: Dec 28, 2012 +# Last updated: May 28, 2013 +# +# Functions implemented: +# +# list - List locally installed genomes. +# check - Check the integrity of local database. +# install - Install a genome from ngsplotdb.XXX.tar.gz. +# remove - Remove a locally installed genome. + + +# def dir_to_idspec(dirn): +# """Convert an FTP directory name to species ID and Name. +# This works with both UCSC and Ensembl FTP. +# """ +# parts = dirn.split('_') +# sid = parts[0][:3].lower() + parts[1][:3].lower() +# spec = str.title(parts[0] + ' ' + parts[1]) +# return sid, spec + +# def listremote(database="refseq"): +# """List available genomes on a remote FTP site.""" + +# import urllib2 + +# url_ensembl = "ftp://ftp.ensembl.org/pub/current_gtf/" +# url_ucsc = "ftp://hgdownload.cse.ucsc.edu/goldenPath/currentGenomes/" + +# if database == "refseq": +# ftp = urllib2.urlopen(url_ucsc) +# elif database == "ensembl": +# ftp = urllib2.urlopen(url_ensembl) +# else: +# pass + +# # Read directory names from the remote FTP site. +# dir_names = ftp.read().splitlines() +# # Parse lines to obtain clean directory names only. +# if database == "refseq": +# dir_names = map(lambda x: x.split()[-3], dir_names) +# elif database == "ensembl": +# dir_names = map(lambda x: x.split()[-1], dir_names) +# else: +# pass +# # Filter directory names that do not correspond to a species. +# # On UCSC FTP, both "." and ".." will be retrieved from above and can +# # cause troubles. + +# id_spec_list = map(dir_to_idspec, dir_names) +# print "ID\tName" +# for r in id_spec_list: +# print r[0], "\t", r[1] + +def read_gnlist(root_path, mode): + """Read installed genomes list. + + Args: + root_path: ngs.plot installation directory. + mode: row reading mode - "vector" or "hash", correspond to each record + read as a vector of hash table. + Returns: (header split vector, hash table of installed genomes, + vector of column widths) + """ + from collections import defaultdict + + gn_list = root_path + "/database/gn_list.txt" + try: + db_f = open(gn_list, "r") + except IOError: + print "Open {0} error: your ngs.plot database may be corrupted.".\ + format(gn_list) + sys.exit() + + default_list = root_path + "/database/default.tbl" + try: + default_f = open(default_list, "r") + except IOError: + print "Open {0} error: your ngs.plot database may be corrupted.".\ + format(default_list) + sys.exit() + default_l = [] + header = default_f.readline() # skip the header + for rec in default_f: + r_sp = rec.rstrip().split("\t") + default_l.append((r_sp[0], r_sp[2])) # tuple of genome and region, like ("dm3", "genebody") + genome_region_dict = defaultdict(list) + for genome,region in default_l: + genome_region_dict[genome].append(region) + + header = db_f.readline() + h_sp = header.rstrip().split("\t") + h_sp.append("InstalledFeatures") + v_cw = map(len, h_sp) # column widths initialize to header widths. + + g_tbl = {} + for rec in db_f: + r_sp = rec.rstrip().split("\t") + r_sp.append(",".join(genome_region_dict[r_sp[0]])) + r_cw = map(len, r_sp) + v_cw = map(max, v_cw, r_cw) + + r_tbl = dict(zip(h_sp, r_sp)) + if(mode == "vector"): + g_tbl[r_tbl["ID"]] = r_sp + elif(mode == "hash"): + g_tbl[r_tbl["ID"]] = r_tbl + else: + pass + + db_f.close() + + return (h_sp, g_tbl, v_cw) + + +def update_gnlist(root_path, g_tbl, gn, assembly, species, ens_v, np_v): + """Update/Add genomes in ngs.plot database. + + Args: + root_path: ngs.plot installation directory. + g_tbl: genome hash table. Each value is also a hash table of + genome info. + gn: genome name. + assembly: genome assembly name. + species: species name. + ens_v: new Ensembl version. + np_v: new ngs.plot version. + Returns: None + """ + + if gn in g_tbl: + g_tbl[gn]["EnsVer"] = ens_v + g_tbl[gn]["NPVer"] = np_v + else: + g_tbl[gn] = {"ID": gn, "Assembly": assembly, "Species": species, \ + "EnsVer": ens_v, "NPVer": np_v} + + write_gnlist(root_path, g_tbl) + + +def write_gnlist(root_path, g_tbl): + """Write genome hash table to database file: gn_list.txt. + + Args: + root_path: ngs.plot installation directory. + g_tbl: genome hash table. Each value is also a hash table of + genome info. + Returns: None + """ + + output_order = ["ID", "Assembly", "Species", "EnsVer", "NPVer"] + gn_list = root_path + "/database/gn_list.txt" + + try: + db_f = open(gn_list, "w") + except IOError: + print "Open {0} error: your ngs.plot database may be corrupted.".\ + format(gn_list) + sys.exit() + + db_f.write("\t".join(output_order) + "\n") + + for k in g_tbl.viewkeys(): + rec = [] + for col in output_order: + rec.append(str(g_tbl[k][col])) + db_f.write("\t".join(rec) + "\n") + + db_f.close() + + +def listgn(root_path, args): + """List installed genomes in local database on screen. + + Args: + root_path: ngs.plot installation directory. + args: currently not used. + Returns: None. + """ + + import math + + (h_sp, g_tbl, v_cw) = read_gnlist(root_path, "vector") + + # Format column widths to beautify output. + tab_u = 4 + v_cw = map(lambda x: int(math.ceil(float(x) / tab_u) * tab_u + 1), v_cw) + + # List genomes to screen. + print "".join(map(lambda x, y: x.ljust(y), h_sp, v_cw)) # header. + for k in sorted(g_tbl.viewkeys(), key=str.lower): + print "".join(map(lambda x, y: x.ljust(y), g_tbl[k], v_cw)) + +def isExAnno(pkg_files): + """If the package is an extension package based on cell lines for ngs.plot, + like enhancer or dhs. + + Args: + pkg_files: list of files in the package. + Returns: + isEx: Bool, if the package is an extension package. + feature: string, feature name contained in the extension package. None + if not an extension package. + RDcount: number of RData tables in the package. + """ + + import os.path + + exclusive_files = [".chrnames.refseq", ".chrnames.ensembl", ".metainfo"] + isEx = False + RDcount = 0 + feature = None + for file_name in pkg_files: + if os.path.basename(file_name) in exclusive_files: + continue + if (not file_name.endswith("RData")) and (file_name.count("/")) > 0: + isEx = True + feature = file_name.split("/")[1] + elif file_name.endswith("RData"): + RDcount += 1 + return (isEx, feature, RDcount) + +def install(root_path, args): + """Interactive session for installing genome from package file. + + Args: + root_path: ngs.plot installation directory. + args.yes: say yes to all questions. + args.pkg: path of package file. + Returns: + None. + """ + + import tarfile + import sys + + yestoall = args.yes + pkg_file = args.pkg + # Package name: e.g. ngsplotdb_mm10_71_1.0.tar.gz. + # Information about the genome is in mm10/.metainfo. + try: + pkg_f = tarfile.open(pkg_file, "r:gz") + except tarfile.ReadError: + print "Read package file {0} error.".format(pkg_file), + print "The downloaded file may be corrupted." + sys.exit() + + print "Extracting information from package...", + sys.stdout.flush() + pkg_files = pkg_f.getnames() + g_folder = pkg_files[0] + # Minus folder name and .metainfo file name. + (isEx, feature, RDcount) = isExAnno(pkg_files) + if isEx: + print feature + " extension package, ", + print "contains " + str(RDcount + 2) + " tables." + else: + print "contains " + str(RDcount + 2) + " tables." + # .metainfo file: + # "ID"<TAB>mm10 + # "Assembly"<TAB>GRCm38 + # "Species"<TAB>mus_musculus + # "EnsVer"<TAB>71 + # "NPVer"<TAB>1.0 + meta_f = pkg_f.extractfile(g_folder + "/.metainfo") + meta_tbl = {} + if meta_f: + for rec in meta_f: + rec_sp = rec.strip().split("\t") + meta_tbl[rec_sp[0]] = rec_sp[1] # format: Variable<TAB>Value + else: + print "No meta information found in the package file.", + print "The downloaded file may be corrupted." + sys.exit() + meta_f.close() + pkg_f.close() + + gn_inst = meta_tbl["ID"] + assembly = meta_tbl["Assembly"] + species = meta_tbl["Species"] + ens_ver = float(meta_tbl["EnsVer"]) + np_ver = float(meta_tbl["NPVer"]) + + # Database file. + (h_sp, g_tbl, v_cw) = read_gnlist(root_path, "hash") + + if gn_inst in g_tbl: + installed_ens = float(g_tbl[gn_inst]["EnsVer"]) + installed_np = float(g_tbl[gn_inst]["NPVer"]) + + # For extension package. + # Only the same version with basic package could be installed. + if isEx: + if ens_ver == installed_ens and np_ver == installed_np: + print "Will upgrade the genome {0} with {1} annotation:".format(gn_inst, \ + feature), + print "Ensembl: {0}; ngs.plot: {1}.".\ + format(installed_ens, np_ver) + if yestoall: + install_pkg(root_path, pkg_file, gn_inst) + update_gnlist(root_path, g_tbl, gn_inst, assembly, \ + species, ens_ver, np_ver) + else: + ans = raw_input("Continue?(y/n): ") + while True: + if(ans == "y" or ans == "Y" or ans == "n" or ans == "N"): + break + else: + ans = raw_input("Answer must be y/Y or n/N: ") + if(ans == "y" or ans == "Y"): + install_pkg(root_path, pkg_file, gn_inst) + update_gnlist(root_path, g_tbl, gn_inst, assembly, \ + species, ens_ver, np_ver) + return + else: + print "This is an extension package of " + feature + \ + ", ENS version " + str(ens_ver) + ", NPVer" + str(np_ver) + \ + ", please install the same version basic genome annotation first!" + return + + # For basic package. + # Install a newer version. + if ens_ver > installed_ens or \ + ens_ver == installed_ens and np_ver > installed_np: + print "Will upgrade the genome {0}:".format(gn_inst) + print "Ensembl: {0}==>{1}; ngs.plot: {2}==>{3}.".\ + format(installed_ens, ens_ver, installed_np, np_ver), + if yestoall: + install_pkg(root_path, pkg_file, gn_inst, gn_inst) + update_gnlist(root_path, g_tbl, gn_inst, assembly, \ + species, ens_ver, np_ver) + else: + ans = raw_input("Continue?(y/n): ") + while True: + if(ans == "y" or ans == "Y" or ans == "n" or ans == "N"): + break + else: + ans = raw_input("Answer must be y/Y or n/N: ") + if(ans == "y" or ans == "Y"): + install_pkg(root_path, pkg_file, gn_inst, gn_inst) + update_gnlist(root_path, g_tbl, gn_inst, assembly, \ + species, ens_ver, np_ver) + # Install an older version. + else: + print "Will install the same or older version", + print "of the genome {0}:".format(gn_inst) + print "Ensembl: {0}==>{1}; ngs.plot: {2}==>{3}.".\ + format(installed_ens, ens_ver, installed_np, np_ver), + if yestoall: + install_pkg(root_path, pkg_file, gn_inst, gn_inst) + update_gnlist(root_path, g_tbl, gn_inst, assembly, \ + species, ens_ver, np_ver) + else: + ans = raw_input("Continue?(y/n): ") + while True: + if(ans == "y" or ans == "Y" or ans == "n" or ans == "N"): + break + else: + ans = raw_input("Answer must be y/Y or n/N: ") + if(ans == "y" or ans == "Y"): + install_pkg(root_path, pkg_file, gn_inst, gn_inst) + update_gnlist(root_path, g_tbl, gn_inst, assembly, \ + species, ens_ver, np_ver) + # Totally new installation, only basic package could be installed. + else: + print "Will install new genome {0}:".format(gn_inst), + print "Ensembl=> v{0}; ngs.plot=> v{1}.".format(ens_ver, np_ver), + if yestoall: + install_pkg(root_path, pkg_file, gn_inst) + update_gnlist(root_path, g_tbl, gn_inst, assembly, \ + species, ens_ver, np_ver) + else: + ans = raw_input("Continue?(y/n): ") + + while True: + if(ans == "y" or ans == "Y" or ans == "n" or ans == "N"): + break + else: + ans = raw_input("Answer must be y/Y or n/N:") + if(ans == "y" or ans == "Y"): + install_pkg(root_path, pkg_file, gn_inst) + update_gnlist(root_path, g_tbl, gn_inst, assembly, \ + species, ens_ver, np_ver) + + +def install_pkg(root_path, pkg_file, new_gn, old_gn=None): + """Install genome from package file. + + Args: + root_path: ngs.plot installation directory. + pkg_file: package file name. + old_gn: old genome name to be removed first. + Returns: + None. + """ + + import shutil + import tarfile + import sys + + if old_gn: + print "Removing installed genome:{0}...".format(old_gn), + sys.stdout.flush() + shutil.rmtree(root_path + '/database/' + old_gn) + rm_dbtbl(root_path, old_gn) + print "Done" + + print "Installing new genome...", + sys.stdout.flush() + try: + tar_f = tarfile.open(pkg_file, "r:gz") + except tarfile.ReadError: + print "Read package file {0} error: ".format(pkg_file), + print "downloaded file may be corrupted." + sys.exit() + + try: + tar_f.extractall(root_path + "/database") + except tarfile.ExtractError: + print "Extract files from package error.", + print "The downloaded file may be corrupted." + + add_dbtbl(root_path, new_gn) + + print "Done" + + +def add_dbtbl(root_path, gn): + """Add a new genome into database meta tables. + + Args: + root_path: ngs.plot installation directory. + gn: genome name to add. + Returns: None. + """ + + import os.path + import os + import glob + import tempfile + import subprocess + + (dblist_f, dblist_n) = tempfile.mkstemp(text=True) + (gnlist_f, gnlist_n) = tempfile.mkstemp(text=True) + + # Obtain a list of .RData file names to extract info from. + all_rdata = root_path + '/database/{0}/{1}*.RData'.format(gn, gn) + all_set = set(map(os.path.basename, glob.glob(all_rdata))) + all_rdata = root_path + '/database/{0}/*/{1}*.RData'.format(gn, gn) + all_set = all_set | set(map(os.path.basename, glob.glob(all_rdata))) + exm_rdata = root_path + '/database/{0}/{1}*exonmodel*.RData'.format(gn, gn) + exm_set = set(map(os.path.basename, glob.glob(exm_rdata))) + nex_list = sorted(list(all_set - exm_set)) + + # DB file list. + gn_ens_list = {} # Ensembl genome name unique list. + desired_ncols = 6 + for fname in nex_list: + tokens = fname.split('.') + tokens = tokens[:6] + if tokens[-1] == 'RData': + tokens[-1] = 'NA' + tokens.extend(['NA'] * (desired_ncols - len(tokens))) + os.write(dblist_f, fname + "\t" + "\t".join(tokens) + "\n") + if tokens[1] == 'ensembl': + gn_ens_list[".".join(tokens[:3])] = 1 + os.close(dblist_f) + + # Ensembl genome name list. + for gn in sorted(gn_ens_list.viewkeys()): + os.write(gnlist_f, gn.replace(".", "\t") + "\n") + os.close(gnlist_f) + + # Call external program to create table default file. + (default_f, default_n) = tempfile.mkstemp(text=True) + subprocess.call(["setTableDefaults.py", gnlist_n, default_n]) + os.remove(gnlist_n) + + # Call external program to calcualte DB score and install the new entries + # into ngs.plot database. + subprocess.call(["install.db.tables.r", default_n, dblist_n]) + os.remove(default_n) + os.remove(dblist_n) + + +def remove(root_path, args): + """Remove genome from ngs.plot database. + + Args: + root_path: ngs.plot installation directory. + args.yes: say yes to all questions. + args.gn: genome name. + Returns: + None. + """ + + import shutil + import sys + + yestoall = args.yes + sub_ftr = args.ftr + + (h_sp, g_tbl, v_cw) = read_gnlist(root_path, "hash") + + gn = args.gn + + if gn in g_tbl and sub_ftr is None: + print "Will remove genome {0} from database.".format(gn), + do_rm = False + if yestoall: + do_rm = True + else: + ans = raw_input("Continue?(y/n): ") + while True: + if ans == 'y' or ans == 'Y' or ans == 'n' or ans == 'N': + break + else: + ans = raw_input("The answer must be y/Y or n/N: ") + if ans == 'y' or ans == 'Y': + do_rm = True + if do_rm: + folder_to_rm = root_path + "/database/" + gn + print "Removing genome...", + sys.stdout.flush() + shutil.rmtree(folder_to_rm) + del g_tbl[gn] + write_gnlist(root_path, g_tbl) + rm_dbtbl(root_path, gn) + print "Done" + elif gn in g_tbl and sub_ftr is not None: + print "Will remove genomic feature {0} of genome {1} from database."\ + .format(sub_ftr, gn) + do_rm = False + if yestoall: + do_rm = True + else: + ans = raw_input("Continue?(y/n): ") + while True: + if ans == 'y' or ans == 'Y' or ans == 'n' or ans == 'N': + break + else: + ans = raw_input("The answer must be y/Y or n/N: ") + if ans == 'y' or ans == 'Y': + do_rm = True + if do_rm: + folder_to_rm = root_path + "/database/" + gn + "/" + sub_ftr + print "Removing genome...", + sys.stdout.flush() + shutil.rmtree(folder_to_rm) + # g_tbl does't need to be updated + rm_dbtbl(root_path, gn, sub_ftr) + print "Done" + else: + print "Cannot find the genome in database. Nothing was done." + + +def rm_dbtbl(root_path, gn, sub_ftr=None): + """Remove a genome from database meta tables. + + Args: + root_path: ngs.plot installation directory. + gn: genome name to remove. + Returns: None. + """ + + import subprocess + + if sub_ftr is None: + subprocess.call(["remove.db.tables.r", gn]) + else: + subprocess.call(["remove.db.tables.r", gn, sub_ftr]) + +def chrnames(root_path, args): + """List chromosome names for a given genome. + + Args: + root_path: ngs.plot installation directory. + args.gn: genome name to remove. + args.db: database(ensembl or refseq). + Returns: None. + """ + + import sys + + db = args.db + gn = args.gn + + if db != "ensembl" and db != "refseq": + print "Unrecognized database name: database must be ensembl or refseq." + sys.exit() + + chrnames_file = root_path + "/database/" + gn + "/.chrnames." + db + + try: + chrf = open(chrnames_file) + except IOError: + print "Open file: {0} error.".format(chrnames_file), + print "Your database may be corrupted or you have an older version." + sys.exit() + + chr_list = chrf.read(1000000) # set 1MB limit to avoid nasty input. + chrf.close() + + chr_list = chr_list.strip() + print chr_list + + + +############################################### +######## Main procedure. ###################### +if __name__ == "__main__": + + import argparse + import os + import sys + + try: + root_path = os.environ["NGSPLOT"] + except KeyError: + print "Environmental variable NGSPLOT is not set! Exit.\n" + sys.exit() + + # Main parser. + parser = argparse.ArgumentParser(description="Manipulate ngs.plot's \ + annotation database", + prog="ngsplotdb.py") + parser.add_argument("-y", "--yes", help="Say yes to all prompted questions", + action="store_true") + + subparsers = parser.add_subparsers(title="Subcommands", + help="additional help") + + # ngsplotdb.py list parser. + parser_list = subparsers.add_parser("list", help="List genomes installed \ + in database") + parser_list.set_defaults(func=listgn) + + # ngsplotdb.py install parser. + parser_install = subparsers.add_parser("install", + help="Install genome from package \ + file") + parser_install.add_argument("pkg", help="Package file(.tar.gz) to install", + type=str) + parser_install.set_defaults(func=install) + + # ngsplotdb.py remove parser. + parser_remove = subparsers.add_parser("remove", + help="Remove genome from database") + parser_remove.add_argument("gn", help="Name of genome to be \ + removed(e.g. hg18)", type=str) + parser_remove.add_argument("--ftr", help="Remove cellline specific features \ + (enhancer, dhs, etc)", type=str,\ + default=None) + parser_remove.set_defaults(func=remove) + + # ngsplotdb.py chromosome names parser. + parser_chrnames = subparsers.add_parser("chrnames", + help="List chromosome names") + parser_chrnames.add_argument("gn", help="Genome name(e.g. mm9)", type=str) + parser_chrnames.add_argument("db", help="Database(ensembl or refseq)", + type=str) + parser_chrnames.set_defaults(func=chrnames) + + + args = parser.parse_args() + args.func(root_path, args) +
