# HG changeset patch # User artbio # Date 1512604913 18000 # Node ID 3ca58369469c210453c7356a0b111b735ca4aa21 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/ngsplot commit b'e9fcc157a7f2f2fa9d6ac9a58d425ff17c975f5c\n' diff -r 000000000000 -r 3ca58369469c bin/install.db.tables.r --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/bin/install.db.tables.r Wed Dec 06 19:01:53 2017 -0500 @@ -0,0 +1,73 @@ +#!/usr/bin/env Rscript +# Install the newly obtained DB tables into two ngs.plot system files: +# default.tbl and dbfile.tbl. + +args <- commandArgs(T) +# args <- c('/tmp/tmpLXAGXl', '/tmp/tmpUJduMe') +default.tbl <- args[1] +db.list <- args[2] + +# Genome-region default values. +anno.tbl <- read.table(default.tbl, header=TRUE, sep="\t", + blank.lines.skip=TRUE, stringsAsFactors=FALSE) +row.names(anno.tbl) <- paste(anno.tbl$Genome, anno.tbl$Region, sep=".") + +# Database table list. +anno.db.tbl <- read.table(db.list, + col.names=c("db.file", "Genome", "DB", "Region", + "FI.1", "FI.2", "FI.3"), + sep="\t", blank.lines.skip=TRUE, + stringsAsFactors=FALSE) +tss.tbl <- anno.db.tbl[anno.db.tbl$Region=="genebody", ] +tss.tbl$Region <- "tss" +tes.tbl <- anno.db.tbl[anno.db.tbl$Region=="genebody", ] +tes.tbl$Region <- "tes" +anno.db.tbl <- rbind(anno.db.tbl, tss.tbl, tes.tbl) +anno.db.tbl$URL <- NA # reserve for future use. + +# Extract further info columns. +default.fi <- anno.tbl[, c("DefaultFI1", "DefaultFI2", "DefaultFI3")] +db.fi <- anno.db.tbl[, c("FI.1", "FI.2", "FI.3")] +db.fi$tag <- paste(anno.db.tbl$Genome, anno.db.tbl$Region, sep=".") + +# DB scores. +getScore <- function(v.fi) { +# Calculate DB score based on intersection with default values. +# Args: +# v.fi: character vector of further infos. +# Returns: integer of DB score. + + default.vals <- default.fi[v.fi["tag"], ] + fi.vals <- v.fi[c("FI.1", "FI.2", "FI.3")] + 3 - length(intersect(default.vals, fi.vals)) +} +db.score <- apply(db.fi, 1, getScore) +anno.db.tbl$dbScore <- db.score + +# Save to text output. +progpath <- Sys.getenv('NGSPLOT') +if(progpath == "") { + stop("Set environment variable NGSPLOT before run the program. See README for details.\n") +} +def.f <- file.path(progpath, 'database', 'default.tbl') +db.f <- file.path(progpath, 'database', 'dbfile.tbl') +org.anno.tbl <- read.delim(def.f) +org.anno.db.tbl <- read.delim(db.f) +anno.tbl <- rbind(org.anno.tbl, anno.tbl) +anno.tbl <- unique(anno.tbl) +anno.db.tbl <- rbind(org.anno.db.tbl, anno.db.tbl) +anno.db.tbl <- unique(anno.db.tbl) +write.table(anno.tbl, file=def.f, row.names=F, sep="\t", quote=F) +write.table(anno.db.tbl, file=db.f, row.names=F, sep="\t", quote=F) + +# getScore <- function(i, db.fi, default.fi){ +# score <- 3 - length(intersect(as.matrix(default.fi[db.fi[i, "tag"], ]), as.matrix(db.fi[i, ]))) +# return(score) +# } + +# for (i in (1:dim(db.fi)[1])){ +# anno.db.tbl[i, "dbScore"] <- getScore(i, db.fi, default.fi) +# } + +# anno.version="0.01" +# save(anno.tbl, anno.db.tbl, anno.version, file="database.RData") diff -r 000000000000 -r 3ca58369469c bin/ngsplotdb.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/bin/ngsplotdb.py Wed Dec 06 19:01:53 2017 -0500 @@ -0,0 +1,670 @@ +#!/usr/bin/env python +# +# ngsplotdb - PYTHON script to manipulate the ngs.plot database installation. +# Author: Li Shen, Mount Sinai School of Medicine +# Date created: Dec 28, 2012 +# Last updated: May 28, 2013 +# +# Functions implemented: +# +# list - List locally installed genomes. +# check - Check the integrity of local database. +# install - Install a genome from ngsplotdb.XXX.tar.gz. +# remove - Remove a locally installed genome. + + +# def dir_to_idspec(dirn): +# """Convert an FTP directory name to species ID and Name. +# This works with both UCSC and Ensembl FTP. +# """ +# parts = dirn.split('_') +# sid = parts[0][:3].lower() + parts[1][:3].lower() +# spec = str.title(parts[0] + ' ' + parts[1]) +# return sid, spec + +# def listremote(database="refseq"): +# """List available genomes on a remote FTP site.""" + +# import urllib2 + +# url_ensembl = "ftp://ftp.ensembl.org/pub/current_gtf/" +# url_ucsc = "ftp://hgdownload.cse.ucsc.edu/goldenPath/currentGenomes/" + +# if database == "refseq": +# ftp = urllib2.urlopen(url_ucsc) +# elif database == "ensembl": +# ftp = urllib2.urlopen(url_ensembl) +# else: +# pass + +# # Read directory names from the remote FTP site. +# dir_names = ftp.read().splitlines() +# # Parse lines to obtain clean directory names only. +# if database == "refseq": +# dir_names = map(lambda x: x.split()[-3], dir_names) +# elif database == "ensembl": +# dir_names = map(lambda x: x.split()[-1], dir_names) +# else: +# pass +# # Filter directory names that do not correspond to a species. +# # On UCSC FTP, both "." and ".." will be retrieved from above and can +# # cause troubles. + +# id_spec_list = map(dir_to_idspec, dir_names) +# print "ID\tName" +# for r in id_spec_list: +# print r[0], "\t", r[1] + +def read_gnlist(root_path, mode): + """Read installed genomes list. + + Args: + root_path: ngs.plot installation directory. + mode: row reading mode - "vector" or "hash", correspond to each record + read as a vector of hash table. + Returns: (header split vector, hash table of installed genomes, + vector of column widths) + """ + from collections import defaultdict + + gn_list = root_path + "/database/gn_list.txt" + try: + db_f = open(gn_list, "r") + except IOError: + print "Open {0} error: your ngs.plot database may be corrupted.".\ + format(gn_list) + sys.exit() + + default_list = root_path + "/database/default.tbl" + try: + default_f = open(default_list, "r") + except IOError: + print "Open {0} error: your ngs.plot database may be corrupted.".\ + format(default_list) + sys.exit() + default_l = [] + header = default_f.readline() # skip the header + for rec in default_f: + r_sp = rec.rstrip().split("\t") + default_l.append((r_sp[0], r_sp[2])) # tuple of genome and region, like ("dm3", "genebody") + genome_region_dict = defaultdict(list) + for genome,region in default_l: + genome_region_dict[genome].append(region) + + header = db_f.readline() + h_sp = header.rstrip().split("\t") + h_sp.append("InstalledFeatures") + v_cw = map(len, h_sp) # column widths initialize to header widths. + + g_tbl = {} + for rec in db_f: + r_sp = rec.rstrip().split("\t") + r_sp.append(",".join(genome_region_dict[r_sp[0]])) + r_cw = map(len, r_sp) + v_cw = map(max, v_cw, r_cw) + + r_tbl = dict(zip(h_sp, r_sp)) + if(mode == "vector"): + g_tbl[r_tbl["ID"]] = r_sp + elif(mode == "hash"): + g_tbl[r_tbl["ID"]] = r_tbl + else: + pass + + db_f.close() + + return (h_sp, g_tbl, v_cw) + + +def update_gnlist(root_path, g_tbl, gn, assembly, species, ens_v, np_v): + """Update/Add genomes in ngs.plot database. + + Args: + root_path: ngs.plot installation directory. + g_tbl: genome hash table. Each value is also a hash table of + genome info. + gn: genome name. + assembly: genome assembly name. + species: species name. + ens_v: new Ensembl version. + np_v: new ngs.plot version. + Returns: None + """ + + if gn in g_tbl: + g_tbl[gn]["EnsVer"] = ens_v + g_tbl[gn]["NPVer"] = np_v + else: + g_tbl[gn] = {"ID": gn, "Assembly": assembly, "Species": species, \ + "EnsVer": ens_v, "NPVer": np_v} + + write_gnlist(root_path, g_tbl) + + +def write_gnlist(root_path, g_tbl): + """Write genome hash table to database file: gn_list.txt. + + Args: + root_path: ngs.plot installation directory. + g_tbl: genome hash table. Each value is also a hash table of + genome info. + Returns: None + """ + + output_order = ["ID", "Assembly", "Species", "EnsVer", "NPVer"] + gn_list = root_path + "/database/gn_list.txt" + + try: + db_f = open(gn_list, "w") + except IOError: + print "Open {0} error: your ngs.plot database may be corrupted.".\ + format(gn_list) + sys.exit() + + db_f.write("\t".join(output_order) + "\n") + + for k in g_tbl.viewkeys(): + rec = [] + for col in output_order: + rec.append(str(g_tbl[k][col])) + db_f.write("\t".join(rec) + "\n") + + db_f.close() + + +def listgn(root_path, args): + """List installed genomes in local database on screen. + + Args: + root_path: ngs.plot installation directory. + args: currently not used. + Returns: None. + """ + + import math + + (h_sp, g_tbl, v_cw) = read_gnlist(root_path, "vector") + + # Format column widths to beautify output. + tab_u = 4 + v_cw = map(lambda x: int(math.ceil(float(x) / tab_u) * tab_u + 1), v_cw) + + # List genomes to screen. + print "".join(map(lambda x, y: x.ljust(y), h_sp, v_cw)) # header. + for k in sorted(g_tbl.viewkeys(), key=str.lower): + print "".join(map(lambda x, y: x.ljust(y), g_tbl[k], v_cw)) + +def isExAnno(pkg_files): + """If the package is an extension package based on cell lines for ngs.plot, + like enhancer or dhs. + + Args: + pkg_files: list of files in the package. + Returns: + isEx: Bool, if the package is an extension package. + feature: string, feature name contained in the extension package. None + if not an extension package. + RDcount: number of RData tables in the package. + """ + + import os.path + + exclusive_files = [".chrnames.refseq", ".chrnames.ensembl", ".metainfo"] + isEx = False + RDcount = 0 + feature = None + for file_name in pkg_files: + if os.path.basename(file_name) in exclusive_files: + continue + if (not file_name.endswith("RData")) and (file_name.count("/")) > 0: + isEx = True + feature = file_name.split("/")[1] + elif file_name.endswith("RData"): + RDcount += 1 + return (isEx, feature, RDcount) + +def install(root_path, args): + """Interactive session for installing genome from package file. + + Args: + root_path: ngs.plot installation directory. + args.yes: say yes to all questions. + args.pkg: path of package file. + Returns: + None. + """ + + import tarfile + import sys + + yestoall = args.yes + pkg_file = args.pkg + # Package name: e.g. ngsplotdb_mm10_71_1.0.tar.gz. + # Information about the genome is in mm10/.metainfo. + try: + pkg_f = tarfile.open(pkg_file, "r:gz") + except tarfile.ReadError: + print "Read package file {0} error.".format(pkg_file), + print "The downloaded file may be corrupted." + sys.exit() + + print "Extracting information from package...", + sys.stdout.flush() + pkg_files = pkg_f.getnames() + g_folder = pkg_files[0] + # Minus folder name and .metainfo file name. + (isEx, feature, RDcount) = isExAnno(pkg_files) + if isEx: + print feature + " extension package, ", + print "contains " + str(RDcount + 2) + " tables." + else: + print "contains " + str(RDcount + 2) + " tables." + # .metainfo file: + # "ID"mm10 + # "Assembly"GRCm38 + # "Species"mus_musculus + # "EnsVer"71 + # "NPVer"1.0 + meta_f = pkg_f.extractfile(g_folder + "/.metainfo") + meta_tbl = {} + if meta_f: + for rec in meta_f: + rec_sp = rec.strip().split("\t") + meta_tbl[rec_sp[0]] = rec_sp[1] # format: VariableValue + else: + print "No meta information found in the package file.", + print "The downloaded file may be corrupted." + sys.exit() + meta_f.close() + pkg_f.close() + + gn_inst = meta_tbl["ID"] + assembly = meta_tbl["Assembly"] + species = meta_tbl["Species"] + ens_ver = float(meta_tbl["EnsVer"]) + np_ver = float(meta_tbl["NPVer"]) + + # Database file. + (h_sp, g_tbl, v_cw) = read_gnlist(root_path, "hash") + + if gn_inst in g_tbl: + installed_ens = float(g_tbl[gn_inst]["EnsVer"]) + installed_np = float(g_tbl[gn_inst]["NPVer"]) + + # For extension package. + # Only the same version with basic package could be installed. + if isEx: + if ens_ver == installed_ens and np_ver == installed_np: + print "Will upgrade the genome {0} with {1} annotation:".format(gn_inst, \ + feature), + print "Ensembl: {0}; ngs.plot: {1}.".\ + format(installed_ens, np_ver) + if yestoall: + install_pkg(root_path, pkg_file, gn_inst) + update_gnlist(root_path, g_tbl, gn_inst, assembly, \ + species, ens_ver, np_ver) + else: + ans = raw_input("Continue?(y/n): ") + while True: + if(ans == "y" or ans == "Y" or ans == "n" or ans == "N"): + break + else: + ans = raw_input("Answer must be y/Y or n/N: ") + if(ans == "y" or ans == "Y"): + install_pkg(root_path, pkg_file, gn_inst) + update_gnlist(root_path, g_tbl, gn_inst, assembly, \ + species, ens_ver, np_ver) + return + else: + print "This is an extension package of " + feature + \ + ", ENS version " + str(ens_ver) + ", NPVer" + str(np_ver) + \ + ", please install the same version basic genome annotation first!" + return + + # For basic package. + # Install a newer version. + if ens_ver > installed_ens or \ + ens_ver == installed_ens and np_ver > installed_np: + print "Will upgrade the genome {0}:".format(gn_inst) + print "Ensembl: {0}==>{1}; ngs.plot: {2}==>{3}.".\ + format(installed_ens, ens_ver, installed_np, np_ver), + if yestoall: + install_pkg(root_path, pkg_file, gn_inst, gn_inst) + update_gnlist(root_path, g_tbl, gn_inst, assembly, \ + species, ens_ver, np_ver) + else: + ans = raw_input("Continue?(y/n): ") + while True: + if(ans == "y" or ans == "Y" or ans == "n" or ans == "N"): + break + else: + ans = raw_input("Answer must be y/Y or n/N: ") + if(ans == "y" or ans == "Y"): + install_pkg(root_path, pkg_file, gn_inst, gn_inst) + update_gnlist(root_path, g_tbl, gn_inst, assembly, \ + species, ens_ver, np_ver) + # Install an older version. + else: + print "Will install the same or older version", + print "of the genome {0}:".format(gn_inst) + print "Ensembl: {0}==>{1}; ngs.plot: {2}==>{3}.".\ + format(installed_ens, ens_ver, installed_np, np_ver), + if yestoall: + install_pkg(root_path, pkg_file, gn_inst, gn_inst) + update_gnlist(root_path, g_tbl, gn_inst, assembly, \ + species, ens_ver, np_ver) + else: + ans = raw_input("Continue?(y/n): ") + while True: + if(ans == "y" or ans == "Y" or ans == "n" or ans == "N"): + break + else: + ans = raw_input("Answer must be y/Y or n/N: ") + if(ans == "y" or ans == "Y"): + install_pkg(root_path, pkg_file, gn_inst, gn_inst) + update_gnlist(root_path, g_tbl, gn_inst, assembly, \ + species, ens_ver, np_ver) + # Totally new installation, only basic package could be installed. + else: + print "Will install new genome {0}:".format(gn_inst), + print "Ensembl=> v{0}; ngs.plot=> v{1}.".format(ens_ver, np_ver), + if yestoall: + install_pkg(root_path, pkg_file, gn_inst) + update_gnlist(root_path, g_tbl, gn_inst, assembly, \ + species, ens_ver, np_ver) + else: + ans = raw_input("Continue?(y/n): ") + + while True: + if(ans == "y" or ans == "Y" or ans == "n" or ans == "N"): + break + else: + ans = raw_input("Answer must be y/Y or n/N:") + if(ans == "y" or ans == "Y"): + install_pkg(root_path, pkg_file, gn_inst) + update_gnlist(root_path, g_tbl, gn_inst, assembly, \ + species, ens_ver, np_ver) + + +def install_pkg(root_path, pkg_file, new_gn, old_gn=None): + """Install genome from package file. + + Args: + root_path: ngs.plot installation directory. + pkg_file: package file name. + old_gn: old genome name to be removed first. + Returns: + None. + """ + + import shutil + import tarfile + import sys + + if old_gn: + print "Removing installed genome:{0}...".format(old_gn), + sys.stdout.flush() + shutil.rmtree(root_path + '/database/' + old_gn) + rm_dbtbl(root_path, old_gn) + print "Done" + + print "Installing new genome...", + sys.stdout.flush() + try: + tar_f = tarfile.open(pkg_file, "r:gz") + except tarfile.ReadError: + print "Read package file {0} error: ".format(pkg_file), + print "downloaded file may be corrupted." + sys.exit() + + try: + tar_f.extractall(root_path + "/database") + except tarfile.ExtractError: + print "Extract files from package error.", + print "The downloaded file may be corrupted." + + add_dbtbl(root_path, new_gn) + + print "Done" + + +def add_dbtbl(root_path, gn): + """Add a new genome into database meta tables. + + Args: + root_path: ngs.plot installation directory. + gn: genome name to add. + Returns: None. + """ + + import os.path + import os + import glob + import tempfile + import subprocess + + (dblist_f, dblist_n) = tempfile.mkstemp(text=True) + (gnlist_f, gnlist_n) = tempfile.mkstemp(text=True) + + # Obtain a list of .RData file names to extract info from. + all_rdata = root_path + '/database/{0}/{1}*.RData'.format(gn, gn) + all_set = set(map(os.path.basename, glob.glob(all_rdata))) + all_rdata = root_path + '/database/{0}/*/{1}*.RData'.format(gn, gn) + all_set = all_set | set(map(os.path.basename, glob.glob(all_rdata))) + exm_rdata = root_path + '/database/{0}/{1}*exonmodel*.RData'.format(gn, gn) + exm_set = set(map(os.path.basename, glob.glob(exm_rdata))) + nex_list = sorted(list(all_set - exm_set)) + + # DB file list. + gn_ens_list = {} # Ensembl genome name unique list. + desired_ncols = 6 + for fname in nex_list: + tokens = fname.split('.') + tokens = tokens[:6] + if tokens[-1] == 'RData': + tokens[-1] = 'NA' + tokens.extend(['NA'] * (desired_ncols - len(tokens))) + os.write(dblist_f, fname + "\t" + "\t".join(tokens) + "\n") + if tokens[1] == 'ensembl': + gn_ens_list[".".join(tokens[:3])] = 1 + os.close(dblist_f) + + # Ensembl genome name list. + for gn in sorted(gn_ens_list.viewkeys()): + os.write(gnlist_f, gn.replace(".", "\t") + "\n") + os.close(gnlist_f) + + # Call external program to create table default file. + (default_f, default_n) = tempfile.mkstemp(text=True) + subprocess.call(["setTableDefaults.py", gnlist_n, default_n]) + os.remove(gnlist_n) + + # Call external program to calcualte DB score and install the new entries + # into ngs.plot database. + subprocess.call(["install.db.tables.r", default_n, dblist_n]) + os.remove(default_n) + os.remove(dblist_n) + + +def remove(root_path, args): + """Remove genome from ngs.plot database. + + Args: + root_path: ngs.plot installation directory. + args.yes: say yes to all questions. + args.gn: genome name. + Returns: + None. + """ + + import shutil + import sys + + yestoall = args.yes + sub_ftr = args.ftr + + (h_sp, g_tbl, v_cw) = read_gnlist(root_path, "hash") + + gn = args.gn + + if gn in g_tbl and sub_ftr is None: + print "Will remove genome {0} from database.".format(gn), + do_rm = False + if yestoall: + do_rm = True + else: + ans = raw_input("Continue?(y/n): ") + while True: + if ans == 'y' or ans == 'Y' or ans == 'n' or ans == 'N': + break + else: + ans = raw_input("The answer must be y/Y or n/N: ") + if ans == 'y' or ans == 'Y': + do_rm = True + if do_rm: + folder_to_rm = root_path + "/database/" + gn + print "Removing genome...", + sys.stdout.flush() + shutil.rmtree(folder_to_rm) + del g_tbl[gn] + write_gnlist(root_path, g_tbl) + rm_dbtbl(root_path, gn) + print "Done" + elif gn in g_tbl and sub_ftr is not None: + print "Will remove genomic feature {0} of genome {1} from database."\ + .format(sub_ftr, gn) + do_rm = False + if yestoall: + do_rm = True + else: + ans = raw_input("Continue?(y/n): ") + while True: + if ans == 'y' or ans == 'Y' or ans == 'n' or ans == 'N': + break + else: + ans = raw_input("The answer must be y/Y or n/N: ") + if ans == 'y' or ans == 'Y': + do_rm = True + if do_rm: + folder_to_rm = root_path + "/database/" + gn + "/" + sub_ftr + print "Removing genome...", + sys.stdout.flush() + shutil.rmtree(folder_to_rm) + # g_tbl does't need to be updated + rm_dbtbl(root_path, gn, sub_ftr) + print "Done" + else: + print "Cannot find the genome in database. Nothing was done." + + +def rm_dbtbl(root_path, gn, sub_ftr=None): + """Remove a genome from database meta tables. + + Args: + root_path: ngs.plot installation directory. + gn: genome name to remove. + Returns: None. + """ + + import subprocess + + if sub_ftr is None: + subprocess.call(["remove.db.tables.r", gn]) + else: + subprocess.call(["remove.db.tables.r", gn, sub_ftr]) + +def chrnames(root_path, args): + """List chromosome names for a given genome. + + Args: + root_path: ngs.plot installation directory. + args.gn: genome name to remove. + args.db: database(ensembl or refseq). + Returns: None. + """ + + import sys + + db = args.db + gn = args.gn + + if db != "ensembl" and db != "refseq": + print "Unrecognized database name: database must be ensembl or refseq." + sys.exit() + + chrnames_file = root_path + "/database/" + gn + "/.chrnames." + db + + try: + chrf = open(chrnames_file) + except IOError: + print "Open file: {0} error.".format(chrnames_file), + print "Your database may be corrupted or you have an older version." + sys.exit() + + chr_list = chrf.read(1000000) # set 1MB limit to avoid nasty input. + chrf.close() + + chr_list = chr_list.strip() + print chr_list + + + +############################################### +######## Main procedure. ###################### +if __name__ == "__main__": + + import argparse + import os + import sys + + try: + root_path = os.environ["NGSPLOT"] + except KeyError: + print "Environmental variable NGSPLOT is not set! Exit.\n" + sys.exit() + + # Main parser. + parser = argparse.ArgumentParser(description="Manipulate ngs.plot's \ + annotation database", + prog="ngsplotdb.py") + parser.add_argument("-y", "--yes", help="Say yes to all prompted questions", + action="store_true") + + subparsers = parser.add_subparsers(title="Subcommands", + help="additional help") + + # ngsplotdb.py list parser. + parser_list = subparsers.add_parser("list", help="List genomes installed \ + in database") + parser_list.set_defaults(func=listgn) + + # ngsplotdb.py install parser. + parser_install = subparsers.add_parser("install", + help="Install genome from package \ + file") + parser_install.add_argument("pkg", help="Package file(.tar.gz) to install", + type=str) + parser_install.set_defaults(func=install) + + # ngsplotdb.py remove parser. + parser_remove = subparsers.add_parser("remove", + help="Remove genome from database") + parser_remove.add_argument("gn", help="Name of genome to be \ + removed(e.g. hg18)", type=str) + parser_remove.add_argument("--ftr", help="Remove cellline specific features \ + (enhancer, dhs, etc)", type=str,\ + default=None) + parser_remove.set_defaults(func=remove) + + # ngsplotdb.py chromosome names parser. + parser_chrnames = subparsers.add_parser("chrnames", + help="List chromosome names") + parser_chrnames.add_argument("gn", help="Genome name(e.g. mm9)", type=str) + parser_chrnames.add_argument("db", help="Database(ensembl or refseq)", + type=str) + parser_chrnames.set_defaults(func=chrnames) + + + args = parser.parse_args() + args.func(root_path, args) + diff -r 000000000000 -r 3ca58369469c bin/remove.db.tables.r --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/bin/remove.db.tables.r Wed Dec 06 19:01:53 2017 -0500 @@ -0,0 +1,46 @@ +#!/usr/bin/env Rscript +# Remove the genome entries from two ngs.plot system files: +# default.tbl and dbfile.tbl. + +args <- commandArgs(T) +if(length(args)>1){ + gname.to.rm <- args[1] + feature.to.rm <- args[2] +}else{ + gname.to.rm <- args[1] +} + +# Save to text output. +progpath <- Sys.getenv('NGSPLOT') +if(progpath == "") { + stop("Set environment variable NGSPLOT before run the program. See README for details.\n") +} +default.file <- file.path(progpath, 'database', 'default.tbl') +dbfile.file <- file.path(progpath, 'database', 'dbfile.tbl') +default.tbl <- read.delim(default.file) +dbfile.tbl <- read.delim(dbfile.file) + +if(exists("feature.to.rm")){ + default.tbl <- subset(default.tbl, !(Genome==gname.to.rm & Region==feature.to.rm)) + dbfile.tbl <- subset(dbfile.tbl, !(Genome==gname.to.rm & Region==feature.to.rm)) +}else{ + default.tbl <- default.tbl[default.tbl$Genome != gname.to.rm, ] + dbfile.tbl <- dbfile.tbl[dbfile.tbl$Genome != gname.to.rm, ] +} + + +write.table(default.tbl, file=default.file, row.names=F, sep="\t", quote=F) +write.table(dbfile.tbl, file=dbfile.file, row.names=F, sep="\t", quote=F) + + +# getScore <- function(i, db.fi, default.fi){ +# score <- 3 - length(intersect(as.matrix(default.fi[db.fi[i, "tag"], ]), as.matrix(db.fi[i, ]))) +# return(score) +# } + +# for (i in (1:dim(db.fi)[1])){ +# anno.db.tbl[i, "dbScore"] <- getScore(i, db.fi, default.fi) +# } + +# anno.version="0.01" +# save(anno.tbl, anno.db.tbl, anno.version, file="database.RData") diff -r 000000000000 -r 3ca58369469c bin/setTableDefaults.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/bin/setTableDefaults.py Wed Dec 06 19:01:53 2017 -0500 @@ -0,0 +1,91 @@ +#!/usr/bin/env python + +import sys +import string + +flankDict = { + "tss":2000, + "tes":2000, + "genebody":2000, + "exon":500, + "cgi":500, + "enhancer":1500, + "dhs":1000, +} + +pointLabDict = { + "tss":"TSS", + "tes":"TES", + "genebody":"TSS,TES", + "exon":"Acceptor,Donor", + "cgi":"Left,Right", + "enhancer":"Enhancer", + "dhs":"Left,Right", +} + +# FItype = { +# "tss":"endsite", +# "tes":"endsite", +# "genebody":"datatype", +# "exon":"subregion", +# "cgi":"subregion", +# "enhancer":"subregion", +# "dhs":"subregion" +# } + +in_f = file(sys.argv[1]) +out_f = file(sys.argv[2], "w") +out_f.write("\t".join(["Genome", "DefaultDB", "Region", "DefaultFI1", \ + "DefaultFI2", "DefaultFI3", "PointLab", "Flank"]) + \ + "\n") + +# while(True): +for line in in_f: + # line = in_f.readline() + # if len(line) == 0: + # break; + # line = string.strip(line) + + lineL = line.rstrip().split("\t") + genome = lineL[0] + defaultDB = lineL[1] + region = lineL[2] + + if region == "cgi": + fi_1 = "NA" + fi_2 = "ProximalPromoter" + fi_3 = "protein_coding" + elif region == "dhs": + fi_1 = "H1hesc" + fi_2 = "ProximalPromoter" + fi_3 = "protein_coding" + elif region == "exon": + fi_1 = "chipseq" + fi_2 = "canonical" + fi_3 = "protein_coding" + elif region == "genebody": + fi_1 = "chipseq" + fi_2 = "NA" + fi_3 = "protein_coding" + elif (region == "enhancer") and (genome == "hg19"): + fi_1 = "H1hesc" + fi_2 = "genebody" + fi_3 = "protein_coding" + elif (region == "enhancer") and (genome == "mm9"): + fi_1 = "mESC" + fi_2 = "genebody" + fi_3 = "protein_coding" + out_f.write("\t".join([genome, defaultDB, region, fi_1, fi_2, fi_3, \ + pointLabDict[region], str(flankDict[region])]) + "\n") + + # Extra: TSS and TES if region = genebody. + if region == "genebody": + region = "tss" + out_f.write("\t".join([genome, defaultDB, region, fi_1, fi_2, fi_3, \ + pointLabDict[region], str(flankDict[region])]) + "\n") + region = "tes" + out_f.write("\t".join([genome, defaultDB, region, fi_1, fi_2, fi_3, \ + pointLabDict[region], str(flankDict[region])]) + "\n") + +in_f.close() +out_f.close() diff -r 000000000000 -r 3ca58369469c database/bootstrap/dbfile.tbl --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/database/bootstrap/dbfile.tbl Wed Dec 06 19:01:53 2017 -0500 @@ -0,0 +1,1 @@ +db.file Genome DB Region FI.1 FI.2 FI.3 URL dbScore diff -r 000000000000 -r 3ca58369469c database/bootstrap/default.tbl --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/database/bootstrap/default.tbl Wed Dec 06 19:01:53 2017 -0500 @@ -0,0 +1,1 @@ +Genome DefaultDB Region DefaultFI1 DefaultFI2 DefaultFI3 PointLab Flank diff -r 000000000000 -r 3ca58369469c database/bootstrap/gn_list.txt --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/database/bootstrap/gn_list.txt Wed Dec 06 19:01:53 2017 -0500 @@ -0,0 +1,1 @@ +ID Assembly Species EnsVer NPVer diff -r 000000000000 -r 3ca58369469c database/dbfile.tbl --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/database/dbfile.tbl Wed Dec 06 19:01:53 2017 -0500 @@ -0,0 +1,1 @@ +db.file Genome DB Region FI.1 FI.2 FI.3 URL dbScore diff -r 000000000000 -r 3ca58369469c database/default.tbl --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/database/default.tbl Wed Dec 06 19:01:53 2017 -0500 @@ -0,0 +1,1 @@ +Genome DefaultDB Region DefaultFI1 DefaultFI2 DefaultFI3 PointLab Flank diff -r 000000000000 -r 3ca58369469c database/gn_list.txt --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/database/gn_list.txt Wed Dec 06 19:01:53 2017 -0500 @@ -0,0 +1,1 @@ +ID Assembly Species EnsVer NPVer diff -r 000000000000 -r 3ca58369469c lib/coverage.r --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/lib/coverage.r Wed Dec 06 19:01:53 2017 -0500 @@ -0,0 +1,701 @@ +# Function to check if the range exceeds coverage vector boundaries. +checkBound <- function(start, end, range, chrlen){ + if(end + range > chrlen || start - range < 1) + return(FALSE) # out of boundary. + else + return(TRUE) +} + +Bin <- function(v, n) { +# Function to bin a coverage vector. +# Args: +# v: coverage vector. +# n: number of bins. +# Return: vector of binned values. + + bin.bound <- seq(1, length(v), length.out=n + 1) + sapply(1:n, function(i) { + mean(v[bin.bound[i]:bin.bound[i + 1]]) + }) +} + +SplineRev3Sec <- function(cov.list, v.fls, pts.list, v.strand, algo="spline") { +# For 3 sections of continuous coverage, spline each according to specified +# data points and return concatenated, interpolated curves. +# Args: +# cov.list: a list of coverage vectors. Each vector represents a gene. +# v.fls: vector of flanking region size. +# pts.list: a list of three integers for data points. +# v.strand: factor vector of strands. +# algo: algorithm used to normalize coverage vectors (spline, bin). +# Return: matrix of interpolated coverage: each row represents a gene; each +# column represents a data point. Coverage from exons are concatenated. +# Notes: the names of cov.list and pts.list must be the same for index purpose. +# Suggested: use 'l', 'm', and 'r' for the names. + + # Create an empty coveage matrix first. + tot.pts <- sum(unlist(pts.list)) - 2 + cov.mat <- matrix(nrow=length(cov.list), ncol=tot.pts) + + for(i in 1:length(cov.list)) { + left.cov <- head(cov.list[[i]], n=v.fls[i]) + right.cov <- tail(cov.list[[i]], n=v.fls[i]) + mid.cov <- window(cov.list[[i]], start=v.fls[i] + 1, + width=length(cov.list[[i]]) - 2*v.fls[i]) + if(algo == "spline") { + left.cov <- spline(1:length(left.cov), left.cov, n=pts.list$l)$y + right.cov <- spline(1:length(right.cov), right.cov, n=pts.list$r)$y + mid.cov <- spline(1:length(mid.cov), mid.cov, n=pts.list$m)$y + } else if(algo == "bin") { + left.cov <- Bin(left.cov, n=pts.list$l) + right.cov <- Bin(right.cov, n=pts.list$r) + mid.cov <- Bin(mid.cov, n=pts.list$m) + } else { + # pass. + } + # Fuse the two points at the boundary and concatenate. + f1 <- (tail(left.cov, n=1) + head(mid.cov, n=1)) / 2 + f2 <- (tail(mid.cov, n=1) + head(right.cov, n=1)) / 2 + con.cov <- c(left.cov[-length(left.cov)], f1, + mid.cov[-c(1, length(mid.cov))], + f2, right.cov[-1]) + # browser() + if(v.strand[i] == '+') { + cov.mat[i, ] <- con.cov + } else { + cov.mat[i, ] <- rev(con.cov) + } + } + + cov.mat +} + +extrCov3Sec <- function(v.chrom, v.start, v.end, v.fls, v.strand, m.pts, f.pts, + bufsize, algo, ...) { +# Extract and interpolate coverage vectors from genomic regions with 3 sections. +# Args: +# v.chrom: factor vector of chromosome names. +# v.start: integer vector of region start. +# v.end: integer vector of region end. +# v.fls: integer vector of flanking region size in bps. +# v.strand: factor vector of gene strands. +# m.pts: data points for middle interval. +# f.pts: data points for flanking region. +# bufsize: integer; buffers are added to both ends of each region. +# algo: algorithm used to normalize coverage vectors. +# Return: matrix of interpolated coverage: each row represents a gene; each +# column represents a data point. + + interflank.gr <- GRanges(seqnames=v.chrom, ranges=IRanges( + start=v.start - v.fls - bufsize, + end=v.end + v.fls + bufsize)) + interflank.cov <- covBamExons(interflank.gr, v.strand, ...) + + # Trim buffers from coverage vectors. + interflank.cov <- lapply(interflank.cov, function(v) { + window(v, start=bufsize + 1, width=length(v) - 2 * bufsize) + }) + + # Interpolate and reverse coverage vectors. + SplineRev3Sec(interflank.cov, v.fls, list(l=f.pts, m=m.pts, r=f.pts), + v.strand, algo) +} + +sub3CovList <- function(all.cov, v.left, v.right) { +# For a list of coverage vectors, separate them into three lists. +# Args: +# all.cov: list of coverage vectors. +# v.left: integer vector of left flanking size. +# v.right: integer vector of right flanking size. +# Return: list of lists of coverage vectors. The outer list is named as: +# "l", "m", "r". + + left.cov <- foreach(i=icount(length(all.cov))) %do% { + head(all.cov[[i]], n=v.left[i]) + } + mid.cov <- foreach(i=icount(length(all.cov))) %do% { + len <- length(all.cov[[i]]) + all.cov[[i]][ (v.left[i] + 1) : (len - v.right[i]) ] + } + right.cov <- foreach(i=icount(length(all.cov))) %do% { + tail(all.cov[[i]], n=v.right[i]) + } + list(l=left.cov, m=mid.cov, r=right.cov) +} + +extrCovExons <- function(v.chrom, exonranges.list, v.fls, v.strand, + m.pts, f.pts, bufsize, algo, ...) { +# Extract coverage vectors for transcripts with exon models. +# Args: +# v.chrom: factor vector of chromosome names. +# exonranges.list: list of IRanges objects for exon coordinates. +# v.fls: integer vector of flanking region size in bps. +# v.strand: factor vector of gene strands. +# m.pts: data points for middle interval. +# f.pts: data points for flanking region. +# bufsize: integer; buffers are added to both ends of each region. +# algo: algorithm used to normalize coverage vectors. +# Return: matrix of interpolated coverage: each row represents a gene; each +# column represents a data point. Coverage from exons are concatenated. + + # Construct ranges including exon and flanking regions. + exonflank.list <- vector('list', length=length(exonranges.list)) + for(i in 1:length(exonranges.list)) { + r.mod <- exonranges.list[[i]] # IRanges object to be modified. + n <- length(r.mod) + # Add flanking and buffer regions. + start(r.mod)[1] <- start(r.mod)[1] - v.fls[i] - bufsize + end(r.mod)[n] <- end(r.mod)[n] + v.fls[i] + bufsize + exonflank.list[[i]] <- GRanges(seqnames=v.chrom[i], ranges=r.mod) + } + + exonflank.cov <- covBamExons(exonflank.list, v.strand, ...) + + # Trim buffers from coverage vectors. + exonflank.cov <- lapply(exonflank.cov, function(v) { + window(v, start=bufsize + 1, width=length(v) - 2 * bufsize) + }) + + SplineRev3Sec(exonflank.cov, v.fls, list(l=f.pts, m=m.pts, r=f.pts), + v.strand, algo) +} + + +extrCovMidp <- function(v.chrom, v.midp, flanksize, v.strand, pts, bufsize, + algo, ...) { +# Extract coverage vectors with a middle point and symmetric flanking regions. +# Args: +# v.chrom: factor vector of chromosome names. +# v.midp: integer vector of middle points. +# flanksize: integer of flanking region size in bps. +# v.strand: factor vector of gene strands. +# pts: data points to spline into. +# bufsize: integer; buffers are added to both ends of each region. +# algo: algorithm used to normalize coverage vectors. +# Return: matrix of interpolated coverage: each row represents a gene; each +# column represents a data point. + + granges <- GRanges(seqnames=v.chrom, ranges=IRanges( + start=v.midp - flanksize - bufsize, + end=v.midp + flanksize + bufsize)) + cov.list <- covBamExons(granges, v.strand, ...) + + # Trim buffers from coverage vectors. + cov.list <- lapply(cov.list, function(v) { + window(v, start=bufsize + 1, width=length(v) - 2 * bufsize) + }) + + # Interpolate and reverse coverage vectors and assemble into a matrix. + cov.mat <- matrix(nrow=length(cov.list), ncol=pts) + for(i in 1:length(cov.list)) { + if(is.null(cov.list[[i]])) { + cov.mat[i, ] <- vector('integer', length=pts) + } else { + if(algo == "spline") { + cov.mat[i, ] <- spline(1:length(cov.list[[i]]), cov.list[[i]], + n=pts)$y + } else if(algo == "bin") { + cov.mat[i, ] <- Bin(cov.list[[i]], n=pts) + } else { + # pass. + } + if(v.strand[i] == '-') { + cov.mat[i, ] <- rev(cov.mat[i, ]) + } + } + } + cov.mat +} + +scanBamRevOrder <- function(org.gr, sbp) { +# ScanBamParam re-arranges the input genomic ranges. Use range info to +# construct a string vector to find the order to reverse it. + org.grnames <- with(org.gr, paste(seqnames, start, end, sep=':')) + sbw.gr <- as.data.frame(bamWhich(sbp)) # scan-bam-ed + if('space' %in% names(sbw.gr)) { + sbw.grnames <- with(sbw.gr, paste(space, start, end, sep=':')) + } else if('group_name' %in% names(sbw.gr)) { + sbw.grnames <- with(sbw.gr, paste(group_name, start, end, sep=':')) + } else { + stop("Cannot locate chromosome names in extracted short reads. Report +this problem using issue tracking or discussion forum.\n") + } + + match(org.grnames, sbw.grnames) +} + +genZeroList <- function(llen, v.vlen) { +# Generate a list of specific length with each element being an Rle vector of +# zeros with specific length. +# Args: +# llen: list length +# v.vlen: vector of vector lengths. + + llen <- as.integer(llen) + stopifnot(llen > 0) + res <- vector('list', length=llen) + for(i in 1:llen) { + res[[i]] <- Rle(0, v.vlen[i]) + } + res +} + +covBamExons <- function(granges.dat, v.strand, bam.file, sn.inbam, fraglen, + map.qual=20, bowtie=F, + strand.spec=c('both', 'same', 'opposite')) { +# Extract coverage vectors from bam file for a list of transcripts of multiple +# exons. +# Args: +# granges.dat: a GRanges object representing a set of genomic ranges or a +# list of GRanges objects each representing a set of exonic +# ranges. +# v.strand: vector of strand info. +# bam.file: character string refers to the path of a bam file. +# sn.inbam: vector of chromosome names in the bam file. +# fraglen: fragment length. +# map.qual: mapping quality to filter reads. +# bowtie: boolean to indicate whether the aligner was Bowtie-like or not. +# strand.spec: string desc. for strand-specific coverage calculation. +# Return: list of coverage vectors, each vector represents a transcript. + + strand.spec <- match.arg(strand.spec) + + if(class(granges.dat) == 'list') { + # Construct a GRanges object representing DNA sequences. + v.seqnames <- sapply(granges.dat, + function(x) as.character(seqnames(x)[1])) + v.start <- sapply(granges.dat, function(x) start(ranges(x))[1]) + v.end <- sapply(granges.dat, function(x) tail(end(ranges(x)), n=1)) + granges.dna <- GRanges(seqnames=v.seqnames, + ranges=IRanges(start=v.start, end=v.end)) + # Obtain mRNA(including flanking) length for each gene. + repr.lens <- sapply(granges.dat, function(g) { + sum(end(g) - start(g) + 1) + }) + } else { + v.seqnames <- as.character(seqnames(granges.dat)) + v.start <- start(granges.dat) + v.end <- end(granges.dat) + granges.dna <- granges.dat + repr.lens <- v.end - v.start + 1 + granges.dat <- vector('list', length(granges.dna)) # set null tags. + } + + # Filter transcripts whose chromosomes do not match bam file. + inbam.mask <- as.character(seqnames(granges.dna)) %in% sn.inbam + if(!any(inbam.mask)) { # none matches. + return(genZeroList(length(granges.dna), repr.lens)) + } + + # scanBamWhat: the info that need to be extracted from a bam file. + sbw <- c('pos', 'qwidth', 'mapq', 'strand', 'rname', + 'mrnm', 'mpos', 'isize') + sbp <- ScanBamParam(what=sbw, which=granges.dna[inbam.mask], + flag=scanBamFlag(isUnmappedQuery=F, + isNotPassingQualityControls=F, + isDuplicate=F)) + + # Scan bam file to retrieve short reads. + sr.in.ranges <- tryCatch(scanBam(bam.file, param=sbp), error=function(e) e) + if(class(sr.in.ranges)[1] == 'simpleError') { + # This is not supposed to happen after those unmatched seqnames are + # removed. I keep it for safty. + return(genZeroList(length(granges.dna), repr.lens)) + } + + # Restore the original order. + sr.in.ranges <- sr.in.ranges[scanBamRevOrder( + as.data.frame(granges.dna[inbam.mask]), sbp)] + + CalcReadsCov <- function(srg, start, end, gr.rna, repr.len, strand) { + # Calculate short read coverage for each gene/region. + # Args: + # srg: extracted short reads in gene. + # start: start position of the DNA sequence. + # end: end position of the DNA sequence. + # gr.rna: GRanges object (multiple ranges) representing exon sequences. + # This can be NULL indicating the input ranges are DNAs. + # repr.len: DNA or mRNA sequence length. + # strand: transcript strand (+/-). + # Returns: a coverage vector for the gene. + + # browser() + # Special handling for bowtie mapping. + if(bowtie) { + srg <- within(srg, mapq[is.na(mapq)] <- 254) # within! + } + # Filter short reads by mapping quality. + all.mask <- srg$mapq >= map.qual + + # Subset by strand info. + if(strand.spec != 'both') { + if(strand.spec == 'same') { + s.mask <- srg$strand == as.character(strand) + } else { + s.mask <- srg$strand != as.character(strand) + } + all.mask <- all.mask & s.mask + } + + # If paired, filter reads that are not properly paired. + paired <- all(with(srg, is.na(isize) | isize != 0)) + if(paired) { + p.mask <- with(srg, rname == mrnm & xor(strand == '+', isize < 0)) + all.mask <- all.mask & p.mask + } + + # Apply all the filters on short reads. + srg <- lapply(srg, `[`, which(all.mask)) + + # Calculate coverage. + if(length(srg[[1]]) > 0) { + if(paired) { + cov.pos <- with(srg, ifelse(isize < 0, mpos, pos)) + cov.wd <- abs(srg$isize) + } else { + # Adjust negative read positions for physical coverage. + cov.pos <- with(srg, ifelse(strand == '-', + pos - fraglen + qwidth, pos)) + cov.wd <- fraglen + } + # Shift reads by subtracting start positions. + cov.pos <- cov.pos - start + 1 + # Calculate physical coverage on the whole genebody. + covg <- coverage(IRanges(start=cov.pos, width=cov.wd), + width=end - start + 1, method='sort') + + if(!is.null(gr.rna)) { # RNA-seq. + # Shift exonic ranges by subtracting start positions. + # BE careful with negative start positions! Need to adjust end + # positions first(or the GRanges lib will emit errors if + # start > end). + # Negative start positions happen when flanking region exceeds + # the chromosomal start. + if(start > 0) { + start(gr.rna) <- start(gr.rna) - start + 1 + end(gr.rna) <- end(gr.rna) - start + 1 + } else { + end(gr.rna) <- end(gr.rna) - start + 1 + start(gr.rna) <- start(gr.rna) - start + 1 + } + # Concatenate all exon coverages. + covg[ranges(gr.rna)] + } else { # ChIP-seq. + covg + } + } else { + Rle(0, repr.len) + } + } + + covg.allgenes <- mapply(CalcReadsCov, srg=sr.in.ranges, + start=v.start, end=v.end, gr.rna=granges.dat, + repr.len=repr.lens, strand=v.strand, SIMPLIFY=F) +} + +bamFileList <- function(ctg.tbl) { +# Determine the bam files involved in the configuration and whether it is a +# bam file pair setup. +# Args: +# ctg.tbl: coverage-genelist-title table. + + cov.uniq <- unique(ctg.tbl$cov) + cov.list <- strsplit(cov.uniq, ':') + v.nbam <- sapply(cov.list, length) + v.bbp <- v.nbam == 2 + if(all(v.bbp)) { + bbp <- T + } else if(all(!v.bbp)) { + bbp <- F + } else { + stop("No mix of bam and bam-pair allowed in configuration.\n") + } + + list(bbp=bbp, bam.list=unique(unlist(cov.list))) +} + +estiMapqStyle <- function(bam.file){ +# Estimate the mapping quality style. Return TRUE if it is SAM standard. +# Sample 1000 mapped reads from bam file, and if the mapq of reads +# over half are NA, then return FALSE, because it is quite possible that +# the aligner using coding style as bowtie, 255 as highest score. +# Args: +# bam.file: bam file to be sampled. + + sbw <- c('pos', 'qwidth', 'mapq', 'strand') + sbp <- ScanBamParam(what=sbw, flag=scanBamFlag( + isUnmappedQuery=F, isDuplicate=F)) + samp <- BamSampler(bam.file, yieldSize=500) + samp.reads <- scanBam(samp, param=sbp)[[1]] + samp.len <- length(samp.reads[["mapq"]]) + mapq.255 <- sum(is.na(samp.reads[["mapq"]])) + if(mapq.255/samp.len >= 0.5){ + return(FALSE) + }else{ + return(TRUE) + } +} + +headerIndexBam <- function(bam.list) { +# Read bam header to determine mapping method. +# Index bam files if not done yet. +# Args: +# ctg.tbl: coverage-genelist-title table. + + v.map.bowtie <- vector('logical', length=length(bam.list)) + for(i in 1:length(bam.list)) { + bam.file <- bam.list[i] + + # Index bam file. + if(!file.exists(paste(bam.file, ".bai", sep=""))) { + indexBam(bam.file) + } + + # Derive mapping program. + header <- scanBamHeader(bam.file) + map.prog <- try(strsplit(header[[1]]$text$'@PG'[[1]], ':')[[1]][2], + silent=T) + if(class(map.prog) != "try-error") { + map.style <- grepl('tophat|bowtie|bedtools|star', map.prog, + ignore.case=T) + if(map.style){ + v.map.bowtie[i] <- TRUE + next + } + map.style <- grepl('bwa|casava|gem', map.prog, ignore.case=T) + if(map.style) { + v.map.bowtie[i] <- FALSE + next + } +# if(estiMapqStyle(bam.file)){ +# warning(sprintf("Aligner for: %s cannot be determined. Style of +# standard SAM mapping score will be used.", bam.file)) +# v.map.bowtie[i] <- FALSE +# }else{ +# warning(sprintf("Aligner for: %s cannot be determined. Style of +# Bowtie-like SAM mapping score will be used. Would you mind to tell us what +# aligner you are using?", bam.file)) +# v.map.bowtie[i] <- TRUE +# } + } +# else { +# cat("\n") +# if(estiMapqStyle(bam.file)){ +# warning(sprintf("Aligner for: %s cannot be determined. Style of +# standard SAM mapping score will be used.", bam.file)) +# v.map.bowtie[i] <- FALSE +# }else{ +# warning(sprintf("Aligner for: %s cannot be determined. Style of +# Bowtie-like SAM mapping score will be used.", bam.file)) +# v.map.bowtie[i] <- TRUE +# } +# } + warning(sprintf("Aligner for: %s cannot be determined. Style of +standard SAM mapping score will be used. Would you mind submitting an issue +report to us on Github? This will benefit people using the same aligner.", + bam.file)) + v.map.bowtie[i] <- FALSE + } + names(v.map.bowtie) <- bam.list + + v.map.bowtie +} + +libSizeBam <- function(bam.list) { +# Obtain library sizes by counting qualified bam records. +# Args: +# ctg.tbl: coverage-genelist-title table. + + # Count only reads that are mapped, primary, passed quality control and + # un-duplicated. + sbp <- ScanBamParam(flag=scanBamFlag(isUnmappedQuery=F, isSecondaryAlignment=F, + isNotPassingQualityControls=F, isDuplicate=F)) + v.lib.size <- vector('integer', length=length(bam.list)) + for(i in 1:length(bam.list)) { + bfn <- bam.list[i] # bam file name. + cfn <- paste(basename(bfn), '.cnt', sep='') # count file name. + if(file.exists(cfn)) { + v.lib.size[i] <- as.integer(readLines(cfn, n=1)) + } else { + cnt.bam <- countBam(bfn, param=sbp) + v.lib.size[i] <- cnt.bam$records + writeLines(as.character(v.lib.size[i]), cfn) + } + names(v.lib.size)[i] <- bfn + } + + v.lib.size +} + +seqnamesBam <- function(bam.list) { +# Obtain chromosome names for each bam file. This list must be used to filter +# genomic regions before scanBam or it terminates immaturely. +# ctg.tbl: coverage-genelist-title table. + + # browser() + sn.list <- lapply(scanBamHeader(bam.list), function(h) { + names(h$targets) + }) + names(sn.list) <- bam.list + + sn.list +} + +chrTag <- function(sn.inbam) { +# Check whether the chromosome name format in the bam file contains 'chr' or not. +# Args: +# sn.inbam: seqnames in the bam file. + + n.chr <- length(grep('^chr', sn.inbam)) + if(n.chr == 0) { + chr.tag <- F + } else if(n.chr == length(sn.inbam)) { + chr.tag <- T + } else { + return("Inconsistent chromosome names in bam file. Check bam header.") + } + + chr.tag +} + +chunkIndex <- function(tot.gene, gcs) { +# Create chunk indices according to total number of genes and chunk size. +# Args: +# tot.gene: total number of genes. +# gcs: gene chunk size. + + nchk <- ceiling(tot.gene / gcs) # number of chunks. + chkidx.list <- vector('list', length=nchk) # chunk indices list. + chk.start <- 1 # chunk start. + i.chk <- idiv(tot.gene, chunkSize=gcs) + for(i in 1:nchk) { + chk.size <- nextElem(i.chk) + chkidx.list[[i]] <- c(chk.start, chk.start + chk.size - 1) + chk.start <- chk.start + chk.size + } + + chkidx.list +} + +covMatrix <- function(debug, chkidx.list, coord, rnaseq.gb, exonmodel, libsize, + spit.dot=T, ...) { +# Function to generate a coverage matrix for all genes. +# Args: +# debug: boolean tag for debugging. +# chkidx.list: list of (start, end) indices for each chunk. +# coord: dataframe of gene coordinates. +# rnaseq.gb: boolean for RNA-seq genebody plot. +# exonmodel: exon model data object. +# libsize: total read count for this bam file. +# spit.dot: boolean to control sptting '.' to consoles. +# Return: normalized coverage matrix for all genes, each row represents a gene. + + + if(!debug) { + # Extract coverage and combine into a matrix. + result.matrix <- foreach(chk=chkidx.list, .combine='rbind', + .multicombine=T) %dopar% { + if(spit.dot) { + cat(".") + } + i <- chk[1]:chk[2] # chunk: start -> end + # If RNA-seq, retrieve exon ranges. + if(rnaseq.gb) { + exonranges.list <- unlist(exonmodel[coord[i, ]$tid]) + } else { + exonranges.list <- NULL + } + doCov(coord[i, ], exonranges.list, ...) + } + + # Floor negative values which are caused by spline. + result.matrix[result.matrix < 0] <- 0 + result.matrix / libsize * 1e6 # normalize to RPM. + + } else { + for(c in 1:length(chkidx.list)) { + chk <- chkidx.list[[c]] + i <- chk[1]:chk[2] # chunk: start -> end + cat(".") + # If RNA-seq, retrieve exon ranges. + if(rnaseq.gb) { + exonranges.list <- unlist(exonmodel[coord[i, ]$tid]) + } else { + exonranges.list <- NULL + } + cov <- doCov(coord[i, ], exonranges.list, ...) + if(c == 1) { + result.matrix <- matrix(0, nrow=nrow(coord), ncol=ncol(cov)) + } + result.matrix[i, ] <- cov + } + # Floor negative values which are caused by spline. + # browser() + result.matrix[result.matrix < 0] <- 0 + result.matrix / libsize * 1e6 # normalize to RPM. + + } +} + + +doCov <- function(coord.mat, exonranges.list, chr.tag, pint, reg2plot, + flanksize, flankfactor, m.pts, f.pts, ...) { +# Extract coverage from bam file into a matrix. According to the parameter +# values, call corresponding functions. +# Args: +# coord.mat: matrix of genomic coordinates to extract coverage. +# exonranges.list: list of IRanges objects, each represents a group of exons. +# pint: boolean of point interval. +# reg2plot: string of region to plot. +# flanksize: flanking region size. +# flankfactor: flanking region factor. +# m.pts: data points for middle interval. +# f.pts: data points for flanking region. +# Return: matrix of interpolated coverage: each row represents a gene; each +# column represents a data point. Coverage from exons are concatenated. + + v.chrom <- coord.mat$chrom + # if(!chr.tag) { + # v.chrom <- sub('chr', '', v.chrom) + # } + v.chrom <- as.factor(v.chrom) + v.strand <- as.factor(coord.mat$strand) + + # Figure out interval region sizes and calculate flanking region sizes. + if(!pint) { # interval regions. + if(!is.null(exonranges.list)) { # RNA-seq + if(flankfactor > 0) { + v.fls <- sapply(exonranges.list, function(t) { + sum(end(t) - start(t) + 1) * flankfactor + }) + } else { + v.fls <- rep(flanksize, length=nrow(coord.mat)) + } + extrCovExons(v.chrom, exonranges.list, v.fls, v.strand, + m.pts, f.pts, ...) + } else { # ChIP-seq with intervals. + v.start <- coord.mat$start + v.end <- coord.mat$end + if(flankfactor > 0) { + v.fls <- round((v.end - v.start + 1) * flankfactor) + } else { + v.fls <- rep(flanksize, length=nrow(coord.mat)) + } + extrCov3Sec(v.chrom, v.start, v.end, v.fls, v.strand, + m.pts, f.pts, ...) + } + } else { # point center with flanking regions. + v.midp <- vector('integer', length=nrow(coord.mat)) + for(r in 1:nrow(coord.mat)) { + if(reg2plot == 'tss' && v.strand[r] == '+' || + reg2plot == 'tes' && v.strand[r] == '-') { + # the left site is center. + v.midp[r] <- coord.mat$start[r] + } else { # this also includes BED supplied point center. + v.midp[r] <- coord.mat$end[r] + } + } + # browser() + extrCovMidp(v.chrom, v.midp, flanksize, v.strand, m.pts + f.pts*2, ...) + } +} diff -r 000000000000 -r 3ca58369469c lib/genedb.r --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/lib/genedb.r Wed Dec 06 19:01:53 2017 -0500 @@ -0,0 +1,258 @@ +chromFormat <- function(crn, ...){ +# Format chromosome names to our standard. +# Args: +# crn: character vector of chromosome names. +# Returns: character vector of formatted chromosome names. + + crn <- sub('.fa$', '', crn) + nochr.i <- grep('^chr', crn, invert=T) + crn[nochr.i] <- paste('chr', crn[nochr.i], sep='') + crn +} + +FIScoreIntersect <- function(db.info, v.finfo){ +# Further info score based on intersection with database FI columns. +# Used to select database files. +# Args: +# db.info: one record of annotation database table. +# v.finfo: further info that is split into vector. +# Returns: a score for the intersection between database and further info. + + length(intersect(as.vector(db.info[c("FI.1", "FI.2", "FI.3")]), v.finfo)) +} + +SetupPlotCoord <- function(args.tbl, ctg.tbl, default.tbl, dbfile.tbl, + progpath, genome, reg2plot, lgint, flanksize, + samprate, galaxy) { +# Load genomic coordinates for plot based on the input arguments. +# Args: +# args.tbl: input argument table +# ctg.tbl: coverage-genelist-title table +# default.tbl: default settings of databases and plot setting +# dbfile.tbl: details of database files(.RData). +# progpath: program path derived from NGSPLOT +# genome: genome name, such as mm9, hg19. +# reg2plot: tss, tes, genebody, etc. +# lgint: boolean tag for large interval. +# flanksize: flanking region size. +# samprate: sampling rate +# galaxy: boolean tag for galaxy installation. + + if(reg2plot!='bed'){ + # Subset using genome-region combination. + key <- default.tbl$Genome == genome & default.tbl$Region == reg2plot + if(sum(key) == 0) { + stop("The combination of genome and region does not exist. You + may need to install the genome or the region does not exist yet.\n") + } + anno.parameters <- default.tbl[key, ] + db.match.mask <- dbfile.tbl$Genome == genome & + dbfile.tbl$Region == reg2plot + anno.db.candidates <- dbfile.tbl[db.match.mask, ] + + # Database flavor. + if('-D' %in% names(args.tbl)){ + database <- as.character(args.tbl['-D']) + database.allowed <- unique(anno.db.candidates$DB) + stopifnot(database %in% database.allowed) + }else{ + database <- as.character(anno.parameters$DefaultDB) + } + + anno.db.candidates <- anno.db.candidates[anno.db.candidates$DB == database, ] + + if (file.exists(file.path(progpath, 'database', genome, reg2plot))){ + prefix <- file.path(progpath, 'database', genome, reg2plot) + }else{ + prefix <- file.path(progpath, 'database', genome) + } + + # Further info to subset genomic regions. + if('-F' %in% names(args.tbl)){ + finfo <- as.character(args.tbl['-F']) + } else { + finfo <- NULL + } + + Labs <- unlist(strsplit(as.character(anno.parameters$PointLab[1]), ",")) + if(length(Labs)==1){ + pint <- TRUE + }else{ + pint <- FALSE + } + + if(!is.null(finfo)){ # use finfo to subset DB tables. + v.finfo <- unlist(strsplit(finfo, ',')) + # Get the intersect of the v.finfo and FI columns of databases, + # and choose the best fit. + fi.score <- apply(anno.db.candidates, 1, FIScoreIntersect, + v.finfo=v.finfo) + anno.db.candidates <- anno.db.candidates[fi.score == max(fi.score), ] + # RNA-seq tag. + if(reg2plot == 'genebody' && 'rnaseq' %in% v.finfo) { + rnaseq.gb <- TRUE + }else{ + rnaseq.gb <- FALSE + } + } else { + rnaseq.gb <- FALSE + } + anno.db.candidates <- anno.db.candidates[order(anno.db.candidates$dbScore), ] + f.load <- file.path(prefix, anno.db.candidates$"db.file"[1]) + + if (!file.exists(f.load)){ + stop("The requested database file does not exist. You may have a +corrupted database. Consider reinstalling the genome.\n") + # cat("\nDownloading database:\n") + # download.file(anno.db.candidates$"URL"[1], destfile=f.load, + # method="curl") + # stopifnot(file.exists(f.load)) + } + # Load genomic coordinates. + cat("\nUsing database:\n") + cat(paste(f.load, "\n", sep="")) + load(f.load) # load coordinates into variable: genome.coord + + } else if(reg2plot == 'bed') { + rnaseq.gb <- FALSE + } + + # Identify unique regions. + reg.list <- as.factor(ctg.tbl$glist) + uniq.reg <- levels(reg.list) + + # Determine plot coordinates for each unique region. + coord.list <- vector('list', length(uniq.reg)) + names(coord.list) <- uniq.reg + + ReadBedCoord <- function(bed.file) { + # Read a bed into memory as a dataframe. + # Args: + # bed.file: path of the bed file to be read. + # Returns: genome coordinates as a dataframe. + + if(!galaxy && + length(grep("\\.bed([0-9]+)?$", bed.file, ignore.case=T)) == 0) { + warning(sprintf("File name: '%s' does not seem to a correct name +for bed file.\n", bed.file)) + } + bed.coord <- read.table(bed.file, sep="\t", quote="\"") + if(ncol(bed.coord) <3){ + stop("A bed file must contain at least 3 columns! The format is: + chrom, start, end, gname, tid, strand. Columns 4-6 are optional\n") + } + genome.coord <- data.frame(chrom=chromFormat(bed.coord[, 1]), + start=bed.coord[, 2] + 1, + end=bed.coord[, 3], + gid=NA, gname='N', tid='N', strand='+', + byname.uniq=T, bygid.uniq=NA) + # Perform sanity check for bed file. + if(!all(genome.coord$start <= genome.coord$end)) { + stop(sprintf("Sanity check for bed file: %s failed. Bed files are + 0-based and right-side open.\n", bed.file)) + } + + # Deal with columns 4-6 (Optional). + if(ncol(bed.coord) >=4){ + genome.coord$gname <- bed.coord[, 4] + } + if(ncol(bed.coord) >=5){ + genome.coord$tid <- bed.coord[, 5] + } + if(ncol(bed.coord) >=6){ + genome.coord$strand <- bed.coord[, 6] + } + + genome.coord + } + + # Sample a vector but preserve its original sequential order. + sampleInSequence <- function(x, samprate) { + x[ifelse(runif(length(x)) <= samprate, T, F)] + } + + ValidateGeneSubset <- function(gid.match, tid.match, gname.match) { + # To validate that there is no ambiguous hits in gene match. + # Args: + # XXX.match: positional hits with respect to the gene database. 0 means + # no hit. + + subset.matrix <- rbind(gid.match, tid.match, gname.match) + g.valid <- sapply(subset.matrix, function(col.hits) { + length(unique(col.hits[col.hits > 0])) <= 1 + }) + + all(g.valid) + } + + for(i in 1:length(uniq.reg)) { + ur <- uniq.reg[i] + + if(reg2plot == "bed") { + coord.list[[i]] <- ReadBedCoord(ur) + } else if(ur == '-1') { # use whole genome. + coord.list[[i]] <- genome.coord[genome.coord$byname.uniq, ] + } else { # read gene list from text file. + gene.list <- read.table(ur, as.is=T, comment.char='#')$V1 + gid.match <- match(gene.list, genome.coord$gid, nomatch=0) + gname.match <- match(gene.list, genome.coord$gname, nomatch=0) + tid.match <- match(gene.list, genome.coord$tid, nomatch=0) + if(!ValidateGeneSubset(gid.match, tid.match, gname.match)) { + stop("\nAmbiguous hits in gene database. This shall never + happen! Contact ngs.plot maintainers.\n") + } + subset.idx <- pmax(gid.match, tid.match, gname.match) + # subset.idx <- gid.match + tid.match + gname.match + subset.idx <- subset.idx[subset.idx != 0] + if(length(subset.idx) == 0) { + stop("\nGene subset size becomes zero. Are you using the + correct database?\n") + } + coord.list[[i]] <- genome.coord[subset.idx, ] + } + + # Do sampling. + if(samprate < 1) { + samp.idx <- sampleInSequence(1:nrow(coord.list[[i]]), samprate) + coord.list[[i]] <- coord.list[[i]][samp.idx, ] + } + } + + # If bed file, set boolean tag for point interval, i.e. interval=1bp. + if(reg2plot == "bed") { + pint <- all(sapply(coord.list, function(cd) all(cd$start == cd$end))) + if(pint) { + Labs <- c("Center") + } else { + Labs <- c("5'End", "3'End") + } + } + + # Set boolean tag for large interval display. + if(!pint && is.na(lgint)) { + if(median(sapply(coord.list, function(cd) { + median(cd$end - cd$start + 1) + })) > flanksize) { + lgint <- 1 + } else { + lgint <- 0 + } + } + + res <- list(coord.list=coord.list, rnaseq.gb=rnaseq.gb, lgint=lgint, + reg.list=reg.list, uniq.reg=uniq.reg, pint=pint, Labs=Labs) + + # Add exon models to the result if RNA-seq. + if(rnaseq.gb) { + em.load <- file.path(prefix, paste(genome, database, 'exonmodel.RData', + sep='.')) + load(em.load) # load exon models into variable: exonmodel + res$exonmodel <- exonmodel + } + + res +} + + + + diff -r 000000000000 -r 3ca58369469c lib/parse.args.r --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/lib/parse.args.r Wed Dec 06 19:01:53 2017 -0500 @@ -0,0 +1,521 @@ +# Parse command line arguments and extract them into an associate array. +# Check if the required arguments are all satisfied. +parseArgs <- function(args, manditories) { + if(length(args) %% 2 == 1 || length(args) == 0) { + cat('Unpaired argument and value.\n') + return(NULL) + } + n.i <- seq(1, length(args), by=2) + v.i <- seq(2, length(args), by=2) + args.name <- args[n.i] + args.value <- args[v.i] + + # Check if required argument values are supplied. + miss_tag <- F + man.bool <- manditories %in% args.name + if(!all(man.bool)){ + cat(paste('Missing argument: ', paste(manditories[!man.bool], + collapse=','), '.', sep='') + ) + miss_tag <- T + } + if(miss_tag){ + res <- NULL + }else{ + res <- args.value + names(res) <- args.name + } + res +} + +ConfigTbl <- function(args.tbl, fraglen) { +# Create configuration table from "-C" argument. +# Args: +# args.tbl: named vector of program arguments. +# fraglen: fragment length. +# Returns: dataframe of configuration. + + covfile <- args.tbl['-C'] + + suppressWarnings( + ctg.tbl <- tryCatch( + read.table(covfile, colClasses='character', comment.char='#'), + error=function(e) { + if('-E' %in% names(args.tbl)) { + glist <- args.tbl['-E'] + } else { + glist <- '-1' + } + if('-T' %in% names(args.tbl)) { + title <- args.tbl['-T'] + } else { + title <- 'Noname' + } + data.frame(cov=covfile, glist=glist, title=title, + fraglen=as.character(fraglen), + color=NA, stringsAsFactors=F) + } + ) + ) + + # Read a config file. + if(ncol(ctg.tbl) < 3) { + stop("Configuration file must contain at least 3 columns! +Insufficient information provided.\n") + } + colnames(ctg.tbl)[1:3] <- c('cov', 'glist', 'title') + if(ncol(ctg.tbl) >= 4) { + colnames(ctg.tbl)[4] <- 'fraglen' + fraglen.sp <- strsplit(ctg.tbl$fraglen, ":") + if(!all(sapply(fraglen.sp, function(x) { + length(x) == 1 || length(x) == 2}))) { + stop("Fragment length format must be X or X:Y; X and Y are +integers.\n") + } + if(!all(as.integer(unlist(fraglen.sp)) > 0)) { + stop("Fragment length must be positive integers! Check your +configuration file.\n") + } + } else { + ctg.tbl <- data.frame(ctg.tbl, fraglen=as.character(fraglen), + stringsAsFactors=F) + } + if(ncol(ctg.tbl) >= 5) { + colnames(ctg.tbl)[5] <- 'color' + # Validate color specifications. + col.validated <- col2rgb(ctg.tbl$color) + } else { + ctg.tbl <- data.frame(ctg.tbl, color=NA) + } + ctg.tbl +} + +CheckRegionAllowed <- function(reg2plot, anno.tbl) { +# Check if region to plot is an allowed value. + + region.allowed <- c(as.vector(unique(anno.tbl$Region)), "bed") + if(!reg2plot %in% region.allowed) { + stop(paste(c("Unknown region specified. Must be one of:", + region.allowed, "\n"), collapse=" ")) + } +} + +CoverageVars <- function(args.tbl, reg2plot) { +# Setup variables from program arguments. +# Args: +# args.tbl: named vector of program arguments. +# reg2plot: string describing region to plot. +# Returns: list of variables. + + vl <- list() # vl: configured variable list + + #### Switch for debug #### + if('-Debug' %in% names(args.tbl)) { + stopifnot(as.integer(args.tbl['-Debug']) >= 0) + vl$debug <- as.integer(args.tbl['-Debug']) + } else { + vl$debug <- as.integer(0) + } + + #### Switch for Galaxy usage #### + if('-Galaxy' %in% names(args.tbl)) { + stopifnot(as.integer(args.tbl['-Galaxy']) >= 0) + vl$galaxy <- as.integer(args.tbl['-Galaxy']) + vl$avgname <- args.tbl['-O2'] + vl$heatmapname <- args.tbl['-O3'] + } else { + vl$galaxy <- as.integer(0) + } + + ######## Coverage-generation parameters ######## + #### Flanking region size. #### + if('-L' %in% names(args.tbl)){ + vl$flanksize <- as.integer(args.tbl['-L']) + stopifnot(vl$flanksize >= 0) + } else { + flank.tbl <- setNames( + c(2000, 2000, 2000, 500, 500, 1500, 1000, 1000), + c('tss','tes','genebody','exon','cgi', 'enhancer', 'dhs','bed')) + vl$flanksize <- as.integer(flank.tbl[reg2plot]) + } + + #### Flanking size factor. #### + if('-N' %in% names(args.tbl) && !('-L' %in% names(args.tbl))) { + stopifnot(as.numeric(args.tbl['-N']) >= 0) + vl$flankfactor <- as.numeric(args.tbl['-N']) + } else { + vl$flankfactor <- 0.0 + } + + #### Robust statistics. #### + if('-RB' %in% names(args.tbl)){ + stopifnot(as.numeric(args.tbl['-RB']) >= 0) + vl$robust <- as.numeric(args.tbl['-RB']) + }else{ + vl$robust <- .0 # percentage to be trimmed on both ends. + } + + #### Random sampling rate. #### + if('-S' %in% names(args.tbl)){ + vl$samprate <- as.numeric(args.tbl['-S']) + stopifnot(vl$samprate > 0 && vl$samprate <= 1) + }else{ + vl$samprate <- 1.0 + } + + ##### Set cores number. #### + if('-P' %in% names(args.tbl)){ + stopifnot(as.integer(args.tbl['-P']) >= 0) + vl$cores.number <- as.integer(args.tbl['-P']) + }else{ + vl$cores.number <- as.integer(0) + } + + #### Algorithm for coverage vector normalization #### + if('-AL' %in% names(args.tbl)) { + vl$cov.algo <- args.tbl['-AL'] + al.allowed <- c('spline', 'bin') + stopifnot(vl$cov.algo %in% al.allowed) + } else { + vl$cov.algo <- 'spline' + } + + #### Gene chunk size #### + if('-CS' %in% names(args.tbl)) { + vl$gcs <- as.integer(args.tbl['-CS']) + stopifnot(vl$gcs > 0) + } else { + vl$gcs <- 100 + } + + #### Mapping quality cutoff #### + if('-MQ' %in% names(args.tbl)) { + vl$map.qual <- as.integer(args.tbl['-MQ']) + stopifnot(vl$map.qual >= 0) + } else { + vl$map.qual <- 20 + } + + #### Fragment length #### + if('-FL' %in% names(args.tbl)) { + vl$fraglen <- as.integer(args.tbl['-FL']) + stopifnot(vl$fraglen > 0) + } else { + vl$fraglen <- 150 + } + vl$bufsize <- vl$fraglen # buffer added to both ends of the coverage vec. + + #### Strand-specific coverage #### + if('-SS' %in% names(args.tbl)) { + spec.allowed <- c('both', 'same', 'opposite') + stopifnot(args.tbl['-SS'] %in% spec.allowed) + vl$strand.spec <- args.tbl['-SS'] + } else { + vl$strand.spec <- 'both' + } + + #### Shall interval size be larger than flanking size? #### + if('-IN' %in% names(args.tbl)) { + stopifnot(as.integer(args.tbl['-IN']) >= 0) + vl$inttag <- as.integer(args.tbl['-IN']) + } else { + vl$inttag <- NA + } + + #### Image output forbidden tag. #### + if('-FI' %in% names(args.tbl)){ + stopifnot(as.integer(args.tbl['-FI']) >= 0) + vl$fi_tag <- as.integer(args.tbl['-FI']) + }else{ + vl$fi_tag <- as.integer(0) + } + + vl +} + +PlotVars <- function(args.tbl, existing.vl=vector('character'), + prof.misc=list(), low.count=NULL, go.paras=list()) { +# Setup replot variables. +# Args: +# args.tbl: argument table. +# existing.vl: existing variable name character list. +# prof.misc: misc. avg prof variable list. +# go.paras: gene ordering parameters. +# Returns: list of updated variables. + + ## Plotting-related parameters: + updated.vl <- list() + ### Misc. parameters: + #### Font size. #### + if('-FS' %in% names(args.tbl)) { + stopifnot(as.integer(args.tbl['-FS']) > 0) + updated.vl$font.size <- as.integer(args.tbl['-FS']) + } else if(!'font.size' %in% existing.vl) { + updated.vl$font.size <- 12 + } + + ### Avg. profiles parameters: + #### Plot width. #### + if('-WD' %in% names(args.tbl)) { + stopifnot(as.integer(args.tbl['-WD']) > 0) + updated.vl$plot.width <- as.integer(args.tbl['-WD']) + } else if(!'plot.width' %in% existing.vl) { + updated.vl$plot.width <- 8 + } + + #### Plot height. #### + if('-HG' %in% names(args.tbl)) { + stopifnot(as.integer(args.tbl['-HG']) > 0) + updated.vl$plot.height <- as.integer(args.tbl['-HG']) + } else if(!'plot.height' %in% existing.vl) { + updated.vl$plot.height <- 7 + } + + #### Shall standard errors be plotted around average profiles? #### + if('-SE' %in% names(args.tbl)) { + stopifnot(as.integer(args.tbl['-SE']) >= 0) + updated.vl$se <- as.integer(args.tbl['-SE']) + } else if(!'se' %in% existing.vl) { + updated.vl$se <- 1 + } + + #### Smooth function radius. #### + if('-MW' %in% names(args.tbl)) { + stopifnot(as.integer(args.tbl['-MW']) >= 1) + updated.vl$mw <- as.integer(args.tbl['-MW']) + } else if(!'mw' %in% existing.vl) { + updated.vl$mw <- 1 + } + + #### Shaded area alpha. #### + if('-H' %in% names(args.tbl)) { + stopifnot(as.numeric(args.tbl['-H']) >= 0 && + as.numeric(args.tbl['-H']) < 1) + updated.vl$shade.alp <- as.numeric(args.tbl['-H']) + } else if(!'shade.alp' %in% existing.vl) { + updated.vl$shade.alp <- 0 + } + + #### Misc. options for avg. profiles. #### + updated.vl$prof.misc <- prof.misc + if('-YAS' %in% names(args.tbl)) { + ystr <- args.tbl['-YAS'] + if(ystr != 'auto') { + yp <- unlist(strsplit(ystr, ',')) + if(length(yp) == 2) { + y.min <- as.numeric(yp[1]) + y.max <- as.numeric(yp[2]) + stopifnot(y.min < y.max) + } else { + stop("-YAS must be 'auto' or a pair of numerics separated +by ','\n") + } + updated.vl$prof.misc$yscale <- c(y.min, y.max) + } else { + updated.vl$prof.misc$yscale <- 'auto' + } + } else if(!'yscale' %in% names(prof.misc)) { + updated.vl$prof.misc$yscale <- 'auto' + } + if('-LEG' %in% names(args.tbl)) { + stopifnot(as.integer(args.tbl['-LEG']) >= 0) + updated.vl$prof.misc$legend <- as.integer(args.tbl['-LEG']) + } else if(!'legend' %in% names(prof.misc)) { + updated.vl$prof.misc$legend <- T + } + if('-BOX' %in% names(args.tbl)) { + stopifnot(as.integer(args.tbl['-BOX']) >= 0) + updated.vl$prof.misc$box <- as.integer(args.tbl['-BOX']) + } else if(!'box' %in% names(prof.misc)) { + updated.vl$prof.misc$box <- T + } + if('-VLN' %in% names(args.tbl)) { + stopifnot(as.integer(args.tbl['-VLN']) >= 0) + updated.vl$prof.misc$vline <- as.integer(args.tbl['-VLN']) + } else if(!'vline' %in% names(prof.misc)) { + updated.vl$prof.misc$vline <- T + } + if('-XYL' %in% names(args.tbl)) { + stopifnot(as.integer(args.tbl['-XYL']) >= 0) + updated.vl$prof.misc$xylab <- as.integer(args.tbl['-XYL']) + } else if(!'xylab' %in% names(prof.misc)) { + updated.vl$prof.misc$xylab <- T + } + if('-LWD' %in% names(args.tbl)) { + stopifnot(as.integer(args.tbl['-LWD']) > 0) + updated.vl$prof.misc$line.wd <- as.integer(args.tbl['-LWD']) + } else if(!'line.wd' %in% names(prof.misc)) { + updated.vl$prof.misc$line.wd <- 3 + } + + ### Heatmap parameters: + #### Gene order algorithm #### + if('-GO' %in% names(args.tbl)){ + go.allowed <- c('total', 'max', 'prod', 'diff', 'hc', 'none', 'km') + stopifnot(args.tbl['-GO'] %in% go.allowed) + updated.vl$go.algo <- args.tbl['-GO'] + } else if(!'go.algo' %in% existing.vl){ + updated.vl$go.algo <- 'total' # hierarchical clustering. + } + + #### Reduce ratio to control a heatmap height #### + if('-RR' %in% names(args.tbl)) { + stopifnot(as.integer(args.tbl['-RR']) > 0) + updated.vl$rr <- as.integer(args.tbl['-RR']) + } else if(!'rr' %in% existing.vl) { + updated.vl$rr <- 30 + } + + #### Color scale string. #### + if('-SC' %in% names(args.tbl)) { + if(!args.tbl['-SC'] %in% c('local', 'region', 'global')) { + scale.pair <- unlist(strsplit(args.tbl['-SC'], ",")) + if(length(scale.pair) != 2 || is.na(as.numeric(scale.pair[1])) || + is.na(as.numeric(scale.pair[2]))) { + stop("Color scale format error: must be local, region, global +or a pair of numerics separated by ','\n") + } + } + updated.vl$color.scale <- args.tbl['-SC'] + } else if(!'color.scale' %in% existing.vl) { + updated.vl$color.scale <- 'local' + } + + #### Flooding fraction. #### + if('-FC' %in% names(args.tbl)) { + stopifnot(as.numeric(args.tbl['-FC']) >= 0 && + as.numeric(args.tbl['-FC']) < 1) + updated.vl$flood.frac <- as.numeric(args.tbl['-FC']) + } else if(!'flood.frac' %in% existing.vl) { + updated.vl$flood.frac <- .02 + } + + #### Heatmap color. #### + if('-CO' %in% names(args.tbl)) { + updated.vl$hm.color <- as.character(args.tbl['-CO']) + } else if(!'hm.color' %in% existing.vl){ + updated.vl$hm.color <- "default" + } + + #### Color distribution. #### + if('-CD' %in% names(args.tbl)) { + stopifnot(as.numeric(args.tbl['-CD']) > 0) + updated.vl$color.distr <- as.numeric(args.tbl['-CD']) + } else if(!'color.distr' %in% existing.vl) { + updated.vl$color.distr <- .6 + } + + #### Low count cutoff for rank-based normalization #### + if('-LOW' %in% names(args.tbl)) { + stopifnot(as.integer(args.tbl['-LOW']) >= 0) + updated.vl$low.count <- as.integer(args.tbl['-LOW']) + } else if(!'low.count' %in% existing.vl) { + updated.vl$low.count <- 10 + } else { # ensure low.count is not empty. + updated.vl$low.count <- low.count + } + if(!is.null(low.count)) { + updated.vl$low.count.ratio <- updated.vl$low.count / low.count + } + + #### Misc. options for heatmap. #### + updated.vl$go.paras <- go.paras + if('-KNC' %in% names(args.tbl)) { + stopifnot(as.integer(args.tbl['-KNC']) > 0) + updated.vl$go.paras$knc <- as.integer(args.tbl['-KNC']) + } else if(!'knc' %in% names(go.paras)) { + updated.vl$go.paras$knc <- 5 + } + if('-MIT' %in% names(args.tbl)) { + stopifnot(as.integer(args.tbl['-MIT']) > 0) + updated.vl$go.paras$max.iter <- as.integer(args.tbl['-MIT']) + } else if(!'max.iter' %in% names(go.paras)) { + updated.vl$go.paras$max.iter <- 20 + } + if('-NRS' %in% names(args.tbl)) { + stopifnot(as.integer(args.tbl['-NRS']) > 0) + updated.vl$go.paras$nrs <- as.integer(args.tbl['-NRS']) + } else if(!'nrs' %in% names(go.paras)) { + updated.vl$go.paras$nrs <- 30 + } + + + updated.vl +} + +CheckHMColorConfig <- function(hm.color, bam.pair) { + if(hm.color != 'default') { + v.colors <- unlist(strsplit(hm.color, ":")) + if(bam.pair && length(v.colors) != 2 && length(v.colors) != 3 || + !bam.pair && length(v.colors) != 1) { + stop("Heatmap color specifications must correspond to bam-pair!\n") + } + } +} + +EchoPlotArgs <- function() { + cat("## Plotting-related parameters:\n") + cat("### Misc. parameters:\n") + cat(" -FS Font size(default=12)\n") + cat("### Avg. profiles parameters:\n") + cat(" -WD Image width(default=8)\n") + cat(" -HG Image height(default=7)\n") + cat(" -SE Shall standard errors be plotted?(0 or 1)\n") + cat(" -MW Moving window width to smooth avg. profiles, must be integer\n") + cat(" 1=no(default); 3=slightly; 5=somewhat; 9=quite; 13=super.\n") + cat(" -H Opacity of shaded area, suggested value:[0, 0.5]\n") + cat(" default=0, i.e. no shading, just lines\n") + cat(" -YAS Y-axis scale: auto(default) or min_val,max_val(custom scale)\n") + cat(" -LEG Draw legend? 1(default) or 0\n") + cat(" -BOX Draw box around plot? 1(default) or 0\n") + cat(" -VLN Draw vertical lines? 1(default) or 0\n") + cat(" -XYL Draw X- and Y-axis labels? 1(default) or 0\n") + cat(" -LWD Line width(default=3)\n") + cat("### Heatmap parameters:\n") + cat(" -GO Gene order algorithm used in heatmaps: total(default), hc, max,\n") + cat(" prod, diff, km and none(according to gene list supplied)\n") + cat(" -LOW Low count cutoff(default=10) in rank-based normalization\n") + cat(" -KNC K-means or HC number of clusters(default=5)\n") + cat(" -MIT Maximum number of iterations(default=20) for K-means\n") + cat(" -NRS Number of random starts(default=30) in K-means\n") + cat(" -RR Reduce ratio(default=30). The parameter controls the heatmap height\n") + cat(" The smaller the value, the taller the heatmap\n") + cat(" -SC Color scale used to map values to colors in a heatmap\n") + cat(" local(default): base on each individual heatmap\n") + cat(" region: base on all heatmaps belong to the same region\n") + cat(" global: base on all heatmaps together\n") + cat(" min_val,max_val: custom scale using a pair of numerics\n") + cat(" -FC Flooding fraction:[0, 1), default=0.02\n") + cat(" -CO Color for heatmap. For bam-pair, use color-tri(neg_color:[neu_color]:pos_color)\n") + cat(" Hint: must use R colors, such as darkgreen, yellow and blue2\n") + cat(" The neutral color is optional(default=black)\n") + cat(" -CD Color distribution for heatmap(default=0.6). Must be a positive number\n") + cat(" Hint: lower values give more widely spaced colors at the negative end\n") + cat(" In other words, they shift the neutral color to positive values\n") + cat(" If set to 1, the neutral color represents 0(i.e. no bias)\n") + +} + +EchoCoverageArgs <- function() { + cat("## Coverage-generation parameters:\n") + cat(" -F Further information provided to select database table or plottype:\n") + cat(" This is a string of description separated by comma.\n") + cat(" E.g. protein_coding,K562,rnaseq(order of descriptors does not matter)\n") + cat(" means coding genes in K562 cell line drawn in rnaseq mode.\n") + cat(" -D Gene database: ensembl(default), refseq\n") + cat(" -L Flanking region size(will override flanking factor)\n") + cat(" -N Flanking region factor\n") + cat(" -RB The fraction of extreme values to be trimmed on both ends\n") + cat(" default=0, 0.05 means 5% of extreme values will be trimmed\n") + cat(" -S Randomly sample the regions for plot, must be:(0, 1]\n") + cat(" -P #CPUs to use. Set 0(default) for auto detection\n") + cat(" -AL Algorithm used to normalize coverage vectors: spline(default), bin\n") + cat(" -CS Chunk size for loading genes in batch(default=100)\n") + cat(" -MQ Mapping quality cutoff to filter reads(default=20)\n") + cat(" -FL Fragment length used to calculate physical coverage(default=150)\n") + cat(" -SS Strand-specific coverage calculation: both(default), same, opposite\n") + cat(" -IN Shall interval be larger than flanking in plot?(0 or 1, default=automatic)\n") + cat(" -FI Forbid image output if set to 1(default=0)\n") +} +############### End arguments configuration ##################### +################################################################# diff -r 000000000000 -r 3ca58369469c lib/plotlib.r --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/lib/plotlib.r Wed Dec 06 19:01:53 2017 -0500 @@ -0,0 +1,649 @@ +#### plotlib.r #### +# This contains the library for plotting related functions. +# +# Authors: Li Shen, Ningyi Shao +# +# Created: Feb 19, 2013 +# Last updated: May 21, 2013 +# + +SetupHeatmapDevice <- function(reg.list, uniq.reg, ng.list, pts, font.size=12, + unit.width=4, reduce.ratio=30) { +# Configure parameters for heatmap output device. The output is used by +# external procedures to setup pdf device ready for heatmap plotting. +# Args: +# reg.list: region list as in config file. +# uniq.reg: unique region list. +# ng.list: number of genes per heatmap in the order as config file. +# pts: data points (number of columns of heatmaps). +# font.size: font size. +# unit.width: image width per heatmap. +# reduce.ratio: how compressed are genes in comparison to data points? This +# controls image height. + + # Number of plots per region. + reg.np <- sapply(uniq.reg, function(r) sum(reg.list==r)) + + # Number of genes per region. + reg.ng <- sapply(uniq.reg, function(r) { + ri <- which(reg.list==r)[1] + ng.list[ri] + }) + + # Adjustment ratio. + origin.fs <- 12 # default font size. + fs.adj.ratio <- font.size / origin.fs + # Margin size (in lines) adjusted by ratio. + m.bot <- fs.adj.ratio * 2 + m.lef <- fs.adj.ratio * 1.5 + m.top <- fs.adj.ratio * 2 + m.rig <- fs.adj.ratio * 1.5 + key.in <- fs.adj.ratio * 1.0 # colorkey in inches. + m.lef.diff <- (fs.adj.ratio - 1) * 1.5 + m.rig.diff <- (fs.adj.ratio - 1) * 1.5 + # Setup image size. + hm.width <- (unit.width + m.lef.diff + m.rig.diff) * max(reg.np) + ipl <- .2 # inches per line. Obtained from par->'mai', 'mar'. + # Convert #gene to image height. + reg.hei <- sapply(reg.ng, function(r) { + c(key.in, # colorkey + margin. + r * unit.width / pts / reduce.ratio + + m.bot * ipl + m.top * ipl) # heatmap + margin. + }) + reg.hei <- c(reg.hei) + hm.height <- sum(reg.hei) + + # Setup layout of the heatmaps. + lay.mat <- matrix(0, ncol=max(reg.np), nrow=length(reg.np) * 2) + fig.n <- 1 # figure plotting number. + for(i in 1:length(reg.np)) { + row.upper <- i * 2 - 1 + row.lower <- i * 2 + for(j in 1:reg.np[i]) { + lay.mat[row.upper, j] <- fig.n; + fig.n <- fig.n + 1 + lay.mat[row.lower, j] <- fig.n; + fig.n <- fig.n + 1 + } + } + + list(reg.hei=reg.hei, hm.width=hm.width, hm.height=hm.height, + lay.mat=lay.mat, heatmap.mar=c(m.bot, m.lef, m.top, m.rig) * ipl) +} + +SetPtsSpline <- function(pint, lgint) { +# Set data points for spline function. +# Args: +# pint: tag for point interval. +# Return: list of data points, middle data points, flanking data points. + + pts <- 100 # data points to plot: 0...pts + if(pint){ # point interval. + m.pts <- 1 # middle part points. + f.pts <- pts / 2 # flanking part points. + } else { + if(lgint) { + m.pts <- pts / 5 * 3 + 1 + f.pts <- pts / 5 + 1 + } else { + m.pts <- pts / 5 + 1 + f.pts <- pts / 5 * 2 + 1 + } + } + list(pts=pts, m.pts=m.pts, f.pts=f.pts) +} + +CreatePlotMat <- function(pts, ctg.tbl) { +# Create matrix for avg. profiles. +# Args: +# pts: data points. +# ctg.tbl: configuration table. +# Return: avg. profile matrix initialized to zero. + + regcovMat <- matrix(0, nrow=pts + 1, ncol=nrow(ctg.tbl)) + colnames(regcovMat) <- ctg.tbl$title + regcovMat +} + +CreateConfiMat <- function(se, pts, ctg.tbl){ +# Create matrix for standard errors. +# Args: +# se: tag for standard error plotting. +# pts: data points. +# ctg.tbl: configuration table. +# Return: standard error matrix initialized to zero or null. + + if(se){ + confiMat <- matrix(0, nrow=pts + 1, ncol=nrow(ctg.tbl)) + colnames(confiMat) <- ctg.tbl$title + } else { + confiMat <- NULL + } + confiMat +} + +col2alpha <- function(col2use, alpha){ +# Convert a vector of solid colors to semi-transparent colors. +# Args: +# col2use: vector of colors. +# alpha: represents degree of opacity - [0,1] +# Return: vector of transformed colors. + + apply(col2rgb(col2use), 2, function(x){ + rgb(x[1], x[2], x[3], alpha=alpha*255, maxColorValue=255) + }) +} + +smoothvec <- function(v, radius, method=c('mean', 'median')){ +# Given a vector of coverage, return smoothed version of coverage. +# Args: +# v: vector of coverage +# radius: fraction of org. vector size. +# method: smooth method +# Return: vector of smoothed coverage. + + stopifnot(is.vector(v)) + stopifnot(length(v) > 0) + stopifnot(radius > 0 && radius < 1) + + halfwin <- ceiling(length(v) * radius) + s <- rep(NA, length(v)) + + for(i in 1:length(v)){ + winpos <- (i - halfwin) : (i + halfwin) + winpos <- winpos[winpos > 0 & winpos <= length(v)] + if(method == 'mean'){ + s[i] <- mean(v[winpos]) + }else if(method == 'median'){ + s[i] <- median(v[winpos]) + } + } + s +} + +smoothplot <- function(m, radius, method=c('mean', 'median')){ +# Smooth the entire avg. profile matrix using smoothvec. +# Args: +# m: avg. profile matrix +# radius: fraction of org. vector size. +# method: smooth method. +# Return: smoothed matrix. + + stopifnot(is.matrix(m) || is.vector(m)) + + if(is.matrix(m)) { + for(i in 1:ncol(m)) { + m[, i] <- smoothvec(m[, i], radius, method) + } + } else { + m <- smoothvec(m, radius, method) + } + m +} + +genXticks <- function(reg2plot, pint, lgint, pts, flanksize, flankfactor, + Labs) { +# Generate X-ticks for plotting. +# Args: +# reg2plot: string representation of region. +# pint: point interval. +# lgint: tag for large interval. +# pts: data points. +# flanksize: flanking region size in bps. +# flankfactor: flanking region factor. +# Labs: character vector of labels of the genomic region. +# Return: list of x-tick position and label. + + if(pint){ # point interval. + mid.lab <- Labs[1] + tick.pos <- c(0, pts / 4, pts / 2, pts / 4 * 3, pts) + tick.lab <- as.character(c(-flanksize, -flanksize/2, mid.lab, + flanksize/2, flanksize)) + }else{ + left.lab <- Labs[1] + right.lab <- Labs[2] + tick.pos <- c(0, pts / 5, pts / 5 * 2, pts / 5 * 3, pts / 5 * 4, pts) + if(lgint){ # large interval: fla int int int fla + if(flankfactor > 0){ # show percentage at x-tick. + tick.lab <- c(sprintf("%d%%", -flankfactor*100), + left.lab, '33%', '66%', right.lab, + sprintf("%d%%", flankfactor*100)) + } else{ # show bps at x-tick. + tick.lab <- c(as.character(-flanksize), + left.lab, '33%', '66%', right.lab, + as.character(flanksize)) + } + } else { # small interval: fla fla int fla fla. + if(flankfactor > 0){ + tick.lab <- c(sprintf("%d%%", -flankfactor*100), + sprintf("%d%%", -flankfactor*50), + left.lab, right.lab, + sprintf("%d%%", flankfactor*50), + sprintf("%d%%", flankfactor*100)) + } else { + tick.lab <- c(as.character(-flanksize), + as.character(-flanksize/2), + left.lab, right.lab, + as.character(flanksize/2), + as.character(flanksize)) + } + } + } + list(pos=tick.pos, lab=tick.lab) +} + +plotmat <- function(regcovMat, title2plot, plot.colors, bam.pair, xticks, + pts, m.pts, f.pts, pint, shade.alp=0, confiMat=NULL, mw=1, + misc.options=list(yscale='auto', legend=T, box=T, vline=T, + xylab=T, line.wd=3)) { +# Plot avg. profiles and standard errors around them. +# Args: +# regcovMat: matrix for avg. profiles. +# title2plot: profile names, will be shown in figure legend. +# plot.colors: vector of color specifications for all curves. +# bam.pair: boolean for bam-pair data. +# xticks: X-axis ticks. +# pts: data points +# m.pts: middle part data points +# f.pts: flanking part data points +# pint: tag for point interval +# shade.alp: shading area alpha +# confiMat: matrix for standard errors. +# mw: moving window size for smoothing function. +# misc.options: list of misc. options - y-axis scale, legend, box around plot, +# verticle lines, X- and Y-axis labels, line width. + + # Smooth avg. profiles if specified. + if(mw > 1){ + regcovMat <- as.matrix(runmean(regcovMat, k=mw, alg='C', + endrule='mean')) + } + + # Choose colors. + if(any(is.na(plot.colors))) { + ncurve <- ncol(regcovMat) + if(ncurve <= 8) { + suppressMessages(require(RColorBrewer, warn.conflicts=F)) + col2use <- brewer.pal(ifelse(ncurve >= 3, ncurve, 3), 'Dark2') + col2use <- col2use[1:ncurve] + } else { + col2use <- rainbow(ncurve) + } + } else { + col2use <- plot.colors + } + col2use <- col2alpha(col2use, 0.8) + + # Plot profiles. + ytext <- ifelse(bam.pair, "log2(Fold change vs. control)", + "Read count Per Million mapped reads") + xrange <- 0:pts + y.lim <- NULL + if(length(misc.options$yscale) == 2) { + y.lim <- misc.options$yscale + } + matplot(xrange, regcovMat, + xaxt='n', type="l", col=col2use, ylim=y.lim, + lty="solid", lwd=misc.options$line.wd, frame.plot=F, ann=F) + if(misc.options$xylab) { + title(xlab="Genomic Region (5' -> 3')", ylab=ytext) + } + axis(1, at=xticks$pos, labels=xticks$lab, lwd=3, lwd.ticks=3) + if(misc.options$box) { + # box around plot. + box() + } + + # Add shade area. + if(shade.alp > 0){ + for(i in 1:ncol(regcovMat)){ + v.x <- c(xrange[1], xrange, xrange[length(xrange)]) + v.y <- regcovMat[, i] + v.y <- c(0, v.y, 0) + col.rgb <- col2rgb(col2use[i]) + p.col <- rgb(col.rgb[1, 1], col.rgb[2, 1], col.rgb[3, 1], + alpha=shade.alp * 255, maxColorValue=255) + polygon(v.x, v.y, density=-1, border=NA, col=p.col) + } + } + + # Add standard errors. + if(!is.null(confiMat)){ + v.x <- c(xrange, rev(xrange)) + for(i in 1:ncol(confiMat)){ + v.y <- c(regcovMat[, i] + confiMat[, i], + rev(regcovMat[, i] - confiMat[, i])) + col.rgb <- col2rgb(col2use[i]) + p.col <- rgb(col.rgb[1, 1], col.rgb[2, 1], col.rgb[3, 1], + alpha=0.2 * 255, maxColorValue=255) + polygon(v.x, v.y, density=-1, border=NA, col=p.col) + } + } + + if(misc.options$vline) { + # Add gray lines indicating feature boundaries. + if(pint) { + abline(v=f.pts, col="gray", lwd=2) + } else { + abline(v=f.pts - 1, col="gray", lwd=2) + abline(v=f.pts + m.pts - 2, col="gray", lwd=2) + } + } + + if(misc.options$legend) { + # Legend. + legend("topright", title2plot, text.col=col2use) + } +} + +spline_mat <- function(mat, n=100){ +# Calculate splined coverage for a matrix. +# Args: +# mat: each column represents a profile to be interpolated. +# n: number of data points to be interpolated. + + foreach(r=iter(mat, by='row'), + .combine='rbind', .multicombine=T) %dopar% { + spline(1:length(r), r, n)$y + } +} + +RankNormalizeMatrix <- function(mat, low.cutoff) { +# Rank-based normalization for a data matrix. +# Args: +# mat: data matrix. +# low.cutoff: low value cutoff. +# Return: rank normalized matrix. + + stopifnot(is.matrix(mat)) + + concat.dat <- c(mat) + low.mask <- concat.dat < low.cutoff + concat.r <- rank(concat.dat) + concat.r[low.mask] <- 0 + + matrix(concat.r, nrow=nrow(mat)) +} + +OrderGenesHeatmap <- function(enrichList, lowCutoffs, + method=c('total', 'max', 'prod', 'diff', 'hc', + 'none', 'km'), + go.paras=list(knc=5, max.iter=20, nrs=30)) { +# Order genes with a list of heatmap data. +# Args: +# enrichList: heatmap data in a list. +# lowCutoffs: low count cutoff for normalized count data. +# method: algorithm used to order genes. +# go.paras: gene ordering parameters. +# Returns: a vector of REVERSED gene orders. +# NOTE: due to the design of image function, the order needs to be reversed +# so that the genes will be shown correctly. + + rankList <- mapply(RankNormalizeMatrix, + mat=enrichList, low.cutoff=lowCutoffs, SIMPLIFY=F) + np <- length(enrichList) + + if(method == 'hc') { + rankCombined <- do.call('cbind', rankList) + # Clustering and order genes. + hc <- hclust(dist(rankCombined, method='euclidean'), + method='complete') + memb <- cutree(hc, k = go.paras$knc) + list(rev(hc$order), memb) # reversed! + } else if(method == 'km') { + rankCombined <- do.call('cbind', rankList) + km <- kmeans(rankCombined, centers=go.paras$knc, + iter.max=go.paras$max.iter, nstart=go.paras$nrs) + list(rev(order(km$cluster)), km$cluster) # reversed! + } else if(method == 'total' || method == 'diff' && np == 1) { + list(order(rowSums(rankList[[1]])), NULL) + } else if(method == 'max') { # peak enrichment value of the 1st profile. + list(order(apply(rankList[[1]], 1, max)), NULL) + } else if(method == 'prod') { # product of all profiles. + rs.mat <- sapply(rankList, rowSums) + g.prod <- apply(rs.mat, 1, prod) + list(order(g.prod), NULL) + } else if(method == 'diff' && np > 1) { # difference between 2 profiles. + list(order(rowSums(rankList[[1]]) - rowSums(rankList[[2]])), NULL) + } else if(method == 'none') { # according to the order of input gene list. + # Because the image function draws from bottom to top, the rows are + # reversed to give a more natural look. + list(rev(1:nrow(enrichList[[1]])),NULL) + } else { + # pass. + } +} + + +plotheat <- function(reg.list, uniq.reg, enrichList, v.low.cutoff, go.algo, + go.paras, title2plot, bam.pair, xticks, flood.q=.02, + do.plot=T, hm.color="default", color.distr=.6, + color.scale='local') { +# Plot heatmaps with genes ordered according to some algorithm. +# Args: +# reg.list: factor vector of regions as in configuration. +# uniq.reg: character vector of unique regions. +# enrichList: list of heatmap data. +# v.low.cutoff: low count cutoff for normalized count data. +# go.algo: gene order algorithm. +# go.paras: gene ordering parameters. +# title2plot: title for each heatmap. Same as the legends in avgprof. +# bam.pair: boolean tag for bam-pair. +# xticks: info for X-axis ticks. +# flood.q: flooding percentage. +# do.plot: boolean tag for plotting heatmaps. +# hm.color: string for heatmap colors. +# color.distr: positive number for color distribution. +# color.scale: string for the method to adjust color scale. +# Returns: ordered gene names for each unique region as a list. + + # Setup basic parameters. + ncolor <- 256 + if(bam.pair) { + if(hm.color != "default") { + tri.colors <- unlist(strsplit(hm.color, ':')) + neg.color <- tri.colors[1] + if(length(tri.colors) == 2) { + neu.color <- 'black' + pos.color <- tri.colors[2] + } else { + neu.color <- tri.colors[2] + pos.color <- tri.colors[3] + } + enrich.palette <- colorRampPalette(c(neg.color, neu.color, + pos.color), + bias=color.distr, + interpolate='spline') + } else { + enrich.palette <- colorRampPalette(c('blue', 'black', 'yellow'), + bias=color.distr, + interpolate='spline') + } + } else { + if(hm.color != "default") { + enrich.palette <- colorRampPalette(c('snow', hm.color)) + } else { + enrich.palette <- colorRampPalette(c('snow', 'red2')) + } + } + + hm_cols <- ncol(enrichList[[1]]) + + # Adjust X-axis tick position. In a heatmap, X-axis is [0, 1]. + # Assume xticks$pos is from 0 to N(>0). + xticks$pos <- xticks$pos / tail(xticks$pos, n=1) # scale to the same size. + + # Define a function to calculate color breaks. + ColorBreaks <- function(max.e, min.e, bam.pair, ncolor) { + # Args: + # max.e: maximum enrichment value to be mapped to color. + # min.e: minimum enrichment value to be mapped to color. + # bam.pair: boolean tag for bam-pair. + # ncolor: number of colors to use. + # Returns: vector of color breaks. + + # If bam-pair is used, create breaks for positives and negatives + # separately. If log2 ratios are all positive or negative, use only + # half of the color space. + if(bam.pair) { + max.e <- ifelse(max.e > 0, max.e, 1) + min.e <- ifelse(min.e < 0, min.e, -1) + c(seq(min.e, 0, length.out=ncolor / 2 + 1), + seq(0, max.e, length.out=ncolor / 2 + 1)[-1]) + } else { + seq(min.e, max.e, length.out=ncolor + 1) + } + } + + if(grepl(",", color.scale)) { + scale.pair <- unlist(strsplit(color.scale, ",")) + scale.min <- as.numeric(scale.pair[1]) + scale.max <- as.numeric(scale.pair[2]) + if(scale.min >= scale.max) { + warning("Color scale min value is >= max value.\n") + } + flood.pts <- c(scale.min, scale.max) + brk.use <- ColorBreaks(scale.max, scale.min, bam.pair, ncolor) + } + + # If color scale is global, calculate breaks and quantile here. + if(color.scale == 'global') { + flood.pts <- quantile(c(enrichList, recursive=T), c(flood.q, 1-flood.q)) + brk.use <- ColorBreaks(flood.pts[2], flood.pts[1], bam.pair, ncolor) + } + + # Go through each unique region. + # Do NOT use "dopar" in the "foreach" loops here because this will disturb + # the image order. + go.list <- vector('list', length(uniq.reg)) + go.cluster <- vector('list', length(uniq.reg)) + + names(go.list) <- uniq.reg + names(go.cluster) <- uniq.reg + + for(ur in uniq.reg) { + # ur <- uniq.reg[i] + plist <- which(reg.list==ur) # get indices in the config file. + + # Combine all profiles into one. + # enrichCombined <- do.call('cbind', enrichList[plist]) + enrichSelected <- enrichList[plist] + + # If color scale is region, calculate breaks and quantile here. + if(color.scale == 'region') { + flood.pts <- quantile(c(enrichSelected, recursive=T), + c(flood.q, 1-flood.q)) + brk.use <- ColorBreaks(flood.pts[2], flood.pts[1], bam.pair, ncolor) + } + + # Order genes. + if(is.matrix(enrichSelected[[1]]) && nrow(enrichSelected[[1]]) > 1) { + if(bam.pair) { + lowCutoffs <- 0 + } else { + lowCutoffs <- v.low.cutoff[plist] + } + g.order.list <- OrderGenesHeatmap(enrichSelected, lowCutoffs, + go.algo, go.paras) + g.order <- g.order.list[[1]] + g.cluster <- g.order.list[[2]] + if(is.null(g.cluster)) { + go.cluster[[ur]] <- NA + } else{ + go.cluster[[ur]] <- rev(g.cluster[g.order]) + } + go.list[[ur]] <- rev(rownames(enrichSelected[[1]][g.order, ])) + } else { + go.cluster[[ur]] <- NULL + go.list[[ur]] <- NULL + } + + if(!do.plot) { + next + } + + # Go through each sample and do plot. + for(pj in plist) { + if(!is.null(g.order)) { + enrichList[[pj]] <- enrichList[[pj]][g.order, ] + } + + # If color scale is local, calculate breaks and quantiles here. + if(color.scale == 'local') { + flood.pts <- quantile(c(enrichList[[pj]], recursive=T), + c(flood.q, 1-flood.q)) + brk.use <- ColorBreaks(flood.pts[2], flood.pts[1], bam.pair, + ncolor) + } + + # Flooding extreme values. + enrichList[[pj]][ enrichList[[pj]] < flood.pts[1] ] <- flood.pts[1] + enrichList[[pj]][ enrichList[[pj]] > flood.pts[2] ] <- flood.pts[2] + + # Draw colorkey. + image(z=matrix(brk.use, ncol=1), col=enrich.palette(ncolor), + breaks=brk.use, axes=F, useRaster=T, main='Colorkey') + axis(1, at=seq(0, 1, length.out=5), + labels=format(brk.use[seq(1, ncolor + 1, length.out=5)], + digits=1), + lwd=1, lwd.ticks=1) + + # Draw heatmap. + image(z=t(enrichList[[pj]]), col=enrich.palette(ncolor), + breaks=brk.use, axes=F, useRaster=T, main=title2plot[pj]) + + axis(1, at=xticks$pos, labels=xticks$lab, lwd=1, lwd.ticks=1) + } + } + list(go.list,go.cluster) +} + +trim <- function(x, p){ +# Trim a numeric vector on both ends. +# Args: +# x: numeric vector. +# p: percentage of data to trim. +# Return: trimmed vector. + + low <- quantile(x, p) + hig <- quantile(x, 1 - p) + x[x > low & x < hig] +} + +CalcSem <- function(x, rb=.05){ +# Calculate standard error of mean for a numeric vector. +# Args: +# x: numeric vector +# rb: fraction of data to trim before calculating sem. +# Return: a scalar of the standard error of mean + + if(rb > 0){ + x <- trim(x, rb) + } + sem <- sd(x) / sqrt(length(x)) + ifelse(is.na(sem), 0, sem) + # NOTE: this should be improved to handle exception that "sd" calculation + # emits errors. +} + + + + +## Leave for future reference. +# +# Set the antialiasing. +# type <- NULL +# if (capabilities()["aqua"]) { +# type <- "quartz" +# } else if (capabilities()["cairo"]) { +# type <- "cairo" +# } else if (capabilities()["X11"]) { +# type <- "Xlib" +# } +# Set the output type based on capabilities. +# if (is.null(type)){ +# png(plot.name, width, height, pointsize=pointsize) + +# } else { +# png(plot.name, width, height, pointsize=pointsize, type=type) +# } diff -r 000000000000 -r 3ca58369469c ngs.plot.r --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/ngs.plot.r Wed Dec 06 19:01:53 2017 -0500 @@ -0,0 +1,341 @@ +#!/usr/bin/env Rscript +# +# Program: ngs.plot.r +# Purpose: Plot sequencing coverages at different genomic regions. +# Allow overlaying various coverages with gene lists. +# +# -- by Li Shen, MSSM +# Created: Nov 2011. +# -- by Christophe Antoniewski +# recoded for Galaxy: Dec 2017 + +ngsplot.version <- '2.63_artbio' +# Program environment variable. +progpath <- Sys.getenv('NGSPLOT') +if(progpath == "") { + stop("Set environment variable NGSPLOT before run the program. See README + for details.\n") +} + +source(file.path(progpath, 'lib', 'parse.args.r')) +source(file.path(progpath, 'lib', 'genedb.r')) +source(file.path(progpath, 'lib', 'plotlib.r')) +source(file.path(progpath, 'lib', 'coverage.r')) + +# Deal with command line arguments. +cmd.help <- function(){ + cat("\nVisit https://github.com/shenlab-sinai/ngsplot/wiki/ProgramArguments101 for details\n") + cat(paste("Version:", ngsplot.version, sep=" ")) + cat("\nUsage: ngs.plot.r -G genome -R region -C [cov|config]file\n") + cat(" -O name [Options]\n") + cat("\n## Mandatory parameters:\n") + cat(" -G Genome name. Use ngsplotdb.py list to show available genomes.\n") + cat(" -R Genomic regions to plot: tss, tes, genebody, exon, cgi, enhancer, dhs or bed\n") + cat(" -C Indexed bam file or a configuration file for multiplot\n") + cat(" -O Name for output: multiple files will be generated\n") + cat("## Optional parameters related to configuration file:\n") + cat(" -E Gene list to subset regions OR bed file for custom region\n") + cat(" -T Image title\n") + EchoCoverageArgs() + EchoPlotArgs() + cat("\n") +} + + +########################################################################### +#################### Deal with program input arguments #################### +args <- commandArgs(T) +# args <- unlist(strsplit('-G mm10 -R tss -C K27M_no_stim_3_GATCAG_L006_R1_001Aligned.out.srt.rmdup.bam -O test -Debug 1', ' ')) + +# Input argument parser. +args.tbl <- parseArgs(args, c('-G', '-C', '-R', '-O')) # see lib/parse.args.r +if(is.null(args.tbl)){ + cmd.help() + stop('Error in parsing command line arguments. Stop.\n') +} +genome <- args.tbl['-G'] +reg2plot <- args.tbl['-R'] +oname <- args.tbl['-O'] +warning(sprintf("%s", args.tbl)) + +cat("Configuring variables...") +# Load tables of database: default.tbl, dbfile.tbl +default.tbl <- read.delim(file.path(progpath, 'database', 'default.tbl')) +dbfile.tbl <- read.delim(file.path(progpath, 'database', 'dbfile.tbl')) +CheckRegionAllowed(reg2plot, default.tbl) + +# Setup variables from arguments. +cov.args <- CoverageVars(args.tbl, reg2plot) +attach(cov.args) # +plot.args <- PlotVars(args.tbl) +attach(plot.args) + +# Configuration: coverage-genelist-title relationships. +ctg.tbl <- ConfigTbl(args.tbl, fraglen) + +# Setup plot-related coordinates and variables. +plotvar.list <- SetupPlotCoord(args.tbl, ctg.tbl, default.tbl, dbfile.tbl, + progpath, genome, reg2plot, inttag, flanksize, + samprate, galaxy) +attach(plotvar.list) + +# Setup data points for plot. +pts.list <- SetPtsSpline(pint, lgint) +pts <- pts.list$pts # data points for avg. profile and standard errors. +m.pts <- pts.list$m.pts # middle data points. For pint, m.pts=1. +f.pts <- pts.list$f.pts # flanking region data points. + +# Setup matrix for avg. profiles. +regcovMat <- CreatePlotMat(pts, ctg.tbl) +# Setup matrix for standard errors. +confiMat <- CreateConfiMat(se, pts, ctg.tbl) + +# Genomic enrichment for all profiles in the config. Use this for heatmaps. +enrichList <- vector('list', nrow(ctg.tbl)) +cat("Done\n") + +# Load required libraries. +cat("Loading R libraries") +if(!suppressMessages(require(ShortRead, warn.conflicts=F))) { + source("http://bioconductor.org/biocLite.R") + biocLite(ShortRead) + if(!suppressMessages(require(ShortRead, warn.conflicts=F))) { + stop('Loading package ShortRead failed!') + } +} +cat('.') +if(!suppressMessages(require(BSgenome, warn.conflicts=F))) { + source("http://bioconductor.org/biocLite.R") + biocLite(BSgenome) + if(!suppressMessages(require(BSgenome, warn.conflicts=F))) { + stop('Loading package BSgenome failed!') + } +} +cat('.') +if(!suppressMessages(require(doMC, warn.conflicts=F))) { + install.packages('doMC') + if(!suppressMessages(require(doMC, warn.conflicts=F))) { + stop('Loading package doMC failed!') + } +} +# Register doMC with CPU number. +if(cores.number == 0){ + registerDoMC() +} else { + registerDoMC(cores.number) +} +cat('.') +if(!suppressMessages(require(caTools, warn.conflicts=F))) { + install.packages('caTools') + if(!suppressMessages(require(caTools, warn.conflicts=F))) { + stop('Loading package caTools failed!') + } +} +cat('.') +if(!suppressMessages(require(utils, warn.conflicts=F))) { + install.packages('utils') + if(!suppressMessages(require(utils, warn.conflicts=F))) { + stop('Loading package utils failed!') + } +} +cat('.') +cat("Done\n") + +####################################################################### +# Here start to extract coverages for all genomic regions and calculate +# data for plotting. + +cat("Analyze bam files and calculate coverage") +# Extract bam file names from configuration and determine if bam-pair is used. +bfl.res <- bamFileList(ctg.tbl) +bam.pair <- bfl.res$bbp # boolean for bam-pair. +bam.list <- bfl.res$bam.list # bam file list. +CheckHMColorConfig(hm.color, bam.pair) + +# Determine if bowtie is used to generate the bam files. +# Index bam files if not done yet. +v.map.bowtie <- headerIndexBam(bam.list) + +# Retrieve chromosome names for each bam file. +sn.list <- seqnamesBam(bam.list) + +# Calculate library size from bam files for normalization. +v.lib.size <- libSizeBam(bam.list) + +v.low.cutoff <- vector("integer", nrow(ctg.tbl)) # low count cutoffs. +# Process the config file row by row. +# browser() +for(r in 1:nrow(ctg.tbl)) { # r: index of plots/profiles. + + reg <- ctg.tbl$glist[r] # retrieve gene list names. + # Create coordinate chunk indices. + chkidx.list <- chunkIndex(nrow(coord.list[[reg]]), gcs) + + # Do coverage for each bam file. + bam.files <- unlist(strsplit(ctg.tbl$cov[r], ':')) + + # Obtain fraglen for each bam file. + fraglens <- as.integer(unlist(strsplit(ctg.tbl$fraglen[r], ':'))) + + # Obtain bam file basic info. + libsize <- v.lib.size[bam.files[1]] + v.low.cutoff[r] <- low.count / libsize * 1e6 + result.pseudo.rpm <- 1e6 / libsize + sn.inbam <- sn.list[[bam.files[1]]] + # chr.tag <- chrTag(sn.inbam) + chr.tag <- NA # do NOT modify the chromosome names. + is.bowtie <- v.map.bowtie[bam.files[1]] + cat("\nreport reg2plot\n") + cat(reg2plot) + cat("\n") + result.matrix <- covMatrix(debug, chkidx.list, coord.list[[reg]], rnaseq.gb, + exonmodel, libsize, TRUE, chr.tag, pint, + reg2plot, flanksize, flankfactor, m.pts, f.pts, + bufsize, cov.algo, bam.files[1], sn.inbam, + fraglens[1], map.qual, is.bowtie, + strand.spec=strand.spec) + # Rprof(NULL) + if(bam.pair) { # calculate background. + fraglen2 <- ifelse(length(fraglens) > 1, fraglens[2], fraglens[1]) + libsize <- v.lib.size[bam.files[2]] + bkg.pseudo.rpm <- 1e6 / libsize + sn.inbam <- sn.list[[bam.files[2]]] + # chr.tag <- chrTag(sn.inbam) + chr.tag <- NA + is.bowtie <- v.map.bowtie[bam.files[2]] + # if(class(chr.tag) == 'character') { + # stop(sprintf("Read %s error: %s", bam.files[2], chr.tag)) + # } + bkg.matrix <- covMatrix(debug, chkidx.list, coord.list[[reg]], rnaseq.gb, + exonmodel, libsize, TRUE, chr.tag, pint, + reg2plot, flanksize, flankfactor, m.pts, f.pts, + bufsize, cov.algo, bam.files[2], sn.inbam, + fraglen2, map.qual, is.bowtie, + strand.spec=strand.spec) + # browser() + result.matrix <- log2((result.matrix + result.pseudo.rpm) / + (bkg.matrix + bkg.pseudo.rpm)) + } + + # Calculate SEM if needed. Shut off SEM in single gene case. + if(nrow(result.matrix) > 1 && se){ + confiMat[, r] <- apply(result.matrix, 2, function(x) CalcSem(x, robust)) + } + + # Book-keep this matrix for heatmap. + enrichList[[r]] <- result.matrix + + # Return avg. profile. + regcovMat[, r] <- apply(result.matrix, 2, function(x) mean(x, trim=robust, + na.rm=T)) +} +# browser() +cat("Done\n") + +######################################## +# Add row names to heatmap data. +for(i in 1:length(enrichList)) { + reg <- ctg.tbl$glist[i] # gene list name. + rownames(enrichList[[i]]) <- paste(coord.list[[reg]]$gname, + coord.list[[reg]]$tid, sep=':') +} +# Some basic parameters. +xticks <- genXticks(reg2plot, pint, lgint, pts, flanksize, flankfactor, Labs) +unit.width <- 4 +ng.list <- sapply(enrichList, nrow) # number of genes per heatmap. + +# Create image file and plot data into it. +if(!fi_tag){ + cat("Plotting figures...") + #### Average profile plot. #### + out.plot <- avgname + pdf(out.plot, width=plot.width, height=plot.height, pointsize=font.size) + plotmat(regcovMat, ctg.tbl$title, ctg.tbl$color, bam.pair, xticks, pts, + m.pts, f.pts, pint, shade.alp, confiMat, mw, prof.misc) + out.dev <- dev.off() + + #### Heatmap. #### + # Setup output device. + hd <- SetupHeatmapDevice(reg.list, uniq.reg, ng.list, pts, font.size, + unit.width, rr) + reg.hei <- hd$reg.hei # list of image heights for unique regions. + hm.width <- hd$hm.width # image width. + hm.height <- hd$hm.height # image height. + lay.mat <- hd$lay.mat # matrix for heatmap layout. + heatmap.mar <- hd$heatmap.mar # heatmap margins in inches. + + out.hm <- heatmapname + pdf(out.hm, width=hm.width, height=hm.height, pointsize=font.size) + par(mai=heatmap.mar) + layout(lay.mat, heights=reg.hei) + + # Do heatmap plotting. + go.list <- plotheat(reg.list, uniq.reg, enrichList, v.low.cutoff, go.algo, + go.paras, ctg.tbl$title, bam.pair, xticks, flood.frac, + do.plot=T, hm.color=hm.color, color.distr=color.distr, + color.scale=color.scale) + out.dev <- dev.off() + cat("Done\n") +} else { + go.list <- plotheat(reg.list, uniq.reg, enrichList, v.low.cutoff, go.algo, + go.paras, ctg.tbl$title, bam.pair, xticks, flood.frac, + do.plot=F, hm.color=hm.color, color.distr=color.distr, + color.scale=color.scale) +} + +# Save plotting data. +oname1="data" +cat("Saving results...") +dir.create(oname1, showWarnings=F) + +# Average profiles. +out.prof <- file.path(oname1, 'avgprof.txt') +write.table(regcovMat, file=out.prof, row.names=F, sep="\t", quote=F) + +# Standard errors of mean. +if(!is.null(confiMat)){ + out.confi <- file.path(oname1, 'sem.txt') + write.table(confiMat, file=out.confi, row.names=F, sep="\t", quote=F) +} + +# Heatmap density values. +for(i in 1:length(enrichList)) { + reg <- ctg.tbl$glist[i] # gene list name. + out.heat <- file.path(oname1, paste('hm', i, '.txt', sep='')) + write.table(cbind(coord.list[[reg]][, c('gid', 'gname', 'tid', 'strand')], + enrichList[[i]]), + file=out.heat, row.names=F, sep="\t", quote=F) +} + +# Avg. profile R data. +prof.dat <- file.path(oname1, 'avgprof.RData') +save(plot.width, plot.height, regcovMat, ctg.tbl, bam.pair, xticks, pts, + m.pts, f.pts, pint, shade.alp, confiMat, mw, prof.misc, se, v.lib.size, + font.size, + file=prof.dat) + +# Heatmap R data. +heat.dat <- file.path(oname1, 'heatmap.RData') +save(reg.list, uniq.reg, ng.list, pts, enrichList, v.low.cutoff, go.algo, + ctg.tbl, bam.pair, xticks, flood.frac, hm.color, unit.width, rr, + go.list, color.scale, v.lib.size, font.size, go.paras, low.count, + color.distr, + file=heat.dat) +cat("Done\n") + +# Wrap results up. +cat("Wrapping results up...") +cur.dir <- getwd() +out.dir <- dirname(oname1) +out.zip <- basename(oname1) +setwd(out.dir) +if(!zip(paste(out.zip, '.zip', sep=''), out.zip, extras='-q')) { + if(unlink(oname, recursive=T)) { + warning(sprintf("Unable to delete intermediate result folder: %s", + oname)) + } +} +setwd(cur.dir) +cat("Done\n") +cat("All done. Cheers!\n") + diff -r 000000000000 -r 3ca58369469c ngsplot.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/ngsplot.xml Wed Dec 06 19:01:53 2017 -0500 @@ -0,0 +1,3185 @@ + + visualize NGS samples at functional genomic regions - beta + + r-catools + r-domc + bioconductor-bsgenome + + bioconductor-shortread + + + + + + +<--> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff -r 000000000000 -r 3ca58369469c replot.xml.notworking --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/replot.xml.notworking Wed Dec 06 19:01:53 2017 -0500 @@ -0,0 +1,157 @@ + + +#if $outtype.outtype2 == "prof": + runreplot.pl $input_zipfile $outtype.outtype2 $output_filename + $outtype.WD + $outtype.HG + $outtype.SE + $outtype.H + $outtype.MW + $outtype.Y.Y2 + $outtype.Y.Y3 + $outtype.LEG + $outtype.BOX + $outtype.VLN + $outtype.XYL + $outtype.LWD + $outtype.FS +#else if $outtype.outtype2 == "heatmap": + runreplot.pl $input_zipfile $outtype.outtype2 $output_filename + $outtype.GO.GO2 + $outtype.GO.KNC + $outtype.GO.MIT + $outtype.GO.NRS + $outtype.LOW + $outtype.RR + $outtype.FC + $outtype.CO + $outtype.CD + $outtype.FS + dummy + dummy + dummy +#end if: + + + + + + +<--> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +<--> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff -r 000000000000 -r 3ca58369469c runNGSplot.pl --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/runNGSplot.pl Wed Dec 06 19:01:53 2017 -0500 @@ -0,0 +1,129 @@ +#!/usr/bin/perl -w +use strict; +use File::Basename 'dirname'; +use File::Spec; +use Cwd 'abs_path'; + +my @inputs = @ARGV; +my @inputs2 = @inputs; + +my $genome_name = shift(@inputs); +my $genomic_region_source_type__genomic_region = shift(@inputs); +my $genomic_region_source_type__further_information = shift(@inputs); +my $genomic_region_source_type__interval_size = shift(@inputs); +my $genomic_region_source_type__flanking_region_option_source_type__flanking_region_option = shift(@inputs); +my $genomic_region_source_type__flanking_region_option_source_type__flanking_region_size = shift(@inputs); +my $numsamples = shift(@inputs); +my $usepairs = shift(@inputs); + +#print STDERR "inputs @inputs\n"; + +my $randfile = rand(100)."\.config\.txt"; +my $randfile2 = $randfile; +$randfile2 =~ s/config\.txt/logfile/gm; + +my $outfile = File::Spec->catfile(abs_path(dirname(__FILE__)),"$randfile"); +open(FILE,">$outfile"); + +for (my $i=1;$i<=$numsamples;$i++) { + my $bamfile=shift(@inputs); + my $reffile=shift(@inputs); + my $usegenelist=shift(@inputs); + my $genelist=shift(@inputs); + my $title=shift(@inputs); + my $fraglen=shift(@inputs); + my $color=shift(@inputs); + if ($usepairs eq 'yes') { + syswrite(FILE, "$bamfile\:$reffile\t$genelist\t$title\t$fraglen\t$color\n"); + }else { + syswrite(FILE, "$bamfile\t$genelist\t$title\t$fraglen\t$color\n"); + } +} +close(FILE); + +my $gene_database = shift(@inputs); +my $randomly_sample = shift(@inputs); +my $gene_order = shift(@inputs); +my $knc = shift(@inputs); +my $mit = shift(@inputs); +my $nrs = shift(@inputs); +my $chunk_size = shift(@inputs); +my $quality_requirement = shift(@inputs); +my $standard_error = shift(@inputs); +my $radius_size = shift(@inputs); +my $flooding_fraction = shift(@inputs); +my $smooth_method = shift(@inputs); +my $shaded_area = shift(@inputs); +my $out_name = shift(@inputs); +my $out_avg_name = shift(@inputs); +my $out_hm_name = shift(@inputs); + + +my $G = $genome_name; +my $R = $genomic_region_source_type__genomic_region; +my $C = $outfile; +my $O = $out_name; +my $O2 = $out_avg_name; +my $O3 = $out_hm_name; +my $F = $genomic_region_source_type__further_information; +my $D = $gene_database; +my $L = $genomic_region_source_type__flanking_region_option_source_type__flanking_region_size; +my $N = $genomic_region_source_type__flanking_region_option_source_type__flanking_region_size; +my $RB = $radius_size; +my $S = $randomly_sample; +my $CS = $chunk_size; +my $MQ = $quality_requirement; +my $IN = $genomic_region_source_type__interval_size; +my $SE = $standard_error; +my $MW = $smooth_method; +my $H = $shaded_area; +my $GO = $gene_order; +my $KNC = $knc; +my $MIT = $mit; +my $NRS = $nrs; +my $FC = $flooding_fraction; + +if ($GO eq 'km') { + $GO = "$GO -KNC $KNC -MIT $MIT -NRS $NRS"; +} + +my $logfile = File::Spec->catfile(abs_path(dirname(__FILE__)),"$randfile2"); +open(FILE2,">>$logfile"); +my $cmd5="pwd >> $logfile 2>&1"; +system($cmd5); +my $NGSPLOT = $ENV{"NGSPLOT"}; +my $cmd=''; +if (($R eq 'tss')||($R eq 'tes')) { + $cmd = "Rscript $NGSPLOT/ngs.plot.r -Galaxy 1 -P 0 -G $G -R $R -C $C -O $O -O2 $O2 -O3 $O3 -D $D -L $L -S $S -GO $GO -CS $CS -MQ $MQ -SE $SE -RB $RB -FC $FC -MW $MW -H $H >> $logfile 2>&1"; +}elsif ($genomic_region_source_type__flanking_region_option_source_type__flanking_region_option eq 'flanking_region_size') { + if ($IN eq 'automatic') { + $cmd = "$NGSPLOT/ngs.plot.r -Galaxy 1 -P 0 -G $G -R $R -C $C -O $O -O2 $O2 -O3 $O3 -D $D -L $L -S $S -GO $GO -CS $CS -MQ $MQ -SE $SE -RB $RB -FC $FC -MW $MW -H $H >> $logfile 2>&1"; + }else { + $cmd = "$NGSPLOT/ngs.plot.r -Galaxy 1 -P 0 -G $G -R $R -C $C -O $O -O2 $O2 -O3 $O3 -D $D -L $L -S $S -GO $GO -CS $CS -MQ $MQ -SE $SE -RB $RB -FC $FC -MW $MW -H $H -IN $IN >> $logfile 2>&1"; + } +}elsif ($genomic_region_source_type__flanking_region_option_source_type__flanking_region_option eq 'flanking_floating_size') { + if ($IN eq 'automatic') { + $cmd = "$NGSPLOT/ngs.plot.r -Galaxy 1 -P 0 -G $G -R $R -C $C -O $O -O2 $O2 -O3 $O3 -D $D -N $N -S $S -GO $GO -CS $CS -MQ $MQ -SE $SE -RB $RB -FC $FC -MW $MW -H $H >> $logfile 2>&1"; + }else { + $cmd = "$NGSPLOT/ngs.plot.r -Galaxy 1 -P 0 -G $G -R $R -C $C -O $O -O2 $O2 -O3 $O3 -D $D -N $N -S $S -GO $GO -CS $CS -MQ $MQ -SE $SE -RB $RB -FC $FC -MW $MW -H $H -IN $IN >> $logfile 2>&1"; + } +} + +print("$cmd\n"); +my $cmd2="cp data.zip $O"; +my $cmd3="rm $outfile"; +my $cmd4="rm $logfile"; +syswrite(FILE2, "\n$cmd\n"); +syswrite(FILE2, "\n$cmd2\n"); +syswrite(FILE2, "\n$cmd3\n"); +syswrite(FILE2, "\n$cmd4\n\n"); + +# print STDERR "cmd: $cmd\n"; +system($cmd); +system($cmd2); +#system($cmd3); +#system($cmd4); + +close(FILE2); + + diff -r 000000000000 -r 3ca58369469c runreplot.pl --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/runreplot.pl Wed Dec 06 19:01:53 2017 -0500 @@ -0,0 +1,63 @@ +#!/usr/bin/perl -w +use strict; +use File::Basename 'dirname'; +use File::Spec; +use Cwd 'abs_path'; + +my $input_file = $ARGV[0]; +my $output_type = $ARGV[1]; +my $output_filename = $ARGV[2]; + +my $cmd1 = "cp $input_file data.zip"; +my $cmd2 = ""; +my $cmd3 = "mv output.pdf $output_filename"; + +my $FS = ""; +if ($output_type eq 'prof') { + my $WD = "-WD $ARGV[3]"; + my $HG = "-HG $ARGV[4]"; + my $SE = "-SE $ARGV[5]"; + my $H = "-H $ARGV[6]"; + my $MW = "-MW $ARGV[7]"; + my $Y = "-Y $ARGV[8]"; + my $YAS = "-YAS $ARGV[9]"; + my $LEG = "-LEG $ARGV[10]"; + my $BOX = "-BOX $ARGV[11]"; + my $VLN = "-VLN $ARGV[12]"; + my $XYL = "-XYL $ARGV[13]"; + my $LWD = "-LWD $ARGV[14]"; + $FS = "-FS $ARGV[15]"; + + if ($Y =~ /no$/gm) { + $cmd2 = "replot.r $output_type -I data.zip -O output $WD $HG $SE $H $MW $LEG $BOX $VLN $XYL $LWD $FS"; + }else { + $cmd2 = "replot.r $output_type -I data.zip -O output $WD $HG $SE $H $MW $LEG $BOX $VLN $XYL $LWD $FS $YAS"; + } +}elsif ($output_type eq 'heatmap') { + my $GO = "-GO $ARGV[3]"; + my $KNC = "-KNC $ARGV[4]"; + my $MIT = "-MIT $ARGV[5]"; + my $NRS = "-NRS $ARGV[6]"; + my $LOW = "-LOW $ARGV[7]"; + my $RR = "-RR $ARGV[8]"; + my $FC = "-FC $ARGV[9]"; + my $CO = "-CO $ARGV[10]"; + my $CD = "-CD $ARGV[11]"; + $FS = "-FS $ARGV[12]"; + my $DUM1 = $ARGV[13]; + my $DUM2 = $ARGV[14]; + my $DUM3 = $ARGV[15]; + + if ($CO =~ /default/gm) { $CO = ""; } + + if ($GO =~ /km$/gm) { + $cmd2 = "replot.r $output_type -I data.zip -O output $LOW $RR $FC $CO $CD $FS $GO $KNC $MIT $NRS"; + }else { + $cmd2 = "replot.r $output_type -I data.zip -O output $LOW $RR $FC $CO $CD $FS $GO"; + } +} +system($cmd1); +system($cmd2); +system($cmd3); + +