comparison bin/ngsplotdb.py @ 0:3ca58369469c draft

planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/ngsplot commit b'e9fcc157a7f2f2fa9d6ac9a58d425ff17c975f5c\n'
author artbio
date Wed, 06 Dec 2017 19:01:53 -0500
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:3ca58369469c
1 #!/usr/bin/env python
2 #
3 # ngsplotdb - PYTHON script to manipulate the ngs.plot database installation.
4 # Author: Li Shen, Mount Sinai School of Medicine
5 # Date created: Dec 28, 2012
6 # Last updated: May 28, 2013
7 #
8 # Functions implemented:
9 #
10 # list - List locally installed genomes.
11 # check - Check the integrity of local database.
12 # install - Install a genome from ngsplotdb.XXX.tar.gz.
13 # remove - Remove a locally installed genome.
14
15
16 # def dir_to_idspec(dirn):
17 # """Convert an FTP directory name to species ID and Name.
18 # This works with both UCSC and Ensembl FTP.
19 # """
20 # parts = dirn.split('_')
21 # sid = parts[0][:3].lower() + parts[1][:3].lower()
22 # spec = str.title(parts[0] + ' ' + parts[1])
23 # return sid, spec
24
25 # def listremote(database="refseq"):
26 # """List available genomes on a remote FTP site."""
27
28 # import urllib2
29
30 # url_ensembl = "ftp://ftp.ensembl.org/pub/current_gtf/"
31 # url_ucsc = "ftp://hgdownload.cse.ucsc.edu/goldenPath/currentGenomes/"
32
33 # if database == "refseq":
34 # ftp = urllib2.urlopen(url_ucsc)
35 # elif database == "ensembl":
36 # ftp = urllib2.urlopen(url_ensembl)
37 # else:
38 # pass
39
40 # # Read directory names from the remote FTP site.
41 # dir_names = ftp.read().splitlines()
42 # # Parse lines to obtain clean directory names only.
43 # if database == "refseq":
44 # dir_names = map(lambda x: x.split()[-3], dir_names)
45 # elif database == "ensembl":
46 # dir_names = map(lambda x: x.split()[-1], dir_names)
47 # else:
48 # pass
49 # # Filter directory names that do not correspond to a species.
50 # # On UCSC FTP, both "." and ".." will be retrieved from above and can
51 # # cause troubles.
52
53 # id_spec_list = map(dir_to_idspec, dir_names)
54 # print "ID\tName"
55 # for r in id_spec_list:
56 # print r[0], "\t", r[1]
57
58 def read_gnlist(root_path, mode):
59 """Read installed genomes list.
60
61 Args:
62 root_path: ngs.plot installation directory.
63 mode: row reading mode - "vector" or "hash", correspond to each record
64 read as a vector of hash table.
65 Returns: (header split vector, hash table of installed genomes,
66 vector of column widths)
67 """
68 from collections import defaultdict
69
70 gn_list = root_path + "/database/gn_list.txt"
71 try:
72 db_f = open(gn_list, "r")
73 except IOError:
74 print "Open {0} error: your ngs.plot database may be corrupted.".\
75 format(gn_list)
76 sys.exit()
77
78 default_list = root_path + "/database/default.tbl"
79 try:
80 default_f = open(default_list, "r")
81 except IOError:
82 print "Open {0} error: your ngs.plot database may be corrupted.".\
83 format(default_list)
84 sys.exit()
85 default_l = []
86 header = default_f.readline() # skip the header
87 for rec in default_f:
88 r_sp = rec.rstrip().split("\t")
89 default_l.append((r_sp[0], r_sp[2])) # tuple of genome and region, like ("dm3", "genebody")
90 genome_region_dict = defaultdict(list)
91 for genome,region in default_l:
92 genome_region_dict[genome].append(region)
93
94 header = db_f.readline()
95 h_sp = header.rstrip().split("\t")
96 h_sp.append("InstalledFeatures")
97 v_cw = map(len, h_sp) # column widths initialize to header widths.
98
99 g_tbl = {}
100 for rec in db_f:
101 r_sp = rec.rstrip().split("\t")
102 r_sp.append(",".join(genome_region_dict[r_sp[0]]))
103 r_cw = map(len, r_sp)
104 v_cw = map(max, v_cw, r_cw)
105
106 r_tbl = dict(zip(h_sp, r_sp))
107 if(mode == "vector"):
108 g_tbl[r_tbl["ID"]] = r_sp
109 elif(mode == "hash"):
110 g_tbl[r_tbl["ID"]] = r_tbl
111 else:
112 pass
113
114 db_f.close()
115
116 return (h_sp, g_tbl, v_cw)
117
118
119 def update_gnlist(root_path, g_tbl, gn, assembly, species, ens_v, np_v):
120 """Update/Add genomes in ngs.plot database.
121
122 Args:
123 root_path: ngs.plot installation directory.
124 g_tbl: genome hash table. Each value is also a hash table of
125 genome info.
126 gn: genome name.
127 assembly: genome assembly name.
128 species: species name.
129 ens_v: new Ensembl version.
130 np_v: new ngs.plot version.
131 Returns: None
132 """
133
134 if gn in g_tbl:
135 g_tbl[gn]["EnsVer"] = ens_v
136 g_tbl[gn]["NPVer"] = np_v
137 else:
138 g_tbl[gn] = {"ID": gn, "Assembly": assembly, "Species": species, \
139 "EnsVer": ens_v, "NPVer": np_v}
140
141 write_gnlist(root_path, g_tbl)
142
143
144 def write_gnlist(root_path, g_tbl):
145 """Write genome hash table to database file: gn_list.txt.
146
147 Args:
148 root_path: ngs.plot installation directory.
149 g_tbl: genome hash table. Each value is also a hash table of
150 genome info.
151 Returns: None
152 """
153
154 output_order = ["ID", "Assembly", "Species", "EnsVer", "NPVer"]
155 gn_list = root_path + "/database/gn_list.txt"
156
157 try:
158 db_f = open(gn_list, "w")
159 except IOError:
160 print "Open {0} error: your ngs.plot database may be corrupted.".\
161 format(gn_list)
162 sys.exit()
163
164 db_f.write("\t".join(output_order) + "\n")
165
166 for k in g_tbl.viewkeys():
167 rec = []
168 for col in output_order:
169 rec.append(str(g_tbl[k][col]))
170 db_f.write("\t".join(rec) + "\n")
171
172 db_f.close()
173
174
175 def listgn(root_path, args):
176 """List installed genomes in local database on screen.
177
178 Args:
179 root_path: ngs.plot installation directory.
180 args: currently not used.
181 Returns: None.
182 """
183
184 import math
185
186 (h_sp, g_tbl, v_cw) = read_gnlist(root_path, "vector")
187
188 # Format column widths to beautify output.
189 tab_u = 4
190 v_cw = map(lambda x: int(math.ceil(float(x) / tab_u) * tab_u + 1), v_cw)
191
192 # List genomes to screen.
193 print "".join(map(lambda x, y: x.ljust(y), h_sp, v_cw)) # header.
194 for k in sorted(g_tbl.viewkeys(), key=str.lower):
195 print "".join(map(lambda x, y: x.ljust(y), g_tbl[k], v_cw))
196
197 def isExAnno(pkg_files):
198 """If the package is an extension package based on cell lines for ngs.plot,
199 like enhancer or dhs.
200
201 Args:
202 pkg_files: list of files in the package.
203 Returns:
204 isEx: Bool, if the package is an extension package.
205 feature: string, feature name contained in the extension package. None
206 if not an extension package.
207 RDcount: number of RData tables in the package.
208 """
209
210 import os.path
211
212 exclusive_files = [".chrnames.refseq", ".chrnames.ensembl", ".metainfo"]
213 isEx = False
214 RDcount = 0
215 feature = None
216 for file_name in pkg_files:
217 if os.path.basename(file_name) in exclusive_files:
218 continue
219 if (not file_name.endswith("RData")) and (file_name.count("/")) > 0:
220 isEx = True
221 feature = file_name.split("/")[1]
222 elif file_name.endswith("RData"):
223 RDcount += 1
224 return (isEx, feature, RDcount)
225
226 def install(root_path, args):
227 """Interactive session for installing genome from package file.
228
229 Args:
230 root_path: ngs.plot installation directory.
231 args.yes: say yes to all questions.
232 args.pkg: path of package file.
233 Returns:
234 None.
235 """
236
237 import tarfile
238 import sys
239
240 yestoall = args.yes
241 pkg_file = args.pkg
242 # Package name: e.g. ngsplotdb_mm10_71_1.0.tar.gz.
243 # Information about the genome is in mm10/.metainfo.
244 try:
245 pkg_f = tarfile.open(pkg_file, "r:gz")
246 except tarfile.ReadError:
247 print "Read package file {0} error.".format(pkg_file),
248 print "The downloaded file may be corrupted."
249 sys.exit()
250
251 print "Extracting information from package...",
252 sys.stdout.flush()
253 pkg_files = pkg_f.getnames()
254 g_folder = pkg_files[0]
255 # Minus folder name and .metainfo file name.
256 (isEx, feature, RDcount) = isExAnno(pkg_files)
257 if isEx:
258 print feature + " extension package, ",
259 print "contains " + str(RDcount + 2) + " tables."
260 else:
261 print "contains " + str(RDcount + 2) + " tables."
262 # .metainfo file:
263 # "ID"<TAB>mm10
264 # "Assembly"<TAB>GRCm38
265 # "Species"<TAB>mus_musculus
266 # "EnsVer"<TAB>71
267 # "NPVer"<TAB>1.0
268 meta_f = pkg_f.extractfile(g_folder + "/.metainfo")
269 meta_tbl = {}
270 if meta_f:
271 for rec in meta_f:
272 rec_sp = rec.strip().split("\t")
273 meta_tbl[rec_sp[0]] = rec_sp[1] # format: Variable<TAB>Value
274 else:
275 print "No meta information found in the package file.",
276 print "The downloaded file may be corrupted."
277 sys.exit()
278 meta_f.close()
279 pkg_f.close()
280
281 gn_inst = meta_tbl["ID"]
282 assembly = meta_tbl["Assembly"]
283 species = meta_tbl["Species"]
284 ens_ver = float(meta_tbl["EnsVer"])
285 np_ver = float(meta_tbl["NPVer"])
286
287 # Database file.
288 (h_sp, g_tbl, v_cw) = read_gnlist(root_path, "hash")
289
290 if gn_inst in g_tbl:
291 installed_ens = float(g_tbl[gn_inst]["EnsVer"])
292 installed_np = float(g_tbl[gn_inst]["NPVer"])
293
294 # For extension package.
295 # Only the same version with basic package could be installed.
296 if isEx:
297 if ens_ver == installed_ens and np_ver == installed_np:
298 print "Will upgrade the genome {0} with {1} annotation:".format(gn_inst, \
299 feature),
300 print "Ensembl: {0}; ngs.plot: {1}.".\
301 format(installed_ens, np_ver)
302 if yestoall:
303 install_pkg(root_path, pkg_file, gn_inst)
304 update_gnlist(root_path, g_tbl, gn_inst, assembly, \
305 species, ens_ver, np_ver)
306 else:
307 ans = raw_input("Continue?(y/n): ")
308 while True:
309 if(ans == "y" or ans == "Y" or ans == "n" or ans == "N"):
310 break
311 else:
312 ans = raw_input("Answer must be y/Y or n/N: ")
313 if(ans == "y" or ans == "Y"):
314 install_pkg(root_path, pkg_file, gn_inst)
315 update_gnlist(root_path, g_tbl, gn_inst, assembly, \
316 species, ens_ver, np_ver)
317 return
318 else:
319 print "This is an extension package of " + feature + \
320 ", ENS version " + str(ens_ver) + ", NPVer" + str(np_ver) + \
321 ", please install the same version basic genome annotation first!"
322 return
323
324 # For basic package.
325 # Install a newer version.
326 if ens_ver > installed_ens or \
327 ens_ver == installed_ens and np_ver > installed_np:
328 print "Will upgrade the genome {0}:".format(gn_inst)
329 print "Ensembl: {0}==>{1}; ngs.plot: {2}==>{3}.".\
330 format(installed_ens, ens_ver, installed_np, np_ver),
331 if yestoall:
332 install_pkg(root_path, pkg_file, gn_inst, gn_inst)
333 update_gnlist(root_path, g_tbl, gn_inst, assembly, \
334 species, ens_ver, np_ver)
335 else:
336 ans = raw_input("Continue?(y/n): ")
337 while True:
338 if(ans == "y" or ans == "Y" or ans == "n" or ans == "N"):
339 break
340 else:
341 ans = raw_input("Answer must be y/Y or n/N: ")
342 if(ans == "y" or ans == "Y"):
343 install_pkg(root_path, pkg_file, gn_inst, gn_inst)
344 update_gnlist(root_path, g_tbl, gn_inst, assembly, \
345 species, ens_ver, np_ver)
346 # Install an older version.
347 else:
348 print "Will install the same or older version",
349 print "of the genome {0}:".format(gn_inst)
350 print "Ensembl: {0}==>{1}; ngs.plot: {2}==>{3}.".\
351 format(installed_ens, ens_ver, installed_np, np_ver),
352 if yestoall:
353 install_pkg(root_path, pkg_file, gn_inst, gn_inst)
354 update_gnlist(root_path, g_tbl, gn_inst, assembly, \
355 species, ens_ver, np_ver)
356 else:
357 ans = raw_input("Continue?(y/n): ")
358 while True:
359 if(ans == "y" or ans == "Y" or ans == "n" or ans == "N"):
360 break
361 else:
362 ans = raw_input("Answer must be y/Y or n/N: ")
363 if(ans == "y" or ans == "Y"):
364 install_pkg(root_path, pkg_file, gn_inst, gn_inst)
365 update_gnlist(root_path, g_tbl, gn_inst, assembly, \
366 species, ens_ver, np_ver)
367 # Totally new installation, only basic package could be installed.
368 else:
369 print "Will install new genome {0}:".format(gn_inst),
370 print "Ensembl=> v{0}; ngs.plot=> v{1}.".format(ens_ver, np_ver),
371 if yestoall:
372 install_pkg(root_path, pkg_file, gn_inst)
373 update_gnlist(root_path, g_tbl, gn_inst, assembly, \
374 species, ens_ver, np_ver)
375 else:
376 ans = raw_input("Continue?(y/n): ")
377
378 while True:
379 if(ans == "y" or ans == "Y" or ans == "n" or ans == "N"):
380 break
381 else:
382 ans = raw_input("Answer must be y/Y or n/N:")
383 if(ans == "y" or ans == "Y"):
384 install_pkg(root_path, pkg_file, gn_inst)
385 update_gnlist(root_path, g_tbl, gn_inst, assembly, \
386 species, ens_ver, np_ver)
387
388
389 def install_pkg(root_path, pkg_file, new_gn, old_gn=None):
390 """Install genome from package file.
391
392 Args:
393 root_path: ngs.plot installation directory.
394 pkg_file: package file name.
395 old_gn: old genome name to be removed first.
396 Returns:
397 None.
398 """
399
400 import shutil
401 import tarfile
402 import sys
403
404 if old_gn:
405 print "Removing installed genome:{0}...".format(old_gn),
406 sys.stdout.flush()
407 shutil.rmtree(root_path + '/database/' + old_gn)
408 rm_dbtbl(root_path, old_gn)
409 print "Done"
410
411 print "Installing new genome...",
412 sys.stdout.flush()
413 try:
414 tar_f = tarfile.open(pkg_file, "r:gz")
415 except tarfile.ReadError:
416 print "Read package file {0} error: ".format(pkg_file),
417 print "downloaded file may be corrupted."
418 sys.exit()
419
420 try:
421 tar_f.extractall(root_path + "/database")
422 except tarfile.ExtractError:
423 print "Extract files from package error.",
424 print "The downloaded file may be corrupted."
425
426 add_dbtbl(root_path, new_gn)
427
428 print "Done"
429
430
431 def add_dbtbl(root_path, gn):
432 """Add a new genome into database meta tables.
433
434 Args:
435 root_path: ngs.plot installation directory.
436 gn: genome name to add.
437 Returns: None.
438 """
439
440 import os.path
441 import os
442 import glob
443 import tempfile
444 import subprocess
445
446 (dblist_f, dblist_n) = tempfile.mkstemp(text=True)
447 (gnlist_f, gnlist_n) = tempfile.mkstemp(text=True)
448
449 # Obtain a list of .RData file names to extract info from.
450 all_rdata = root_path + '/database/{0}/{1}*.RData'.format(gn, gn)
451 all_set = set(map(os.path.basename, glob.glob(all_rdata)))
452 all_rdata = root_path + '/database/{0}/*/{1}*.RData'.format(gn, gn)
453 all_set = all_set | set(map(os.path.basename, glob.glob(all_rdata)))
454 exm_rdata = root_path + '/database/{0}/{1}*exonmodel*.RData'.format(gn, gn)
455 exm_set = set(map(os.path.basename, glob.glob(exm_rdata)))
456 nex_list = sorted(list(all_set - exm_set))
457
458 # DB file list.
459 gn_ens_list = {} # Ensembl genome name unique list.
460 desired_ncols = 6
461 for fname in nex_list:
462 tokens = fname.split('.')
463 tokens = tokens[:6]
464 if tokens[-1] == 'RData':
465 tokens[-1] = 'NA'
466 tokens.extend(['NA'] * (desired_ncols - len(tokens)))
467 os.write(dblist_f, fname + "\t" + "\t".join(tokens) + "\n")
468 if tokens[1] == 'ensembl':
469 gn_ens_list[".".join(tokens[:3])] = 1
470 os.close(dblist_f)
471
472 # Ensembl genome name list.
473 for gn in sorted(gn_ens_list.viewkeys()):
474 os.write(gnlist_f, gn.replace(".", "\t") + "\n")
475 os.close(gnlist_f)
476
477 # Call external program to create table default file.
478 (default_f, default_n) = tempfile.mkstemp(text=True)
479 subprocess.call(["setTableDefaults.py", gnlist_n, default_n])
480 os.remove(gnlist_n)
481
482 # Call external program to calcualte DB score and install the new entries
483 # into ngs.plot database.
484 subprocess.call(["install.db.tables.r", default_n, dblist_n])
485 os.remove(default_n)
486 os.remove(dblist_n)
487
488
489 def remove(root_path, args):
490 """Remove genome from ngs.plot database.
491
492 Args:
493 root_path: ngs.plot installation directory.
494 args.yes: say yes to all questions.
495 args.gn: genome name.
496 Returns:
497 None.
498 """
499
500 import shutil
501 import sys
502
503 yestoall = args.yes
504 sub_ftr = args.ftr
505
506 (h_sp, g_tbl, v_cw) = read_gnlist(root_path, "hash")
507
508 gn = args.gn
509
510 if gn in g_tbl and sub_ftr is None:
511 print "Will remove genome {0} from database.".format(gn),
512 do_rm = False
513 if yestoall:
514 do_rm = True
515 else:
516 ans = raw_input("Continue?(y/n): ")
517 while True:
518 if ans == 'y' or ans == 'Y' or ans == 'n' or ans == 'N':
519 break
520 else:
521 ans = raw_input("The answer must be y/Y or n/N: ")
522 if ans == 'y' or ans == 'Y':
523 do_rm = True
524 if do_rm:
525 folder_to_rm = root_path + "/database/" + gn
526 print "Removing genome...",
527 sys.stdout.flush()
528 shutil.rmtree(folder_to_rm)
529 del g_tbl[gn]
530 write_gnlist(root_path, g_tbl)
531 rm_dbtbl(root_path, gn)
532 print "Done"
533 elif gn in g_tbl and sub_ftr is not None:
534 print "Will remove genomic feature {0} of genome {1} from database."\
535 .format(sub_ftr, gn)
536 do_rm = False
537 if yestoall:
538 do_rm = True
539 else:
540 ans = raw_input("Continue?(y/n): ")
541 while True:
542 if ans == 'y' or ans == 'Y' or ans == 'n' or ans == 'N':
543 break
544 else:
545 ans = raw_input("The answer must be y/Y or n/N: ")
546 if ans == 'y' or ans == 'Y':
547 do_rm = True
548 if do_rm:
549 folder_to_rm = root_path + "/database/" + gn + "/" + sub_ftr
550 print "Removing genome...",
551 sys.stdout.flush()
552 shutil.rmtree(folder_to_rm)
553 # g_tbl does't need to be updated
554 rm_dbtbl(root_path, gn, sub_ftr)
555 print "Done"
556 else:
557 print "Cannot find the genome in database. Nothing was done."
558
559
560 def rm_dbtbl(root_path, gn, sub_ftr=None):
561 """Remove a genome from database meta tables.
562
563 Args:
564 root_path: ngs.plot installation directory.
565 gn: genome name to remove.
566 Returns: None.
567 """
568
569 import subprocess
570
571 if sub_ftr is None:
572 subprocess.call(["remove.db.tables.r", gn])
573 else:
574 subprocess.call(["remove.db.tables.r", gn, sub_ftr])
575
576 def chrnames(root_path, args):
577 """List chromosome names for a given genome.
578
579 Args:
580 root_path: ngs.plot installation directory.
581 args.gn: genome name to remove.
582 args.db: database(ensembl or refseq).
583 Returns: None.
584 """
585
586 import sys
587
588 db = args.db
589 gn = args.gn
590
591 if db != "ensembl" and db != "refseq":
592 print "Unrecognized database name: database must be ensembl or refseq."
593 sys.exit()
594
595 chrnames_file = root_path + "/database/" + gn + "/.chrnames." + db
596
597 try:
598 chrf = open(chrnames_file)
599 except IOError:
600 print "Open file: {0} error.".format(chrnames_file),
601 print "Your database may be corrupted or you have an older version."
602 sys.exit()
603
604 chr_list = chrf.read(1000000) # set 1MB limit to avoid nasty input.
605 chrf.close()
606
607 chr_list = chr_list.strip()
608 print chr_list
609
610
611
612 ###############################################
613 ######## Main procedure. ######################
614 if __name__ == "__main__":
615
616 import argparse
617 import os
618 import sys
619
620 try:
621 root_path = os.environ["NGSPLOT"]
622 except KeyError:
623 print "Environmental variable NGSPLOT is not set! Exit.\n"
624 sys.exit()
625
626 # Main parser.
627 parser = argparse.ArgumentParser(description="Manipulate ngs.plot's \
628 annotation database",
629 prog="ngsplotdb.py")
630 parser.add_argument("-y", "--yes", help="Say yes to all prompted questions",
631 action="store_true")
632
633 subparsers = parser.add_subparsers(title="Subcommands",
634 help="additional help")
635
636 # ngsplotdb.py list parser.
637 parser_list = subparsers.add_parser("list", help="List genomes installed \
638 in database")
639 parser_list.set_defaults(func=listgn)
640
641 # ngsplotdb.py install parser.
642 parser_install = subparsers.add_parser("install",
643 help="Install genome from package \
644 file")
645 parser_install.add_argument("pkg", help="Package file(.tar.gz) to install",
646 type=str)
647 parser_install.set_defaults(func=install)
648
649 # ngsplotdb.py remove parser.
650 parser_remove = subparsers.add_parser("remove",
651 help="Remove genome from database")
652 parser_remove.add_argument("gn", help="Name of genome to be \
653 removed(e.g. hg18)", type=str)
654 parser_remove.add_argument("--ftr", help="Remove cellline specific features \
655 (enhancer, dhs, etc)", type=str,\
656 default=None)
657 parser_remove.set_defaults(func=remove)
658
659 # ngsplotdb.py chromosome names parser.
660 parser_chrnames = subparsers.add_parser("chrnames",
661 help="List chromosome names")
662 parser_chrnames.add_argument("gn", help="Genome name(e.g. mm9)", type=str)
663 parser_chrnames.add_argument("db", help="Database(ensembl or refseq)",
664 type=str)
665 parser_chrnames.set_defaults(func=chrnames)
666
667
668 args = parser.parse_args()
669 args.func(root_path, args)
670