comparison data_manager/data_manager.py @ 9:facf9e6c872c draft default tip

planemo upload for repository https://github.com/bernt-matthias/mb-galaxy-tools/tree/master/data_managers/data_manager_dada2 commit df2dfeb75f88b326f567cab8df4e6c4a7f2e548c
author matthias
date Tue, 15 Oct 2019 07:20:59 -0400
parents da93e6a3fe23
children
comparison
equal deleted inserted replaced
8:da93e6a3fe23 9:facf9e6c872c
1 import argparse 1 import argparse
2 import json 2 import json
3 import os 3 import os
4 import shutil
5 import sys
6 import zipfile
7 try: 4 try:
8 # For Python 3.0 and later 5 # For Python 3.0 and later
9 from urllib.request import Request, urlopen 6 from urllib.request import Request, urlopen
10 except ImportError: 7 except ImportError:
11 # Fall back to Python 2 imports 8 # Fall back to Python 2 imports
12 from urllib2 import Request, urlopen 9 from urllib2 import Request, urlopen
13 10
14 DEFAULT_TAXLEVELS="Kingdom,Phylum,Class,Order,Family,Genus,Species" 11 DEFAULT_TAXLEVELS = "Kingdom,Phylum,Class,Order,Family,Genus,Species"
15 12
16 FILE2NAME = { 13 FILE2NAME = {
17 "silva_132":"Silva version 132", 14 "silva_132": "Silva version 132",
18 "silva_128":"Silva version 128", 15 "silva_128": "Silva version 128",
19 "rdp_16":"RDP trainset 16", 16 "rdp_16": "RDP trainset 16",
20 "rdp_14":"RDP trainset 14", 17 "rdp_14": "RDP trainset 14",
21 "greengenes_13.84":"GreenGenes version 13.84", 18 "greengenes_13.84": "GreenGenes version 13.84",
22 "unite_8.0_fungi": "UNITE: General Fasta release 8.0 for Fungi", 19 "unite_8.0_fungi": "UNITE: General Fasta release 8.0 for Fungi",
23 "unite_8.0_fungi_singletons": "UNITE: General Fasta release 8.0 for Fungi including global and 97% singletons", 20 "unite_8.0_fungi_singletons": "UNITE: General Fasta release 8.0 for Fungi including global and 97% singletons",
24 "RefSeq_RDP_2018_05": "NCBI RefSeq 16S rRNA database supplemented by RDP (05/2018)", 21 "RefSeq_RDP_2018_05": "NCBI RefSeq 16S rRNA database supplemented by RDP (05/2018)",
25 "gtdb_2018_11": "GTDB: Genome Taxonomy Database (Bacteria & Archaea) (11/2018)", 22 "gtdb_2018_11": "GTDB: Genome Taxonomy Database (Bacteria & Archaea) (11/2018)",
26 "hitdb_1": "HitDB version 1 (Human InTestinal 16S rRNA)", 23 "hitdb_1": "HitDB version 1 (Human InTestinal 16S rRNA)",
27 "silva_euk_18S_132": "Silva version 132 Eukaryotic 18S", 24 "silva_euk_18S_132": "Silva version 132 Eukaryotic 18S",
28 "PR2_4.11.1": "Protist Ribosomal Reference database (PR2) 4.11.1" 25 "PR2_4.11.1": "Protist Ribosomal Reference database (PR2) 4.11.1"
29 } 26 }
30 27
31 FILE2TAXURL = { 28 FILE2TAXURL = {
32 "silva_132":"https://zenodo.org/record/1172783/files/silva_nr_v132_train_set.fa.gz?download=1", 29 "silva_132": "https://zenodo.org/record/1172783/files/silva_nr_v132_train_set.fa.gz?download=1",
33 "silva_128":"https://zenodo.org/record/824551/files/silva_nr_v128_train_set.fa.gz?download=1", 30 "silva_128": "https://zenodo.org/record/824551/files/silva_nr_v128_train_set.fa.gz?download=1",
34 "rdp_16":"https://zenodo.org/record/801828/files/rdp_train_set_16.fa.gz?download=1", 31 "rdp_16": "https://zenodo.org/record/801828/files/rdp_train_set_16.fa.gz?download=1",
35 "rdp_14":"https://zenodo.org/record/158955/files/rdp_train_set_14.fa.gz?download=1", 32 "rdp_14": "https://zenodo.org/record/158955/files/rdp_train_set_14.fa.gz?download=1",
36 "unite_8.0_fungi": "https://files.plutof.ut.ee/public/orig/EB/0C/EB0CCB3A871B77EA75E472D13926271076904A588D2E1C1EA5AFCF7397D48378.zip", 33 "unite_8.0_fungi": "https://files.plutof.ut.ee/public/orig/EB/0C/EB0CCB3A871B77EA75E472D13926271076904A588D2E1C1EA5AFCF7397D48378.zip",
37 "unite_8.0_fungi_singletons": "https://files.plutof.ut.ee/doi/06/A2/06A2C86256EED64085670EB0C54B7115F6DAC8F311C656A9CB33E386CFABA0D0.zip", 34 "unite_8.0_fungi_singletons": "https://files.plutof.ut.ee/doi/06/A2/06A2C86256EED64085670EB0C54B7115F6DAC8F311C656A9CB33E386CFABA0D0.zip",
38 "greengenes_13.84":"https://zenodo.org/record/158955/files/gg_13_8_train_set_97.fa.gz?download=1", 35 "greengenes_13.84": "https://zenodo.org/record/158955/files/gg_13_8_train_set_97.fa.gz?download=1",
39 "RefSeq_RDP_2018_05": "https://zenodo.org/record/2541239/files/RefSeq-RDP16S_v2_May2018.fa.gz?download=1", 36 "RefSeq_RDP_2018_05": "https://zenodo.org/record/2541239/files/RefSeq-RDP16S_v2_May2018.fa.gz?download=1",
40 "gtdb_2018_11": "https://zenodo.org/record/2541239/files/GTDB_bac-arc_ssu_r86.fa.gz?download=1", 37 "gtdb_2018_11": "https://zenodo.org/record/2541239/files/GTDB_bac-arc_ssu_r86.fa.gz?download=1",
41 "hitdb_1": "https://zenodo.org/record/159205/files/hitdb_v1.00.fa.gz?download=1", 38 "hitdb_1": "https://zenodo.org/record/159205/files/hitdb_v1.00.fa.gz?download=1",
42 "silva_euk_18S_132": "https://zenodo.org/record/1447330/files/silva_132.18s.99_rep_set.dada2.fa.gz?download=1", 39 "silva_euk_18S_132": "https://zenodo.org/record/1447330/files/silva_132.18s.99_rep_set.dada2.fa.gz?download=1",
43 "PR2_4.11.1": "https://github.com/pr2database/pr2database/releases/download/4.11.1/pr2_version_4.11.1_dada2.fasta.gz" 40 "PR2_4.11.1": "https://github.com/pr2database/pr2database/releases/download/4.11.1/pr2_version_4.11.1_dada2.fasta.gz"
44 } 41 }
45 42
46 FILE2SPECIESURL = { 43 FILE2SPECIESURL = {
47 "silva_132":"https://zenodo.org/record/1172783/files/silva_species_assignment_v132.fa.gz?download=1", 44 "silva_132": "https://zenodo.org/record/1172783/files/silva_species_assignment_v132.fa.gz?download=1",
48 "silva_128":"https://zenodo.org/record/824551/files/silva_species_assignment_v128.fa.gz?download=1", 45 "silva_128": "https://zenodo.org/record/824551/files/silva_species_assignment_v128.fa.gz?download=1",
49 "rdp_16":"https://zenodo.org/record/801828/files/rdp_species_assignment_16.fa.gz?download=1", 46 "rdp_16": "https://zenodo.org/record/801828/files/rdp_species_assignment_16.fa.gz?download=1",
50 "rdp_14":"https://zenodo.org/record/158955/files/rdp_species_assignment_14.fa.gz?download=1" 47 "rdp_14": "https://zenodo.org/record/158955/files/rdp_species_assignment_14.fa.gz?download=1"
51 } 48 }
52 49
53 FILE2TAXLEVELS = { 50 FILE2TAXLEVELS = {
54 "PR2_4.11.1": "Kingdom,Supergroup,Division,Class,Order,Family,Genus,Species" 51 "PR2_4.11.1": "Kingdom,Supergroup,Division,Class,Order,Family,Genus,Species"
55 } 52 }
53
56 54
57 def url_download(url, fname, workdir): 55 def url_download(url, fname, workdir):
58 """ 56 """
59 download url to workdir/fname 57 download url to workdir/fname
60 """ 58 """
75 break 73 break
76 finally: 74 finally:
77 if src: 75 if src:
78 src.close() 76 src.close()
79 77
80 #special treatment of UNITE DBs: they are zip files containing two fasta (xyz.fasta and developer/xyz.fasta) 78 # special treatment of UNITE DBs: they are zip files containing two fasta (xyz.fasta and developer/xyz.fasta)
81 if fname.startswith("unite"): 79 if fname.startswith("unite"):
82 import glob 80 import glob
83 import gzip 81 import gzip
84 import shutil 82 import shutil
85 import zipfile 83 import zipfile
86 # unzip download 84 # unzip download
87 zip_ref = zipfile.ZipFile(file_path, 'r') 85 zip_ref = zipfile.ZipFile(file_path, 'r')
88 zip_ref.extractall(workdir) 86 zip_ref.extractall(workdir)
89 zip_ref.close() 87 zip_ref.close()
90 # gzip top level fasta file 88 # gzip top level fasta file
91 fastas = glob.glob("%s/*fasta"%workdir) 89 fastas = glob.glob("%s/*fasta" % workdir)
92 if len(fastas) != 1: 90 if len(fastas) != 1:
93 msg = "UNITE download %s contained %d fasta file(s): %s"%(url, len(fastas), " ".join(fastas)) 91 msg = "UNITE download %s contained %d fasta file(s): %s" % (url, len(fastas), " ".join(fastas))
94 raise Exception(msg) 92 raise Exception(msg)
95 with open(fastas[0], 'rb') as f_in: 93 with open(fastas[0], 'rb') as f_in:
96 with gzip.open(file_path, 'wb') as f_out: 94 with gzip.open(file_path, 'wb') as f_out:
97 shutil.copyfileobj(f_in, f_out) 95 shutil.copyfileobj(f_in, f_out)
98 96
102 with open(outjson) as jf: 100 with open(outjson) as jf:
103 params = json.loads(jf.read()) 101 params = json.loads(jf.read())
104 102
105 workdir = params['output_data'][0]['extra_files_path'] 103 workdir = params['output_data'][0]['extra_files_path']
106 os.mkdir(workdir) 104 os.mkdir(workdir)
107 url_download( FILE2TAXURL[dataset], dataset+".taxonomy", workdir) 105 url_download( FILE2TAXURL[dataset], dataset + ".taxonomy", workdir)
108 106
109 data_manager_json = {"data_tables":{}} 107 data_manager_json = {"data_tables": {}}
110 data_manager_entry = {} 108 data_manager_entry = {}
111 data_manager_entry['value'] = dataset 109 data_manager_entry['value'] = dataset
112 data_manager_entry['name'] = FILE2NAME[dataset] 110 data_manager_entry['name'] = FILE2NAME[dataset]
113 data_manager_entry['path'] = dataset+".taxonomy" 111 data_manager_entry['path'] = dataset + ".taxonomy"
114 data_manager_entry['taxlevels'] = FILE2TAXLEVELS.get(dataset, DEFAULT_TAXLEVELS) 112 data_manager_entry['taxlevels'] = FILE2TAXLEVELS.get(dataset, DEFAULT_TAXLEVELS)
115 data_manager_json["data_tables"]["dada2_taxonomy"] = data_manager_entry 113 data_manager_json["data_tables"]["dada2_taxonomy"] = data_manager_entry
116 114
117 if FILE2SPECIESURL.get(dataset, False ): 115 if FILE2SPECIESURL.get(dataset, False ):
118 url_download( FILE2SPECIESURL[dataset], dataset+".species", workdir) 116 url_download( FILE2SPECIESURL[dataset], dataset + ".species", workdir)
119 data_manager_entry = {} 117 data_manager_entry = {}
120 data_manager_entry['value'] = dataset 118 data_manager_entry['value'] = dataset
121 data_manager_entry['name'] = FILE2NAME[dataset] 119 data_manager_entry['name'] = FILE2NAME[dataset]
122 data_manager_entry['path'] = dataset+".species" 120 data_manager_entry['path'] = dataset + ".species"
123 data_manager_json["data_tables"]["dada2_species"] = data_manager_entry 121 data_manager_json["data_tables"]["dada2_species"] = data_manager_entry
124 122
125 with file(outjson, 'w') as jf: 123 with file(outjson, 'w') as jf:
126 jf.write(json.dumps(data_manager_json)) 124 jf.write(json.dumps(data_manager_json))
125
127 126
128 if __name__ == '__main__': 127 if __name__ == '__main__':
129 parser = argparse.ArgumentParser(description='Create data manager json.') 128 parser = argparse.ArgumentParser(description='Create data manager json.')
130 parser.add_argument('--out', action='store', help='JSON filename') 129 parser.add_argument('--out', action='store', help='JSON filename')
131 parser.add_argument('--dataset', action='store', help='Download data set name') 130 parser.add_argument('--dataset', action='store', help='Download data set name')