Mercurial > repos > matthias > data_manager_dada2
comparison data_manager/data_manager.py @ 3:3a4ee8bf012a draft
planemo upload for repository https://github.com/bernt-matthias/mb-galaxy-tools/tree/master/data_managers/data_manager_dada2 commit 5b1603bbcd3f139cad5c876be83fcb39697b5613-dirty
author | matthias |
---|---|
date | Tue, 09 Apr 2019 07:18:57 -0400 |
parents | b4c303665291 |
children |
comparison
equal
deleted
inserted
replaced
2:b4c303665291 | 3:3a4ee8bf012a |
---|---|
12 from urllib2 import Request, urlopen | 12 from urllib2 import Request, urlopen |
13 | 13 |
14 DEFAULT_TAXLEVELS="Kingdom,Phylum,Class,Order,Family,Genus,Species" | 14 DEFAULT_TAXLEVELS="Kingdom,Phylum,Class,Order,Family,Genus,Species" |
15 | 15 |
16 FILE2NAME = { | 16 FILE2NAME = { |
17 "silva132":"Silva version 132", | 17 "silva_132":"Silva version 132", |
18 "silva128":"Silva version 128", | 18 "silva_128":"Silva version 128", |
19 "rdp16":"RDP trainset 16", | 19 "rdp_16":"RDP trainset 16", |
20 "rdp14":"RDP trainset 14", | 20 "rdp_14":"RDP trainset 14", |
21 "gg13.84":"GreenGenes version 13.8", | 21 "greengenes_13.84":"GreenGenes version 13.84", |
22 "unite8.0_fungi": "UNITE: General Fasta release 8.0 for Fungi", | 22 "unite_8.0_fungi": "UNITE: General Fasta release 8.0 for Fungi", |
23 "unite8.0_fungi_singletons": "UNITE: General Fasta release 8.0 for Fungi including global and 97% singletons", | 23 "unite_8.0_fungi_singletons": "UNITE: General Fasta release 8.0 for Fungi including global and 97% singletons", |
24 "unite8.0_euka": "UNITE: General Fasta release 8.0 for all Eukaryotes", | |
25 "unite8.0_euka_singletons": "UNITE: General Fasta release 8.0 for all Eukaryotes including global and 97% singletons", | |
26 "RefSeq_RDP_2018_05": "NCBI RefSeq 16S rRNA database supplemented by RDP (05/2018)", | 24 "RefSeq_RDP_2018_05": "NCBI RefSeq 16S rRNA database supplemented by RDP (05/2018)", |
27 "gtdb_2018_11_20": "GTDB: Genome Taxonomy Database (Bacteria & Archaea) (11/2018)", | 25 "gtdb_2018_11": "GTDB: Genome Taxonomy Database (Bacteria & Archaea) (11/2018)", |
28 "hitdb1": "HitDB version 1 (Human InTestinal 16S rRNA)", | 26 "hitdb_1": "HitDB version 1 (Human InTestinal 16S rRNA)", |
29 "silva132_euk_18S": "Silva version 132 Eukaryotic 18S", | 27 "silva_euk_18S_132": "Silva version 132 Eukaryotic 18S", |
30 "PR2v4.11.1": "Protist Ribosomal Reference database (PR2) 4.11.1" | 28 "PR2_4.11.1": "Protist Ribosomal Reference database (PR2) 4.11.1" |
31 } | 29 } |
32 | 30 |
33 FILE2TAXURL = { | 31 FILE2TAXURL = { |
34 "silva132":"https://zenodo.org/record/1172783/files/silva_nr_v132_train_set.fa.gz?download=1", | 32 "silva_132":"https://zenodo.org/record/1172783/files/silva_nr_v132_train_set.fa.gz?download=1", |
35 "silva128":"https://zenodo.org/record/824551/files/silva_nr_v128_train_set.fa.gz?download=1", | 33 "silva_128":"https://zenodo.org/record/824551/files/silva_nr_v128_train_set.fa.gz?download=1", |
36 "rdp16":"https://zenodo.org/record/801828/files/rdp_train_set_16.fa.gz?download=1", | 34 "rdp_16":"https://zenodo.org/record/801828/files/rdp_train_set_16.fa.gz?download=1", |
37 "rdp14":"https://zenodo.org/record/158955/files/rdp_train_set_14.fa.gz?download=1", | 35 "rdp_14":"https://zenodo.org/record/158955/files/rdp_train_set_14.fa.gz?download=1", |
38 "unite8.0_fungi": "https://files.plutof.ut.ee/public/orig/EB/0C/EB0CCB3A871B77EA75E472D13926271076904A588D2E1C1EA5AFCF7397D48378.zip", | 36 "unite_8.0_fungi": "https://files.plutof.ut.ee/public/orig/EB/0C/EB0CCB3A871B77EA75E472D13926271076904A588D2E1C1EA5AFCF7397D48378.zip", |
39 "unite8.0_fungi_singletons": "https://files.plutof.ut.ee/doi/06/A2/06A2C86256EED64085670EB0C54B7115F6DAC8F311C656A9CB33E386CFABA0D0.zip", | 37 "unite_8.0_fungi_singletons": "https://files.plutof.ut.ee/doi/06/A2/06A2C86256EED64085670EB0C54B7115F6DAC8F311C656A9CB33E386CFABA0D0.zip", |
40 "unite8.0_euka": "https://files.plutof.ut.ee/public/orig/D6/96/D69658E99589D888A207805A744019DBA4EC0F603E67E53732767B3E03A5AA86.zip", | 38 "greengenes_13.84":"https://zenodo.org/record/158955/files/gg_13_8_train_set_97.fa.gz?download=1", |
41 "unite8.0_euka_singletons": "https://files.plutof.ut.ee/doi/C2/20/C22034350E32D6AD7E5D1AF3F8BC487E34DA0BE25602B0E748906005CE6ADA97.zip", | |
42 "gg13.84":"https://zenodo.org/record/158955/files/gg_13_8_train_set_97.fa.gz?download=1", | |
43 "RefSeq_RDP_2018_05": "https://zenodo.org/record/2541239/files/RefSeq-RDP16S_v2_May2018.fa.gz?download=1", | 39 "RefSeq_RDP_2018_05": "https://zenodo.org/record/2541239/files/RefSeq-RDP16S_v2_May2018.fa.gz?download=1", |
44 "gtdb_2018_11_20": "https://zenodo.org/record/2541239/files/GTDB_bac-arc_ssu_r86.fa.gz?download=1", | 40 "gtdb_2018_11": "https://zenodo.org/record/2541239/files/GTDB_bac-arc_ssu_r86.fa.gz?download=1", |
45 "hitdb1": "https://zenodo.org/record/159205/files/hitdb_v1.00.fa.gz?download=1", | 41 "hitdb_1": "https://zenodo.org/record/159205/files/hitdb_v1.00.fa.gz?download=1", |
46 "silva132_euk_18S": "https://zenodo.org/record/1447330/files/silva_132.18s.99_rep_set.dada2.fa.gz?download=1", | 42 "silva_euk_18S_132": "https://zenodo.org/record/1447330/files/silva_132.18s.99_rep_set.dada2.fa.gz?download=1", |
47 "PR2v4.11.1": "https://github.com/pr2database/pr2database/releases/download/4.11.1/pr2_version_4.11.1_dada2.fasta.gz" | 43 "PR2_4.11.1": "https://github.com/pr2database/pr2database/releases/download/4.11.1/pr2_version_4.11.1_dada2.fasta.gz" |
48 } | 44 } |
49 | 45 |
50 FILE2SPECIESURL = { | 46 FILE2SPECIESURL = { |
51 "silva132":"https://zenodo.org/record/1172783/files/silva_species_assignment_v132.fa.gz?download=1", | 47 "silva_132":"https://zenodo.org/record/1172783/files/silva_species_assignment_v132.fa.gz?download=1", |
52 "silva128":"https://zenodo.org/record/824551/files/silva_species_assignment_v128.fa.gz?download=1", | 48 "silva_128":"https://zenodo.org/record/824551/files/silva_species_assignment_v128.fa.gz?download=1", |
53 "rdp16":"https://zenodo.org/record/801828/files/rdp_species_assignment_16.fa.gz?download=1", | 49 "rdp_16":"https://zenodo.org/record/801828/files/rdp_species_assignment_16.fa.gz?download=1", |
54 "rdp14":"https://zenodo.org/record/158955/files/rdp_species_assignment_14.fa.gz?download=1" | 50 "rdp_14":"https://zenodo.org/record/158955/files/rdp_species_assignment_14.fa.gz?download=1" |
55 } | 51 } |
56 | 52 |
57 FILE2TAXLEVELS = { | 53 FILE2TAXLEVELS = { |
58 "PR2v4.11.1": "Kingdom,Supergroup,Division,Class,Order,Family,Genus,Species" | 54 "PR2_4.11.1": "Kingdom,Supergroup,Division,Class,Order,Family,Genus,Species" |
59 } | 55 } |
60 | 56 |
61 def url_download(url, fname, workdir): | 57 def url_download(url, fname, workdir): |
62 """ | 58 """ |
63 download url to workdir/fname | 59 download url to workdir/fname |
64 | |
65 return the path to the resulting file | |
66 """ | 60 """ |
67 file_path = os.path.join(workdir, fname) | 61 file_path = os.path.join(workdir, fname) |
68 if not os.path.exists(workdir): | 62 if not os.path.exists(workdir): |
69 os.makedirs(workdir) | 63 os.makedirs(workdir) |
70 src = None | 64 src = None |
92 # unzip download | 86 # unzip download |
93 zip_ref = zipfile.ZipFile(file_path, 'r') | 87 zip_ref = zipfile.ZipFile(file_path, 'r') |
94 zip_ref.extractall(workdir) | 88 zip_ref.extractall(workdir) |
95 zip_ref.close() | 89 zip_ref.close() |
96 # gzip top level fasta file | 90 # gzip top level fasta file |
97 fastas = glob.glob("*fasta") | 91 fastas = glob.glob("%s/*fasta"%workdir) |
98 if len(fastas) != 1: | 92 if len(fastas) != 1: |
99 msg = "UNITE download %s contained more than one or no fasta file" | 93 msg = "UNITE download %s contained %d fasta file(s): %s"%(url, len(fastas), " ".join(fastas)) |
100 raise Exception(msg) | 94 raise Exception(msg) |
101 with open(fastas[0], 'rb') as f_in: | 95 with open(fastas[0], 'rb') as f_in: |
102 with gzip.open(file_path, 'wb') as f_out: | 96 with gzip.open(file_path, 'wb') as f_out: |
103 shutil.copyfileobj(f_in, f_out) | 97 shutil.copyfileobj(f_in, f_out) |
104 | 98 |
105 return fname | |
106 | 99 |
107 def main(dataset, outjson): | 100 def remote_dataset(dataset, outjson): |
108 | 101 |
109 params = json.loads(open(outjson).read()) | 102 with open(outjson) as jf: |
110 target_directory = params['output_data'][0]['extra_files_path'] | 103 params = json.loads(jf.read()) |
111 os.mkdir(target_directory) | |
112 output_path = os.path.abspath(os.path.join(os.getcwd(), 'dada2')) | |
113 | 104 |
114 workdir = os.path.join(os.getcwd(), 'dada2') | 105 workdir = params['output_data'][0]['extra_files_path'] |
115 path = url_download( FILE2TAXURL[dataset], dataset+".taxonomy", workdir) | 106 os.mkdir(workdir) |
107 url_download( FILE2TAXURL[dataset], dataset+".taxonomy", workdir) | |
116 | 108 |
117 data_manager_json = {"data_tables":{}} | 109 data_manager_json = {"data_tables":{}} |
118 data_manager_entry = {} | 110 data_manager_entry = {} |
119 data_manager_entry['value'] = dataset | 111 data_manager_entry['value'] = dataset |
120 data_manager_entry['name'] = FILE2NAME[dataset] | 112 data_manager_entry['name'] = FILE2NAME[dataset] |
121 data_manager_entry['path'] = dataset+".taxonomy" | 113 data_manager_entry['path'] = dataset+".taxonomy" |
122 data_manager_entry['taxlevels'] = FILE2TAXLEVELS.get(dataset, DEFAULT_TAXLEVELS) | 114 data_manager_entry['taxlevels'] = FILE2TAXLEVELS.get(dataset, DEFAULT_TAXLEVELS) |
123 data_manager_json["data_tables"]["dada2_taxonomy"] = data_manager_entry | 115 data_manager_json["data_tables"]["dada2_taxonomy"] = data_manager_entry |
124 | 116 |
125 | |
126 if FILE2SPECIESURL.get(dataset, False ): | 117 if FILE2SPECIESURL.get(dataset, False ): |
127 path = url_download( FILE2SPECIESURL[dataset], dataset+".species", workdir) | 118 url_download( FILE2SPECIESURL[dataset], dataset+".species", workdir) |
128 | |
129 data_manager_entry = {} | 119 data_manager_entry = {} |
130 data_manager_entry['value'] = dataset | 120 data_manager_entry['value'] = dataset |
131 data_manager_entry['name'] = FILE2NAME[dataset] | 121 data_manager_entry['name'] = FILE2NAME[dataset] |
132 data_manager_entry['path'] = dataset+".species" | 122 data_manager_entry['path'] = dataset+".species" |
133 data_manager_json["data_tables"]["dada2_species"] = data_manager_entry | 123 data_manager_json["data_tables"]["dada2_species"] = data_manager_entry |
134 | |
135 for filename in os.listdir(workdir): | |
136 shutil.move(os.path.join(output_path, filename), target_directory) | |
137 | 124 |
138 sys.stderr.write("JSON %s" %json.dumps(data_manager_json)) | 125 with file(outjson, 'w') as jf: |
139 file(outjson, 'w').write(json.dumps(data_manager_json)) | 126 jf.write(json.dumps(data_manager_json)) |
140 | 127 |
141 if __name__ == '__main__': | 128 if __name__ == '__main__': |
142 parser = argparse.ArgumentParser(description='Create data manager json.') | 129 parser = argparse.ArgumentParser(description='Create data manager json.') |
143 parser.add_argument('--out', action='store', help='JSON filename') | 130 parser.add_argument('--out', action='store', help='JSON filename') |
144 parser.add_argument('--dataset', action='store', help='Download data set name') | 131 parser.add_argument('--dataset', action='store', help='Download data set name') |
145 args = parser.parse_args() | 132 args = parser.parse_args() |
146 | 133 |
147 main(args.dataset, args.out) | 134 remote_dataset(args.dataset, args.out) |