# HG changeset patch # User matthias # Date 1554808737 14400 # Node ID 3a4ee8bf012a03bf6fa1a0fc3657b1cd05b9f9c1 # Parent b4c30366529118a3b27c9463caa1e672698fbad5 planemo upload for repository https://github.com/bernt-matthias/mb-galaxy-tools/tree/master/data_managers/data_manager_dada2 commit 5b1603bbcd3f139cad5c876be83fcb39697b5613-dirty diff -r b4c303665291 -r 3a4ee8bf012a data_manager/.megan_tools_fetcher.xml.swp Binary file data_manager/.megan_tools_fetcher.xml.swp has changed diff -r b4c303665291 -r 3a4ee8bf012a data_manager/dada2_fetcher.xml --- a/data_manager/dada2_fetcher.xml Fri Mar 08 05:38:44 2019 -0500 +++ b/data_manager/dada2_fetcher.xml Tue Apr 09 07:18:57 2019 -0400 @@ -1,90 +1,174 @@ - - Download reference data sets - - + Download reference databases + - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + -- RDP trainset 14 -- GreenGenes version 13.8 +- Silva (https://www.arb-silva.de/) +- RDP (http://rdp.cme.msu.edu/) +- GreenGenes (http://greengenes.secondgenome.com/) +- UNITE general FASTA (https://unite.ut.ee/repository.php) -While the Silva and RDP data sets contain reference data bases for taxonomy and species assignment, the greengenes data set only contains a reference data base for taxonomy assignment. +While Silva and RDP contain reference databases for taxonomy and species assignment, the greengenes and UNITE databases only contains a reference database for taxonomy assignment. -For the Silva data sets consider to check the license information: http://www.arb-silva.de/silva-license-information. +For the Silva databases check the license information: http://www.arb-silva.de/silva-license-information. - +Except for UNITE all reference databases are downloaded from the corresponding zenodo links that are listed on the DADA2 website. The UNITE databases are taken from the links provided on the UNITE website -(More info: http://gtdb.ecogenomic.org/) - -https://github.com/pr2database/pr2database +More detailed informations in the reference data bases can be found on the DADA2 website and contained links: https://benjjneb.github.io/dada2/training.html. +Further public Reference databases listed by the DADA2 project +.............................................................. -Custom Reference data sets --------------------------- +Several contributed reference databases are listed of the DADA2 project website (https://benjjneb.github.io/dada2/training.html): -For ** taxonomy assignment ** the following is needed: +- RefSeq + RDP (NCBI RefSeq 16S rRNA database supplemented by RDP) +- GTDB: Genome Taxonomy Database (More info: http://gtdb.ecogenomic.org/) +- HitDB version 1 (Human InTestinal 16S rRNA) (https://github.com/microbiome/HITdb) +- RDP fungi LSU +- Silva Eukaryotic 18S +- PR2 (https://github.com/pr2database/pr2database) -- a reference fasta data base -- a comma separated list of taxonomic ranks present in the reference data base - -The reference fasta data base for taxonomic assignment (fasta or compressed fasta) needs to encode the taxonomy corresponding to each sequence in the fasta header lines in the following fashion (note, the second sequence is not assigned down to level 6): - -:: +Except for PR2, all reference databases are downloaded from the corresponding zenodo links that are listed on the DADA2 website. The PR2 database is taken from their github page. ->Level1;Level2;Level3;Level4;Level5;Level6; -ACCTAGAAAGTCGTAGATCGAAGTTGAAGCATCGCCCGATGATCGTCTGAAGCTGTAGCATGAGTCGATTTTCACATTCAGGGATACCATAGGATAC ->Level1;Level2;Level3;Level4;Level5; -CGCTAGAAAGTCGTAGAAGGCTCGGAGGTTTGAAGCATCGCCCGATGGGATCTCGTTGCTGTAGCATGAGTACGGACATTCAGGGATCATAGGATAC - -The list of required taxonomic ranks could be for instance: "Kingdom,Phylum,Class,Order,Family,Genus" - -The reference data base for ** species assignment ** is a fasta file (or compressed fasta file), with the id line formatted as follows: - -:: - ->ID Genus species -ACCTAGAAAGTCGTAGATCGAAGTTGAAGCATCGCCCGATGATCGTCTGAAGCTGTAGCATGAGTCGATTTTCACATTCAGGGATACCATAGGATAC ->ID Genus species -CGCTAGAAAGTCGTAGAAGGCTCGGAGGTTTGAAGCATCGCCCGATGGGATCTCGTTGCTGTAGCATGAGTACGGACATTCAGGGATCATAGGATAC +More detailed informations in the reference data bases can be found on the DADA2 website and contained links: https://benjjneb.github.io/dada2/training.html. ]]> + + + 10.1093/nar/gks1219 + > + 10.1093/nar/gkt1244 + + 10.1128/AEM.03006-05 + + 10.15156/BIO/786343 + + + 10.1186/s12864-015-2265-y + + 10.1093/nar/gks1160 + diff -r b4c303665291 -r 3a4ee8bf012a data_manager/data_manager.py --- a/data_manager/data_manager.py Fri Mar 08 05:38:44 2019 -0500 +++ b/data_manager/data_manager.py Tue Apr 09 07:18:57 2019 -0400 @@ -14,55 +14,49 @@ DEFAULT_TAXLEVELS="Kingdom,Phylum,Class,Order,Family,Genus,Species" FILE2NAME = { - "silva132":"Silva version 132", - "silva128":"Silva version 128", - "rdp16":"RDP trainset 16", - "rdp14":"RDP trainset 14", - "gg13.84":"GreenGenes version 13.8", - "unite8.0_fungi": "UNITE: General Fasta release 8.0 for Fungi", - "unite8.0_fungi_singletons": "UNITE: General Fasta release 8.0 for Fungi including global and 97% singletons", - "unite8.0_euka": "UNITE: General Fasta release 8.0 for all Eukaryotes", - "unite8.0_euka_singletons": "UNITE: General Fasta release 8.0 for all Eukaryotes including global and 97% singletons", + "silva_132":"Silva version 132", + "silva_128":"Silva version 128", + "rdp_16":"RDP trainset 16", + "rdp_14":"RDP trainset 14", + "greengenes_13.84":"GreenGenes version 13.84", + "unite_8.0_fungi": "UNITE: General Fasta release 8.0 for Fungi", + "unite_8.0_fungi_singletons": "UNITE: General Fasta release 8.0 for Fungi including global and 97% singletons", "RefSeq_RDP_2018_05": "NCBI RefSeq 16S rRNA database supplemented by RDP (05/2018)", - "gtdb_2018_11_20": "GTDB: Genome Taxonomy Database (Bacteria & Archaea) (11/2018)", - "hitdb1": "HitDB version 1 (Human InTestinal 16S rRNA)", - "silva132_euk_18S": "Silva version 132 Eukaryotic 18S", - "PR2v4.11.1": "Protist Ribosomal Reference database (PR2) 4.11.1" + "gtdb_2018_11": "GTDB: Genome Taxonomy Database (Bacteria & Archaea) (11/2018)", + "hitdb_1": "HitDB version 1 (Human InTestinal 16S rRNA)", + "silva_euk_18S_132": "Silva version 132 Eukaryotic 18S", + "PR2_4.11.1": "Protist Ribosomal Reference database (PR2) 4.11.1" } FILE2TAXURL = { - "silva132":"https://zenodo.org/record/1172783/files/silva_nr_v132_train_set.fa.gz?download=1", - "silva128":"https://zenodo.org/record/824551/files/silva_nr_v128_train_set.fa.gz?download=1", - "rdp16":"https://zenodo.org/record/801828/files/rdp_train_set_16.fa.gz?download=1", - "rdp14":"https://zenodo.org/record/158955/files/rdp_train_set_14.fa.gz?download=1", - "unite8.0_fungi": "https://files.plutof.ut.ee/public/orig/EB/0C/EB0CCB3A871B77EA75E472D13926271076904A588D2E1C1EA5AFCF7397D48378.zip", - "unite8.0_fungi_singletons": "https://files.plutof.ut.ee/doi/06/A2/06A2C86256EED64085670EB0C54B7115F6DAC8F311C656A9CB33E386CFABA0D0.zip", - "unite8.0_euka": "https://files.plutof.ut.ee/public/orig/D6/96/D69658E99589D888A207805A744019DBA4EC0F603E67E53732767B3E03A5AA86.zip", - "unite8.0_euka_singletons": "https://files.plutof.ut.ee/doi/C2/20/C22034350E32D6AD7E5D1AF3F8BC487E34DA0BE25602B0E748906005CE6ADA97.zip", - "gg13.84":"https://zenodo.org/record/158955/files/gg_13_8_train_set_97.fa.gz?download=1", + "silva_132":"https://zenodo.org/record/1172783/files/silva_nr_v132_train_set.fa.gz?download=1", + "silva_128":"https://zenodo.org/record/824551/files/silva_nr_v128_train_set.fa.gz?download=1", + "rdp_16":"https://zenodo.org/record/801828/files/rdp_train_set_16.fa.gz?download=1", + "rdp_14":"https://zenodo.org/record/158955/files/rdp_train_set_14.fa.gz?download=1", + "unite_8.0_fungi": "https://files.plutof.ut.ee/public/orig/EB/0C/EB0CCB3A871B77EA75E472D13926271076904A588D2E1C1EA5AFCF7397D48378.zip", + "unite_8.0_fungi_singletons": "https://files.plutof.ut.ee/doi/06/A2/06A2C86256EED64085670EB0C54B7115F6DAC8F311C656A9CB33E386CFABA0D0.zip", + "greengenes_13.84":"https://zenodo.org/record/158955/files/gg_13_8_train_set_97.fa.gz?download=1", "RefSeq_RDP_2018_05": "https://zenodo.org/record/2541239/files/RefSeq-RDP16S_v2_May2018.fa.gz?download=1", - "gtdb_2018_11_20": "https://zenodo.org/record/2541239/files/GTDB_bac-arc_ssu_r86.fa.gz?download=1", - "hitdb1": "https://zenodo.org/record/159205/files/hitdb_v1.00.fa.gz?download=1", - "silva132_euk_18S": "https://zenodo.org/record/1447330/files/silva_132.18s.99_rep_set.dada2.fa.gz?download=1", - "PR2v4.11.1": "https://github.com/pr2database/pr2database/releases/download/4.11.1/pr2_version_4.11.1_dada2.fasta.gz" + "gtdb_2018_11": "https://zenodo.org/record/2541239/files/GTDB_bac-arc_ssu_r86.fa.gz?download=1", + "hitdb_1": "https://zenodo.org/record/159205/files/hitdb_v1.00.fa.gz?download=1", + "silva_euk_18S_132": "https://zenodo.org/record/1447330/files/silva_132.18s.99_rep_set.dada2.fa.gz?download=1", + "PR2_4.11.1": "https://github.com/pr2database/pr2database/releases/download/4.11.1/pr2_version_4.11.1_dada2.fasta.gz" } FILE2SPECIESURL = { - "silva132":"https://zenodo.org/record/1172783/files/silva_species_assignment_v132.fa.gz?download=1", - "silva128":"https://zenodo.org/record/824551/files/silva_species_assignment_v128.fa.gz?download=1", - "rdp16":"https://zenodo.org/record/801828/files/rdp_species_assignment_16.fa.gz?download=1", - "rdp14":"https://zenodo.org/record/158955/files/rdp_species_assignment_14.fa.gz?download=1" + "silva_132":"https://zenodo.org/record/1172783/files/silva_species_assignment_v132.fa.gz?download=1", + "silva_128":"https://zenodo.org/record/824551/files/silva_species_assignment_v128.fa.gz?download=1", + "rdp_16":"https://zenodo.org/record/801828/files/rdp_species_assignment_16.fa.gz?download=1", + "rdp_14":"https://zenodo.org/record/158955/files/rdp_species_assignment_14.fa.gz?download=1" } FILE2TAXLEVELS = { - "PR2v4.11.1": "Kingdom,Supergroup,Division,Class,Order,Family,Genus,Species" + "PR2_4.11.1": "Kingdom,Supergroup,Division,Class,Order,Family,Genus,Species" } def url_download(url, fname, workdir): """ download url to workdir/fname - - return the path to the resulting file """ file_path = os.path.join(workdir, fname) if not os.path.exists(workdir): @@ -94,25 +88,23 @@ zip_ref.extractall(workdir) zip_ref.close() # gzip top level fasta file - fastas = glob.glob("*fasta") + fastas = glob.glob("%s/*fasta"%workdir) if len(fastas) != 1: - msg = "UNITE download %s contained more than one or no fasta file" + msg = "UNITE download %s contained %d fasta file(s): %s"%(url, len(fastas), " ".join(fastas)) raise Exception(msg) with open(fastas[0], 'rb') as f_in: with gzip.open(file_path, 'wb') as f_out: shutil.copyfileobj(f_in, f_out) - return fname -def main(dataset, outjson): +def remote_dataset(dataset, outjson): - params = json.loads(open(outjson).read()) - target_directory = params['output_data'][0]['extra_files_path'] - os.mkdir(target_directory) - output_path = os.path.abspath(os.path.join(os.getcwd(), 'dada2')) + with open(outjson) as jf: + params = json.loads(jf.read()) - workdir = os.path.join(os.getcwd(), 'dada2') - path = url_download( FILE2TAXURL[dataset], dataset+".taxonomy", workdir) + workdir = params['output_data'][0]['extra_files_path'] + os.mkdir(workdir) + url_download( FILE2TAXURL[dataset], dataset+".taxonomy", workdir) data_manager_json = {"data_tables":{}} data_manager_entry = {} @@ -122,21 +114,16 @@ data_manager_entry['taxlevels'] = FILE2TAXLEVELS.get(dataset, DEFAULT_TAXLEVELS) data_manager_json["data_tables"]["dada2_taxonomy"] = data_manager_entry - if FILE2SPECIESURL.get(dataset, False ): - path = url_download( FILE2SPECIESURL[dataset], dataset+".species", workdir) - + url_download( FILE2SPECIESURL[dataset], dataset+".species", workdir) data_manager_entry = {} data_manager_entry['value'] = dataset data_manager_entry['name'] = FILE2NAME[dataset] data_manager_entry['path'] = dataset+".species" data_manager_json["data_tables"]["dada2_species"] = data_manager_entry - - for filename in os.listdir(workdir): - shutil.move(os.path.join(output_path, filename), target_directory) - sys.stderr.write("JSON %s" %json.dumps(data_manager_json)) - file(outjson, 'w').write(json.dumps(data_manager_json)) + with file(outjson, 'w') as jf: + jf.write(json.dumps(data_manager_json)) if __name__ == '__main__': parser = argparse.ArgumentParser(description='Create data manager json.') @@ -144,4 +131,4 @@ parser.add_argument('--dataset', action='store', help='Download data set name') args = parser.parse_args() - main(args.dataset, args.out) + remote_dataset(args.dataset, args.out) diff -r b4c303665291 -r 3a4ee8bf012a data_manager/test-data/PR24.11.1_json --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/data_manager/test-data/PR24.11.1_json Tue Apr 09 07:18:57 2019 -0400 @@ -0,0 +1,1 @@ +{"data_tables": {"dada2_taxonomy": {"path": "PR2_4.11.1.taxonomy", "name": "Protist Ribosomal Reference database (PR2) 4.11.1", "value": "PR2_4.11.1", "taxlevels": "Kingdom,Supergroup,Division,Class,Order,Family,Genus,Species"}}} \ No newline at end of file diff -r b4c303665291 -r 3a4ee8bf012a data_manager/test-data/RefSeq_RDP2018_json --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/data_manager/test-data/RefSeq_RDP2018_json Tue Apr 09 07:18:57 2019 -0400 @@ -0,0 +1,1 @@ +{"data_tables": {"dada2_taxonomy": {"path": "RefSeq_RDP_2018_05.taxonomy", "name": "NCBI RefSeq 16S rRNA database supplemented by RDP (05/2018)", "value": "RefSeq_RDP_2018_05", "taxlevels": "Kingdom,Phylum,Class,Order,Family,Genus,Species"}}} \ No newline at end of file diff -r b4c303665291 -r 3a4ee8bf012a data_manager/test-data/greengenes13.84_json --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/data_manager/test-data/greengenes13.84_json Tue Apr 09 07:18:57 2019 -0400 @@ -0,0 +1,1 @@ +{"data_tables": {"dada2_taxonomy": {"path": "greengenes_13.84.taxonomy", "name": "GreenGenes version 13.84", "value": "greengenes_13.84", "taxlevels": "Kingdom,Phylum,Class,Order,Family,Genus,Species"}}} \ No newline at end of file diff -r b4c303665291 -r 3a4ee8bf012a data_manager/test-data/gtdb2018_json --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/data_manager/test-data/gtdb2018_json Tue Apr 09 07:18:57 2019 -0400 @@ -0,0 +1,1 @@ +{"data_tables": {"dada2_taxonomy": {"path": "gtdb_2018_11.taxonomy", "name": "GTDB: Genome Taxonomy Database (Bacteria & Archaea) (11/2018)", "value": "gtdb_2018_11", "taxlevels": "Kingdom,Phylum,Class,Order,Family,Genus,Species"}}} \ No newline at end of file diff -r b4c303665291 -r 3a4ee8bf012a data_manager/test-data/hitdb1_json --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/data_manager/test-data/hitdb1_json Tue Apr 09 07:18:57 2019 -0400 @@ -0,0 +1,1 @@ +{"data_tables": {"dada2_taxonomy": {"path": "hitdb_1.taxonomy", "name": "HitDB version 1 (Human InTestinal 16S rRNA)", "value": "hitdb_1", "taxlevels": "Kingdom,Phylum,Class,Order,Family,Genus,Species"}}} \ No newline at end of file diff -r b4c303665291 -r 3a4ee8bf012a data_manager/test-data/rdp16_json --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/data_manager/test-data/rdp16_json Tue Apr 09 07:18:57 2019 -0400 @@ -0,0 +1,1 @@ +{"data_tables": {"dada2_species": {"path": "rdp_16.species", "name": "RDP trainset 16", "value": "rdp_16"}, "dada2_taxonomy": {"path": "rdp_16.taxonomy", "name": "RDP trainset 16", "value": "rdp_16", "taxlevels": "Kingdom,Phylum,Class,Order,Family,Genus,Species"}}} \ No newline at end of file diff -r b4c303665291 -r 3a4ee8bf012a data_manager/test-data/silva132_json --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/data_manager/test-data/silva132_json Tue Apr 09 07:18:57 2019 -0400 @@ -0,0 +1,1 @@ +{"data_tables": {"dada2_species": {"path": "silva_132.species", "name": "Silva version 132", "value": "silva_132"}, "dada2_taxonomy": {"path": "silva_132.taxonomy", "name": "Silva version 132", "value": "silva_132", "taxlevels": "Kingdom,Phylum,Class,Order,Family,Genus,Species"}}} \ No newline at end of file diff -r b4c303665291 -r 3a4ee8bf012a data_manager/test-data/silvaeuk132_json --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/data_manager/test-data/silvaeuk132_json Tue Apr 09 07:18:57 2019 -0400 @@ -0,0 +1,1 @@ +{"data_tables": {"dada2_taxonomy": {"path": "silva_euk_18S_132.taxonomy", "name": "Silva version 132 Eukaryotic 18S", "value": "silva_euk_18S_132", "taxlevels": "Kingdom,Phylum,Class,Order,Family,Genus,Species"}}} \ No newline at end of file diff -r b4c303665291 -r 3a4ee8bf012a data_manager/test-data/unite8fungi_json --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/data_manager/test-data/unite8fungi_json Tue Apr 09 07:18:57 2019 -0400 @@ -0,0 +1,1 @@ +{"data_tables": {"dada2_taxonomy": {"path": "unite_8.0_fungi.taxonomy", "name": "UNITE: General Fasta release 8.0 for Fungi", "value": "unite_8.0_fungi", "taxlevels": "Kingdom,Phylum,Class,Order,Family,Genus,Species"}}} \ No newline at end of file diff -r b4c303665291 -r 3a4ee8bf012a test-data/SSURef_Nr99_132_tax_silva_to_NCBI_synonyms_json