# HG changeset patch # User matthias # Date 1552041524 18000 # Node ID b4c30366529118a3b27c9463caa1e672698fbad5 # Parent 1c50cfb0c0ab0350c8f7fdd01b514ace43dff5f9 planemo upload for repository https://github.com/bernt-matthias/mb-galaxy-tools/tree/master/data_managers/data_manager_dada2 commit eec95ccc2189355061112ea2785b82f13a0fa077-dirty diff -r 1c50cfb0c0ab -r b4c303665291 data_manager/.dada2_fetcher.xml.swp Binary file data_manager/.dada2_fetcher.xml.swp has changed diff -r 1c50cfb0c0ab -r b4c303665291 data_manager/dada2_fetcher.xml --- a/data_manager/dada2_fetcher.xml Thu Mar 07 11:48:18 2019 -0500 +++ b/data_manager/dada2_fetcher.xml Fri Mar 08 05:38:44 2019 -0500 @@ -1,26 +1,29 @@ - + Download reference data sets - + - - + + + + + + + + + + @@ -32,8 +35,56 @@ - -http://www.arb-silva.de/silva-license-information - + +- RDP trainset 14 +- GreenGenes version 13.8 + +While the Silva and RDP data sets contain reference data bases for taxonomy and species assignment, the greengenes data set only contains a reference data base for taxonomy assignment. + +For the Silva data sets consider to check the license information: http://www.arb-silva.de/silva-license-information. + + + +(More info: http://gtdb.ecogenomic.org/) + +https://github.com/pr2database/pr2database + + +Custom Reference data sets +-------------------------- + +For ** taxonomy assignment ** the following is needed: + +- a reference fasta data base +- a comma separated list of taxonomic ranks present in the reference data base + +The reference fasta data base for taxonomic assignment (fasta or compressed fasta) needs to encode the taxonomy corresponding to each sequence in the fasta header lines in the following fashion (note, the second sequence is not assigned down to level 6): + +:: + +>Level1;Level2;Level3;Level4;Level5;Level6; +ACCTAGAAAGTCGTAGATCGAAGTTGAAGCATCGCCCGATGATCGTCTGAAGCTGTAGCATGAGTCGATTTTCACATTCAGGGATACCATAGGATAC +>Level1;Level2;Level3;Level4;Level5; +CGCTAGAAAGTCGTAGAAGGCTCGGAGGTTTGAAGCATCGCCCGATGGGATCTCGTTGCTGTAGCATGAGTACGGACATTCAGGGATCATAGGATAC + +The list of required taxonomic ranks could be for instance: "Kingdom,Phylum,Class,Order,Family,Genus" + +The reference data base for ** species assignment ** is a fasta file (or compressed fasta file), with the id line formatted as follows: + +:: + +>ID Genus species +ACCTAGAAAGTCGTAGATCGAAGTTGAAGCATCGCCCGATGATCGTCTGAAGCTGTAGCATGAGTCGATTTTCACATTCAGGGATACCATAGGATAC +>ID Genus species +CGCTAGAAAGTCGTAGAAGGCTCGGAGGTTTGAAGCATCGCCCGATGGGATCTCGTTGCTGTAGCATGAGTACGGACATTCAGGGATCATAGGATAC + ]]> diff -r 1c50cfb0c0ab -r b4c303665291 data_manager/data_manager.py --- a/data_manager/data_manager.py Thu Mar 07 11:48:18 2019 -0500 +++ b/data_manager/data_manager.py Fri Mar 08 05:38:44 2019 -0500 @@ -19,6 +19,15 @@ "rdp16":"RDP trainset 16", "rdp14":"RDP trainset 14", "gg13.84":"GreenGenes version 13.8", + "unite8.0_fungi": "UNITE: General Fasta release 8.0 for Fungi", + "unite8.0_fungi_singletons": "UNITE: General Fasta release 8.0 for Fungi including global and 97% singletons", + "unite8.0_euka": "UNITE: General Fasta release 8.0 for all Eukaryotes", + "unite8.0_euka_singletons": "UNITE: General Fasta release 8.0 for all Eukaryotes including global and 97% singletons", + "RefSeq_RDP_2018_05": "NCBI RefSeq 16S rRNA database supplemented by RDP (05/2018)", + "gtdb_2018_11_20": "GTDB: Genome Taxonomy Database (Bacteria & Archaea) (11/2018)", + "hitdb1": "HitDB version 1 (Human InTestinal 16S rRNA)", + "silva132_euk_18S": "Silva version 132 Eukaryotic 18S", + "PR2v4.11.1": "Protist Ribosomal Reference database (PR2) 4.11.1" } FILE2TAXURL = { @@ -26,7 +35,16 @@ "silva128":"https://zenodo.org/record/824551/files/silva_nr_v128_train_set.fa.gz?download=1", "rdp16":"https://zenodo.org/record/801828/files/rdp_train_set_16.fa.gz?download=1", "rdp14":"https://zenodo.org/record/158955/files/rdp_train_set_14.fa.gz?download=1", + "unite8.0_fungi": "https://files.plutof.ut.ee/public/orig/EB/0C/EB0CCB3A871B77EA75E472D13926271076904A588D2E1C1EA5AFCF7397D48378.zip", + "unite8.0_fungi_singletons": "https://files.plutof.ut.ee/doi/06/A2/06A2C86256EED64085670EB0C54B7115F6DAC8F311C656A9CB33E386CFABA0D0.zip", + "unite8.0_euka": "https://files.plutof.ut.ee/public/orig/D6/96/D69658E99589D888A207805A744019DBA4EC0F603E67E53732767B3E03A5AA86.zip", + "unite8.0_euka_singletons": "https://files.plutof.ut.ee/doi/C2/20/C22034350E32D6AD7E5D1AF3F8BC487E34DA0BE25602B0E748906005CE6ADA97.zip", "gg13.84":"https://zenodo.org/record/158955/files/gg_13_8_train_set_97.fa.gz?download=1", + "RefSeq_RDP_2018_05": "https://zenodo.org/record/2541239/files/RefSeq-RDP16S_v2_May2018.fa.gz?download=1", + "gtdb_2018_11_20": "https://zenodo.org/record/2541239/files/GTDB_bac-arc_ssu_r86.fa.gz?download=1", + "hitdb1": "https://zenodo.org/record/159205/files/hitdb_v1.00.fa.gz?download=1", + "silva132_euk_18S": "https://zenodo.org/record/1447330/files/silva_132.18s.99_rep_set.dada2.fa.gz?download=1", + "PR2v4.11.1": "https://github.com/pr2database/pr2database/releases/download/4.11.1/pr2_version_4.11.1_dada2.fasta.gz" } FILE2SPECIESURL = { @@ -37,6 +55,7 @@ } FILE2TAXLEVELS = { + "PR2v4.11.1": "Kingdom,Supergroup,Division,Class,Order,Family,Genus,Species" } def url_download(url, fname, workdir): @@ -63,7 +82,27 @@ finally: if src: src.close() - return os.path.join(workdir, fname) + + #special treatment of UNITE DBs: they are zip files containing two fasta (xyz.fasta and developer/xyz.fasta) + if fname.startswith("unite"): + import glob + import gzip + import shutil + import zipfile + # unzip download + zip_ref = zipfile.ZipFile(file_path, 'r') + zip_ref.extractall(workdir) + zip_ref.close() + # gzip top level fasta file + fastas = glob.glob("*fasta") + if len(fastas) != 1: + msg = "UNITE download %s contained more than one or no fasta file" + raise Exception(msg) + with open(fastas[0], 'rb') as f_in: + with gzip.open(file_path, 'wb') as f_out: + shutil.copyfileobj(f_in, f_out) + + return fname def main(dataset, outjson): @@ -73,28 +112,30 @@ output_path = os.path.abspath(os.path.join(os.getcwd(), 'dada2')) workdir = os.path.join(os.getcwd(), 'dada2') - path = url_download( FILE2TAXURL[dataset], taxdataset+".taxonomy", workdir) + path = url_download( FILE2TAXURL[dataset], dataset+".taxonomy", workdir) data_manager_json = {"data_tables":{}} data_manager_entry = {} data_manager_entry['value'] = dataset data_manager_entry['name'] = FILE2NAME[dataset] - data_manager_entry['path'] = path + data_manager_entry['path'] = dataset+".taxonomy" data_manager_entry['taxlevels'] = FILE2TAXLEVELS.get(dataset, DEFAULT_TAXLEVELS) data_manager_json["data_tables"]["dada2_taxonomy"] = data_manager_entry - if FILE2SPECIES.get(dataset, False ): - path = url_download( FILE2SPECIES[dataset], taxdataset+".species", workdir) + if FILE2SPECIESURL.get(dataset, False ): + path = url_download( FILE2SPECIESURL[dataset], dataset+".species", workdir) data_manager_entry = {} data_manager_entry['value'] = dataset data_manager_entry['name'] = FILE2NAME[dataset] - data_manager_entry['path'] = path + data_manager_entry['path'] = dataset+".species" data_manager_json["data_tables"]["dada2_species"] = data_manager_entry for filename in os.listdir(workdir): shutil.move(os.path.join(output_path, filename), target_directory) + + sys.stderr.write("JSON %s" %json.dumps(data_manager_json)) file(outjson, 'w').write(json.dumps(data_manager_json)) if __name__ == '__main__': diff -r 1c50cfb0c0ab -r b4c303665291 tool-data/.dada2_species.loc.sample.swp Binary file tool-data/.dada2_species.loc.sample.swp has changed diff -r 1c50cfb0c0ab -r b4c303665291 tool-data/.dada2_taxonomy.loc.sample.swp Binary file tool-data/.dada2_taxonomy.loc.sample.swp has changed