Mercurial > repos > matthias > data_manager_dada2

Binary file data_manager/.dada2_fetcher.xml.swp has changed
--- a/data_manager/dada2_fetcher.xml	Thu Mar 07 11:48:18 2019 -0500
+++ b/data_manager/dada2_fetcher.xml	Fri Mar 08 05:38:44 2019 -0500
@@ -1,26 +1,29 @@
 <?xml version="1.0"?>
-<tool id="dada_fetcher" name="dada2 dada manager" tool_type="manage_data" version="0.0.2">
+<tool id="dada_fetcher" name="dada2 dada manager" tool_type="manage_data" version="0.0.5">
     <description>Download reference data sets</description>
     <command detect_errors="exit_code">
     <![CDATA[
     python '$__tool_directory__/data_manager.py'
     --out '${out_file}'
-    --file '$type_cond.database_name'
+    --dataset '$database_name'
     ]]>
     </command>
     <inputs>
         <param name="database_name" type="select" label="mapping data">
             <option value="silva132">Silva version 132</option>
             <option value="silva128">Silva version 128</option>
-            <option value="rdp16">RDP trainset 16 + RDP database release 11.5</option>
+            <option value="rdp16">RDP trainset 16</option>
             <option value="rdp14">RDP trainset 14</option>
-            <option value="gg13.84">GreenGenes version 13.8</option>
-<!--            <option value="unite8.0">UNITE: General Fasta release 8.0 </option>
-            <option value="RefSeq_RDP">NCBI RefSeq 16S rRNA database supplemented by RDP</option>
-            <option value="gtdb">GTDB: Genome Taxonomy Database (More info: http://gtdb.ecogenomic.org/)</option>
-            <option value="hitdb1">HitDB version 1 (Human InTestinal 16S rRNA)</option>
-            <option value="silva132_euk">Silva Eukaryotic 18S, v132 &amp; v128</option>
-            <option value="PR2v4.11.0">Protist Ribosomal Reference database 2 version 4.11.0</option>-->
+            <option value="gg13.84">GreenGenes version 13.84</option>
+            <option value="unite8.0_fungi">UNITE: General Fasta release 8.0 for Fungi</option>
+            <option value="unite8.0_fungi_singletons">UNITE: General Fasta release 8.0 for Fungi including global and 97% singletons</option>
+            <option value="unite8.0_euka">UNITE: General Fasta release 8.0 for all Eukaryotes</option>
+            <option value="unite8.0_euka_singletons">UNITE: General Fasta release 8.0 for all Eukaryotes including global and 97% singletons</option>
+            <option value="RefSeq_RDP_2018_05">NCBI RefSeq 16S rRNA database supplemented by RDP (05/2018)</option>
+            <option value="gtdb_2018_11_20">GTDB: Genome Taxonomy Database (Bacteria &amp; Archaea) (11/2018)</option>
+            <option value="hitdb1">HitDB version 1 (Human InTestinal 16S)</option>
+            <option value="silva132_euk_18S">Silva version 132 Eukaryotic 18S</option>
+            <option value="PR2v4.11.1">Protist Ribosomal Reference database (PR2) 4.11.1</option>
         </param>
     </inputs>
     <outputs>
@@ -32,8 +35,56 @@
             <output name="out_file" file="silva132_json"/>
         </test>
     </tests>
-    <help>
-http://www.arb-silva.de/silva-license-information
-    </help>
+    <help><![CDATA[
+Public Reference data sets
+--------------------------
+
+The following 16S data sets are taken from the list of data sets maintained by the DADA2 project (https://benjjneb.github.io/dada2/training.html)
+
+- Silva version 132
+- Silva version 128
+- RDP trainset 16 + RDP database release 11.5</option>
+- RDP trainset 14
+- GreenGenes version 13.8
+
+While the Silva and RDP data sets contain reference data bases for taxonomy and species assignment, the greengenes data set only contains a reference data base for taxonomy assignment.
+
+For the Silva data sets consider to check the license information: http://www.arb-silva.de/silva-license-information.
+
+
+
+(More info: http://gtdb.ecogenomic.org/)
+
+https://github.com/pr2database/pr2database
+
+
+Custom Reference data sets
+--------------------------
+
+For ** taxonomy assignment ** the following is needed:
+
+- a reference fasta data base
+- a comma separated list of taxonomic ranks present in the reference data base
+
+The reference fasta data base for taxonomic assignment (fasta or compressed fasta) needs to encode the taxonomy corresponding to each sequence in the fasta header lines in the following fashion (note, the second sequence is not assigned down to level 6):
+
+::
+
+>Level1;Level2;Level3;Level4;Level5;Level6;
+ACCTAGAAAGTCGTAGATCGAAGTTGAAGCATCGCCCGATGATCGTCTGAAGCTGTAGCATGAGTCGATTTTCACATTCAGGGATACCATAGGATAC
+>Level1;Level2;Level3;Level4;Level5;
+CGCTAGAAAGTCGTAGAAGGCTCGGAGGTTTGAAGCATCGCCCGATGGGATCTCGTTGCTGTAGCATGAGTACGGACATTCAGGGATCATAGGATAC
+
+The list of required taxonomic ranks could be for instance: "Kingdom,Phylum,Class,Order,Family,Genus"
+
+The reference data base for ** species assignment ** is a fasta file (or compressed fasta file), with the id line formatted as follows:
+
+::
+
+>ID Genus species
+ACCTAGAAAGTCGTAGATCGAAGTTGAAGCATCGCCCGATGATCGTCTGAAGCTGTAGCATGAGTCGATTTTCACATTCAGGGATACCATAGGATAC
+>ID Genus species
+CGCTAGAAAGTCGTAGAAGGCTCGGAGGTTTGAAGCATCGCCCGATGGGATCTCGTTGCTGTAGCATGAGTACGGACATTCAGGGATCATAGGATAC
+    ]]></help>
 </tool>
--- a/data_manager/data_manager.py	Thu Mar 07 11:48:18 2019 -0500
+++ b/data_manager/data_manager.py	Fri Mar 08 05:38:44 2019 -0500
@@ -19,6 +19,15 @@
     "rdp16":"RDP trainset 16",
     "rdp14":"RDP trainset 14",
     "gg13.84":"GreenGenes version 13.8",
+    "unite8.0_fungi": "UNITE: General Fasta release 8.0 for Fungi",
+    "unite8.0_fungi_singletons": "UNITE: General Fasta release 8.0 for Fungi including global and 97% singletons",
+    "unite8.0_euka": "UNITE: General Fasta release 8.0 for all Eukaryotes",
+    "unite8.0_euka_singletons": "UNITE: General Fasta release 8.0 for all Eukaryotes including global and 97% singletons",
+    "RefSeq_RDP_2018_05": "NCBI RefSeq 16S rRNA database supplemented by RDP (05/2018)",
+    "gtdb_2018_11_20": "GTDB: Genome Taxonomy Database (Bacteria &amp; Archaea) (11/2018)",
+    "hitdb1": "HitDB version 1 (Human InTestinal 16S rRNA)",
+    "silva132_euk_18S": "Silva version 132 Eukaryotic 18S",
+    "PR2v4.11.1": "Protist Ribosomal Reference database (PR2) 4.11.1"
 }

 FILE2TAXURL = {
@@ -26,7 +35,16 @@
     "silva128":"https://zenodo.org/record/824551/files/silva_nr_v128_train_set.fa.gz?download=1",
     "rdp16":"https://zenodo.org/record/801828/files/rdp_train_set_16.fa.gz?download=1",
     "rdp14":"https://zenodo.org/record/158955/files/rdp_train_set_14.fa.gz?download=1",
+    "unite8.0_fungi": "https://files.plutof.ut.ee/public/orig/EB/0C/EB0CCB3A871B77EA75E472D13926271076904A588D2E1C1EA5AFCF7397D48378.zip",
+    "unite8.0_fungi_singletons": "https://files.plutof.ut.ee/doi/06/A2/06A2C86256EED64085670EB0C54B7115F6DAC8F311C656A9CB33E386CFABA0D0.zip",
+    "unite8.0_euka": "https://files.plutof.ut.ee/public/orig/D6/96/D69658E99589D888A207805A744019DBA4EC0F603E67E53732767B3E03A5AA86.zip",
+    "unite8.0_euka_singletons": "https://files.plutof.ut.ee/doi/C2/20/C22034350E32D6AD7E5D1AF3F8BC487E34DA0BE25602B0E748906005CE6ADA97.zip",
     "gg13.84":"https://zenodo.org/record/158955/files/gg_13_8_train_set_97.fa.gz?download=1",
+    "RefSeq_RDP_2018_05": "https://zenodo.org/record/2541239/files/RefSeq-RDP16S_v2_May2018.fa.gz?download=1",
+    "gtdb_2018_11_20": "https://zenodo.org/record/2541239/files/GTDB_bac-arc_ssu_r86.fa.gz?download=1",
+    "hitdb1": "https://zenodo.org/record/159205/files/hitdb_v1.00.fa.gz?download=1",
+    "silva132_euk_18S": "https://zenodo.org/record/1447330/files/silva_132.18s.99_rep_set.dada2.fa.gz?download=1",
+    "PR2v4.11.1": "https://github.com/pr2database/pr2database/releases/download/4.11.1/pr2_version_4.11.1_dada2.fasta.gz"
 }

 FILE2SPECIESURL = {
@@ -37,6 +55,7 @@
 }

 FILE2TAXLEVELS = {
+    "PR2v4.11.1": "Kingdom,Supergroup,Division,Class,Order,Family,Genus,Species"
 }

 def url_download(url, fname, workdir):
@@ -63,7 +82,27 @@
     finally:
         if src:
             src.close()
-    return os.path.join(workdir, fname)
+
+    #special treatment of UNITE DBs: they are zip files containing two fasta (xyz.fasta and developer/xyz.fasta)
+    if fname.startswith("unite"):
+        import glob
+        import gzip
+        import shutil
+        import zipfile
+        # unzip download
+        zip_ref = zipfile.ZipFile(file_path, 'r')
+        zip_ref.extractall(workdir)
+        zip_ref.close()
+        # gzip top level fasta file
+        fastas = glob.glob("*fasta")
+        if len(fastas) != 1:
+            msg = "UNITE download %s contained more than one or no fasta file"
+            raise Exception(msg)
+        with open(fastas[0], 'rb') as f_in:
+            with gzip.open(file_path, 'wb') as f_out:
+                shutil.copyfileobj(f_in, f_out)
+
+    return fname

 def main(dataset, outjson):

@@ -73,28 +112,30 @@
     output_path = os.path.abspath(os.path.join(os.getcwd(), 'dada2'))

     workdir = os.path.join(os.getcwd(), 'dada2')
-    path = url_download( FILE2TAXURL[dataset], taxdataset+".taxonomy", workdir)
+    path = url_download( FILE2TAXURL[dataset], dataset+".taxonomy", workdir)

     data_manager_json = {"data_tables":{}}
     data_manager_entry = {}
     data_manager_entry['value'] = dataset
     data_manager_entry['name'] = FILE2NAME[dataset]
-    data_manager_entry['path'] = path
+    data_manager_entry['path'] = dataset+".taxonomy"
     data_manager_entry['taxlevels'] = FILE2TAXLEVELS.get(dataset, DEFAULT_TAXLEVELS)
     data_manager_json["data_tables"]["dada2_taxonomy"] = data_manager_entry


-    if FILE2SPECIES.get(dataset, False ):
-        path = url_download( FILE2SPECIES[dataset], taxdataset+".species", workdir)
+    if FILE2SPECIESURL.get(dataset, False ):
+        path = url_download( FILE2SPECIESURL[dataset], dataset+".species", workdir)

         data_manager_entry = {}
         data_manager_entry['value'] = dataset
         data_manager_entry['name'] = FILE2NAME[dataset]
-        data_manager_entry['path'] = path
+        data_manager_entry['path'] = dataset+".species"
         data_manager_json["data_tables"]["dada2_species"] = data_manager_entry

     for filename in os.listdir(workdir):
         shutil.move(os.path.join(output_path, filename), target_directory)
+
+    sys.stderr.write("JSON %s" %json.dumps(data_manager_json))
     file(outjson, 'w').write(json.dumps(data_manager_json))

 if __name__ == '__main__':
Binary file tool-data/.dada2_species.loc.sample.swp has changed
Binary file tool-data/.dada2_taxonomy.loc.sample.swp has changed