changeset 10:238a5328279d draft

planemo upload commit b0ebe74a020dcb21b79d8d39e7b6a2f6533b2fc4-dirty
author dchristiany
date Mon, 28 Oct 2019 06:46:53 -0400
parents 0d9cb5c5aa35
children 0cc5f020640e
files data_manager/FROGS_data_manager.py data_manager/FROGS_data_manager.xml tool-data/phiX_db.loc.sample
diffstat 3 files changed, 102 insertions(+), 78 deletions(-) [+]
line wrap: on
line diff
--- a/data_manager/FROGS_data_manager.py	Mon Oct 07 10:56:23 2019 -0400
+++ b/data_manager/FROGS_data_manager.py	Mon Oct 28 06:46:53 2019 -0400
@@ -5,19 +5,61 @@
 def get_args():
     parser = argparse.ArgumentParser()
     parser.add_argument("-d","--database")
-    parser.add_argument("--custom_db")
+    parser.add_argument("--all_dbs")
+    parser.add_argument("--date")
     parser.add_argument("--amplicons")
+    parser.add_argument("--bases")
+    parser.add_argument("--filters")
+    parser.add_argument("--only_last_versions")
+    parser.add_argument("--tool_data")
     parser.add_argument("-o","--output")
     args = parser.parse_args()
     return args
 
+#build database last version dictionary: key=base_id, value=last version
+def build_last_version_dict(db_index):
+    last_version_dict={}
+    for line in db_index :
+        date=int(line[0])
+        base_id=line[5]
+        if base_id in last_version_dict:
+            if date > last_version_dict[base_id] : last_version_dict[base_id]=date
+        else:
+            last_version_dict[base_id]=date
+    return(last_version_dict)
+
 def _add_data_table_entry(data_manager_dict, data_table_entry,data_table):
     data_manager_dict['data_tables'] = data_manager_dict.get('data_tables', {})
     data_manager_dict['data_tables'][data_table] = data_manager_dict['data_tables'].get(data_table, [])
     data_manager_dict['data_tables'][data_table].append(data_table_entry)
     return data_manager_dict
 
-def frogs_sources(data_manager_dict,target_directory,amplicons_list):
+def keep_only_last_version(db_index):
+    values=["_".join(line[5].split("_")[:-1]) for line in db_index]
+    to_filter = list(set([val for val in values if values.count(val) >1]))
+    out = [line for line in db_index if "_".join(line[5].split("_")[:-1]) not in to_filter] 
+    for bd in to_filter:
+        versions = [line[4] for line in db_index if "_".join(line[5].split("_")[:-1])==bd]
+        to_keep = bd+"_"+sorted(versions)[-1]
+        for line in db_index:
+            if line[5]==to_keep:
+                out.append(line)
+                print(line)
+                break
+    return(out)
+
+def frogs_sources(data_manager_dict,target_directory):
+
+    #variables
+    amplicons_list=[]
+    bases_list=[]
+    filters_list=[]
+    if  args.all_dbs=="false": 
+        amplicons_list = [amplicon.lower().strip() for amplicon in args.amplicons.split(",") if amplicon != ""]
+        bases_list = [base.lower().strip() for base in args.bases.split(",") if base != ""]
+        filters_list = [filter.lower().strip() for filter in args.filters.split(",") if filter!=""]
+        bottom_date = int(args.date)
+    tool_data_path=args.tool_data
 
     #get frogs database index
     frogs_db_index_link="http://genoweb.toulouse.inra.fr/frogs_databanks/assignation/FROGS_databases.tsv"
@@ -26,11 +68,16 @@
         decoded_content = download.content.decode('utf-8')
         db_index = download.content.splitlines()    
         db_index = [line.split("\t") for line in db_index[1:]]
-        db_index = [line[:4]+[line[1]+"_"+line[2]+"_"+line[3]]+[line[4]] for line in db_index]  #add column name
+        db_index = [[line[0],line[1].lower(),line[2].lower(),line[3].lower()]+line[4:] for line in db_index]
 
-    #filter amplicons
-    if len(amplicons_list)!=0:
-        db_index = [line for line in db_index if line[4] in amplicons_list]
+    #filter databases
+    last_version_dict=build_last_version_dict(db_index)
+    if args.all_dbs=="false":
+        if len(amplicons_list)!=0: db_index = [line for line in db_index if any([amplicon in amplicons_list for amplicon in line[1].split(',')])]   #filter by amplicons
+        if len(bases_list)!=0: db_index = [line for line in db_index if line[2] in bases_list]                                                      #filter by base
+        if len(filters_list)!=0: db_index = [line for line in db_index if line[3] in filters_list]                                                  #filter by filters
+    if bottom_date!=0: db_index = [line for line in db_index if int(line[0])>=bottom_date]                                                          #filter by date      
+        db_index = keep_only_last_version(db_index)                                                          #keep only last version
 
     #get frogs dbs
     os.chdir(target_directory)
@@ -38,33 +85,36 @@
     os.mkdir(dir_name)
     dbs=set([])
     for line in db_index:
-        value=line[4]
+        value=line[5]
         name=value.replace("_"," ")
-        link=line[5]
+        link=line[6]
+        name_dir="".join([line[6].replace(".tar.gz","").split("/")[-1]])
+        file_path=tool_data_path+"/frogs_db/"+name_dir
+        if not os.path.exists(file_path):   #if the file is not already in frogs_db directory
+            
+            #download frogs db
+            dl_file = urllib.URLopener()
+            dl_file.retrieve(link, "tmp.tar.gz")
+            
+            #unzip frogs db
+            with tarfile.open("tmp.tar.gz") as tar:
+                tar.extractall(dir_name)
+                tar.close()
+                os.remove('tmp.tar.gz')
+            
+            #get fasta file path
+            tmp = set(os.listdir(dir_name))
+            new_db = dir_name+"/"+"".join(tmp.difference(dbs))
+            files = os.listdir(new_db)
+            fasta = "".join([file for file in files if file.endswith('.fasta')])
+            path = new_db+'/'+fasta
+            dbs = os.listdir(dir_name)
+            release = value+"_"+time.strftime("%Y-%m-%d")
+            date=time.strftime("%Y%m%d")
+            path = os.path.join(target_directory,path)
 
-        #download frogs db
-        dl_file = urllib.URLopener()
-        dl_file.retrieve(link, "tmp.tar.gz")
-        
-        #unzip frogs db
-        with tarfile.open("tmp.tar.gz") as tar:
-            tar.extractall(dir_name)
-            tar.close()
-            os.remove('tmp.tar.gz')
-        
-        #get fasta file path
-        tmp = set(os.listdir(dir_name))
-        new_db = dir_name+"/"+"".join(tmp.difference(dbs))
-        files = os.listdir(new_db)
-        fasta = "".join([file for file in files if file.endswith('.fasta')])
-        path = new_db+'/'+fasta
-        dbs = os.listdir(dir_name)
-        release = value+"_"+time.strftime("%Y-%m-%d")
-        date=time.strftime("%Y%m%d")
-        path = os.path.join(target_directory,path)
-
-        data_table_entry = dict(name = name, value = value, path=path)
-        _add_data_table_entry(data_manager_dict, data_table_entry, "frogs_db")
+            data_table_entry = dict(name = name, value = value, path=path)
+            _add_data_table_entry(data_manager_dict, data_table_entry, "frogs_db")
 
 def HVL_sources(data_manager_dict,target_directory):
 
@@ -89,11 +139,8 @@
 def main():
 
     #get args from command line
+    global args
     args = get_args()
-    if args.database=="frogs_db_data" and args.custom_db=="true":
-        amplicons_list = args.amplicons.split(",")
-    else :
-        amplicons_list = []
 
     # Extract json file params
     data_manager_dict = {}
@@ -103,7 +150,7 @@
     os.mkdir(target_directory)
 
     if args.database=="frogs_db_data":
-        frogs_sources(data_manager_dict,target_directory,amplicons_list)
+        frogs_sources(data_manager_dict,target_directory)
     elif args.database=="HVL_db_data":
         HVL_sources(data_manager_dict,target_directory)
 
--- a/data_manager/FROGS_data_manager.xml	Mon Oct 07 10:56:23 2019 -0400
+++ b/data_manager/FROGS_data_manager.xml	Mon Oct 28 06:46:53 2019 -0400
@@ -1,4 +1,4 @@
-<tool id="FROGS_data_manager" name="FROGS Data manager" version="2019.10.07.1" tool_type="manage_data">
+<tool id="FROGS_data_manager" name="FROGS Data manager" version="2019.10.28" tool_type="manage_data">
     <requirements>
     </requirements>
     <stdio>
@@ -8,9 +8,16 @@
         python $__tool_directory__/FROGS_data_manager.py
             --database="$database.database"
             #if $database.database=="frogs_db_data"
-                --custom_db="$database.db_type.custom_db"
-                --amplicons="$database.db_type.amplicons"
+                --all_dbs="$database.db_type.db"
+                #if $database.db_type.db=="false"
+                    --date="$database.db_type.date"
+                    --amplicons="$database.db_type.amplicons"
+                    --bases="$database.db_type.bases"
+                    --filters="$database.db_type.filters"
+                #end if
+                --only_last_versions="$database.only_last_versions"
             #end if 
+            --tool_data="$__tool_data_path__"
             --output "$output"
     ]]></command>
     <inputs>
@@ -21,47 +28,18 @@
             </param>
             <when value="frogs_db_data">
                 <conditional name="db_type">
-                    <param name="custom_db" type="boolean" checked="false" truevalue="true" falsevalue="false" label="Build a custom database"/>
-                    <when value="true">
-                        <param name="amplicons" type="select" multiple="true">
-                            <option value="COI_MIDORI_MARINE_20180221">COI MIDORI MARINE 20180221</option>
-                            <option value="COI_MIDORI_20180221">COI MIDORI 20180221</option>
-                            <option value="COI_BOLD_1percentN_600nt_022019">COI BOLD 1percentN 600nt 022019</option>
-                            <option value="COI_BOLD_1percentN_630nt_022019">COI BOLD 1percentN 630nt 022019</option>
-                            <option value="ITS_UNITE_Euka_8.0">ITS UNITE Euka 8.0</option>
-                            <option value="ITS_UNITE_Fungi_8.0">ITS UNITE Fungi 8.0</option>
-                            <option value="COI_BOLD_022019">COI BOLD 022019</option>
-                            <option value="COI_BOLD_1percentN_022019">COI BOLD 1percentN 022019</option>
-                            <option value="EF1,18S_PHYMYCO-DB_2013">EF1,18S PHYMYCO-DB 2013</option>
-                            <option value="16S_EZBioCloud_052018">16S EZBioCloud 052018</option>
-                            <option value="18S_PR2_4.11.0">18S PR2 4.11.0</option>
-                            <option value="16S_DAIRYdb_V1.1.2">16S DAIRYdb V1.1.2</option>
-                            <option value="rbcL_Rsyst_Diatom_7">rbcL Rsyst Diatom 7</option>
-                            <option value="ITS_UNITE_7.1">ITS UNITE 7.1</option>
-                            <option value="rpoB_DB_NAME_TO_CHECK_122017">rpoB DB NAME TO CHECK 122017</option>
-                            <option value="16S_SILVA_Pintail100_132">16S SILVA Pintail100 132</option>
-                            <option value="16S_SILVA_Pintail50_132">16S SILVA Pintail50 132</option>
-                            <option value="16S_SILVA_Pintail80_132">16S SILVA Pintail80 132</option>
-                            <option value="16S_SILVA_132">16S SILVA 132</option>
-                            <option value="18S_SILVA_132">18S SILVA 132</option>
-                            <option value="23S_SILVA_132">23S SILVA 132</option>
-                            <option value="16S_SILVA_Pintail100_128">16S SILVA Pintail100 128</option>
-                            <option value="16S_SILVA_Pintail50_128">16S SILVA Pintail50 128</option>
-                            <option value="16S_SILVA_Pintail80_128">16S SILVA Pintail80 128</option>
-                            <option value="18S_PR2_Gb203_4.5">18S PR2 Gb203 4.5</option>
-                            <option value="16S_MIDAS_S123_2.1.3">16S MIDAS S123 2.1.3</option>
-                            <option value="16S_SILVA_128">16S SILVA 128</option>
-                            <option value="18S_SILVA_123">18S SILVA 123</option>
-                            <option value="18S_SILVA_128">18S SILVA 128</option>
-                            <option value="23S_SILVA_128">23S SILVA 128</option>
-                            <option value="16S_MIDAS_S119_1.20">16S MIDAS S119 1.20</option>
-                            <option value="16S_Greengenes_13.5">16S Greengenes 13.5</option>
-                            <option value="16S_SILVA_123">16S SILVA 123</option>
-                            <option value="18S_SILVA_119-1">18S SILVA 119-1</option>
-                            <option value="23S_SILVA_123">23S SILVA 123</option>
+                    <param name="db" type="boolean" checked="true" truevalue="true" falsevalue="false" label="Download all databases"/>
+                    <when value="true"/>
+                    <when value="false">
+                        <param name="date" value="0" type="text"  label="Only more recent database than this date will be downloaded" help="Please enter a date at the following format: YYYYMMDD, leave 0 for no date">
+                            <validator type="regex" message="Please enter a date at the following format: YYYYMMDD, leave 0 for no date">0|[1-2]{1}[0-9]{3}[0-1]{1}[0-9]{1}([0-2]{1}[0-9]{1}|3[0-1]{1})</validator>
                         </param>
+                        <param name="amplicons" type="text" label='Select database to download by amplicons, put an amplicons list separated by "," ' help='example: "COI,ITS,16S" or "23S"'/>
+                        <param name="bases" type="text" label='Select database to download by base, put a bases list separated by "," ' help='example: "SILVA,PR2,MIDAS" or "BOLD"'/>
+                        <param name="filters" type="text" label='Select database to download by filters, put a filters list separated by "," ' help='example: "Pintail100,Fungi"'/>
                     </when>
                 </conditional>
+                <param name="only_last_versions" type="boolean" checked="true" label="Download only the last version of each database"/>
             </when>
             <when value="HVL_db_data"/>
         </conditional>
--- a/tool-data/phiX_db.loc.sample	Mon Oct 07 10:56:23 2019 -0400
+++ b/tool-data/phiX_db.loc.sample	Mon Oct 28 06:46:53 2019 -0400
@@ -39,4 +39,3 @@
 #-rw-rwxr-- 1 gpascal FROGS   88 16 sept.  2015 phi.fa.nin
 #-rw-rwxr-- 1 gpascal FROGS 1348 16 sept.  2015 phi.fa.nsq
 #
-