Mercurial > repos > dchristiany > frogs_data_manager
changeset 10:238a5328279d draft
planemo upload commit b0ebe74a020dcb21b79d8d39e7b6a2f6533b2fc4-dirty
author | dchristiany |
---|---|
date | Mon, 28 Oct 2019 06:46:53 -0400 |
parents | 0d9cb5c5aa35 |
children | 0cc5f020640e |
files | data_manager/FROGS_data_manager.py data_manager/FROGS_data_manager.xml tool-data/phiX_db.loc.sample |
diffstat | 3 files changed, 102 insertions(+), 78 deletions(-) [+] |
line wrap: on
line diff
--- a/data_manager/FROGS_data_manager.py Mon Oct 07 10:56:23 2019 -0400 +++ b/data_manager/FROGS_data_manager.py Mon Oct 28 06:46:53 2019 -0400 @@ -5,19 +5,61 @@ def get_args(): parser = argparse.ArgumentParser() parser.add_argument("-d","--database") - parser.add_argument("--custom_db") + parser.add_argument("--all_dbs") + parser.add_argument("--date") parser.add_argument("--amplicons") + parser.add_argument("--bases") + parser.add_argument("--filters") + parser.add_argument("--only_last_versions") + parser.add_argument("--tool_data") parser.add_argument("-o","--output") args = parser.parse_args() return args +#build database last version dictionary: key=base_id, value=last version +def build_last_version_dict(db_index): + last_version_dict={} + for line in db_index : + date=int(line[0]) + base_id=line[5] + if base_id in last_version_dict: + if date > last_version_dict[base_id] : last_version_dict[base_id]=date + else: + last_version_dict[base_id]=date + return(last_version_dict) + def _add_data_table_entry(data_manager_dict, data_table_entry,data_table): data_manager_dict['data_tables'] = data_manager_dict.get('data_tables', {}) data_manager_dict['data_tables'][data_table] = data_manager_dict['data_tables'].get(data_table, []) data_manager_dict['data_tables'][data_table].append(data_table_entry) return data_manager_dict -def frogs_sources(data_manager_dict,target_directory,amplicons_list): +def keep_only_last_version(db_index): + values=["_".join(line[5].split("_")[:-1]) for line in db_index] + to_filter = list(set([val for val in values if values.count(val) >1])) + out = [line for line in db_index if "_".join(line[5].split("_")[:-1]) not in to_filter] + for bd in to_filter: + versions = [line[4] for line in db_index if "_".join(line[5].split("_")[:-1])==bd] + to_keep = bd+"_"+sorted(versions)[-1] + for line in db_index: + if line[5]==to_keep: + out.append(line) + print(line) + break + return(out) + +def frogs_sources(data_manager_dict,target_directory): + + #variables + amplicons_list=[] + bases_list=[] + filters_list=[] + if args.all_dbs=="false": + amplicons_list = [amplicon.lower().strip() for amplicon in args.amplicons.split(",") if amplicon != ""] + bases_list = [base.lower().strip() for base in args.bases.split(",") if base != ""] + filters_list = [filter.lower().strip() for filter in args.filters.split(",") if filter!=""] + bottom_date = int(args.date) + tool_data_path=args.tool_data #get frogs database index frogs_db_index_link="http://genoweb.toulouse.inra.fr/frogs_databanks/assignation/FROGS_databases.tsv" @@ -26,11 +68,16 @@ decoded_content = download.content.decode('utf-8') db_index = download.content.splitlines() db_index = [line.split("\t") for line in db_index[1:]] - db_index = [line[:4]+[line[1]+"_"+line[2]+"_"+line[3]]+[line[4]] for line in db_index] #add column name + db_index = [[line[0],line[1].lower(),line[2].lower(),line[3].lower()]+line[4:] for line in db_index] - #filter amplicons - if len(amplicons_list)!=0: - db_index = [line for line in db_index if line[4] in amplicons_list] + #filter databases + last_version_dict=build_last_version_dict(db_index) + if args.all_dbs=="false": + if len(amplicons_list)!=0: db_index = [line for line in db_index if any([amplicon in amplicons_list for amplicon in line[1].split(',')])] #filter by amplicons + if len(bases_list)!=0: db_index = [line for line in db_index if line[2] in bases_list] #filter by base + if len(filters_list)!=0: db_index = [line for line in db_index if line[3] in filters_list] #filter by filters + if bottom_date!=0: db_index = [line for line in db_index if int(line[0])>=bottom_date] #filter by date + db_index = keep_only_last_version(db_index) #keep only last version #get frogs dbs os.chdir(target_directory) @@ -38,33 +85,36 @@ os.mkdir(dir_name) dbs=set([]) for line in db_index: - value=line[4] + value=line[5] name=value.replace("_"," ") - link=line[5] + link=line[6] + name_dir="".join([line[6].replace(".tar.gz","").split("/")[-1]]) + file_path=tool_data_path+"/frogs_db/"+name_dir + if not os.path.exists(file_path): #if the file is not already in frogs_db directory + + #download frogs db + dl_file = urllib.URLopener() + dl_file.retrieve(link, "tmp.tar.gz") + + #unzip frogs db + with tarfile.open("tmp.tar.gz") as tar: + tar.extractall(dir_name) + tar.close() + os.remove('tmp.tar.gz') + + #get fasta file path + tmp = set(os.listdir(dir_name)) + new_db = dir_name+"/"+"".join(tmp.difference(dbs)) + files = os.listdir(new_db) + fasta = "".join([file for file in files if file.endswith('.fasta')]) + path = new_db+'/'+fasta + dbs = os.listdir(dir_name) + release = value+"_"+time.strftime("%Y-%m-%d") + date=time.strftime("%Y%m%d") + path = os.path.join(target_directory,path) - #download frogs db - dl_file = urllib.URLopener() - dl_file.retrieve(link, "tmp.tar.gz") - - #unzip frogs db - with tarfile.open("tmp.tar.gz") as tar: - tar.extractall(dir_name) - tar.close() - os.remove('tmp.tar.gz') - - #get fasta file path - tmp = set(os.listdir(dir_name)) - new_db = dir_name+"/"+"".join(tmp.difference(dbs)) - files = os.listdir(new_db) - fasta = "".join([file for file in files if file.endswith('.fasta')]) - path = new_db+'/'+fasta - dbs = os.listdir(dir_name) - release = value+"_"+time.strftime("%Y-%m-%d") - date=time.strftime("%Y%m%d") - path = os.path.join(target_directory,path) - - data_table_entry = dict(name = name, value = value, path=path) - _add_data_table_entry(data_manager_dict, data_table_entry, "frogs_db") + data_table_entry = dict(name = name, value = value, path=path) + _add_data_table_entry(data_manager_dict, data_table_entry, "frogs_db") def HVL_sources(data_manager_dict,target_directory): @@ -89,11 +139,8 @@ def main(): #get args from command line + global args args = get_args() - if args.database=="frogs_db_data" and args.custom_db=="true": - amplicons_list = args.amplicons.split(",") - else : - amplicons_list = [] # Extract json file params data_manager_dict = {} @@ -103,7 +150,7 @@ os.mkdir(target_directory) if args.database=="frogs_db_data": - frogs_sources(data_manager_dict,target_directory,amplicons_list) + frogs_sources(data_manager_dict,target_directory) elif args.database=="HVL_db_data": HVL_sources(data_manager_dict,target_directory)
--- a/data_manager/FROGS_data_manager.xml Mon Oct 07 10:56:23 2019 -0400 +++ b/data_manager/FROGS_data_manager.xml Mon Oct 28 06:46:53 2019 -0400 @@ -1,4 +1,4 @@ -<tool id="FROGS_data_manager" name="FROGS Data manager" version="2019.10.07.1" tool_type="manage_data"> +<tool id="FROGS_data_manager" name="FROGS Data manager" version="2019.10.28" tool_type="manage_data"> <requirements> </requirements> <stdio> @@ -8,9 +8,16 @@ python $__tool_directory__/FROGS_data_manager.py --database="$database.database" #if $database.database=="frogs_db_data" - --custom_db="$database.db_type.custom_db" - --amplicons="$database.db_type.amplicons" + --all_dbs="$database.db_type.db" + #if $database.db_type.db=="false" + --date="$database.db_type.date" + --amplicons="$database.db_type.amplicons" + --bases="$database.db_type.bases" + --filters="$database.db_type.filters" + #end if + --only_last_versions="$database.only_last_versions" #end if + --tool_data="$__tool_data_path__" --output "$output" ]]></command> <inputs> @@ -21,47 +28,18 @@ </param> <when value="frogs_db_data"> <conditional name="db_type"> - <param name="custom_db" type="boolean" checked="false" truevalue="true" falsevalue="false" label="Build a custom database"/> - <when value="true"> - <param name="amplicons" type="select" multiple="true"> - <option value="COI_MIDORI_MARINE_20180221">COI MIDORI MARINE 20180221</option> - <option value="COI_MIDORI_20180221">COI MIDORI 20180221</option> - <option value="COI_BOLD_1percentN_600nt_022019">COI BOLD 1percentN 600nt 022019</option> - <option value="COI_BOLD_1percentN_630nt_022019">COI BOLD 1percentN 630nt 022019</option> - <option value="ITS_UNITE_Euka_8.0">ITS UNITE Euka 8.0</option> - <option value="ITS_UNITE_Fungi_8.0">ITS UNITE Fungi 8.0</option> - <option value="COI_BOLD_022019">COI BOLD 022019</option> - <option value="COI_BOLD_1percentN_022019">COI BOLD 1percentN 022019</option> - <option value="EF1,18S_PHYMYCO-DB_2013">EF1,18S PHYMYCO-DB 2013</option> - <option value="16S_EZBioCloud_052018">16S EZBioCloud 052018</option> - <option value="18S_PR2_4.11.0">18S PR2 4.11.0</option> - <option value="16S_DAIRYdb_V1.1.2">16S DAIRYdb V1.1.2</option> - <option value="rbcL_Rsyst_Diatom_7">rbcL Rsyst Diatom 7</option> - <option value="ITS_UNITE_7.1">ITS UNITE 7.1</option> - <option value="rpoB_DB_NAME_TO_CHECK_122017">rpoB DB NAME TO CHECK 122017</option> - <option value="16S_SILVA_Pintail100_132">16S SILVA Pintail100 132</option> - <option value="16S_SILVA_Pintail50_132">16S SILVA Pintail50 132</option> - <option value="16S_SILVA_Pintail80_132">16S SILVA Pintail80 132</option> - <option value="16S_SILVA_132">16S SILVA 132</option> - <option value="18S_SILVA_132">18S SILVA 132</option> - <option value="23S_SILVA_132">23S SILVA 132</option> - <option value="16S_SILVA_Pintail100_128">16S SILVA Pintail100 128</option> - <option value="16S_SILVA_Pintail50_128">16S SILVA Pintail50 128</option> - <option value="16S_SILVA_Pintail80_128">16S SILVA Pintail80 128</option> - <option value="18S_PR2_Gb203_4.5">18S PR2 Gb203 4.5</option> - <option value="16S_MIDAS_S123_2.1.3">16S MIDAS S123 2.1.3</option> - <option value="16S_SILVA_128">16S SILVA 128</option> - <option value="18S_SILVA_123">18S SILVA 123</option> - <option value="18S_SILVA_128">18S SILVA 128</option> - <option value="23S_SILVA_128">23S SILVA 128</option> - <option value="16S_MIDAS_S119_1.20">16S MIDAS S119 1.20</option> - <option value="16S_Greengenes_13.5">16S Greengenes 13.5</option> - <option value="16S_SILVA_123">16S SILVA 123</option> - <option value="18S_SILVA_119-1">18S SILVA 119-1</option> - <option value="23S_SILVA_123">23S SILVA 123</option> + <param name="db" type="boolean" checked="true" truevalue="true" falsevalue="false" label="Download all databases"/> + <when value="true"/> + <when value="false"> + <param name="date" value="0" type="text" label="Only more recent database than this date will be downloaded" help="Please enter a date at the following format: YYYYMMDD, leave 0 for no date"> + <validator type="regex" message="Please enter a date at the following format: YYYYMMDD, leave 0 for no date">0|[1-2]{1}[0-9]{3}[0-1]{1}[0-9]{1}([0-2]{1}[0-9]{1}|3[0-1]{1})</validator> </param> + <param name="amplicons" type="text" label='Select database to download by amplicons, put an amplicons list separated by "," ' help='example: "COI,ITS,16S" or "23S"'/> + <param name="bases" type="text" label='Select database to download by base, put a bases list separated by "," ' help='example: "SILVA,PR2,MIDAS" or "BOLD"'/> + <param name="filters" type="text" label='Select database to download by filters, put a filters list separated by "," ' help='example: "Pintail100,Fungi"'/> </when> </conditional> + <param name="only_last_versions" type="boolean" checked="true" label="Download only the last version of each database"/> </when> <when value="HVL_db_data"/> </conditional>