comparison data_manager/FROGS_data_manager.py @ 10:238a5328279d draft

planemo upload commit b0ebe74a020dcb21b79d8d39e7b6a2f6533b2fc4-dirty
author dchristiany
date Mon, 28 Oct 2019 06:46:53 -0400
parents 0d9cb5c5aa35
children 0cc5f020640e
comparison
equal deleted inserted replaced
9:0d9cb5c5aa35 10:238a5328279d
3 import os, sys, argparse, time, json, requests, urllib, tarfile 3 import os, sys, argparse, time, json, requests, urllib, tarfile
4 4
5 def get_args(): 5 def get_args():
6 parser = argparse.ArgumentParser() 6 parser = argparse.ArgumentParser()
7 parser.add_argument("-d","--database") 7 parser.add_argument("-d","--database")
8 parser.add_argument("--custom_db") 8 parser.add_argument("--all_dbs")
9 parser.add_argument("--date")
9 parser.add_argument("--amplicons") 10 parser.add_argument("--amplicons")
11 parser.add_argument("--bases")
12 parser.add_argument("--filters")
13 parser.add_argument("--only_last_versions")
14 parser.add_argument("--tool_data")
10 parser.add_argument("-o","--output") 15 parser.add_argument("-o","--output")
11 args = parser.parse_args() 16 args = parser.parse_args()
12 return args 17 return args
18
19 #build database last version dictionary: key=base_id, value=last version
20 def build_last_version_dict(db_index):
21 last_version_dict={}
22 for line in db_index :
23 date=int(line[0])
24 base_id=line[5]
25 if base_id in last_version_dict:
26 if date > last_version_dict[base_id] : last_version_dict[base_id]=date
27 else:
28 last_version_dict[base_id]=date
29 return(last_version_dict)
13 30
14 def _add_data_table_entry(data_manager_dict, data_table_entry,data_table): 31 def _add_data_table_entry(data_manager_dict, data_table_entry,data_table):
15 data_manager_dict['data_tables'] = data_manager_dict.get('data_tables', {}) 32 data_manager_dict['data_tables'] = data_manager_dict.get('data_tables', {})
16 data_manager_dict['data_tables'][data_table] = data_manager_dict['data_tables'].get(data_table, []) 33 data_manager_dict['data_tables'][data_table] = data_manager_dict['data_tables'].get(data_table, [])
17 data_manager_dict['data_tables'][data_table].append(data_table_entry) 34 data_manager_dict['data_tables'][data_table].append(data_table_entry)
18 return data_manager_dict 35 return data_manager_dict
19 36
20 def frogs_sources(data_manager_dict,target_directory,amplicons_list): 37 def keep_only_last_version(db_index):
38 values=["_".join(line[5].split("_")[:-1]) for line in db_index]
39 to_filter = list(set([val for val in values if values.count(val) >1]))
40 out = [line for line in db_index if "_".join(line[5].split("_")[:-1]) not in to_filter]
41 for bd in to_filter:
42 versions = [line[4] for line in db_index if "_".join(line[5].split("_")[:-1])==bd]
43 to_keep = bd+"_"+sorted(versions)[-1]
44 for line in db_index:
45 if line[5]==to_keep:
46 out.append(line)
47 print(line)
48 break
49 return(out)
50
51 def frogs_sources(data_manager_dict,target_directory):
52
53 #variables
54 amplicons_list=[]
55 bases_list=[]
56 filters_list=[]
57 if args.all_dbs=="false":
58 amplicons_list = [amplicon.lower().strip() for amplicon in args.amplicons.split(",") if amplicon != ""]
59 bases_list = [base.lower().strip() for base in args.bases.split(",") if base != ""]
60 filters_list = [filter.lower().strip() for filter in args.filters.split(",") if filter!=""]
61 bottom_date = int(args.date)
62 tool_data_path=args.tool_data
21 63
22 #get frogs database index 64 #get frogs database index
23 frogs_db_index_link="http://genoweb.toulouse.inra.fr/frogs_databanks/assignation/FROGS_databases.tsv" 65 frogs_db_index_link="http://genoweb.toulouse.inra.fr/frogs_databanks/assignation/FROGS_databases.tsv"
24 with requests.Session() as s: 66 with requests.Session() as s:
25 download = s.get(frogs_db_index_link) 67 download = s.get(frogs_db_index_link)
26 decoded_content = download.content.decode('utf-8') 68 decoded_content = download.content.decode('utf-8')
27 db_index = download.content.splitlines() 69 db_index = download.content.splitlines()
28 db_index = [line.split("\t") for line in db_index[1:]] 70 db_index = [line.split("\t") for line in db_index[1:]]
29 db_index = [line[:4]+[line[1]+"_"+line[2]+"_"+line[3]]+[line[4]] for line in db_index] #add column name 71 db_index = [[line[0],line[1].lower(),line[2].lower(),line[3].lower()]+line[4:] for line in db_index]
30 72
31 #filter amplicons 73 #filter databases
32 if len(amplicons_list)!=0: 74 last_version_dict=build_last_version_dict(db_index)
33 db_index = [line for line in db_index if line[4] in amplicons_list] 75 if args.all_dbs=="false":
76 if len(amplicons_list)!=0: db_index = [line for line in db_index if any([amplicon in amplicons_list for amplicon in line[1].split(',')])] #filter by amplicons
77 if len(bases_list)!=0: db_index = [line for line in db_index if line[2] in bases_list] #filter by base
78 if len(filters_list)!=0: db_index = [line for line in db_index if line[3] in filters_list] #filter by filters
79 if bottom_date!=0: db_index = [line for line in db_index if int(line[0])>=bottom_date] #filter by date
80 db_index = keep_only_last_version(db_index) #keep only last version
34 81
35 #get frogs dbs 82 #get frogs dbs
36 os.chdir(target_directory) 83 os.chdir(target_directory)
37 dir_name="frogs_db_"+time.strftime("%Y%m%d") 84 dir_name="frogs_db_"+time.strftime("%Y%m%d")
38 os.mkdir(dir_name) 85 os.mkdir(dir_name)
39 dbs=set([]) 86 dbs=set([])
40 for line in db_index: 87 for line in db_index:
41 value=line[4] 88 value=line[5]
42 name=value.replace("_"," ") 89 name=value.replace("_"," ")
43 link=line[5] 90 link=line[6]
91 name_dir="".join([line[6].replace(".tar.gz","").split("/")[-1]])
92 file_path=tool_data_path+"/frogs_db/"+name_dir
93 if not os.path.exists(file_path): #if the file is not already in frogs_db directory
94
95 #download frogs db
96 dl_file = urllib.URLopener()
97 dl_file.retrieve(link, "tmp.tar.gz")
98
99 #unzip frogs db
100 with tarfile.open("tmp.tar.gz") as tar:
101 tar.extractall(dir_name)
102 tar.close()
103 os.remove('tmp.tar.gz')
104
105 #get fasta file path
106 tmp = set(os.listdir(dir_name))
107 new_db = dir_name+"/"+"".join(tmp.difference(dbs))
108 files = os.listdir(new_db)
109 fasta = "".join([file for file in files if file.endswith('.fasta')])
110 path = new_db+'/'+fasta
111 dbs = os.listdir(dir_name)
112 release = value+"_"+time.strftime("%Y-%m-%d")
113 date=time.strftime("%Y%m%d")
114 path = os.path.join(target_directory,path)
44 115
45 #download frogs db 116 data_table_entry = dict(name = name, value = value, path=path)
46 dl_file = urllib.URLopener() 117 _add_data_table_entry(data_manager_dict, data_table_entry, "frogs_db")
47 dl_file.retrieve(link, "tmp.tar.gz")
48
49 #unzip frogs db
50 with tarfile.open("tmp.tar.gz") as tar:
51 tar.extractall(dir_name)
52 tar.close()
53 os.remove('tmp.tar.gz')
54
55 #get fasta file path
56 tmp = set(os.listdir(dir_name))
57 new_db = dir_name+"/"+"".join(tmp.difference(dbs))
58 files = os.listdir(new_db)
59 fasta = "".join([file for file in files if file.endswith('.fasta')])
60 path = new_db+'/'+fasta
61 dbs = os.listdir(dir_name)
62 release = value+"_"+time.strftime("%Y-%m-%d")
63 date=time.strftime("%Y%m%d")
64 path = os.path.join(target_directory,path)
65
66 data_table_entry = dict(name = name, value = value, path=path)
67 _add_data_table_entry(data_manager_dict, data_table_entry, "frogs_db")
68 118
69 def HVL_sources(data_manager_dict,target_directory): 119 def HVL_sources(data_manager_dict,target_directory):
70 120
71 #get phiX files 121 #get phiX files
72 os.chdir(target_directory) 122 os.chdir(target_directory)
87 _add_data_table_entry(data_manager_dict, data_table_entry, "HVL_db") 137 _add_data_table_entry(data_manager_dict, data_table_entry, "HVL_db")
88 138
89 def main(): 139 def main():
90 140
91 #get args from command line 141 #get args from command line
142 global args
92 args = get_args() 143 args = get_args()
93 if args.database=="frogs_db_data" and args.custom_db=="true":
94 amplicons_list = args.amplicons.split(",")
95 else :
96 amplicons_list = []
97 144
98 # Extract json file params 145 # Extract json file params
99 data_manager_dict = {} 146 data_manager_dict = {}
100 filename = args.output 147 filename = args.output
101 params = from_json_string(open(filename).read()) 148 params = from_json_string(open(filename).read())
102 target_directory = params[ 'output_data' ][0]['extra_files_path'] 149 target_directory = params[ 'output_data' ][0]['extra_files_path']
103 os.mkdir(target_directory) 150 os.mkdir(target_directory)
104 151
105 if args.database=="frogs_db_data": 152 if args.database=="frogs_db_data":
106 frogs_sources(data_manager_dict,target_directory,amplicons_list) 153 frogs_sources(data_manager_dict,target_directory)
107 elif args.database=="HVL_db_data": 154 elif args.database=="HVL_db_data":
108 HVL_sources(data_manager_dict,target_directory) 155 HVL_sources(data_manager_dict,target_directory)
109 156
110 #save info to json file 157 #save info to json file
111 open(filename, 'wb').write(to_json_string(data_manager_dict)) 158 open(filename, 'wb').write(to_json_string(data_manager_dict))