Mercurial > repos > dchristiany > frogs_data_manager
comparison data_manager/FROGS_data_manager.py @ 10:238a5328279d draft
planemo upload commit b0ebe74a020dcb21b79d8d39e7b6a2f6533b2fc4-dirty
author | dchristiany |
---|---|
date | Mon, 28 Oct 2019 06:46:53 -0400 |
parents | 0d9cb5c5aa35 |
children | 0cc5f020640e |
comparison
equal
deleted
inserted
replaced
9:0d9cb5c5aa35 | 10:238a5328279d |
---|---|
3 import os, sys, argparse, time, json, requests, urllib, tarfile | 3 import os, sys, argparse, time, json, requests, urllib, tarfile |
4 | 4 |
5 def get_args(): | 5 def get_args(): |
6 parser = argparse.ArgumentParser() | 6 parser = argparse.ArgumentParser() |
7 parser.add_argument("-d","--database") | 7 parser.add_argument("-d","--database") |
8 parser.add_argument("--custom_db") | 8 parser.add_argument("--all_dbs") |
9 parser.add_argument("--date") | |
9 parser.add_argument("--amplicons") | 10 parser.add_argument("--amplicons") |
11 parser.add_argument("--bases") | |
12 parser.add_argument("--filters") | |
13 parser.add_argument("--only_last_versions") | |
14 parser.add_argument("--tool_data") | |
10 parser.add_argument("-o","--output") | 15 parser.add_argument("-o","--output") |
11 args = parser.parse_args() | 16 args = parser.parse_args() |
12 return args | 17 return args |
18 | |
19 #build database last version dictionary: key=base_id, value=last version | |
20 def build_last_version_dict(db_index): | |
21 last_version_dict={} | |
22 for line in db_index : | |
23 date=int(line[0]) | |
24 base_id=line[5] | |
25 if base_id in last_version_dict: | |
26 if date > last_version_dict[base_id] : last_version_dict[base_id]=date | |
27 else: | |
28 last_version_dict[base_id]=date | |
29 return(last_version_dict) | |
13 | 30 |
14 def _add_data_table_entry(data_manager_dict, data_table_entry,data_table): | 31 def _add_data_table_entry(data_manager_dict, data_table_entry,data_table): |
15 data_manager_dict['data_tables'] = data_manager_dict.get('data_tables', {}) | 32 data_manager_dict['data_tables'] = data_manager_dict.get('data_tables', {}) |
16 data_manager_dict['data_tables'][data_table] = data_manager_dict['data_tables'].get(data_table, []) | 33 data_manager_dict['data_tables'][data_table] = data_manager_dict['data_tables'].get(data_table, []) |
17 data_manager_dict['data_tables'][data_table].append(data_table_entry) | 34 data_manager_dict['data_tables'][data_table].append(data_table_entry) |
18 return data_manager_dict | 35 return data_manager_dict |
19 | 36 |
20 def frogs_sources(data_manager_dict,target_directory,amplicons_list): | 37 def keep_only_last_version(db_index): |
38 values=["_".join(line[5].split("_")[:-1]) for line in db_index] | |
39 to_filter = list(set([val for val in values if values.count(val) >1])) | |
40 out = [line for line in db_index if "_".join(line[5].split("_")[:-1]) not in to_filter] | |
41 for bd in to_filter: | |
42 versions = [line[4] for line in db_index if "_".join(line[5].split("_")[:-1])==bd] | |
43 to_keep = bd+"_"+sorted(versions)[-1] | |
44 for line in db_index: | |
45 if line[5]==to_keep: | |
46 out.append(line) | |
47 print(line) | |
48 break | |
49 return(out) | |
50 | |
51 def frogs_sources(data_manager_dict,target_directory): | |
52 | |
53 #variables | |
54 amplicons_list=[] | |
55 bases_list=[] | |
56 filters_list=[] | |
57 if args.all_dbs=="false": | |
58 amplicons_list = [amplicon.lower().strip() for amplicon in args.amplicons.split(",") if amplicon != ""] | |
59 bases_list = [base.lower().strip() for base in args.bases.split(",") if base != ""] | |
60 filters_list = [filter.lower().strip() for filter in args.filters.split(",") if filter!=""] | |
61 bottom_date = int(args.date) | |
62 tool_data_path=args.tool_data | |
21 | 63 |
22 #get frogs database index | 64 #get frogs database index |
23 frogs_db_index_link="http://genoweb.toulouse.inra.fr/frogs_databanks/assignation/FROGS_databases.tsv" | 65 frogs_db_index_link="http://genoweb.toulouse.inra.fr/frogs_databanks/assignation/FROGS_databases.tsv" |
24 with requests.Session() as s: | 66 with requests.Session() as s: |
25 download = s.get(frogs_db_index_link) | 67 download = s.get(frogs_db_index_link) |
26 decoded_content = download.content.decode('utf-8') | 68 decoded_content = download.content.decode('utf-8') |
27 db_index = download.content.splitlines() | 69 db_index = download.content.splitlines() |
28 db_index = [line.split("\t") for line in db_index[1:]] | 70 db_index = [line.split("\t") for line in db_index[1:]] |
29 db_index = [line[:4]+[line[1]+"_"+line[2]+"_"+line[3]]+[line[4]] for line in db_index] #add column name | 71 db_index = [[line[0],line[1].lower(),line[2].lower(),line[3].lower()]+line[4:] for line in db_index] |
30 | 72 |
31 #filter amplicons | 73 #filter databases |
32 if len(amplicons_list)!=0: | 74 last_version_dict=build_last_version_dict(db_index) |
33 db_index = [line for line in db_index if line[4] in amplicons_list] | 75 if args.all_dbs=="false": |
76 if len(amplicons_list)!=0: db_index = [line for line in db_index if any([amplicon in amplicons_list for amplicon in line[1].split(',')])] #filter by amplicons | |
77 if len(bases_list)!=0: db_index = [line for line in db_index if line[2] in bases_list] #filter by base | |
78 if len(filters_list)!=0: db_index = [line for line in db_index if line[3] in filters_list] #filter by filters | |
79 if bottom_date!=0: db_index = [line for line in db_index if int(line[0])>=bottom_date] #filter by date | |
80 db_index = keep_only_last_version(db_index) #keep only last version | |
34 | 81 |
35 #get frogs dbs | 82 #get frogs dbs |
36 os.chdir(target_directory) | 83 os.chdir(target_directory) |
37 dir_name="frogs_db_"+time.strftime("%Y%m%d") | 84 dir_name="frogs_db_"+time.strftime("%Y%m%d") |
38 os.mkdir(dir_name) | 85 os.mkdir(dir_name) |
39 dbs=set([]) | 86 dbs=set([]) |
40 for line in db_index: | 87 for line in db_index: |
41 value=line[4] | 88 value=line[5] |
42 name=value.replace("_"," ") | 89 name=value.replace("_"," ") |
43 link=line[5] | 90 link=line[6] |
91 name_dir="".join([line[6].replace(".tar.gz","").split("/")[-1]]) | |
92 file_path=tool_data_path+"/frogs_db/"+name_dir | |
93 if not os.path.exists(file_path): #if the file is not already in frogs_db directory | |
94 | |
95 #download frogs db | |
96 dl_file = urllib.URLopener() | |
97 dl_file.retrieve(link, "tmp.tar.gz") | |
98 | |
99 #unzip frogs db | |
100 with tarfile.open("tmp.tar.gz") as tar: | |
101 tar.extractall(dir_name) | |
102 tar.close() | |
103 os.remove('tmp.tar.gz') | |
104 | |
105 #get fasta file path | |
106 tmp = set(os.listdir(dir_name)) | |
107 new_db = dir_name+"/"+"".join(tmp.difference(dbs)) | |
108 files = os.listdir(new_db) | |
109 fasta = "".join([file for file in files if file.endswith('.fasta')]) | |
110 path = new_db+'/'+fasta | |
111 dbs = os.listdir(dir_name) | |
112 release = value+"_"+time.strftime("%Y-%m-%d") | |
113 date=time.strftime("%Y%m%d") | |
114 path = os.path.join(target_directory,path) | |
44 | 115 |
45 #download frogs db | 116 data_table_entry = dict(name = name, value = value, path=path) |
46 dl_file = urllib.URLopener() | 117 _add_data_table_entry(data_manager_dict, data_table_entry, "frogs_db") |
47 dl_file.retrieve(link, "tmp.tar.gz") | |
48 | |
49 #unzip frogs db | |
50 with tarfile.open("tmp.tar.gz") as tar: | |
51 tar.extractall(dir_name) | |
52 tar.close() | |
53 os.remove('tmp.tar.gz') | |
54 | |
55 #get fasta file path | |
56 tmp = set(os.listdir(dir_name)) | |
57 new_db = dir_name+"/"+"".join(tmp.difference(dbs)) | |
58 files = os.listdir(new_db) | |
59 fasta = "".join([file for file in files if file.endswith('.fasta')]) | |
60 path = new_db+'/'+fasta | |
61 dbs = os.listdir(dir_name) | |
62 release = value+"_"+time.strftime("%Y-%m-%d") | |
63 date=time.strftime("%Y%m%d") | |
64 path = os.path.join(target_directory,path) | |
65 | |
66 data_table_entry = dict(name = name, value = value, path=path) | |
67 _add_data_table_entry(data_manager_dict, data_table_entry, "frogs_db") | |
68 | 118 |
69 def HVL_sources(data_manager_dict,target_directory): | 119 def HVL_sources(data_manager_dict,target_directory): |
70 | 120 |
71 #get phiX files | 121 #get phiX files |
72 os.chdir(target_directory) | 122 os.chdir(target_directory) |
87 _add_data_table_entry(data_manager_dict, data_table_entry, "HVL_db") | 137 _add_data_table_entry(data_manager_dict, data_table_entry, "HVL_db") |
88 | 138 |
89 def main(): | 139 def main(): |
90 | 140 |
91 #get args from command line | 141 #get args from command line |
142 global args | |
92 args = get_args() | 143 args = get_args() |
93 if args.database=="frogs_db_data" and args.custom_db=="true": | |
94 amplicons_list = args.amplicons.split(",") | |
95 else : | |
96 amplicons_list = [] | |
97 | 144 |
98 # Extract json file params | 145 # Extract json file params |
99 data_manager_dict = {} | 146 data_manager_dict = {} |
100 filename = args.output | 147 filename = args.output |
101 params = from_json_string(open(filename).read()) | 148 params = from_json_string(open(filename).read()) |
102 target_directory = params[ 'output_data' ][0]['extra_files_path'] | 149 target_directory = params[ 'output_data' ][0]['extra_files_path'] |
103 os.mkdir(target_directory) | 150 os.mkdir(target_directory) |
104 | 151 |
105 if args.database=="frogs_db_data": | 152 if args.database=="frogs_db_data": |
106 frogs_sources(data_manager_dict,target_directory,amplicons_list) | 153 frogs_sources(data_manager_dict,target_directory) |
107 elif args.database=="HVL_db_data": | 154 elif args.database=="HVL_db_data": |
108 HVL_sources(data_manager_dict,target_directory) | 155 HVL_sources(data_manager_dict,target_directory) |
109 | 156 |
110 #save info to json file | 157 #save info to json file |
111 open(filename, 'wb').write(to_json_string(data_manager_dict)) | 158 open(filename, 'wb').write(to_json_string(data_manager_dict)) |