annotate data_manager/FROGS_data_manager.py @ 9:0d9cb5c5aa35 draft

planemo upload commit af25bf8e6c89c9de2e25dc0bc0ead06d8c05cf17
author dchristiany
date Mon, 07 Oct 2019 10:56:23 -0400
parents 99b2dfb1fa1c
children 238a5328279d
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
7caea40b2a30 planemo upload commit c3ff1475af0e964a0c61458b66e2744c903d8d3d-dirty
dchristiany
parents:
diff changeset
1 # -*- coding: utf-8 -*-
7caea40b2a30 planemo upload commit c3ff1475af0e964a0c61458b66e2744c903d8d3d-dirty
dchristiany
parents:
diff changeset
2 from galaxy.util.json import from_json_string, to_json_string
5
ac4fb9d97a51 planemo upload commit c3ff1475af0e964a0c61458b66e2744c903d8d3d-dirty
dchristiany
parents: 4
diff changeset
3 import os, sys, argparse, time, json, requests, urllib, tarfile
0
7caea40b2a30 planemo upload commit c3ff1475af0e964a0c61458b66e2744c903d8d3d-dirty
dchristiany
parents:
diff changeset
4
7caea40b2a30 planemo upload commit c3ff1475af0e964a0c61458b66e2744c903d8d3d-dirty
dchristiany
parents:
diff changeset
5 def get_args():
7caea40b2a30 planemo upload commit c3ff1475af0e964a0c61458b66e2744c903d8d3d-dirty
dchristiany
parents:
diff changeset
6 parser = argparse.ArgumentParser()
7caea40b2a30 planemo upload commit c3ff1475af0e964a0c61458b66e2744c903d8d3d-dirty
dchristiany
parents:
diff changeset
7 parser.add_argument("-d","--database")
9
0d9cb5c5aa35 planemo upload commit af25bf8e6c89c9de2e25dc0bc0ead06d8c05cf17
dchristiany
parents: 7
diff changeset
8 parser.add_argument("--custom_db")
0d9cb5c5aa35 planemo upload commit af25bf8e6c89c9de2e25dc0bc0ead06d8c05cf17
dchristiany
parents: 7
diff changeset
9 parser.add_argument("--amplicons")
0
7caea40b2a30 planemo upload commit c3ff1475af0e964a0c61458b66e2744c903d8d3d-dirty
dchristiany
parents:
diff changeset
10 parser.add_argument("-o","--output")
7caea40b2a30 planemo upload commit c3ff1475af0e964a0c61458b66e2744c903d8d3d-dirty
dchristiany
parents:
diff changeset
11 args = parser.parse_args()
7caea40b2a30 planemo upload commit c3ff1475af0e964a0c61458b66e2744c903d8d3d-dirty
dchristiany
parents:
diff changeset
12 return args
7caea40b2a30 planemo upload commit c3ff1475af0e964a0c61458b66e2744c903d8d3d-dirty
dchristiany
parents:
diff changeset
13
7caea40b2a30 planemo upload commit c3ff1475af0e964a0c61458b66e2744c903d8d3d-dirty
dchristiany
parents:
diff changeset
14 def _add_data_table_entry(data_manager_dict, data_table_entry,data_table):
7caea40b2a30 planemo upload commit c3ff1475af0e964a0c61458b66e2744c903d8d3d-dirty
dchristiany
parents:
diff changeset
15 data_manager_dict['data_tables'] = data_manager_dict.get('data_tables', {})
7caea40b2a30 planemo upload commit c3ff1475af0e964a0c61458b66e2744c903d8d3d-dirty
dchristiany
parents:
diff changeset
16 data_manager_dict['data_tables'][data_table] = data_manager_dict['data_tables'].get(data_table, [])
7caea40b2a30 planemo upload commit c3ff1475af0e964a0c61458b66e2744c903d8d3d-dirty
dchristiany
parents:
diff changeset
17 data_manager_dict['data_tables'][data_table].append(data_table_entry)
7caea40b2a30 planemo upload commit c3ff1475af0e964a0c61458b66e2744c903d8d3d-dirty
dchristiany
parents:
diff changeset
18 return data_manager_dict
7caea40b2a30 planemo upload commit c3ff1475af0e964a0c61458b66e2744c903d8d3d-dirty
dchristiany
parents:
diff changeset
19
9
0d9cb5c5aa35 planemo upload commit af25bf8e6c89c9de2e25dc0bc0ead06d8c05cf17
dchristiany
parents: 7
diff changeset
20 def frogs_sources(data_manager_dict,target_directory,amplicons_list):
0
7caea40b2a30 planemo upload commit c3ff1475af0e964a0c61458b66e2744c903d8d3d-dirty
dchristiany
parents:
diff changeset
21
7caea40b2a30 planemo upload commit c3ff1475af0e964a0c61458b66e2744c903d8d3d-dirty
dchristiany
parents:
diff changeset
22 #get frogs database index
7caea40b2a30 planemo upload commit c3ff1475af0e964a0c61458b66e2744c903d8d3d-dirty
dchristiany
parents:
diff changeset
23 frogs_db_index_link="http://genoweb.toulouse.inra.fr/frogs_databanks/assignation/FROGS_databases.tsv"
7caea40b2a30 planemo upload commit c3ff1475af0e964a0c61458b66e2744c903d8d3d-dirty
dchristiany
parents:
diff changeset
24 with requests.Session() as s:
7caea40b2a30 planemo upload commit c3ff1475af0e964a0c61458b66e2744c903d8d3d-dirty
dchristiany
parents:
diff changeset
25 download = s.get(frogs_db_index_link)
7caea40b2a30 planemo upload commit c3ff1475af0e964a0c61458b66e2744c903d8d3d-dirty
dchristiany
parents:
diff changeset
26 decoded_content = download.content.decode('utf-8')
7caea40b2a30 planemo upload commit c3ff1475af0e964a0c61458b66e2744c903d8d3d-dirty
dchristiany
parents:
diff changeset
27 db_index = download.content.splitlines()
7caea40b2a30 planemo upload commit c3ff1475af0e964a0c61458b66e2744c903d8d3d-dirty
dchristiany
parents:
diff changeset
28 db_index = [line.split("\t") for line in db_index[1:]]
7
99b2dfb1fa1c planemo upload commit c3ff1475af0e964a0c61458b66e2744c903d8d3d-dirty
dchristiany
parents: 6
diff changeset
29 db_index = [line[:4]+[line[1]+"_"+line[2]+"_"+line[3]]+[line[4]] for line in db_index] #add column name
4
95d5191f1dc4 planemo upload commit c3ff1475af0e964a0c61458b66e2744c903d8d3d-dirty
dchristiany
parents: 0
diff changeset
30
9
0d9cb5c5aa35 planemo upload commit af25bf8e6c89c9de2e25dc0bc0ead06d8c05cf17
dchristiany
parents: 7
diff changeset
31 #filter amplicons
0d9cb5c5aa35 planemo upload commit af25bf8e6c89c9de2e25dc0bc0ead06d8c05cf17
dchristiany
parents: 7
diff changeset
32 if len(amplicons_list)!=0:
0d9cb5c5aa35 planemo upload commit af25bf8e6c89c9de2e25dc0bc0ead06d8c05cf17
dchristiany
parents: 7
diff changeset
33 db_index = [line for line in db_index if line[4] in amplicons_list]
0d9cb5c5aa35 planemo upload commit af25bf8e6c89c9de2e25dc0bc0ead06d8c05cf17
dchristiany
parents: 7
diff changeset
34
0
7caea40b2a30 planemo upload commit c3ff1475af0e964a0c61458b66e2744c903d8d3d-dirty
dchristiany
parents:
diff changeset
35 #get frogs dbs
7
99b2dfb1fa1c planemo upload commit c3ff1475af0e964a0c61458b66e2744c903d8d3d-dirty
dchristiany
parents: 6
diff changeset
36 os.chdir(target_directory)
0
7caea40b2a30 planemo upload commit c3ff1475af0e964a0c61458b66e2744c903d8d3d-dirty
dchristiany
parents:
diff changeset
37 dir_name="frogs_db_"+time.strftime("%Y%m%d")
7caea40b2a30 planemo upload commit c3ff1475af0e964a0c61458b66e2744c903d8d3d-dirty
dchristiany
parents:
diff changeset
38 os.mkdir(dir_name)
7caea40b2a30 planemo upload commit c3ff1475af0e964a0c61458b66e2744c903d8d3d-dirty
dchristiany
parents:
diff changeset
39 dbs=set([])
7
99b2dfb1fa1c planemo upload commit c3ff1475af0e964a0c61458b66e2744c903d8d3d-dirty
dchristiany
parents: 6
diff changeset
40 for line in db_index:
0
7caea40b2a30 planemo upload commit c3ff1475af0e964a0c61458b66e2744c903d8d3d-dirty
dchristiany
parents:
diff changeset
41 value=line[4]
7caea40b2a30 planemo upload commit c3ff1475af0e964a0c61458b66e2744c903d8d3d-dirty
dchristiany
parents:
diff changeset
42 name=value.replace("_"," ")
7caea40b2a30 planemo upload commit c3ff1475af0e964a0c61458b66e2744c903d8d3d-dirty
dchristiany
parents:
diff changeset
43 link=line[5]
7caea40b2a30 planemo upload commit c3ff1475af0e964a0c61458b66e2744c903d8d3d-dirty
dchristiany
parents:
diff changeset
44
7caea40b2a30 planemo upload commit c3ff1475af0e964a0c61458b66e2744c903d8d3d-dirty
dchristiany
parents:
diff changeset
45 #download frogs db
7caea40b2a30 planemo upload commit c3ff1475af0e964a0c61458b66e2744c903d8d3d-dirty
dchristiany
parents:
diff changeset
46 dl_file = urllib.URLopener()
7caea40b2a30 planemo upload commit c3ff1475af0e964a0c61458b66e2744c903d8d3d-dirty
dchristiany
parents:
diff changeset
47 dl_file.retrieve(link, "tmp.tar.gz")
7caea40b2a30 planemo upload commit c3ff1475af0e964a0c61458b66e2744c903d8d3d-dirty
dchristiany
parents:
diff changeset
48
7caea40b2a30 planemo upload commit c3ff1475af0e964a0c61458b66e2744c903d8d3d-dirty
dchristiany
parents:
diff changeset
49 #unzip frogs db
7caea40b2a30 planemo upload commit c3ff1475af0e964a0c61458b66e2744c903d8d3d-dirty
dchristiany
parents:
diff changeset
50 with tarfile.open("tmp.tar.gz") as tar:
7caea40b2a30 planemo upload commit c3ff1475af0e964a0c61458b66e2744c903d8d3d-dirty
dchristiany
parents:
diff changeset
51 tar.extractall(dir_name)
7caea40b2a30 planemo upload commit c3ff1475af0e964a0c61458b66e2744c903d8d3d-dirty
dchristiany
parents:
diff changeset
52 tar.close()
7caea40b2a30 planemo upload commit c3ff1475af0e964a0c61458b66e2744c903d8d3d-dirty
dchristiany
parents:
diff changeset
53 os.remove('tmp.tar.gz')
7caea40b2a30 planemo upload commit c3ff1475af0e964a0c61458b66e2744c903d8d3d-dirty
dchristiany
parents:
diff changeset
54
7caea40b2a30 planemo upload commit c3ff1475af0e964a0c61458b66e2744c903d8d3d-dirty
dchristiany
parents:
diff changeset
55 #get fasta file path
7caea40b2a30 planemo upload commit c3ff1475af0e964a0c61458b66e2744c903d8d3d-dirty
dchristiany
parents:
diff changeset
56 tmp = set(os.listdir(dir_name))
7caea40b2a30 planemo upload commit c3ff1475af0e964a0c61458b66e2744c903d8d3d-dirty
dchristiany
parents:
diff changeset
57 new_db = dir_name+"/"+"".join(tmp.difference(dbs))
7caea40b2a30 planemo upload commit c3ff1475af0e964a0c61458b66e2744c903d8d3d-dirty
dchristiany
parents:
diff changeset
58 files = os.listdir(new_db)
7caea40b2a30 planemo upload commit c3ff1475af0e964a0c61458b66e2744c903d8d3d-dirty
dchristiany
parents:
diff changeset
59 fasta = "".join([file for file in files if file.endswith('.fasta')])
7caea40b2a30 planemo upload commit c3ff1475af0e964a0c61458b66e2744c903d8d3d-dirty
dchristiany
parents:
diff changeset
60 path = new_db+'/'+fasta
7caea40b2a30 planemo upload commit c3ff1475af0e964a0c61458b66e2744c903d8d3d-dirty
dchristiany
parents:
diff changeset
61 dbs = os.listdir(dir_name)
7caea40b2a30 planemo upload commit c3ff1475af0e964a0c61458b66e2744c903d8d3d-dirty
dchristiany
parents:
diff changeset
62 release = value+"_"+time.strftime("%Y-%m-%d")
7caea40b2a30 planemo upload commit c3ff1475af0e964a0c61458b66e2744c903d8d3d-dirty
dchristiany
parents:
diff changeset
63 date=time.strftime("%Y%m%d")
7caea40b2a30 planemo upload commit c3ff1475af0e964a0c61458b66e2744c903d8d3d-dirty
dchristiany
parents:
diff changeset
64 path = os.path.join(target_directory,path)
7caea40b2a30 planemo upload commit c3ff1475af0e964a0c61458b66e2744c903d8d3d-dirty
dchristiany
parents:
diff changeset
65
7caea40b2a30 planemo upload commit c3ff1475af0e964a0c61458b66e2744c903d8d3d-dirty
dchristiany
parents:
diff changeset
66 data_table_entry = dict(name = name, value = value, path=path)
7caea40b2a30 planemo upload commit c3ff1475af0e964a0c61458b66e2744c903d8d3d-dirty
dchristiany
parents:
diff changeset
67 _add_data_table_entry(data_manager_dict, data_table_entry, "frogs_db")
7caea40b2a30 planemo upload commit c3ff1475af0e964a0c61458b66e2744c903d8d3d-dirty
dchristiany
parents:
diff changeset
68
9
0d9cb5c5aa35 planemo upload commit af25bf8e6c89c9de2e25dc0bc0ead06d8c05cf17
dchristiany
parents: 7
diff changeset
69 def HVL_sources(data_manager_dict,target_directory):
0d9cb5c5aa35 planemo upload commit af25bf8e6c89c9de2e25dc0bc0ead06d8c05cf17
dchristiany
parents: 7
diff changeset
70
0d9cb5c5aa35 planemo upload commit af25bf8e6c89c9de2e25dc0bc0ead06d8c05cf17
dchristiany
parents: 7
diff changeset
71 #get phiX files
0d9cb5c5aa35 planemo upload commit af25bf8e6c89c9de2e25dc0bc0ead06d8c05cf17
dchristiany
parents: 7
diff changeset
72 os.chdir(target_directory)
0d9cb5c5aa35 planemo upload commit af25bf8e6c89c9de2e25dc0bc0ead06d8c05cf17
dchristiany
parents: 7
diff changeset
73 for link in ["http://genoweb.toulouse.inra.fr/frogs_databanks/HVL/ITS/UNITE_s_7.1_20112016/Unite_s_7.1_20112016_ITS1.fasta","http://genoweb.toulouse.inra.fr/frogs_databanks/HVL/ITS/UNITE_s_7.1_20112016/Unite_s_7.1_20112016_ITS2.fasta"]:
0d9cb5c5aa35 planemo upload commit af25bf8e6c89c9de2e25dc0bc0ead06d8c05cf17
dchristiany
parents: 7
diff changeset
74 file_name=link.split("/")[-1].replace('.fasta',"_"+time.strftime("%Y-%m-%d")+".fasta")
0d9cb5c5aa35 planemo upload commit af25bf8e6c89c9de2e25dc0bc0ead06d8c05cf17
dchristiany
parents: 7
diff changeset
75 dl_file = urllib.URLopener()
0d9cb5c5aa35 planemo upload commit af25bf8e6c89c9de2e25dc0bc0ead06d8c05cf17
dchristiany
parents: 7
diff changeset
76 dl_file.retrieve(link,file_name)
0
7caea40b2a30 planemo upload commit c3ff1475af0e964a0c61458b66e2744c903d8d3d-dirty
dchristiany
parents:
diff changeset
77
9
0d9cb5c5aa35 planemo upload commit af25bf8e6c89c9de2e25dc0bc0ead06d8c05cf17
dchristiany
parents: 7
diff changeset
78 #get fasta file path
0d9cb5c5aa35 planemo upload commit af25bf8e6c89c9de2e25dc0bc0ead06d8c05cf17
dchristiany
parents: 7
diff changeset
79 path = os.path.join(target_directory,file_name)
0d9cb5c5aa35 planemo upload commit af25bf8e6c89c9de2e25dc0bc0ead06d8c05cf17
dchristiany
parents: 7
diff changeset
80 if link.endswith('ITS1.fasta'):
0d9cb5c5aa35 planemo upload commit af25bf8e6c89c9de2e25dc0bc0ead06d8c05cf17
dchristiany
parents: 7
diff changeset
81 name = "UNITE 7.1 ITS1 " + time.strftime("%Y-%m-%d")
0d9cb5c5aa35 planemo upload commit af25bf8e6c89c9de2e25dc0bc0ead06d8c05cf17
dchristiany
parents: 7
diff changeset
82 elif link.endswith('ITS2.fasta'):
0d9cb5c5aa35 planemo upload commit af25bf8e6c89c9de2e25dc0bc0ead06d8c05cf17
dchristiany
parents: 7
diff changeset
83 name = "UNITE 7.1 ITS2 " + time.strftime("%Y-%m-%d")
0d9cb5c5aa35 planemo upload commit af25bf8e6c89c9de2e25dc0bc0ead06d8c05cf17
dchristiany
parents: 7
diff changeset
84 value=file_name.replace('.fasta','')
0d9cb5c5aa35 planemo upload commit af25bf8e6c89c9de2e25dc0bc0ead06d8c05cf17
dchristiany
parents: 7
diff changeset
85
0d9cb5c5aa35 planemo upload commit af25bf8e6c89c9de2e25dc0bc0ead06d8c05cf17
dchristiany
parents: 7
diff changeset
86 data_table_entry = dict(name = name, value = value, path=path)
0d9cb5c5aa35 planemo upload commit af25bf8e6c89c9de2e25dc0bc0ead06d8c05cf17
dchristiany
parents: 7
diff changeset
87 _add_data_table_entry(data_manager_dict, data_table_entry, "HVL_db")
0
7caea40b2a30 planemo upload commit c3ff1475af0e964a0c61458b66e2744c903d8d3d-dirty
dchristiany
parents:
diff changeset
88
7caea40b2a30 planemo upload commit c3ff1475af0e964a0c61458b66e2744c903d8d3d-dirty
dchristiany
parents:
diff changeset
89 def main():
7caea40b2a30 planemo upload commit c3ff1475af0e964a0c61458b66e2744c903d8d3d-dirty
dchristiany
parents:
diff changeset
90
7caea40b2a30 planemo upload commit c3ff1475af0e964a0c61458b66e2744c903d8d3d-dirty
dchristiany
parents:
diff changeset
91 #get args from command line
7caea40b2a30 planemo upload commit c3ff1475af0e964a0c61458b66e2744c903d8d3d-dirty
dchristiany
parents:
diff changeset
92 args = get_args()
9
0d9cb5c5aa35 planemo upload commit af25bf8e6c89c9de2e25dc0bc0ead06d8c05cf17
dchristiany
parents: 7
diff changeset
93 if args.database=="frogs_db_data" and args.custom_db=="true":
0d9cb5c5aa35 planemo upload commit af25bf8e6c89c9de2e25dc0bc0ead06d8c05cf17
dchristiany
parents: 7
diff changeset
94 amplicons_list = args.amplicons.split(",")
0d9cb5c5aa35 planemo upload commit af25bf8e6c89c9de2e25dc0bc0ead06d8c05cf17
dchristiany
parents: 7
diff changeset
95 else :
0d9cb5c5aa35 planemo upload commit af25bf8e6c89c9de2e25dc0bc0ead06d8c05cf17
dchristiany
parents: 7
diff changeset
96 amplicons_list = []
0
7caea40b2a30 planemo upload commit c3ff1475af0e964a0c61458b66e2744c903d8d3d-dirty
dchristiany
parents:
diff changeset
97
7caea40b2a30 planemo upload commit c3ff1475af0e964a0c61458b66e2744c903d8d3d-dirty
dchristiany
parents:
diff changeset
98 # Extract json file params
7caea40b2a30 planemo upload commit c3ff1475af0e964a0c61458b66e2744c903d8d3d-dirty
dchristiany
parents:
diff changeset
99 data_manager_dict = {}
7caea40b2a30 planemo upload commit c3ff1475af0e964a0c61458b66e2744c903d8d3d-dirty
dchristiany
parents:
diff changeset
100 filename = args.output
7caea40b2a30 planemo upload commit c3ff1475af0e964a0c61458b66e2744c903d8d3d-dirty
dchristiany
parents:
diff changeset
101 params = from_json_string(open(filename).read())
7caea40b2a30 planemo upload commit c3ff1475af0e964a0c61458b66e2744c903d8d3d-dirty
dchristiany
parents:
diff changeset
102 target_directory = params[ 'output_data' ][0]['extra_files_path']
7caea40b2a30 planemo upload commit c3ff1475af0e964a0c61458b66e2744c903d8d3d-dirty
dchristiany
parents:
diff changeset
103 os.mkdir(target_directory)
7caea40b2a30 planemo upload commit c3ff1475af0e964a0c61458b66e2744c903d8d3d-dirty
dchristiany
parents:
diff changeset
104
7caea40b2a30 planemo upload commit c3ff1475af0e964a0c61458b66e2744c903d8d3d-dirty
dchristiany
parents:
diff changeset
105 if args.database=="frogs_db_data":
9
0d9cb5c5aa35 planemo upload commit af25bf8e6c89c9de2e25dc0bc0ead06d8c05cf17
dchristiany
parents: 7
diff changeset
106 frogs_sources(data_manager_dict,target_directory,amplicons_list)
0
7caea40b2a30 planemo upload commit c3ff1475af0e964a0c61458b66e2744c903d8d3d-dirty
dchristiany
parents:
diff changeset
107 elif args.database=="HVL_db_data":
9
0d9cb5c5aa35 planemo upload commit af25bf8e6c89c9de2e25dc0bc0ead06d8c05cf17
dchristiany
parents: 7
diff changeset
108 HVL_sources(data_manager_dict,target_directory)
0
7caea40b2a30 planemo upload commit c3ff1475af0e964a0c61458b66e2744c903d8d3d-dirty
dchristiany
parents:
diff changeset
109
6
d11bc4a8f596 planemo upload commit c3ff1475af0e964a0c61458b66e2744c903d8d3d-dirty
dchristiany
parents: 5
diff changeset
110 #save info to json file
d11bc4a8f596 planemo upload commit c3ff1475af0e964a0c61458b66e2744c903d8d3d-dirty
dchristiany
parents: 5
diff changeset
111 open(filename, 'wb').write(to_json_string(data_manager_dict))
d11bc4a8f596 planemo upload commit c3ff1475af0e964a0c61458b66e2744c903d8d3d-dirty
dchristiany
parents: 5
diff changeset
112
0
7caea40b2a30 planemo upload commit c3ff1475af0e964a0c61458b66e2744c903d8d3d-dirty
dchristiany
parents:
diff changeset
113 if __name__ == "__main__":
7caea40b2a30 planemo upload commit c3ff1475af0e964a0c61458b66e2744c903d8d3d-dirty
dchristiany
parents:
diff changeset
114 main()