annotate data_manager/FROGS_data_manager.py @ 7:99b2dfb1fa1c draft

planemo upload commit c3ff1475af0e964a0c61458b66e2744c903d8d3d-dirty
author dchristiany
date Fri, 04 Oct 2019 08:57:45 -0400
parents d11bc4a8f596
children 0d9cb5c5aa35
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
7caea40b2a30 planemo upload commit c3ff1475af0e964a0c61458b66e2744c903d8d3d-dirty
dchristiany
parents:
diff changeset
1 # -*- coding: utf-8 -*-
7caea40b2a30 planemo upload commit c3ff1475af0e964a0c61458b66e2744c903d8d3d-dirty
dchristiany
parents:
diff changeset
2 from galaxy.util.json import from_json_string, to_json_string
5
ac4fb9d97a51 planemo upload commit c3ff1475af0e964a0c61458b66e2744c903d8d3d-dirty
dchristiany
parents: 4
diff changeset
3 import os, sys, argparse, time, json, requests, urllib, tarfile
0
7caea40b2a30 planemo upload commit c3ff1475af0e964a0c61458b66e2744c903d8d3d-dirty
dchristiany
parents:
diff changeset
4
7caea40b2a30 planemo upload commit c3ff1475af0e964a0c61458b66e2744c903d8d3d-dirty
dchristiany
parents:
diff changeset
5 def get_args():
7caea40b2a30 planemo upload commit c3ff1475af0e964a0c61458b66e2744c903d8d3d-dirty
dchristiany
parents:
diff changeset
6 parser = argparse.ArgumentParser()
7caea40b2a30 planemo upload commit c3ff1475af0e964a0c61458b66e2744c903d8d3d-dirty
dchristiany
parents:
diff changeset
7 parser.add_argument("-d","--database")
7caea40b2a30 planemo upload commit c3ff1475af0e964a0c61458b66e2744c903d8d3d-dirty
dchristiany
parents:
diff changeset
8 parser.add_argument("-r","--resource")
7caea40b2a30 planemo upload commit c3ff1475af0e964a0c61458b66e2744c903d8d3d-dirty
dchristiany
parents:
diff changeset
9 parser.add_argument("-o","--output")
7caea40b2a30 planemo upload commit c3ff1475af0e964a0c61458b66e2744c903d8d3d-dirty
dchristiany
parents:
diff changeset
10 args = parser.parse_args()
7caea40b2a30 planemo upload commit c3ff1475af0e964a0c61458b66e2744c903d8d3d-dirty
dchristiany
parents:
diff changeset
11 return args
7caea40b2a30 planemo upload commit c3ff1475af0e964a0c61458b66e2744c903d8d3d-dirty
dchristiany
parents:
diff changeset
12
7caea40b2a30 planemo upload commit c3ff1475af0e964a0c61458b66e2744c903d8d3d-dirty
dchristiany
parents:
diff changeset
13 def _add_data_table_entry(data_manager_dict, data_table_entry,data_table):
7caea40b2a30 planemo upload commit c3ff1475af0e964a0c61458b66e2744c903d8d3d-dirty
dchristiany
parents:
diff changeset
14 data_manager_dict['data_tables'] = data_manager_dict.get('data_tables', {})
7caea40b2a30 planemo upload commit c3ff1475af0e964a0c61458b66e2744c903d8d3d-dirty
dchristiany
parents:
diff changeset
15 data_manager_dict['data_tables'][data_table] = data_manager_dict['data_tables'].get(data_table, [])
7caea40b2a30 planemo upload commit c3ff1475af0e964a0c61458b66e2744c903d8d3d-dirty
dchristiany
parents:
diff changeset
16 data_manager_dict['data_tables'][data_table].append(data_table_entry)
7caea40b2a30 planemo upload commit c3ff1475af0e964a0c61458b66e2744c903d8d3d-dirty
dchristiany
parents:
diff changeset
17 return data_manager_dict
7caea40b2a30 planemo upload commit c3ff1475af0e964a0c61458b66e2744c903d8d3d-dirty
dchristiany
parents:
diff changeset
18
7caea40b2a30 planemo upload commit c3ff1475af0e964a0c61458b66e2744c903d8d3d-dirty
dchristiany
parents:
diff changeset
19 def frogs_sources(data_manager_dict,target_directory):
7caea40b2a30 planemo upload commit c3ff1475af0e964a0c61458b66e2744c903d8d3d-dirty
dchristiany
parents:
diff changeset
20
7caea40b2a30 planemo upload commit c3ff1475af0e964a0c61458b66e2744c903d8d3d-dirty
dchristiany
parents:
diff changeset
21 #get frogs database index
7caea40b2a30 planemo upload commit c3ff1475af0e964a0c61458b66e2744c903d8d3d-dirty
dchristiany
parents:
diff changeset
22 frogs_db_index_link="http://genoweb.toulouse.inra.fr/frogs_databanks/assignation/FROGS_databases.tsv"
7caea40b2a30 planemo upload commit c3ff1475af0e964a0c61458b66e2744c903d8d3d-dirty
dchristiany
parents:
diff changeset
23 with requests.Session() as s:
7caea40b2a30 planemo upload commit c3ff1475af0e964a0c61458b66e2744c903d8d3d-dirty
dchristiany
parents:
diff changeset
24 download = s.get(frogs_db_index_link)
7caea40b2a30 planemo upload commit c3ff1475af0e964a0c61458b66e2744c903d8d3d-dirty
dchristiany
parents:
diff changeset
25 decoded_content = download.content.decode('utf-8')
7caea40b2a30 planemo upload commit c3ff1475af0e964a0c61458b66e2744c903d8d3d-dirty
dchristiany
parents:
diff changeset
26 db_index = download.content.splitlines()
7caea40b2a30 planemo upload commit c3ff1475af0e964a0c61458b66e2744c903d8d3d-dirty
dchristiany
parents:
diff changeset
27 db_index = [line.split("\t") for line in db_index[1:]]
7
99b2dfb1fa1c planemo upload commit c3ff1475af0e964a0c61458b66e2744c903d8d3d-dirty
dchristiany
parents: 6
diff changeset
28 db_index = [line[:4]+[line[1]+"_"+line[2]+"_"+line[3]]+[line[4]] for line in db_index] #add column name
4
95d5191f1dc4 planemo upload commit c3ff1475af0e964a0c61458b66e2744c903d8d3d-dirty
dchristiany
parents: 0
diff changeset
29
0
7caea40b2a30 planemo upload commit c3ff1475af0e964a0c61458b66e2744c903d8d3d-dirty
dchristiany
parents:
diff changeset
30 #get frogs dbs
7
99b2dfb1fa1c planemo upload commit c3ff1475af0e964a0c61458b66e2744c903d8d3d-dirty
dchristiany
parents: 6
diff changeset
31 os.chdir(target_directory)
0
7caea40b2a30 planemo upload commit c3ff1475af0e964a0c61458b66e2744c903d8d3d-dirty
dchristiany
parents:
diff changeset
32 dir_name="frogs_db_"+time.strftime("%Y%m%d")
7caea40b2a30 planemo upload commit c3ff1475af0e964a0c61458b66e2744c903d8d3d-dirty
dchristiany
parents:
diff changeset
33 os.mkdir(dir_name)
7caea40b2a30 planemo upload commit c3ff1475af0e964a0c61458b66e2744c903d8d3d-dirty
dchristiany
parents:
diff changeset
34 dbs=set([])
7
99b2dfb1fa1c planemo upload commit c3ff1475af0e964a0c61458b66e2744c903d8d3d-dirty
dchristiany
parents: 6
diff changeset
35 for line in db_index:
0
7caea40b2a30 planemo upload commit c3ff1475af0e964a0c61458b66e2744c903d8d3d-dirty
dchristiany
parents:
diff changeset
36 value=line[4]
7caea40b2a30 planemo upload commit c3ff1475af0e964a0c61458b66e2744c903d8d3d-dirty
dchristiany
parents:
diff changeset
37 name=value.replace("_"," ")
7caea40b2a30 planemo upload commit c3ff1475af0e964a0c61458b66e2744c903d8d3d-dirty
dchristiany
parents:
diff changeset
38 link=line[5]
7caea40b2a30 planemo upload commit c3ff1475af0e964a0c61458b66e2744c903d8d3d-dirty
dchristiany
parents:
diff changeset
39
7caea40b2a30 planemo upload commit c3ff1475af0e964a0c61458b66e2744c903d8d3d-dirty
dchristiany
parents:
diff changeset
40 #download frogs db
7caea40b2a30 planemo upload commit c3ff1475af0e964a0c61458b66e2744c903d8d3d-dirty
dchristiany
parents:
diff changeset
41 dl_file = urllib.URLopener()
7caea40b2a30 planemo upload commit c3ff1475af0e964a0c61458b66e2744c903d8d3d-dirty
dchristiany
parents:
diff changeset
42 dl_file.retrieve(link, "tmp.tar.gz")
7caea40b2a30 planemo upload commit c3ff1475af0e964a0c61458b66e2744c903d8d3d-dirty
dchristiany
parents:
diff changeset
43
7caea40b2a30 planemo upload commit c3ff1475af0e964a0c61458b66e2744c903d8d3d-dirty
dchristiany
parents:
diff changeset
44 #unzip frogs db
7caea40b2a30 planemo upload commit c3ff1475af0e964a0c61458b66e2744c903d8d3d-dirty
dchristiany
parents:
diff changeset
45 with tarfile.open("tmp.tar.gz") as tar:
7caea40b2a30 planemo upload commit c3ff1475af0e964a0c61458b66e2744c903d8d3d-dirty
dchristiany
parents:
diff changeset
46 tar.extractall(dir_name)
7caea40b2a30 planemo upload commit c3ff1475af0e964a0c61458b66e2744c903d8d3d-dirty
dchristiany
parents:
diff changeset
47 tar.close()
7caea40b2a30 planemo upload commit c3ff1475af0e964a0c61458b66e2744c903d8d3d-dirty
dchristiany
parents:
diff changeset
48 os.remove('tmp.tar.gz')
7caea40b2a30 planemo upload commit c3ff1475af0e964a0c61458b66e2744c903d8d3d-dirty
dchristiany
parents:
diff changeset
49
7caea40b2a30 planemo upload commit c3ff1475af0e964a0c61458b66e2744c903d8d3d-dirty
dchristiany
parents:
diff changeset
50 #get fasta file path
7caea40b2a30 planemo upload commit c3ff1475af0e964a0c61458b66e2744c903d8d3d-dirty
dchristiany
parents:
diff changeset
51 tmp = set(os.listdir(dir_name))
7caea40b2a30 planemo upload commit c3ff1475af0e964a0c61458b66e2744c903d8d3d-dirty
dchristiany
parents:
diff changeset
52 new_db = dir_name+"/"+"".join(tmp.difference(dbs))
7caea40b2a30 planemo upload commit c3ff1475af0e964a0c61458b66e2744c903d8d3d-dirty
dchristiany
parents:
diff changeset
53 files = os.listdir(new_db)
7caea40b2a30 planemo upload commit c3ff1475af0e964a0c61458b66e2744c903d8d3d-dirty
dchristiany
parents:
diff changeset
54 fasta = "".join([file for file in files if file.endswith('.fasta')])
7caea40b2a30 planemo upload commit c3ff1475af0e964a0c61458b66e2744c903d8d3d-dirty
dchristiany
parents:
diff changeset
55 path = new_db+'/'+fasta
7caea40b2a30 planemo upload commit c3ff1475af0e964a0c61458b66e2744c903d8d3d-dirty
dchristiany
parents:
diff changeset
56 dbs = os.listdir(dir_name)
7caea40b2a30 planemo upload commit c3ff1475af0e964a0c61458b66e2744c903d8d3d-dirty
dchristiany
parents:
diff changeset
57 release = value+"_"+time.strftime("%Y-%m-%d")
7caea40b2a30 planemo upload commit c3ff1475af0e964a0c61458b66e2744c903d8d3d-dirty
dchristiany
parents:
diff changeset
58 date=time.strftime("%Y%m%d")
7caea40b2a30 planemo upload commit c3ff1475af0e964a0c61458b66e2744c903d8d3d-dirty
dchristiany
parents:
diff changeset
59 path = os.path.join(target_directory,path)
7caea40b2a30 planemo upload commit c3ff1475af0e964a0c61458b66e2744c903d8d3d-dirty
dchristiany
parents:
diff changeset
60
7caea40b2a30 planemo upload commit c3ff1475af0e964a0c61458b66e2744c903d8d3d-dirty
dchristiany
parents:
diff changeset
61 data_table_entry = dict(name = name, value = value, path=path)
7caea40b2a30 planemo upload commit c3ff1475af0e964a0c61458b66e2744c903d8d3d-dirty
dchristiany
parents:
diff changeset
62 _add_data_table_entry(data_manager_dict, data_table_entry, "frogs_db")
7caea40b2a30 planemo upload commit c3ff1475af0e964a0c61458b66e2744c903d8d3d-dirty
dchristiany
parents:
diff changeset
63
7caea40b2a30 planemo upload commit c3ff1475af0e964a0c61458b66e2744c903d8d3d-dirty
dchristiany
parents:
diff changeset
64 #def HVL_sources(resource):
7caea40b2a30 planemo upload commit c3ff1475af0e964a0c61458b66e2744c903d8d3d-dirty
dchristiany
parents:
diff changeset
65
7caea40b2a30 planemo upload commit c3ff1475af0e964a0c61458b66e2744c903d8d3d-dirty
dchristiany
parents:
diff changeset
66 #def phiX_sources(resource):
7caea40b2a30 planemo upload commit c3ff1475af0e964a0c61458b66e2744c903d8d3d-dirty
dchristiany
parents:
diff changeset
67
7caea40b2a30 planemo upload commit c3ff1475af0e964a0c61458b66e2744c903d8d3d-dirty
dchristiany
parents:
diff changeset
68 def main():
7caea40b2a30 planemo upload commit c3ff1475af0e964a0c61458b66e2744c903d8d3d-dirty
dchristiany
parents:
diff changeset
69
7caea40b2a30 planemo upload commit c3ff1475af0e964a0c61458b66e2744c903d8d3d-dirty
dchristiany
parents:
diff changeset
70 #get args from command line
7caea40b2a30 planemo upload commit c3ff1475af0e964a0c61458b66e2744c903d8d3d-dirty
dchristiany
parents:
diff changeset
71 args = get_args()
7caea40b2a30 planemo upload commit c3ff1475af0e964a0c61458b66e2744c903d8d3d-dirty
dchristiany
parents:
diff changeset
72
7caea40b2a30 planemo upload commit c3ff1475af0e964a0c61458b66e2744c903d8d3d-dirty
dchristiany
parents:
diff changeset
73 # Extract json file params
7caea40b2a30 planemo upload commit c3ff1475af0e964a0c61458b66e2744c903d8d3d-dirty
dchristiany
parents:
diff changeset
74 data_manager_dict = {}
7caea40b2a30 planemo upload commit c3ff1475af0e964a0c61458b66e2744c903d8d3d-dirty
dchristiany
parents:
diff changeset
75 filename = args.output
7caea40b2a30 planemo upload commit c3ff1475af0e964a0c61458b66e2744c903d8d3d-dirty
dchristiany
parents:
diff changeset
76 params = from_json_string(open(filename).read())
7caea40b2a30 planemo upload commit c3ff1475af0e964a0c61458b66e2744c903d8d3d-dirty
dchristiany
parents:
diff changeset
77 target_directory = params[ 'output_data' ][0]['extra_files_path']
7caea40b2a30 planemo upload commit c3ff1475af0e964a0c61458b66e2744c903d8d3d-dirty
dchristiany
parents:
diff changeset
78 os.mkdir(target_directory)
7caea40b2a30 planemo upload commit c3ff1475af0e964a0c61458b66e2744c903d8d3d-dirty
dchristiany
parents:
diff changeset
79
7caea40b2a30 planemo upload commit c3ff1475af0e964a0c61458b66e2744c903d8d3d-dirty
dchristiany
parents:
diff changeset
80 if args.database=="frogs_db_data":
7caea40b2a30 planemo upload commit c3ff1475af0e964a0c61458b66e2744c903d8d3d-dirty
dchristiany
parents:
diff changeset
81 frogs_sources(data_manager_dict,target_directory)
7caea40b2a30 planemo upload commit c3ff1475af0e964a0c61458b66e2744c903d8d3d-dirty
dchristiany
parents:
diff changeset
82 elif args.database=="HVL_db_data":
7caea40b2a30 planemo upload commit c3ff1475af0e964a0c61458b66e2744c903d8d3d-dirty
dchristiany
parents:
diff changeset
83 HVL_sources(args.resource)
7caea40b2a30 planemo upload commit c3ff1475af0e964a0c61458b66e2744c903d8d3d-dirty
dchristiany
parents:
diff changeset
84 elif args.database=="phiX_db_data":
7caea40b2a30 planemo upload commit c3ff1475af0e964a0c61458b66e2744c903d8d3d-dirty
dchristiany
parents:
diff changeset
85 phiX_sources(args.resource)
7caea40b2a30 planemo upload commit c3ff1475af0e964a0c61458b66e2744c903d8d3d-dirty
dchristiany
parents:
diff changeset
86
6
d11bc4a8f596 planemo upload commit c3ff1475af0e964a0c61458b66e2744c903d8d3d-dirty
dchristiany
parents: 5
diff changeset
87 #save info to json file
d11bc4a8f596 planemo upload commit c3ff1475af0e964a0c61458b66e2744c903d8d3d-dirty
dchristiany
parents: 5
diff changeset
88 open(filename, 'wb').write(to_json_string(data_manager_dict))
d11bc4a8f596 planemo upload commit c3ff1475af0e964a0c61458b66e2744c903d8d3d-dirty
dchristiany
parents: 5
diff changeset
89
0
7caea40b2a30 planemo upload commit c3ff1475af0e964a0c61458b66e2744c903d8d3d-dirty
dchristiany
parents:
diff changeset
90 if __name__ == "__main__":
7caea40b2a30 planemo upload commit c3ff1475af0e964a0c61458b66e2744c903d8d3d-dirty
dchristiany
parents:
diff changeset
91 main()