Mercurial > repos > dchristiany > frogs_data_manager
annotate data_manager/FROGS_data_manager.py @ 4:95d5191f1dc4 draft
planemo upload commit c3ff1475af0e964a0c61458b66e2744c903d8d3d-dirty
author | dchristiany |
---|---|
date | Thu, 03 Oct 2019 03:15:17 -0400 |
parents | 7caea40b2a30 |
children | ac4fb9d97a51 |
rev | line source |
---|---|
0
7caea40b2a30
planemo upload commit c3ff1475af0e964a0c61458b66e2744c903d8d3d-dirty
dchristiany
parents:
diff
changeset
|
1 # -*- coding: utf-8 -*- |
7caea40b2a30
planemo upload commit c3ff1475af0e964a0c61458b66e2744c903d8d3d-dirty
dchristiany
parents:
diff
changeset
|
2 from galaxy.util.json import from_json_string, to_json_string |
7caea40b2a30
planemo upload commit c3ff1475af0e964a0c61458b66e2744c903d8d3d-dirty
dchristiany
parents:
diff
changeset
|
3 import os, sys, argparse, time, json, requests, urllib |
7caea40b2a30
planemo upload commit c3ff1475af0e964a0c61458b66e2744c903d8d3d-dirty
dchristiany
parents:
diff
changeset
|
4 |
7caea40b2a30
planemo upload commit c3ff1475af0e964a0c61458b66e2744c903d8d3d-dirty
dchristiany
parents:
diff
changeset
|
5 def get_args(): |
7caea40b2a30
planemo upload commit c3ff1475af0e964a0c61458b66e2744c903d8d3d-dirty
dchristiany
parents:
diff
changeset
|
6 parser = argparse.ArgumentParser() |
7caea40b2a30
planemo upload commit c3ff1475af0e964a0c61458b66e2744c903d8d3d-dirty
dchristiany
parents:
diff
changeset
|
7 parser.add_argument("-d","--database") |
7caea40b2a30
planemo upload commit c3ff1475af0e964a0c61458b66e2744c903d8d3d-dirty
dchristiany
parents:
diff
changeset
|
8 parser.add_argument("-r","--resource") |
7caea40b2a30
planemo upload commit c3ff1475af0e964a0c61458b66e2744c903d8d3d-dirty
dchristiany
parents:
diff
changeset
|
9 parser.add_argument("-o","--output") |
7caea40b2a30
planemo upload commit c3ff1475af0e964a0c61458b66e2744c903d8d3d-dirty
dchristiany
parents:
diff
changeset
|
10 args = parser.parse_args() |
7caea40b2a30
planemo upload commit c3ff1475af0e964a0c61458b66e2744c903d8d3d-dirty
dchristiany
parents:
diff
changeset
|
11 return args |
7caea40b2a30
planemo upload commit c3ff1475af0e964a0c61458b66e2744c903d8d3d-dirty
dchristiany
parents:
diff
changeset
|
12 |
7caea40b2a30
planemo upload commit c3ff1475af0e964a0c61458b66e2744c903d8d3d-dirty
dchristiany
parents:
diff
changeset
|
13 def _add_data_table_entry(data_manager_dict, data_table_entry,data_table): |
7caea40b2a30
planemo upload commit c3ff1475af0e964a0c61458b66e2744c903d8d3d-dirty
dchristiany
parents:
diff
changeset
|
14 data_manager_dict['data_tables'] = data_manager_dict.get('data_tables', {}) |
7caea40b2a30
planemo upload commit c3ff1475af0e964a0c61458b66e2744c903d8d3d-dirty
dchristiany
parents:
diff
changeset
|
15 data_manager_dict['data_tables'][data_table] = data_manager_dict['data_tables'].get(data_table, []) |
7caea40b2a30
planemo upload commit c3ff1475af0e964a0c61458b66e2744c903d8d3d-dirty
dchristiany
parents:
diff
changeset
|
16 data_manager_dict['data_tables'][data_table].append(data_table_entry) |
7caea40b2a30
planemo upload commit c3ff1475af0e964a0c61458b66e2744c903d8d3d-dirty
dchristiany
parents:
diff
changeset
|
17 return data_manager_dict |
7caea40b2a30
planemo upload commit c3ff1475af0e964a0c61458b66e2744c903d8d3d-dirty
dchristiany
parents:
diff
changeset
|
18 |
7caea40b2a30
planemo upload commit c3ff1475af0e964a0c61458b66e2744c903d8d3d-dirty
dchristiany
parents:
diff
changeset
|
19 def frogs_sources(data_manager_dict,target_directory): |
7caea40b2a30
planemo upload commit c3ff1475af0e964a0c61458b66e2744c903d8d3d-dirty
dchristiany
parents:
diff
changeset
|
20 |
7caea40b2a30
planemo upload commit c3ff1475af0e964a0c61458b66e2744c903d8d3d-dirty
dchristiany
parents:
diff
changeset
|
21 #get frogs database index |
7caea40b2a30
planemo upload commit c3ff1475af0e964a0c61458b66e2744c903d8d3d-dirty
dchristiany
parents:
diff
changeset
|
22 frogs_db_index_link="http://genoweb.toulouse.inra.fr/frogs_databanks/assignation/FROGS_databases.tsv" |
7caea40b2a30
planemo upload commit c3ff1475af0e964a0c61458b66e2744c903d8d3d-dirty
dchristiany
parents:
diff
changeset
|
23 with requests.Session() as s: |
7caea40b2a30
planemo upload commit c3ff1475af0e964a0c61458b66e2744c903d8d3d-dirty
dchristiany
parents:
diff
changeset
|
24 download = s.get(frogs_db_index_link) |
7caea40b2a30
planemo upload commit c3ff1475af0e964a0c61458b66e2744c903d8d3d-dirty
dchristiany
parents:
diff
changeset
|
25 decoded_content = download.content.decode('utf-8') |
7caea40b2a30
planemo upload commit c3ff1475af0e964a0c61458b66e2744c903d8d3d-dirty
dchristiany
parents:
diff
changeset
|
26 db_index = download.content.splitlines() |
7caea40b2a30
planemo upload commit c3ff1475af0e964a0c61458b66e2744c903d8d3d-dirty
dchristiany
parents:
diff
changeset
|
27 db_index = [line.split("\t") for line in db_index[1:]] |
7caea40b2a30
planemo upload commit c3ff1475af0e964a0c61458b66e2744c903d8d3d-dirty
dchristiany
parents:
diff
changeset
|
28 db_index = [line[:4]+[line[1]+"_"+line[2]+"_"+line[1]]+[line[4]] for line in db_index] #add column name |
4
95d5191f1dc4
planemo upload commit c3ff1475af0e964a0c61458b66e2744c903d8d3d-dirty
dchristiany
parents:
0
diff
changeset
|
29 |
0
7caea40b2a30
planemo upload commit c3ff1475af0e964a0c61458b66e2744c903d8d3d-dirty
dchristiany
parents:
diff
changeset
|
30 #get frogs dbs |
7caea40b2a30
planemo upload commit c3ff1475af0e964a0c61458b66e2744c903d8d3d-dirty
dchristiany
parents:
diff
changeset
|
31 dir_name="frogs_db_"+time.strftime("%Y%m%d") |
7caea40b2a30
planemo upload commit c3ff1475af0e964a0c61458b66e2744c903d8d3d-dirty
dchristiany
parents:
diff
changeset
|
32 os.mkdir(dir_name) |
7caea40b2a30
planemo upload commit c3ff1475af0e964a0c61458b66e2744c903d8d3d-dirty
dchristiany
parents:
diff
changeset
|
33 dbs=set([]) |
7caea40b2a30
planemo upload commit c3ff1475af0e964a0c61458b66e2744c903d8d3d-dirty
dchristiany
parents:
diff
changeset
|
34 for line in db_index[:2]: |
7caea40b2a30
planemo upload commit c3ff1475af0e964a0c61458b66e2744c903d8d3d-dirty
dchristiany
parents:
diff
changeset
|
35 value=line[4] |
7caea40b2a30
planemo upload commit c3ff1475af0e964a0c61458b66e2744c903d8d3d-dirty
dchristiany
parents:
diff
changeset
|
36 name=value.replace("_"," ") |
7caea40b2a30
planemo upload commit c3ff1475af0e964a0c61458b66e2744c903d8d3d-dirty
dchristiany
parents:
diff
changeset
|
37 link=line[5] |
7caea40b2a30
planemo upload commit c3ff1475af0e964a0c61458b66e2744c903d8d3d-dirty
dchristiany
parents:
diff
changeset
|
38 |
7caea40b2a30
planemo upload commit c3ff1475af0e964a0c61458b66e2744c903d8d3d-dirty
dchristiany
parents:
diff
changeset
|
39 #download frogs db |
7caea40b2a30
planemo upload commit c3ff1475af0e964a0c61458b66e2744c903d8d3d-dirty
dchristiany
parents:
diff
changeset
|
40 dl_file = urllib.URLopener() |
7caea40b2a30
planemo upload commit c3ff1475af0e964a0c61458b66e2744c903d8d3d-dirty
dchristiany
parents:
diff
changeset
|
41 dl_file.retrieve(link, "tmp.tar.gz") |
7caea40b2a30
planemo upload commit c3ff1475af0e964a0c61458b66e2744c903d8d3d-dirty
dchristiany
parents:
diff
changeset
|
42 |
7caea40b2a30
planemo upload commit c3ff1475af0e964a0c61458b66e2744c903d8d3d-dirty
dchristiany
parents:
diff
changeset
|
43 #unzip frogs db |
7caea40b2a30
planemo upload commit c3ff1475af0e964a0c61458b66e2744c903d8d3d-dirty
dchristiany
parents:
diff
changeset
|
44 with tarfile.open("tmp.tar.gz") as tar: |
7caea40b2a30
planemo upload commit c3ff1475af0e964a0c61458b66e2744c903d8d3d-dirty
dchristiany
parents:
diff
changeset
|
45 tar.extractall(dir_name) |
7caea40b2a30
planemo upload commit c3ff1475af0e964a0c61458b66e2744c903d8d3d-dirty
dchristiany
parents:
diff
changeset
|
46 tar.close() |
7caea40b2a30
planemo upload commit c3ff1475af0e964a0c61458b66e2744c903d8d3d-dirty
dchristiany
parents:
diff
changeset
|
47 os.remove('tmp.tar.gz') |
7caea40b2a30
planemo upload commit c3ff1475af0e964a0c61458b66e2744c903d8d3d-dirty
dchristiany
parents:
diff
changeset
|
48 |
7caea40b2a30
planemo upload commit c3ff1475af0e964a0c61458b66e2744c903d8d3d-dirty
dchristiany
parents:
diff
changeset
|
49 #get fasta file path |
7caea40b2a30
planemo upload commit c3ff1475af0e964a0c61458b66e2744c903d8d3d-dirty
dchristiany
parents:
diff
changeset
|
50 tmp = set(os.listdir(dir_name)) |
7caea40b2a30
planemo upload commit c3ff1475af0e964a0c61458b66e2744c903d8d3d-dirty
dchristiany
parents:
diff
changeset
|
51 new_db = dir_name+"/"+"".join(tmp.difference(dbs)) |
7caea40b2a30
planemo upload commit c3ff1475af0e964a0c61458b66e2744c903d8d3d-dirty
dchristiany
parents:
diff
changeset
|
52 files = os.listdir(new_db) |
7caea40b2a30
planemo upload commit c3ff1475af0e964a0c61458b66e2744c903d8d3d-dirty
dchristiany
parents:
diff
changeset
|
53 fasta = "".join([file for file in files if file.endswith('.fasta')]) |
7caea40b2a30
planemo upload commit c3ff1475af0e964a0c61458b66e2744c903d8d3d-dirty
dchristiany
parents:
diff
changeset
|
54 path = new_db+'/'+fasta |
7caea40b2a30
planemo upload commit c3ff1475af0e964a0c61458b66e2744c903d8d3d-dirty
dchristiany
parents:
diff
changeset
|
55 dbs = os.listdir(dir_name) |
7caea40b2a30
planemo upload commit c3ff1475af0e964a0c61458b66e2744c903d8d3d-dirty
dchristiany
parents:
diff
changeset
|
56 release = value+"_"+time.strftime("%Y-%m-%d") |
7caea40b2a30
planemo upload commit c3ff1475af0e964a0c61458b66e2744c903d8d3d-dirty
dchristiany
parents:
diff
changeset
|
57 date=time.strftime("%Y%m%d") |
7caea40b2a30
planemo upload commit c3ff1475af0e964a0c61458b66e2744c903d8d3d-dirty
dchristiany
parents:
diff
changeset
|
58 path = os.path.join(target_directory,path) |
7caea40b2a30
planemo upload commit c3ff1475af0e964a0c61458b66e2744c903d8d3d-dirty
dchristiany
parents:
diff
changeset
|
59 |
7caea40b2a30
planemo upload commit c3ff1475af0e964a0c61458b66e2744c903d8d3d-dirty
dchristiany
parents:
diff
changeset
|
60 data_table_entry = dict(name = name, value = value, path=path) |
7caea40b2a30
planemo upload commit c3ff1475af0e964a0c61458b66e2744c903d8d3d-dirty
dchristiany
parents:
diff
changeset
|
61 _add_data_table_entry(data_manager_dict, data_table_entry, "frogs_db") |
7caea40b2a30
planemo upload commit c3ff1475af0e964a0c61458b66e2744c903d8d3d-dirty
dchristiany
parents:
diff
changeset
|
62 |
7caea40b2a30
planemo upload commit c3ff1475af0e964a0c61458b66e2744c903d8d3d-dirty
dchristiany
parents:
diff
changeset
|
63 #def HVL_sources(resource): |
7caea40b2a30
planemo upload commit c3ff1475af0e964a0c61458b66e2744c903d8d3d-dirty
dchristiany
parents:
diff
changeset
|
64 |
7caea40b2a30
planemo upload commit c3ff1475af0e964a0c61458b66e2744c903d8d3d-dirty
dchristiany
parents:
diff
changeset
|
65 #def phiX_sources(resource): |
7caea40b2a30
planemo upload commit c3ff1475af0e964a0c61458b66e2744c903d8d3d-dirty
dchristiany
parents:
diff
changeset
|
66 |
7caea40b2a30
planemo upload commit c3ff1475af0e964a0c61458b66e2744c903d8d3d-dirty
dchristiany
parents:
diff
changeset
|
67 def main(): |
7caea40b2a30
planemo upload commit c3ff1475af0e964a0c61458b66e2744c903d8d3d-dirty
dchristiany
parents:
diff
changeset
|
68 |
7caea40b2a30
planemo upload commit c3ff1475af0e964a0c61458b66e2744c903d8d3d-dirty
dchristiany
parents:
diff
changeset
|
69 #get args from command line |
7caea40b2a30
planemo upload commit c3ff1475af0e964a0c61458b66e2744c903d8d3d-dirty
dchristiany
parents:
diff
changeset
|
70 args = get_args() |
7caea40b2a30
planemo upload commit c3ff1475af0e964a0c61458b66e2744c903d8d3d-dirty
dchristiany
parents:
diff
changeset
|
71 |
7caea40b2a30
planemo upload commit c3ff1475af0e964a0c61458b66e2744c903d8d3d-dirty
dchristiany
parents:
diff
changeset
|
72 # Extract json file params |
7caea40b2a30
planemo upload commit c3ff1475af0e964a0c61458b66e2744c903d8d3d-dirty
dchristiany
parents:
diff
changeset
|
73 data_manager_dict = {} |
7caea40b2a30
planemo upload commit c3ff1475af0e964a0c61458b66e2744c903d8d3d-dirty
dchristiany
parents:
diff
changeset
|
74 filename = args.output |
7caea40b2a30
planemo upload commit c3ff1475af0e964a0c61458b66e2744c903d8d3d-dirty
dchristiany
parents:
diff
changeset
|
75 params = from_json_string(open(filename).read()) |
7caea40b2a30
planemo upload commit c3ff1475af0e964a0c61458b66e2744c903d8d3d-dirty
dchristiany
parents:
diff
changeset
|
76 target_directory = params[ 'output_data' ][0]['extra_files_path'] |
7caea40b2a30
planemo upload commit c3ff1475af0e964a0c61458b66e2744c903d8d3d-dirty
dchristiany
parents:
diff
changeset
|
77 os.mkdir(target_directory) |
7caea40b2a30
planemo upload commit c3ff1475af0e964a0c61458b66e2744c903d8d3d-dirty
dchristiany
parents:
diff
changeset
|
78 |
7caea40b2a30
planemo upload commit c3ff1475af0e964a0c61458b66e2744c903d8d3d-dirty
dchristiany
parents:
diff
changeset
|
79 if args.database=="frogs_db_data": |
7caea40b2a30
planemo upload commit c3ff1475af0e964a0c61458b66e2744c903d8d3d-dirty
dchristiany
parents:
diff
changeset
|
80 frogs_sources(data_manager_dict,target_directory) |
7caea40b2a30
planemo upload commit c3ff1475af0e964a0c61458b66e2744c903d8d3d-dirty
dchristiany
parents:
diff
changeset
|
81 elif args.database=="HVL_db_data": |
7caea40b2a30
planemo upload commit c3ff1475af0e964a0c61458b66e2744c903d8d3d-dirty
dchristiany
parents:
diff
changeset
|
82 HVL_sources(args.resource) |
7caea40b2a30
planemo upload commit c3ff1475af0e964a0c61458b66e2744c903d8d3d-dirty
dchristiany
parents:
diff
changeset
|
83 elif args.database=="phiX_db_data": |
7caea40b2a30
planemo upload commit c3ff1475af0e964a0c61458b66e2744c903d8d3d-dirty
dchristiany
parents:
diff
changeset
|
84 phiX_sources(args.resource) |
7caea40b2a30
planemo upload commit c3ff1475af0e964a0c61458b66e2744c903d8d3d-dirty
dchristiany
parents:
diff
changeset
|
85 |
7caea40b2a30
planemo upload commit c3ff1475af0e964a0c61458b66e2744c903d8d3d-dirty
dchristiany
parents:
diff
changeset
|
86 if __name__ == "__main__": |
7caea40b2a30
planemo upload commit c3ff1475af0e964a0c61458b66e2744c903d8d3d-dirty
dchristiany
parents:
diff
changeset
|
87 main() |