Mercurial > repos > dchristiany > data_manager_proteore
annotate data_manager/resource_building.py @ 15:83f57ba70416 draft
planemo upload commit 8040003119a3d54866ec6ee9b9f659f2af554817-dirty
author | dchristiany |
---|---|
date | Tue, 15 Jan 2019 04:29:28 -0500 |
parents | a1530507fee4 |
children | 454c2e2984ea |
rev | line source |
---|---|
0
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
1 """ |
10
2f153b41b6fe
planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
7
diff
changeset
|
2 The purpose of this script is to create source files from different databases to be used in other proteore tools |
0
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
3 """ |
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
4 |
15
83f57ba70416
planemo upload commit 8040003119a3d54866ec6ee9b9f659f2af554817-dirty
dchristiany
parents:
13
diff
changeset
|
5 import os, sys, argparse, requests, time, csv, re, json, zipfile, shutil |
0
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
6 from io import BytesIO |
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
7 from zipfile import ZipFile |
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
8 from galaxy.util.json import from_json_string, to_json_string |
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
9 |
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
10 ####################################################################################################### |
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
11 # General functions |
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
12 ####################################################################################################### |
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
13 def unzip(url, output_file): |
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
14 """ |
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
15 Get a zip file content from a link and unzip |
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
16 """ |
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
17 content = requests.get(url) |
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
18 zipfile = ZipFile(BytesIO(content.content)) |
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
19 output_content = "" |
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
20 output_content += zipfile.open(zipfile.namelist()[0]).read() |
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
21 output = open(output_file, "w") |
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
22 output.write(output_content) |
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
23 output.close() |
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
24 |
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
25 def _add_data_table_entry(data_manager_dict, data_table_entry,data_table): |
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
26 data_manager_dict['data_tables'] = data_manager_dict.get('data_tables', {}) |
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
27 data_manager_dict['data_tables'][data_table] = data_manager_dict['data_tables'].get(data_table, []) |
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
28 data_manager_dict['data_tables'][data_table].append(data_table_entry) |
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
29 return data_manager_dict |
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
30 |
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
31 ####################################################################################################### |
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
32 # 1. Human Protein Atlas |
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
33 # - Normal tissue |
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
34 # - Pathology |
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
35 # - Full Atlas |
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
36 ####################################################################################################### |
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
37 def HPA_sources(data_manager_dict, tissue, target_directory): |
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
38 if tissue == "HPA_normal_tissue": |
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
39 tissue_name = "HPA normal tissue" |
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
40 url = "https://www.proteinatlas.org/download/normal_tissue.tsv.zip" |
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
41 elif tissue == "HPA_pathology": |
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
42 tissue_name = "HPA pathology" |
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
43 url = "https://www.proteinatlas.org/download/pathology.tsv.zip" |
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
44 elif tissue == "HPA_full_atlas": |
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
45 tissue_name = "HPA full atlas" |
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
46 url = "https://www.proteinatlas.org/download/proteinatlas.tsv.zip" |
10
2f153b41b6fe
planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
7
diff
changeset
|
47 |
0
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
48 output_file = tissue +"_"+ time.strftime("%d-%m-%Y") + ".tsv" |
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
49 path = os.path.join(target_directory, output_file) |
10
2f153b41b6fe
planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
7
diff
changeset
|
50 unzip(url, path) #download and save file |
0
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
51 tissue_name = tissue_name + " " + time.strftime("%d/%m/%Y") |
10
2f153b41b6fe
planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
7
diff
changeset
|
52 tissue_id = tissue_name.replace(" ","_").replace("/","-") |
2f153b41b6fe
planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
7
diff
changeset
|
53 |
2f153b41b6fe
planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
7
diff
changeset
|
54 data_table_entry = dict(id=tissue_id, name = tissue_name, value = tissue, path = path) |
12
60cb0a5ae661
planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
10
diff
changeset
|
55 _add_data_table_entry(data_manager_dict, data_table_entry, "proteore_protein_atlas") |
0
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
56 |
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
57 |
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
58 ####################################################################################################### |
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
59 # 2. Peptide Atlas |
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
60 ####################################################################################################### |
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
61 def peptide_atlas_sources(data_manager_dict, tissue, target_directory): |
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
62 # Define PA Human build released number (here early 2018) |
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
63 atlas_build_id = "472" |
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
64 # Define organism_id (here Human) - to be upraded when other organism added to the project |
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
65 organism_id = "2" |
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
66 # Extract sample_category_id and output filename |
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
67 sample_category_id = tissue.split("-")[0] |
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
68 output_file = tissue.split("-")[1] +"_"+ time.strftime("%d-%m-%Y") + ".tsv" |
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
69 query = "https://db.systemsbiology.net/sbeams/cgi/PeptideAtlas/GetPeptides?atlas_build_id=" + \ |
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
70 atlas_build_id + "&display_options=ShowMappings&organism_id= " + \ |
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
71 organism_id + "&sample_category_id=" + sample_category_id + \ |
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
72 "&QUERY_NAME=AT_GetPeptides&output_mode=tsv&apply_action=QUERY" |
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
73 download = requests.get(query) |
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
74 decoded_content = download.content.decode('utf-8') |
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
75 cr = csv.reader(decoded_content.splitlines(), delimiter='\t') |
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
76 |
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
77 #build dictionary by only keeping uniprot accession (not isoform) as key and sum of observations as value |
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
78 uni_dict = build_dictionary(cr) |
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
79 |
10
2f153b41b6fe
planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
7
diff
changeset
|
80 #columns of data table peptide_atlas |
2f153b41b6fe
planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
7
diff
changeset
|
81 date = time.strftime("%d-%m-%Y") |
2f153b41b6fe
planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
7
diff
changeset
|
82 tissue = tissue.split("-")[1] |
2f153b41b6fe
planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
7
diff
changeset
|
83 tissue_id = tissue+"_"+date |
2f153b41b6fe
planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
7
diff
changeset
|
84 tissue_name = tissue_id.replace("-","/").replace("_"," ") |
0
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
85 path = os.path.join(target_directory,output_file) |
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
86 |
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
87 with open(path,"wb") as out : |
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
88 w = csv.writer(out,delimiter='\t') |
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
89 w.writerow(["Uniprot_AC","nb_obs"]) |
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
90 w.writerows(uni_dict.items()) |
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
91 |
10
2f153b41b6fe
planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
7
diff
changeset
|
92 data_table_entry = dict(id=tissue_id, name=tissue_name, value = path, tissue = tissue) |
12
60cb0a5ae661
planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
10
diff
changeset
|
93 _add_data_table_entry(data_manager_dict, data_table_entry, "proteore_peptide_atlas") |
0
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
94 |
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
95 #function to count the number of observations by uniprot id |
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
96 def build_dictionary (csv) : |
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
97 uni_dict = {} |
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
98 for line in csv : |
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
99 if "-" not in line[2] and check_uniprot_access(line[2]) : |
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
100 if line[2] in uni_dict : |
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
101 uni_dict[line[2]] += int(line[4]) |
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
102 else : |
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
103 uni_dict[line[2]] = int(line[4]) |
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
104 |
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
105 return uni_dict |
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
106 |
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
107 #function to check if an id is an uniprot accession number : return True or False- |
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
108 def check_uniprot_access (id) : |
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
109 uniprot_pattern = re.compile("[OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2}") |
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
110 if uniprot_pattern.match(id) : |
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
111 return True |
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
112 else : |
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
113 return False |
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
114 |
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
115 |
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
116 |
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
117 ####################################################################################################### |
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
118 # 3. ID mapping file |
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
119 ####################################################################################################### |
15
83f57ba70416
planemo upload commit 8040003119a3d54866ec6ee9b9f659f2af554817-dirty
dchristiany
parents:
13
diff
changeset
|
120 import ftplib, gzip |
0
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
121 csv.field_size_limit(sys.maxsize) # to handle big files |
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
122 |
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
123 def id_mapping_sources (data_manager_dict, species, target_directory) : |
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
124 |
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
125 human = species == "human" |
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
126 species_dict = { "human" : "HUMAN_9606", "mouse" : "MOUSE_10090", "rat" : "RAT_10116" } |
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
127 files=["idmapping_selected.tab.gz","idmapping.dat.gz"] |
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
128 |
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
129 #header |
15
83f57ba70416
planemo upload commit 8040003119a3d54866ec6ee9b9f659f2af554817-dirty
dchristiany
parents:
13
diff
changeset
|
130 if human : tab = [["UniProt-AC","UniProt-ID","GeneID","RefSeq","GI","PDB","GO","PIR","MIM","UniGene","Ensembl_Gene","Ensembl_Transcript","Ensembl_Protein","neXtProt","BioGrid","STRING","KEGG"]] |
83f57ba70416
planemo upload commit 8040003119a3d54866ec6ee9b9f659f2af554817-dirty
dchristiany
parents:
13
diff
changeset
|
131 else : tab = [["UniProt-AC","UniProt-ID","GeneID","RefSeq","GI","PDB","GO","PIR","MIM","UniGene","Ensembl_Gene","Ensembl_Transcript","Ensembl_Protein","BioGrid","STRING","KEGG"]] |
0
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
132 |
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
133 #print("header ok") |
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
134 |
15
83f57ba70416
planemo upload commit 8040003119a3d54866ec6ee9b9f659f2af554817-dirty
dchristiany
parents:
13
diff
changeset
|
135 #get selected.tab and keep only ids of interest |
0
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
136 selected_tab_file=species_dict[species]+"_"+files[0] |
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
137 tab_path = download_from_uniprot_ftp(selected_tab_file,target_directory) |
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
138 with gzip.open(tab_path,"rt") as select : |
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
139 tab_reader = csv.reader(select,delimiter="\t") |
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
140 for line in tab_reader : |
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
141 tab.append([line[i] for i in [0,1,2,3,4,5,6,11,13,14,18,19,20]]) |
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
142 os.remove(tab_path) |
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
143 |
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
144 #print("selected_tab ok") |
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
145 |
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
146 """ |
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
147 Supplementary ID to get from HUMAN_9606_idmapping.dat : |
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
148 -NextProt,BioGrid,STRING,KEGG |
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
149 """ |
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
150 |
15
83f57ba70416
planemo upload commit 8040003119a3d54866ec6ee9b9f659f2af554817-dirty
dchristiany
parents:
13
diff
changeset
|
151 #there's more id type for human |
0
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
152 if human : ids = ['neXtProt','BioGrid','STRING','KEGG' ] #ids to get from dat_file |
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
153 else : ids = ['BioGrid','STRING','KEGG' ] |
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
154 unidict = {} |
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
155 |
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
156 #keep only ids of interest in dictionaries |
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
157 dat_file=species_dict[species]+"_"+files[1] |
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
158 dat_path = download_from_uniprot_ftp(dat_file,target_directory) |
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
159 with gzip.open(dat_path,"rt") as dat : |
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
160 dat_reader = csv.reader(dat,delimiter="\t") |
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
161 for line in dat_reader : |
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
162 uniprotID=line[0] #UniProtID as key |
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
163 id_type=line[1] #ID type of corresponding id, key of sub-dictionnary |
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
164 cor_id=line[2] #corresponding id |
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
165 if "-" not in id_type : #we don't keep isoform |
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
166 if id_type in ids and uniprotID in unidict : |
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
167 if id_type in unidict[uniprotID] : |
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
168 unidict[uniprotID][id_type]= ";".join([unidict[uniprotID][id_type],cor_id]) #if there is already a value in the dictionnary |
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
169 else : |
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
170 unidict[uniprotID].update({ id_type : cor_id }) |
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
171 elif id_type in ids : |
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
172 unidict[uniprotID]={id_type : cor_id} |
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
173 os.remove(dat_path) |
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
174 |
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
175 #print("dat_file ok") |
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
176 |
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
177 #add ids from idmapping.dat to the final tab |
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
178 for line in tab[1:] : |
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
179 uniprotID=line[0] |
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
180 if human : |
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
181 if uniprotID in unidict : |
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
182 nextprot = access_dictionary(unidict,uniprotID,'neXtProt') |
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
183 if nextprot != '' : nextprot = clean_nextprot_id(nextprot,line[0]) |
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
184 line.extend([nextprot,access_dictionary(unidict,uniprotID,'BioGrid'),access_dictionary(unidict,uniprotID,'STRING'), |
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
185 access_dictionary(unidict,uniprotID,'KEGG')]) |
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
186 else : |
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
187 line.extend(["","","",""]) |
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
188 else : |
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
189 if uniprotID in unidict : |
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
190 line.extend([access_dictionary(unidict,uniprotID,'BioGrid'),access_dictionary(unidict,uniprotID,'STRING'), |
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
191 access_dictionary(unidict,uniprotID,'KEGG')]) |
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
192 else : |
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
193 line.extend(["","",""]) |
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
194 |
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
195 #print ("tab ok") |
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
196 |
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
197 #add missing nextprot ID for human |
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
198 if human : |
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
199 #build next_dict |
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
200 nextprot_ids = id_list_from_nextprot_ftp("nextprot_ac_list_all.txt",target_directory) |
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
201 next_dict = {} |
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
202 for nextid in nextprot_ids : |
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
203 next_dict[nextid.replace("NX_","")] = nextid |
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
204 os.remove(os.path.join(target_directory,"nextprot_ac_list_all.txt")) |
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
205 |
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
206 #add missing nextprot ID |
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
207 for line in tab[1:] : |
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
208 uniprotID=line[0] |
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
209 nextprotID=line[13] |
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
210 if nextprotID == '' and uniprotID in next_dict : |
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
211 line[13]=next_dict[uniprotID] |
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
212 |
15
83f57ba70416
planemo upload commit 8040003119a3d54866ec6ee9b9f659f2af554817-dirty
dchristiany
parents:
13
diff
changeset
|
213 output_file = species+"_id_mapping_"+ time.strftime("%d-%m-%Y") + ".tsv" |
0
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
214 path = os.path.join(target_directory,output_file) |
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
215 |
15
83f57ba70416
planemo upload commit 8040003119a3d54866ec6ee9b9f659f2af554817-dirty
dchristiany
parents:
13
diff
changeset
|
216 with open(path,"w") as out : |
83f57ba70416
planemo upload commit 8040003119a3d54866ec6ee9b9f659f2af554817-dirty
dchristiany
parents:
13
diff
changeset
|
217 w = csv.writer(out,delimiter='\t') |
83f57ba70416
planemo upload commit 8040003119a3d54866ec6ee9b9f659f2af554817-dirty
dchristiany
parents:
13
diff
changeset
|
218 w.writerows(tab) |
0
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
219 |
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
220 name_dict={"human" : "Homo sapiens", "mouse" : "Mus musculus", "rat" : "Rattus norvegicus"} |
10
2f153b41b6fe
planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
7
diff
changeset
|
221 name = name_dict[species]+" "+time.strftime("%d/%m/%Y") |
2f153b41b6fe
planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
7
diff
changeset
|
222 id = species+"_id_mapping_"+ time.strftime("%d-%m-%Y") |
0
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
223 |
10
2f153b41b6fe
planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
7
diff
changeset
|
224 data_table_entry = dict(id=id, name = name, value = species, path = path) |
15
83f57ba70416
planemo upload commit 8040003119a3d54866ec6ee9b9f659f2af554817-dirty
dchristiany
parents:
13
diff
changeset
|
225 _add_data_table_entry(data_manager_dict, data_table_entry, "proteore_id_mapping") |
0
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
226 |
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
227 def download_from_uniprot_ftp(file,target_directory) : |
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
228 ftp_dir = "pub/databases/uniprot/current_release/knowledgebase/idmapping/by_organism/" |
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
229 path = os.path.join(target_directory, file) |
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
230 ftp = ftplib.FTP("ftp.uniprot.org") |
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
231 ftp.login("anonymous", "anonymous") |
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
232 ftp.cwd(ftp_dir) |
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
233 ftp.retrbinary("RETR " + file, open(path, 'wb').write) |
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
234 ftp.quit() |
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
235 return (path) |
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
236 |
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
237 def id_list_from_nextprot_ftp(file,target_directory) : |
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
238 ftp_dir = "pub/current_release/ac_lists/" |
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
239 path = os.path.join(target_directory, file) |
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
240 ftp = ftplib.FTP("ftp.nextprot.org") |
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
241 ftp.login("anonymous", "anonymous") |
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
242 ftp.cwd(ftp_dir) |
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
243 ftp.retrbinary("RETR " + file, open(path, 'wb').write) |
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
244 ftp.quit() |
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
245 with open(path,'r') as nextprot_ids : |
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
246 nextprot_ids = nextprot_ids.read().splitlines() |
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
247 return (nextprot_ids) |
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
248 |
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
249 #return '' if there's no value in a dictionary, avoid error |
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
250 def access_dictionary (dico,key1,key2) : |
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
251 if key1 in dico : |
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
252 if key2 in dico[key1] : |
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
253 return (dico[key1][key2]) |
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
254 else : |
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
255 return ("") |
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
256 #print (key2,"not in ",dico,"[",key1,"]") |
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
257 else : |
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
258 return ('') |
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
259 |
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
260 #if there are several nextprot ID for one uniprotID, return the uniprot like ID |
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
261 def clean_nextprot_id (next_id,uniprotAc) : |
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
262 if len(next_id.split(";")) > 1 : |
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
263 tmp = next_id.split(";") |
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
264 if "NX_"+uniprotAc in tmp : |
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
265 return ("NX_"+uniprotAc) |
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
266 else : |
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
267 return (tmp[1]) |
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
268 else : |
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
269 return (next_id) |
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
270 |
15
83f57ba70416
planemo upload commit 8040003119a3d54866ec6ee9b9f659f2af554817-dirty
dchristiany
parents:
13
diff
changeset
|
271 |
83f57ba70416
planemo upload commit 8040003119a3d54866ec6ee9b9f659f2af554817-dirty
dchristiany
parents:
13
diff
changeset
|
272 ####################################################################################################### |
83f57ba70416
planemo upload commit 8040003119a3d54866ec6ee9b9f659f2af554817-dirty
dchristiany
parents:
13
diff
changeset
|
273 # 4. Build protein interaction maps files |
83f57ba70416
planemo upload commit 8040003119a3d54866ec6ee9b9f659f2af554817-dirty
dchristiany
parents:
13
diff
changeset
|
274 ####################################################################################################### |
83f57ba70416
planemo upload commit 8040003119a3d54866ec6ee9b9f659f2af554817-dirty
dchristiany
parents:
13
diff
changeset
|
275 |
83f57ba70416
planemo upload commit 8040003119a3d54866ec6ee9b9f659f2af554817-dirty
dchristiany
parents:
13
diff
changeset
|
276 def PPI_ref_files(data_manager_dict, species, interactome, target_directory): |
83f57ba70416
planemo upload commit 8040003119a3d54866ec6ee9b9f659f2af554817-dirty
dchristiany
parents:
13
diff
changeset
|
277 |
83f57ba70416
planemo upload commit 8040003119a3d54866ec6ee9b9f659f2af554817-dirty
dchristiany
parents:
13
diff
changeset
|
278 species_dict={'human':'Homo sapiens',"mouse":"Mus musculus","rat":"Rattus norvegicus"} |
83f57ba70416
planemo upload commit 8040003119a3d54866ec6ee9b9f659f2af554817-dirty
dchristiany
parents:
13
diff
changeset
|
279 |
83f57ba70416
planemo upload commit 8040003119a3d54866ec6ee9b9f659f2af554817-dirty
dchristiany
parents:
13
diff
changeset
|
280 ##BioGRID |
83f57ba70416
planemo upload commit 8040003119a3d54866ec6ee9b9f659f2af554817-dirty
dchristiany
parents:
13
diff
changeset
|
281 if interactome=="biogrid": |
83f57ba70416
planemo upload commit 8040003119a3d54866ec6ee9b9f659f2af554817-dirty
dchristiany
parents:
13
diff
changeset
|
282 |
83f57ba70416
planemo upload commit 8040003119a3d54866ec6ee9b9f659f2af554817-dirty
dchristiany
parents:
13
diff
changeset
|
283 tab2_link="https://downloads.thebiogrid.org/Download/BioGRID/Release-Archive/BIOGRID-3.5.167/BIOGRID-ORGANISM-3.5.167.tab2.zip" |
83f57ba70416
planemo upload commit 8040003119a3d54866ec6ee9b9f659f2af554817-dirty
dchristiany
parents:
13
diff
changeset
|
284 |
83f57ba70416
planemo upload commit 8040003119a3d54866ec6ee9b9f659f2af554817-dirty
dchristiany
parents:
13
diff
changeset
|
285 #dowload zip file |
83f57ba70416
planemo upload commit 8040003119a3d54866ec6ee9b9f659f2af554817-dirty
dchristiany
parents:
13
diff
changeset
|
286 r = requests.get(tab2_link) |
83f57ba70416
planemo upload commit 8040003119a3d54866ec6ee9b9f659f2af554817-dirty
dchristiany
parents:
13
diff
changeset
|
287 with open("BioGRID.zip", "wb") as code: |
83f57ba70416
planemo upload commit 8040003119a3d54866ec6ee9b9f659f2af554817-dirty
dchristiany
parents:
13
diff
changeset
|
288 code.write(r.content) |
83f57ba70416
planemo upload commit 8040003119a3d54866ec6ee9b9f659f2af554817-dirty
dchristiany
parents:
13
diff
changeset
|
289 |
83f57ba70416
planemo upload commit 8040003119a3d54866ec6ee9b9f659f2af554817-dirty
dchristiany
parents:
13
diff
changeset
|
290 #unzip files |
83f57ba70416
planemo upload commit 8040003119a3d54866ec6ee9b9f659f2af554817-dirty
dchristiany
parents:
13
diff
changeset
|
291 with zipfile.ZipFile("BioGRID.zip", 'r') as zip_ref: |
83f57ba70416
planemo upload commit 8040003119a3d54866ec6ee9b9f659f2af554817-dirty
dchristiany
parents:
13
diff
changeset
|
292 if not os.path.exists("tmp_BioGRID"): os.makedirs("tmp_BioGRID") |
83f57ba70416
planemo upload commit 8040003119a3d54866ec6ee9b9f659f2af554817-dirty
dchristiany
parents:
13
diff
changeset
|
293 zip_ref.extractall("tmp_BioGRID") |
83f57ba70416
planemo upload commit 8040003119a3d54866ec6ee9b9f659f2af554817-dirty
dchristiany
parents:
13
diff
changeset
|
294 |
83f57ba70416
planemo upload commit 8040003119a3d54866ec6ee9b9f659f2af554817-dirty
dchristiany
parents:
13
diff
changeset
|
295 #import file of interest and build dictionary |
83f57ba70416
planemo upload commit 8040003119a3d54866ec6ee9b9f659f2af554817-dirty
dchristiany
parents:
13
diff
changeset
|
296 file_path="tmp_BioGRID/BIOGRID-ORGANISM-"+species_dict[species].replace(" ","_")+"-3.5.167.tab2.txt" |
83f57ba70416
planemo upload commit 8040003119a3d54866ec6ee9b9f659f2af554817-dirty
dchristiany
parents:
13
diff
changeset
|
297 with open(file_path,"r") as handle : |
83f57ba70416
planemo upload commit 8040003119a3d54866ec6ee9b9f659f2af554817-dirty
dchristiany
parents:
13
diff
changeset
|
298 tab_file = csv.reader(handle,delimiter="\t") |
83f57ba70416
planemo upload commit 8040003119a3d54866ec6ee9b9f659f2af554817-dirty
dchristiany
parents:
13
diff
changeset
|
299 dico_network = {} |
83f57ba70416
planemo upload commit 8040003119a3d54866ec6ee9b9f659f2af554817-dirty
dchristiany
parents:
13
diff
changeset
|
300 GeneID_index=1 |
83f57ba70416
planemo upload commit 8040003119a3d54866ec6ee9b9f659f2af554817-dirty
dchristiany
parents:
13
diff
changeset
|
301 network_cols=[1,2,7,8,11,12,18,20] |
83f57ba70416
planemo upload commit 8040003119a3d54866ec6ee9b9f659f2af554817-dirty
dchristiany
parents:
13
diff
changeset
|
302 for line in tab_file : |
83f57ba70416
planemo upload commit 8040003119a3d54866ec6ee9b9f659f2af554817-dirty
dchristiany
parents:
13
diff
changeset
|
303 dico_network[line[GeneID_index]]=[line[i] for i in network_cols] |
83f57ba70416
planemo upload commit 8040003119a3d54866ec6ee9b9f659f2af554817-dirty
dchristiany
parents:
13
diff
changeset
|
304 |
83f57ba70416
planemo upload commit 8040003119a3d54866ec6ee9b9f659f2af554817-dirty
dchristiany
parents:
13
diff
changeset
|
305 #delete tmp_BioGRID directory |
83f57ba70416
planemo upload commit 8040003119a3d54866ec6ee9b9f659f2af554817-dirty
dchristiany
parents:
13
diff
changeset
|
306 os.remove("BioGRID.zip") |
83f57ba70416
planemo upload commit 8040003119a3d54866ec6ee9b9f659f2af554817-dirty
dchristiany
parents:
13
diff
changeset
|
307 shutil.rmtree("tmp_BioGRID", ignore_errors=True) |
83f57ba70416
planemo upload commit 8040003119a3d54866ec6ee9b9f659f2af554817-dirty
dchristiany
parents:
13
diff
changeset
|
308 |
83f57ba70416
planemo upload commit 8040003119a3d54866ec6ee9b9f659f2af554817-dirty
dchristiany
parents:
13
diff
changeset
|
309 #download NCBI2Reactome.txt file and build dictionary |
83f57ba70416
planemo upload commit 8040003119a3d54866ec6ee9b9f659f2af554817-dirty
dchristiany
parents:
13
diff
changeset
|
310 download = requests.get('https://www.reactome.org/download/current/NCBI2Reactome.txt') |
83f57ba70416
planemo upload commit 8040003119a3d54866ec6ee9b9f659f2af554817-dirty
dchristiany
parents:
13
diff
changeset
|
311 decoded_content = download.content.decode('utf-8') |
83f57ba70416
planemo upload commit 8040003119a3d54866ec6ee9b9f659f2af554817-dirty
dchristiany
parents:
13
diff
changeset
|
312 tab_file = csv.reader(decoded_content.splitlines(), delimiter='\t') |
83f57ba70416
planemo upload commit 8040003119a3d54866ec6ee9b9f659f2af554817-dirty
dchristiany
parents:
13
diff
changeset
|
313 dico_nodes = {} |
83f57ba70416
planemo upload commit 8040003119a3d54866ec6ee9b9f659f2af554817-dirty
dchristiany
parents:
13
diff
changeset
|
314 GeneID_index=0 |
83f57ba70416
planemo upload commit 8040003119a3d54866ec6ee9b9f659f2af554817-dirty
dchristiany
parents:
13
diff
changeset
|
315 pathway_description_index=3 |
83f57ba70416
planemo upload commit 8040003119a3d54866ec6ee9b9f659f2af554817-dirty
dchristiany
parents:
13
diff
changeset
|
316 species_index=5 |
83f57ba70416
planemo upload commit 8040003119a3d54866ec6ee9b9f659f2af554817-dirty
dchristiany
parents:
13
diff
changeset
|
317 for line in tab_file : |
83f57ba70416
planemo upload commit 8040003119a3d54866ec6ee9b9f659f2af554817-dirty
dchristiany
parents:
13
diff
changeset
|
318 if line[species_index]==species_dict[species]: |
83f57ba70416
planemo upload commit 8040003119a3d54866ec6ee9b9f659f2af554817-dirty
dchristiany
parents:
13
diff
changeset
|
319 if line[GeneID_index] in dico_nodes : |
83f57ba70416
planemo upload commit 8040003119a3d54866ec6ee9b9f659f2af554817-dirty
dchristiany
parents:
13
diff
changeset
|
320 dico_nodes[line[GeneID_index]].append(line[pathway_description_index]) |
83f57ba70416
planemo upload commit 8040003119a3d54866ec6ee9b9f659f2af554817-dirty
dchristiany
parents:
13
diff
changeset
|
321 else : |
83f57ba70416
planemo upload commit 8040003119a3d54866ec6ee9b9f659f2af554817-dirty
dchristiany
parents:
13
diff
changeset
|
322 dico_nodes[line[GeneID_index]] = [line[pathway_description_index]] |
13
a1530507fee4
planemo upload commit 08d8f131da0e66113519ffaa7f7e7632cb3d1eff-dirty
dchristiany
parents:
12
diff
changeset
|
323 |
15
83f57ba70416
planemo upload commit 8040003119a3d54866ec6ee9b9f659f2af554817-dirty
dchristiany
parents:
13
diff
changeset
|
324 dico={} |
83f57ba70416
planemo upload commit 8040003119a3d54866ec6ee9b9f659f2af554817-dirty
dchristiany
parents:
13
diff
changeset
|
325 dico['network']=dico_network |
83f57ba70416
planemo upload commit 8040003119a3d54866ec6ee9b9f659f2af554817-dirty
dchristiany
parents:
13
diff
changeset
|
326 dico['nodes']=dico_nodes |
83f57ba70416
planemo upload commit 8040003119a3d54866ec6ee9b9f659f2af554817-dirty
dchristiany
parents:
13
diff
changeset
|
327 |
83f57ba70416
planemo upload commit 8040003119a3d54866ec6ee9b9f659f2af554817-dirty
dchristiany
parents:
13
diff
changeset
|
328 ##Bioplex |
83f57ba70416
planemo upload commit 8040003119a3d54866ec6ee9b9f659f2af554817-dirty
dchristiany
parents:
13
diff
changeset
|
329 elif interactome=="bioplex": |
83f57ba70416
planemo upload commit 8040003119a3d54866ec6ee9b9f659f2af554817-dirty
dchristiany
parents:
13
diff
changeset
|
330 |
83f57ba70416
planemo upload commit 8040003119a3d54866ec6ee9b9f659f2af554817-dirty
dchristiany
parents:
13
diff
changeset
|
331 download = requests.get("http://bioplex.hms.harvard.edu/data/BioPlex_interactionList_v4a.tsv") |
83f57ba70416
planemo upload commit 8040003119a3d54866ec6ee9b9f659f2af554817-dirty
dchristiany
parents:
13
diff
changeset
|
332 decoded_content = download.content.decode('utf-8') |
83f57ba70416
planemo upload commit 8040003119a3d54866ec6ee9b9f659f2af554817-dirty
dchristiany
parents:
13
diff
changeset
|
333 bioplex = csv.reader(decoded_content.splitlines(), delimiter='\t') |
83f57ba70416
planemo upload commit 8040003119a3d54866ec6ee9b9f659f2af554817-dirty
dchristiany
parents:
13
diff
changeset
|
334 dico_network = {} |
83f57ba70416
planemo upload commit 8040003119a3d54866ec6ee9b9f659f2af554817-dirty
dchristiany
parents:
13
diff
changeset
|
335 dico_network["GeneID"]={} |
83f57ba70416
planemo upload commit 8040003119a3d54866ec6ee9b9f659f2af554817-dirty
dchristiany
parents:
13
diff
changeset
|
336 network_geneid_cols=[0,1,4,5,8] |
83f57ba70416
planemo upload commit 8040003119a3d54866ec6ee9b9f659f2af554817-dirty
dchristiany
parents:
13
diff
changeset
|
337 dico_network["UniProt-AC"]={} |
83f57ba70416
planemo upload commit 8040003119a3d54866ec6ee9b9f659f2af554817-dirty
dchristiany
parents:
13
diff
changeset
|
338 network_uniprot_cols=[2,3,4,5,8] |
83f57ba70416
planemo upload commit 8040003119a3d54866ec6ee9b9f659f2af554817-dirty
dchristiany
parents:
13
diff
changeset
|
339 dico_GeneID_to_UniProt = {} |
83f57ba70416
planemo upload commit 8040003119a3d54866ec6ee9b9f659f2af554817-dirty
dchristiany
parents:
13
diff
changeset
|
340 dico_nodes = {} |
83f57ba70416
planemo upload commit 8040003119a3d54866ec6ee9b9f659f2af554817-dirty
dchristiany
parents:
13
diff
changeset
|
341 for line in bioplex : |
83f57ba70416
planemo upload commit 8040003119a3d54866ec6ee9b9f659f2af554817-dirty
dchristiany
parents:
13
diff
changeset
|
342 dico_network["GeneID"][line[0]]=[line[i] for i in network_geneid_cols] |
83f57ba70416
planemo upload commit 8040003119a3d54866ec6ee9b9f659f2af554817-dirty
dchristiany
parents:
13
diff
changeset
|
343 dico_network["UniProt-AC"][line[2]]=[line[i] for i in network_uniprot_cols] |
83f57ba70416
planemo upload commit 8040003119a3d54866ec6ee9b9f659f2af554817-dirty
dchristiany
parents:
13
diff
changeset
|
344 dico_GeneID_to_UniProt[line[0]]=line[2] |
83f57ba70416
planemo upload commit 8040003119a3d54866ec6ee9b9f659f2af554817-dirty
dchristiany
parents:
13
diff
changeset
|
345 |
83f57ba70416
planemo upload commit 8040003119a3d54866ec6ee9b9f659f2af554817-dirty
dchristiany
parents:
13
diff
changeset
|
346 download = requests.get("https://reactome.org/download/current/UniProt2Reactome.txt") |
83f57ba70416
planemo upload commit 8040003119a3d54866ec6ee9b9f659f2af554817-dirty
dchristiany
parents:
13
diff
changeset
|
347 decoded_content = download.content.decode('utf-8') |
83f57ba70416
planemo upload commit 8040003119a3d54866ec6ee9b9f659f2af554817-dirty
dchristiany
parents:
13
diff
changeset
|
348 tab_file = csv.reader(decoded_content.splitlines(), delimiter='\t') |
83f57ba70416
planemo upload commit 8040003119a3d54866ec6ee9b9f659f2af554817-dirty
dchristiany
parents:
13
diff
changeset
|
349 dico_nodes = {} |
83f57ba70416
planemo upload commit 8040003119a3d54866ec6ee9b9f659f2af554817-dirty
dchristiany
parents:
13
diff
changeset
|
350 uniProt_index=0 |
83f57ba70416
planemo upload commit 8040003119a3d54866ec6ee9b9f659f2af554817-dirty
dchristiany
parents:
13
diff
changeset
|
351 pathway_description_index=3 |
83f57ba70416
planemo upload commit 8040003119a3d54866ec6ee9b9f659f2af554817-dirty
dchristiany
parents:
13
diff
changeset
|
352 species_index=5 |
83f57ba70416
planemo upload commit 8040003119a3d54866ec6ee9b9f659f2af554817-dirty
dchristiany
parents:
13
diff
changeset
|
353 for line in tab_file : |
83f57ba70416
planemo upload commit 8040003119a3d54866ec6ee9b9f659f2af554817-dirty
dchristiany
parents:
13
diff
changeset
|
354 if line[species_index]==species_dict[species]: |
83f57ba70416
planemo upload commit 8040003119a3d54866ec6ee9b9f659f2af554817-dirty
dchristiany
parents:
13
diff
changeset
|
355 if line[uniProt_index] in dico_nodes : |
83f57ba70416
planemo upload commit 8040003119a3d54866ec6ee9b9f659f2af554817-dirty
dchristiany
parents:
13
diff
changeset
|
356 dico_nodes[line[uniProt_index]].append(line[pathway_description_index]) |
83f57ba70416
planemo upload commit 8040003119a3d54866ec6ee9b9f659f2af554817-dirty
dchristiany
parents:
13
diff
changeset
|
357 else : |
83f57ba70416
planemo upload commit 8040003119a3d54866ec6ee9b9f659f2af554817-dirty
dchristiany
parents:
13
diff
changeset
|
358 dico_nodes[line[uniProt_index]] = [line[pathway_description_index]] |
83f57ba70416
planemo upload commit 8040003119a3d54866ec6ee9b9f659f2af554817-dirty
dchristiany
parents:
13
diff
changeset
|
359 |
83f57ba70416
planemo upload commit 8040003119a3d54866ec6ee9b9f659f2af554817-dirty
dchristiany
parents:
13
diff
changeset
|
360 dico={} |
83f57ba70416
planemo upload commit 8040003119a3d54866ec6ee9b9f659f2af554817-dirty
dchristiany
parents:
13
diff
changeset
|
361 dico['network']=dico_network |
83f57ba70416
planemo upload commit 8040003119a3d54866ec6ee9b9f659f2af554817-dirty
dchristiany
parents:
13
diff
changeset
|
362 dico['nodes']=dico_nodes |
83f57ba70416
planemo upload commit 8040003119a3d54866ec6ee9b9f659f2af554817-dirty
dchristiany
parents:
13
diff
changeset
|
363 dico['convert']=dico_GeneID_to_UniProt |
83f57ba70416
planemo upload commit 8040003119a3d54866ec6ee9b9f659f2af554817-dirty
dchristiany
parents:
13
diff
changeset
|
364 |
83f57ba70416
planemo upload commit 8040003119a3d54866ec6ee9b9f659f2af554817-dirty
dchristiany
parents:
13
diff
changeset
|
365 #writing output |
83f57ba70416
planemo upload commit 8040003119a3d54866ec6ee9b9f659f2af554817-dirty
dchristiany
parents:
13
diff
changeset
|
366 output_file = species+'_'+interactome+'_dict_'+ time.strftime("%d-%m-%Y") + ".json" |
83f57ba70416
planemo upload commit 8040003119a3d54866ec6ee9b9f659f2af554817-dirty
dchristiany
parents:
13
diff
changeset
|
367 path = os.path.join(target_directory,output_file) |
83f57ba70416
planemo upload commit 8040003119a3d54866ec6ee9b9f659f2af554817-dirty
dchristiany
parents:
13
diff
changeset
|
368 name = species+" ("+species_dict[species]+") "+time.strftime("%d/%m/%Y") |
83f57ba70416
planemo upload commit 8040003119a3d54866ec6ee9b9f659f2af554817-dirty
dchristiany
parents:
13
diff
changeset
|
369 id = interactome+"_"+species+ time.strftime("%d-%m-%Y") |
83f57ba70416
planemo upload commit 8040003119a3d54866ec6ee9b9f659f2af554817-dirty
dchristiany
parents:
13
diff
changeset
|
370 |
83f57ba70416
planemo upload commit 8040003119a3d54866ec6ee9b9f659f2af554817-dirty
dchristiany
parents:
13
diff
changeset
|
371 with open(path, 'w') as handle: |
83f57ba70416
planemo upload commit 8040003119a3d54866ec6ee9b9f659f2af554817-dirty
dchristiany
parents:
13
diff
changeset
|
372 json.dump(dico, handle, sort_keys=True) |
83f57ba70416
planemo upload commit 8040003119a3d54866ec6ee9b9f659f2af554817-dirty
dchristiany
parents:
13
diff
changeset
|
373 |
83f57ba70416
planemo upload commit 8040003119a3d54866ec6ee9b9f659f2af554817-dirty
dchristiany
parents:
13
diff
changeset
|
374 data_table_entry = dict(id=id, name = name, value = species, path = path) |
83f57ba70416
planemo upload commit 8040003119a3d54866ec6ee9b9f659f2af554817-dirty
dchristiany
parents:
13
diff
changeset
|
375 _add_data_table_entry(data_manager_dict, data_table_entry, "proteore_"+interactome+"_dictionaries") |
83f57ba70416
planemo upload commit 8040003119a3d54866ec6ee9b9f659f2af554817-dirty
dchristiany
parents:
13
diff
changeset
|
376 |
0
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
377 |
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
378 ####################################################################################################### |
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
379 # Main function |
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
380 ####################################################################################################### |
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
381 def main(): |
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
382 parser = argparse.ArgumentParser() |
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
383 parser.add_argument("--hpa", metavar = ("HPA_OPTION")) |
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
384 parser.add_argument("--peptideatlas", metavar=("SAMPLE_CATEGORY_ID")) |
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
385 parser.add_argument("--id_mapping", metavar = ("ID_MAPPING_SPECIES")) |
15
83f57ba70416
planemo upload commit 8040003119a3d54866ec6ee9b9f659f2af554817-dirty
dchristiany
parents:
13
diff
changeset
|
386 parser.add_argument("--interactome", metavar = ("PPI")) |
83f57ba70416
planemo upload commit 8040003119a3d54866ec6ee9b9f659f2af554817-dirty
dchristiany
parents:
13
diff
changeset
|
387 parser.add_argument("--species") |
0
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
388 parser.add_argument("-o", "--output") |
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
389 args = parser.parse_args() |
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
390 |
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
391 data_manager_dict = {} |
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
392 # Extract json file params |
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
393 filename = args.output |
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
394 params = from_json_string(open(filename).read()) |
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
395 target_directory = params[ 'output_data' ][0]['extra_files_path'] |
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
396 os.mkdir(target_directory) |
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
397 |
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
398 ## Download source files from HPA |
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
399 try: |
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
400 hpa = args.hpa |
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
401 except NameError: |
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
402 hpa = None |
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
403 if hpa is not None: |
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
404 #target_directory = "/projet/galaxydev/galaxy/tools/proteore/ProteoRE/tools/resources_building/test-data/" |
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
405 hpa = hpa.split(",") |
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
406 for hpa_tissue in hpa: |
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
407 HPA_sources(data_manager_dict, hpa_tissue, target_directory) |
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
408 |
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
409 ## Download source file from Peptide Atlas query |
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
410 try: |
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
411 peptide_atlas = args.peptideatlas |
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
412 except NameError: |
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
413 peptide_atlas = None |
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
414 if peptide_atlas is not None: |
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
415 #target_directory = "/projet/galaxydev/galaxy/tools/proteore/ProteoRE/tools/resources_building/test-data/" |
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
416 peptide_atlas = peptide_atlas.split(",") |
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
417 for pa_tissue in peptide_atlas: |
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
418 peptide_atlas_sources(data_manager_dict, pa_tissue, target_directory) |
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
419 |
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
420 ## Download ID_mapping source file from Uniprot |
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
421 try: |
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
422 id_mapping=args.id_mapping |
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
423 except NameError: |
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
424 id_mapping = None |
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
425 if id_mapping is not None: |
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
426 id_mapping = id_mapping .split(",") |
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
427 for species in id_mapping : |
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
428 id_mapping_sources(data_manager_dict, species, target_directory) |
15
83f57ba70416
planemo upload commit 8040003119a3d54866ec6ee9b9f659f2af554817-dirty
dchristiany
parents:
13
diff
changeset
|
429 |
83f57ba70416
planemo upload commit 8040003119a3d54866ec6ee9b9f659f2af554817-dirty
dchristiany
parents:
13
diff
changeset
|
430 ## Download PPI ref files from biogrid/bioplex/humap |
83f57ba70416
planemo upload commit 8040003119a3d54866ec6ee9b9f659f2af554817-dirty
dchristiany
parents:
13
diff
changeset
|
431 try: |
83f57ba70416
planemo upload commit 8040003119a3d54866ec6ee9b9f659f2af554817-dirty
dchristiany
parents:
13
diff
changeset
|
432 interactome=args.interactome |
83f57ba70416
planemo upload commit 8040003119a3d54866ec6ee9b9f659f2af554817-dirty
dchristiany
parents:
13
diff
changeset
|
433 species=args.species |
83f57ba70416
planemo upload commit 8040003119a3d54866ec6ee9b9f659f2af554817-dirty
dchristiany
parents:
13
diff
changeset
|
434 except NameError: |
83f57ba70416
planemo upload commit 8040003119a3d54866ec6ee9b9f659f2af554817-dirty
dchristiany
parents:
13
diff
changeset
|
435 interactome=None |
83f57ba70416
planemo upload commit 8040003119a3d54866ec6ee9b9f659f2af554817-dirty
dchristiany
parents:
13
diff
changeset
|
436 species=None |
83f57ba70416
planemo upload commit 8040003119a3d54866ec6ee9b9f659f2af554817-dirty
dchristiany
parents:
13
diff
changeset
|
437 if interactome is not None and species is not None: |
83f57ba70416
planemo upload commit 8040003119a3d54866ec6ee9b9f659f2af554817-dirty
dchristiany
parents:
13
diff
changeset
|
438 PPI_ref_files(data_manager_dict, species, interactome, target_directory) |
0
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
439 |
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
440 #save info to json file |
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
441 filename = args.output |
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
442 open(filename, 'wb').write(to_json_string(data_manager_dict)) |
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
443 |
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
444 if __name__ == "__main__": |
2de84fea8367
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
dchristiany
parents:
diff
changeset
|
445 main() |