Mercurial > repos > dchristiany > data_manager_proteore
comparison data_manager/resource_building.py @ 10:2f153b41b6fe draft
planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
author | dchristiany |
---|---|
date | Tue, 23 Oct 2018 07:43:37 -0400 |
parents | d16a52bf0e5b |
children | 60cb0a5ae661 |
comparison
equal
deleted
inserted
replaced
9:6c47b77d89d6 | 10:2f153b41b6fe |
---|---|
1 """ | 1 """ |
2 The purpose of this script is to create source files from different databases to be used in other tools | 2 The purpose of this script is to create source files from different databases to be used in other proteore tools |
3 """ | 3 """ |
4 | 4 |
5 import os, sys, argparse, requests, time, csv, re | 5 import os, sys, argparse, requests, time, csv, re |
6 from io import BytesIO | 6 from io import BytesIO |
7 from zipfile import ZipFile | 7 from zipfile import ZipFile |
42 tissue_name = "HPA pathology" | 42 tissue_name = "HPA pathology" |
43 url = "https://www.proteinatlas.org/download/pathology.tsv.zip" | 43 url = "https://www.proteinatlas.org/download/pathology.tsv.zip" |
44 elif tissue == "HPA_full_atlas": | 44 elif tissue == "HPA_full_atlas": |
45 tissue_name = "HPA full atlas" | 45 tissue_name = "HPA full atlas" |
46 url = "https://www.proteinatlas.org/download/proteinatlas.tsv.zip" | 46 url = "https://www.proteinatlas.org/download/proteinatlas.tsv.zip" |
47 | |
47 output_file = tissue +"_"+ time.strftime("%d-%m-%Y") + ".tsv" | 48 output_file = tissue +"_"+ time.strftime("%d-%m-%Y") + ".tsv" |
48 path = os.path.join(target_directory, output_file) | 49 path = os.path.join(target_directory, output_file) |
49 unzip(url, path) | 50 unzip(url, path) #download and save file |
50 print(str(os.path.isfile(path))) | |
51 tmp=open(path,"r").readlines() | |
52 tissue_name = tissue_name + " " + time.strftime("%d/%m/%Y") | 51 tissue_name = tissue_name + " " + time.strftime("%d/%m/%Y") |
53 data_table_entry = dict(value = tissue, name = tissue_name, path = path) | 52 tissue_id = tissue_name.replace(" ","_").replace("/","-") |
53 | |
54 data_table_entry = dict(id=tissue_id, name = tissue_name, value = tissue, path = path) | |
54 _add_data_table_entry(data_manager_dict, data_table_entry, "protein_atlas") | 55 _add_data_table_entry(data_manager_dict, data_table_entry, "protein_atlas") |
55 | 56 |
56 | 57 |
57 ####################################################################################################### | 58 ####################################################################################################### |
58 # 2. Peptide Atlas | 59 # 2. Peptide Atlas |
74 cr = csv.reader(decoded_content.splitlines(), delimiter='\t') | 75 cr = csv.reader(decoded_content.splitlines(), delimiter='\t') |
75 | 76 |
76 #build dictionary by only keeping uniprot accession (not isoform) as key and sum of observations as value | 77 #build dictionary by only keeping uniprot accession (not isoform) as key and sum of observations as value |
77 uni_dict = build_dictionary(cr) | 78 uni_dict = build_dictionary(cr) |
78 | 79 |
79 tissue_id = "_".join([atlas_build_id, organism_id, sample_category_id,time.strftime("%d-%m-%Y")]) | 80 #columns of data table peptide_atlas |
80 tissue_value = tissue.split("-")[1] | 81 date = time.strftime("%d-%m-%Y") |
81 tissue = tissue.split("-")[1] + "_" +time.strftime("%d-%m-%Y") | 82 tissue = tissue.split("-")[1] |
82 tissue_name = " ".join(tissue_value.split("_")) + " " + time.strftime("%d/%m/%Y") | 83 tissue_id = tissue+"_"+date |
84 tissue_name = tissue_id.replace("-","/").replace("_"," ") | |
83 path = os.path.join(target_directory,output_file) | 85 path = os.path.join(target_directory,output_file) |
84 | 86 |
85 with open(path,"wb") as out : | 87 with open(path,"wb") as out : |
86 w = csv.writer(out,delimiter='\t') | 88 w = csv.writer(out,delimiter='\t') |
87 w.writerow(["Uniprot_AC","nb_obs"]) | 89 w.writerow(["Uniprot_AC","nb_obs"]) |
88 w.writerows(uni_dict.items()) | 90 w.writerows(uni_dict.items()) |
89 | 91 |
90 data_table_entry = dict(value = path, name = tissue_name, tissue = tissue) | 92 data_table_entry = dict(id=tissue_id, name=tissue_name, value = path, tissue = tissue) |
91 _add_data_table_entry(data_manager_dict, data_table_entry, "peptide_atlas") | 93 _add_data_table_entry(data_manager_dict, data_table_entry, "peptide_atlas") |
92 | 94 |
93 #function to count the number of observations by uniprot id | 95 #function to count the number of observations by uniprot id |
94 def build_dictionary (csv) : | 96 def build_dictionary (csv) : |
95 uni_dict = {} | 97 uni_dict = {} |
213 with open(path,"w") as out : | 215 with open(path,"w") as out : |
214 w = csv.writer(out,delimiter='\t') | 216 w = csv.writer(out,delimiter='\t') |
215 w.writerows(tab) | 217 w.writerows(tab) |
216 | 218 |
217 name_dict={"human" : "Homo sapiens", "mouse" : "Mus musculus", "rat" : "Rattus norvegicus"} | 219 name_dict={"human" : "Homo sapiens", "mouse" : "Mus musculus", "rat" : "Rattus norvegicus"} |
218 name = name_dict[species]+" ("+time.strftime("%d-%m-%Y")+")" | 220 name = name_dict[species]+" "+time.strftime("%d/%m/%Y") |
219 | 221 id = species+"_id_mapping_"+ time.strftime("%d-%m-%Y") |
220 data_table_entry = dict(value = species+"_id_mapping_"+ time.strftime("%d-%m-%Y"), name = name, path = path) | 222 |
223 data_table_entry = dict(id=id, name = name, value = species, path = path) | |
221 _add_data_table_entry(data_manager_dict, data_table_entry, "id_mapping") | 224 _add_data_table_entry(data_manager_dict, data_table_entry, "id_mapping") |
222 | 225 |
223 def download_from_uniprot_ftp(file,target_directory) : | 226 def download_from_uniprot_ftp(file,target_directory) : |
224 ftp_dir = "pub/databases/uniprot/current_release/knowledgebase/idmapping/by_organism/" | 227 ftp_dir = "pub/databases/uniprot/current_release/knowledgebase/idmapping/by_organism/" |
225 path = os.path.join(target_directory, file) | 228 path = os.path.join(target_directory, file) |