comparison data_manager/resource_building.py @ 10:2f153b41b6fe draft

planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
author dchristiany
date Tue, 23 Oct 2018 07:43:37 -0400
parents d16a52bf0e5b
children 60cb0a5ae661
comparison
equal deleted inserted replaced
9:6c47b77d89d6 10:2f153b41b6fe
1 """ 1 """
2 The purpose of this script is to create source files from different databases to be used in other tools 2 The purpose of this script is to create source files from different databases to be used in other proteore tools
3 """ 3 """
4 4
5 import os, sys, argparse, requests, time, csv, re 5 import os, sys, argparse, requests, time, csv, re
6 from io import BytesIO 6 from io import BytesIO
7 from zipfile import ZipFile 7 from zipfile import ZipFile
42 tissue_name = "HPA pathology" 42 tissue_name = "HPA pathology"
43 url = "https://www.proteinatlas.org/download/pathology.tsv.zip" 43 url = "https://www.proteinatlas.org/download/pathology.tsv.zip"
44 elif tissue == "HPA_full_atlas": 44 elif tissue == "HPA_full_atlas":
45 tissue_name = "HPA full atlas" 45 tissue_name = "HPA full atlas"
46 url = "https://www.proteinatlas.org/download/proteinatlas.tsv.zip" 46 url = "https://www.proteinatlas.org/download/proteinatlas.tsv.zip"
47
47 output_file = tissue +"_"+ time.strftime("%d-%m-%Y") + ".tsv" 48 output_file = tissue +"_"+ time.strftime("%d-%m-%Y") + ".tsv"
48 path = os.path.join(target_directory, output_file) 49 path = os.path.join(target_directory, output_file)
49 unzip(url, path) 50 unzip(url, path) #download and save file
50 print(str(os.path.isfile(path)))
51 tmp=open(path,"r").readlines()
52 tissue_name = tissue_name + " " + time.strftime("%d/%m/%Y") 51 tissue_name = tissue_name + " " + time.strftime("%d/%m/%Y")
53 data_table_entry = dict(value = tissue, name = tissue_name, path = path) 52 tissue_id = tissue_name.replace(" ","_").replace("/","-")
53
54 data_table_entry = dict(id=tissue_id, name = tissue_name, value = tissue, path = path)
54 _add_data_table_entry(data_manager_dict, data_table_entry, "protein_atlas") 55 _add_data_table_entry(data_manager_dict, data_table_entry, "protein_atlas")
55 56
56 57
57 ####################################################################################################### 58 #######################################################################################################
58 # 2. Peptide Atlas 59 # 2. Peptide Atlas
74 cr = csv.reader(decoded_content.splitlines(), delimiter='\t') 75 cr = csv.reader(decoded_content.splitlines(), delimiter='\t')
75 76
76 #build dictionary by only keeping uniprot accession (not isoform) as key and sum of observations as value 77 #build dictionary by only keeping uniprot accession (not isoform) as key and sum of observations as value
77 uni_dict = build_dictionary(cr) 78 uni_dict = build_dictionary(cr)
78 79
79 tissue_id = "_".join([atlas_build_id, organism_id, sample_category_id,time.strftime("%d-%m-%Y")]) 80 #columns of data table peptide_atlas
80 tissue_value = tissue.split("-")[1] 81 date = time.strftime("%d-%m-%Y")
81 tissue = tissue.split("-")[1] + "_" +time.strftime("%d-%m-%Y") 82 tissue = tissue.split("-")[1]
82 tissue_name = " ".join(tissue_value.split("_")) + " " + time.strftime("%d/%m/%Y") 83 tissue_id = tissue+"_"+date
84 tissue_name = tissue_id.replace("-","/").replace("_"," ")
83 path = os.path.join(target_directory,output_file) 85 path = os.path.join(target_directory,output_file)
84 86
85 with open(path,"wb") as out : 87 with open(path,"wb") as out :
86 w = csv.writer(out,delimiter='\t') 88 w = csv.writer(out,delimiter='\t')
87 w.writerow(["Uniprot_AC","nb_obs"]) 89 w.writerow(["Uniprot_AC","nb_obs"])
88 w.writerows(uni_dict.items()) 90 w.writerows(uni_dict.items())
89 91
90 data_table_entry = dict(value = path, name = tissue_name, tissue = tissue) 92 data_table_entry = dict(id=tissue_id, name=tissue_name, value = path, tissue = tissue)
91 _add_data_table_entry(data_manager_dict, data_table_entry, "peptide_atlas") 93 _add_data_table_entry(data_manager_dict, data_table_entry, "peptide_atlas")
92 94
93 #function to count the number of observations by uniprot id 95 #function to count the number of observations by uniprot id
94 def build_dictionary (csv) : 96 def build_dictionary (csv) :
95 uni_dict = {} 97 uni_dict = {}
213 with open(path,"w") as out : 215 with open(path,"w") as out :
214 w = csv.writer(out,delimiter='\t') 216 w = csv.writer(out,delimiter='\t')
215 w.writerows(tab) 217 w.writerows(tab)
216 218
217 name_dict={"human" : "Homo sapiens", "mouse" : "Mus musculus", "rat" : "Rattus norvegicus"} 219 name_dict={"human" : "Homo sapiens", "mouse" : "Mus musculus", "rat" : "Rattus norvegicus"}
218 name = name_dict[species]+" ("+time.strftime("%d-%m-%Y")+")" 220 name = name_dict[species]+" "+time.strftime("%d/%m/%Y")
219 221 id = species+"_id_mapping_"+ time.strftime("%d-%m-%Y")
220 data_table_entry = dict(value = species+"_id_mapping_"+ time.strftime("%d-%m-%Y"), name = name, path = path) 222
223 data_table_entry = dict(id=id, name = name, value = species, path = path)
221 _add_data_table_entry(data_manager_dict, data_table_entry, "id_mapping") 224 _add_data_table_entry(data_manager_dict, data_table_entry, "id_mapping")
222 225
223 def download_from_uniprot_ftp(file,target_directory) : 226 def download_from_uniprot_ftp(file,target_directory) :
224 ftp_dir = "pub/databases/uniprot/current_release/knowledgebase/idmapping/by_organism/" 227 ftp_dir = "pub/databases/uniprot/current_release/knowledgebase/idmapping/by_organism/"
225 path = os.path.join(target_directory, file) 228 path = os.path.join(target_directory, file)