Mercurial > repos > dchristiany > resource_building
comparison data_manager/resource_building.py @ 0:a4d442fbd939 draft default tip
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
| author | dchristiany |
|---|---|
| date | Fri, 19 Oct 2018 03:25:15 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:a4d442fbd939 |
|---|---|
| 1 """ | |
| 2 The purpose of this script is to create source files from different databases to be used in other tools | |
| 3 """ | |
| 4 | |
| 5 import os, sys, argparse, requests, time, csv, re | |
| 6 from io import BytesIO | |
| 7 from zipfile import ZipFile | |
| 8 from galaxy.util.json import from_json_string, to_json_string | |
| 9 | |
| 10 ####################################################################################################### | |
| 11 # General functions | |
| 12 ####################################################################################################### | |
| 13 def unzip(url, output_file): | |
| 14 """ | |
| 15 Get a zip file content from a link and unzip | |
| 16 """ | |
| 17 content = requests.get(url) | |
| 18 zipfile = ZipFile(BytesIO(content.content)) | |
| 19 output_content = "" | |
| 20 output_content += zipfile.open(zipfile.namelist()[0]).read() | |
| 21 output = open(output_file, "w") | |
| 22 output.write(output_content) | |
| 23 output.close() | |
| 24 | |
| 25 def _add_data_table_entry(data_manager_dict, data_table_entry,data_table): | |
| 26 data_manager_dict['data_tables'] = data_manager_dict.get('data_tables', {}) | |
| 27 data_manager_dict['data_tables'][data_table] = data_manager_dict['data_tables'].get(data_table, []) | |
| 28 data_manager_dict['data_tables'][data_table].append(data_table_entry) | |
| 29 return data_manager_dict | |
| 30 | |
| 31 ####################################################################################################### | |
| 32 # 1. Human Protein Atlas | |
| 33 # - Normal tissue | |
| 34 # - Pathology | |
| 35 # - Full Atlas | |
| 36 ####################################################################################################### | |
| 37 def HPA_sources(data_manager_dict, tissue, target_directory): | |
| 38 if tissue == "HPA_normal_tissue": | |
| 39 tissue_name = "HPA normal tissue" | |
| 40 url = "https://www.proteinatlas.org/download/normal_tissue.tsv.zip" | |
| 41 elif tissue == "HPA_pathology": | |
| 42 tissue_name = "HPA pathology" | |
| 43 url = "https://www.proteinatlas.org/download/pathology.tsv.zip" | |
| 44 elif tissue == "HPA_full_atlas": | |
| 45 tissue_name = "HPA full atlas" | |
| 46 url = "https://www.proteinatlas.org/download/proteinatlas.tsv.zip" | |
| 47 output_file = tissue +"_"+ time.strftime("%d-%m-%Y") + ".tsv" | |
| 48 path = os.path.join(target_directory, output_file) | |
| 49 unzip(url, path) | |
| 50 print(str(os.path.isfile(path))) | |
| 51 tmp=open(path,"r").readlines() | |
| 52 tissue_name = tissue_name + " " + time.strftime("%d/%m/%Y") | |
| 53 data_table_entry = dict(value = tissue, name = tissue_name, path = path) | |
| 54 _add_data_table_entry(data_manager_dict, data_table_entry, "proteinatlas") | |
| 55 | |
| 56 | |
| 57 ####################################################################################################### | |
| 58 # 2. Peptide Atlas | |
| 59 ####################################################################################################### | |
| 60 def peptide_atlas_sources(data_manager_dict, tissue, target_directory): | |
| 61 # Define PA Human build released number (here early 2018) | |
| 62 atlas_build_id = "472" | |
| 63 # Define organism_id (here Human) - to be upraded when other organism added to the project | |
| 64 organism_id = "2" | |
| 65 # Extract sample_category_id and output filename | |
| 66 sample_category_id = tissue.split("-")[0] | |
| 67 output_file = tissue.split("-")[1] +"_"+ time.strftime("%d-%m-%Y") + ".tsv" | |
| 68 query = "https://db.systemsbiology.net/sbeams/cgi/PeptideAtlas/GetPeptides?atlas_build_id=" + \ | |
| 69 atlas_build_id + "&display_options=ShowMappings&organism_id= " + \ | |
| 70 organism_id + "&sample_category_id=" + sample_category_id + \ | |
| 71 "&QUERY_NAME=AT_GetPeptides&output_mode=tsv&apply_action=QUERY" | |
| 72 download = requests.get(query) | |
| 73 decoded_content = download.content.decode('utf-8') | |
| 74 cr = csv.reader(decoded_content.splitlines(), delimiter='\t') | |
| 75 | |
| 76 #build dictionary by only keeping uniprot accession (not isoform) as key and sum of observations as value | |
| 77 uni_dict = build_dictionary(cr) | |
| 78 | |
| 79 tissue_id = "_".join([atlas_build_id, organism_id, sample_category_id,time.strftime("%d-%m-%Y")]) | |
| 80 tissue_value = tissue.split("-")[1] | |
| 81 tissue = tissue.split("-")[1] + "_" +time.strftime("%d-%m-%Y") | |
| 82 tissue_name = " ".join(tissue_value.split("_")) + " " + time.strftime("%d/%m/%Y") | |
| 83 path = os.path.join(target_directory,output_file) | |
| 84 | |
| 85 with open(path,"wb") as out : | |
| 86 w = csv.writer(out,delimiter='\t') | |
| 87 w.writerow(["Uniprot_AC","nb_obs"]) | |
| 88 w.writerows(uni_dict.items()) | |
| 89 | |
| 90 data_table_entry = dict(value = path, name = tissue_name, tissue = tissue) | |
| 91 _add_data_table_entry(data_manager_dict, data_table_entry, "peptide_atlas") | |
| 92 | |
| 93 #function to count the number of observations by uniprot id | |
| 94 def build_dictionary (csv) : | |
| 95 uni_dict = {} | |
| 96 for line in csv : | |
| 97 if "-" not in line[2] and check_uniprot_access(line[2]) : | |
| 98 if line[2] in uni_dict : | |
| 99 uni_dict[line[2]] += int(line[4]) | |
| 100 else : | |
| 101 uni_dict[line[2]] = int(line[4]) | |
| 102 | |
| 103 return uni_dict | |
| 104 | |
| 105 #function to check if an id is an uniprot accession number : return True or False- | |
| 106 def check_uniprot_access (id) : | |
| 107 uniprot_pattern = re.compile("[OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2}") | |
| 108 if uniprot_pattern.match(id) : | |
| 109 return True | |
| 110 else : | |
| 111 return False | |
| 112 | |
| 113 | |
| 114 | |
| 115 ####################################################################################################### | |
| 116 # 3. ID mapping file | |
| 117 ####################################################################################################### | |
| 118 import ftplib, gzip | |
| 119 csv.field_size_limit(sys.maxsize) # to handle big files | |
| 120 | |
| 121 def id_mapping_sources (data_manager_dict, species, target_directory) : | |
| 122 | |
| 123 human = species == "human" | |
| 124 species_dict = { "human" : "HUMAN_9606", "mouse" : "MOUSE_10090", "rat" : "RAT_10116" } | |
| 125 files=["idmapping_selected.tab.gz","idmapping.dat.gz"] | |
| 126 | |
| 127 #header | |
| 128 if human : tab = [["UniProt-AC","UniProt-ID","GeneID","RefSeq","GI","PDB","GO","PIR","MIM","UniGene","Ensembl_Gene","Ensembl_Transcript","Ensembl_Protein","neXtProt","BioGrid","STRING","KEGG"]] | |
| 129 else : tab = [["UniProt-AC","UniProt-ID","GeneID","RefSeq","GI","PDB","GO","PIR","MIM","UniGene","Ensembl_Gene","Ensembl_Transcript","Ensembl_Protein","BioGrid","STRING","KEGG"]] | |
| 130 | |
| 131 #print("header ok") | |
| 132 | |
| 133 #selected.tab and keep only ids of interest | |
| 134 selected_tab_file=species_dict[species]+"_"+files[0] | |
| 135 tab_path = download_from_uniprot_ftp(selected_tab_file,target_directory) | |
| 136 with gzip.open(tab_path,"rt") as select : | |
| 137 tab_reader = csv.reader(select,delimiter="\t") | |
| 138 for line in tab_reader : | |
| 139 tab.append([line[i] for i in [0,1,2,3,4,5,6,11,13,14,18,19,20]]) | |
| 140 os.remove(tab_path) | |
| 141 | |
| 142 #print("selected_tab ok") | |
| 143 | |
| 144 """ | |
| 145 Supplementary ID to get from HUMAN_9606_idmapping.dat : | |
| 146 -NextProt,BioGrid,STRING,KEGG | |
| 147 """ | |
| 148 | |
| 149 if human : ids = ['neXtProt','BioGrid','STRING','KEGG' ] #ids to get from dat_file | |
| 150 else : ids = ['BioGrid','STRING','KEGG' ] | |
| 151 unidict = {} | |
| 152 | |
| 153 #keep only ids of interest in dictionaries | |
| 154 dat_file=species_dict[species]+"_"+files[1] | |
| 155 dat_path = download_from_uniprot_ftp(dat_file,target_directory) | |
| 156 with gzip.open(dat_path,"rt") as dat : | |
| 157 dat_reader = csv.reader(dat,delimiter="\t") | |
| 158 for line in dat_reader : | |
| 159 uniprotID=line[0] #UniProtID as key | |
| 160 id_type=line[1] #ID type of corresponding id, key of sub-dictionnary | |
| 161 cor_id=line[2] #corresponding id | |
| 162 if "-" not in id_type : #we don't keep isoform | |
| 163 if id_type in ids and uniprotID in unidict : | |
| 164 if id_type in unidict[uniprotID] : | |
| 165 unidict[uniprotID][id_type]= ";".join([unidict[uniprotID][id_type],cor_id]) #if there is already a value in the dictionnary | |
| 166 else : | |
| 167 unidict[uniprotID].update({ id_type : cor_id }) | |
| 168 elif id_type in ids : | |
| 169 unidict[uniprotID]={id_type : cor_id} | |
| 170 os.remove(dat_path) | |
| 171 | |
| 172 #print("dat_file ok") | |
| 173 | |
| 174 #add ids from idmapping.dat to the final tab | |
| 175 for line in tab[1:] : | |
| 176 uniprotID=line[0] | |
| 177 if human : | |
| 178 if uniprotID in unidict : | |
| 179 nextprot = access_dictionary(unidict,uniprotID,'neXtProt') | |
| 180 if nextprot != '' : nextprot = clean_nextprot_id(nextprot,line[0]) | |
| 181 line.extend([nextprot,access_dictionary(unidict,uniprotID,'BioGrid'),access_dictionary(unidict,uniprotID,'STRING'), | |
| 182 access_dictionary(unidict,uniprotID,'KEGG')]) | |
| 183 else : | |
| 184 line.extend(["","","",""]) | |
| 185 else : | |
| 186 if uniprotID in unidict : | |
| 187 line.extend([access_dictionary(unidict,uniprotID,'BioGrid'),access_dictionary(unidict,uniprotID,'STRING'), | |
| 188 access_dictionary(unidict,uniprotID,'KEGG')]) | |
| 189 else : | |
| 190 line.extend(["","",""]) | |
| 191 | |
| 192 #print ("tab ok") | |
| 193 | |
| 194 #add missing nextprot ID for human | |
| 195 if human : | |
| 196 #build next_dict | |
| 197 nextprot_ids = id_list_from_nextprot_ftp("nextprot_ac_list_all.txt",target_directory) | |
| 198 next_dict = {} | |
| 199 for nextid in nextprot_ids : | |
| 200 next_dict[nextid.replace("NX_","")] = nextid | |
| 201 os.remove(os.path.join(target_directory,"nextprot_ac_list_all.txt")) | |
| 202 | |
| 203 #add missing nextprot ID | |
| 204 for line in tab[1:] : | |
| 205 uniprotID=line[0] | |
| 206 nextprotID=line[13] | |
| 207 if nextprotID == '' and uniprotID in next_dict : | |
| 208 line[13]=next_dict[uniprotID] | |
| 209 | |
| 210 output_file = species+"_id_mapping_"+ time.strftime("%d-%m-%Y") + ".tsv" | |
| 211 path = os.path.join(target_directory,output_file) | |
| 212 | |
| 213 with open(path,"w") as out : | |
| 214 w = csv.writer(out,delimiter='\t') | |
| 215 w.writerows(tab) | |
| 216 | |
| 217 name_dict={"human" : "Homo sapiens", "mouse" : "Mus musculus", "rat" : "Rattus norvegicus"} | |
| 218 name = name_dict[species]+" ("+time.strftime("%d-%m-%Y")+")" | |
| 219 | |
| 220 data_table_entry = dict(value = species+"_id_mapping_"+ time.strftime("%d-%m-%Y"), name = name, path = path) | |
| 221 _add_data_table_entry(data_manager_dict, data_table_entry, "id_mapping_tab") | |
| 222 | |
| 223 def download_from_uniprot_ftp(file,target_directory) : | |
| 224 ftp_dir = "pub/databases/uniprot/current_release/knowledgebase/idmapping/by_organism/" | |
| 225 path = os.path.join(target_directory, file) | |
| 226 ftp = ftplib.FTP("ftp.uniprot.org") | |
| 227 ftp.login("anonymous", "anonymous") | |
| 228 ftp.cwd(ftp_dir) | |
| 229 ftp.retrbinary("RETR " + file, open(path, 'wb').write) | |
| 230 ftp.quit() | |
| 231 return (path) | |
| 232 | |
| 233 def id_list_from_nextprot_ftp(file,target_directory) : | |
| 234 ftp_dir = "pub/current_release/ac_lists/" | |
| 235 path = os.path.join(target_directory, file) | |
| 236 ftp = ftplib.FTP("ftp.nextprot.org") | |
| 237 ftp.login("anonymous", "anonymous") | |
| 238 ftp.cwd(ftp_dir) | |
| 239 ftp.retrbinary("RETR " + file, open(path, 'wb').write) | |
| 240 ftp.quit() | |
| 241 with open(path,'r') as nextprot_ids : | |
| 242 nextprot_ids = nextprot_ids.read().splitlines() | |
| 243 return (nextprot_ids) | |
| 244 | |
| 245 #return '' if there's no value in a dictionary, avoid error | |
| 246 def access_dictionary (dico,key1,key2) : | |
| 247 if key1 in dico : | |
| 248 if key2 in dico[key1] : | |
| 249 return (dico[key1][key2]) | |
| 250 else : | |
| 251 return ("") | |
| 252 #print (key2,"not in ",dico,"[",key1,"]") | |
| 253 else : | |
| 254 return ('') | |
| 255 | |
| 256 #if there are several nextprot ID for one uniprotID, return the uniprot like ID | |
| 257 def clean_nextprot_id (next_id,uniprotAc) : | |
| 258 if len(next_id.split(";")) > 1 : | |
| 259 tmp = next_id.split(";") | |
| 260 if "NX_"+uniprotAc in tmp : | |
| 261 return ("NX_"+uniprotAc) | |
| 262 else : | |
| 263 return (tmp[1]) | |
| 264 else : | |
| 265 return (next_id) | |
| 266 | |
| 267 | |
| 268 ####################################################################################################### | |
| 269 # Main function | |
| 270 ####################################################################################################### | |
| 271 def main(): | |
| 272 parser = argparse.ArgumentParser() | |
| 273 parser.add_argument("--hpa", metavar = ("HPA_OPTION")) | |
| 274 parser.add_argument("--peptideatlas", metavar=("SAMPLE_CATEGORY_ID")) | |
| 275 parser.add_argument("--id_mapping", metavar = ("ID_MAPPING_SPECIES")) | |
| 276 parser.add_argument("-o", "--output") | |
| 277 args = parser.parse_args() | |
| 278 | |
| 279 data_manager_dict = {} | |
| 280 # Extract json file params | |
| 281 filename = args.output | |
| 282 params = from_json_string(open(filename).read()) | |
| 283 target_directory = params[ 'output_data' ][0]['extra_files_path'] | |
| 284 os.mkdir(target_directory) | |
| 285 | |
| 286 ## Download source files from HPA | |
| 287 try: | |
| 288 hpa = args.hpa | |
| 289 except NameError: | |
| 290 hpa = None | |
| 291 if hpa is not None: | |
| 292 #target_directory = "/projet/galaxydev/galaxy/tools/proteore/ProteoRE/tools/resources_building/test-data/" | |
| 293 hpa = hpa.split(",") | |
| 294 for hpa_tissue in hpa: | |
| 295 HPA_sources(data_manager_dict, hpa_tissue, target_directory) | |
| 296 | |
| 297 ## Download source file from Peptide Atlas query | |
| 298 try: | |
| 299 peptide_atlas = args.peptideatlas | |
| 300 except NameError: | |
| 301 peptide_atlas = None | |
| 302 if peptide_atlas is not None: | |
| 303 #target_directory = "/projet/galaxydev/galaxy/tools/proteore/ProteoRE/tools/resources_building/test-data/" | |
| 304 peptide_atlas = peptide_atlas.split(",") | |
| 305 for pa_tissue in peptide_atlas: | |
| 306 peptide_atlas_sources(data_manager_dict, pa_tissue, target_directory) | |
| 307 | |
| 308 ## Download ID_mapping source file from Uniprot | |
| 309 try: | |
| 310 id_mapping=args.id_mapping | |
| 311 except NameError: | |
| 312 id_mapping = None | |
| 313 if id_mapping is not None: | |
| 314 id_mapping = id_mapping .split(",") | |
| 315 for species in id_mapping : | |
| 316 id_mapping_sources(data_manager_dict, species, target_directory) | |
| 317 | |
| 318 #save info to json file | |
| 319 filename = args.output | |
| 320 open(filename, 'wb').write(to_json_string(data_manager_dict)) | |
| 321 | |
| 322 if __name__ == "__main__": | |
| 323 main() |
