comparison data_manager/resource_building.py @ 30:b8271b9a1049 draft

planemo upload commit c89c5deac442c0c2aa52b24f2c5af4b290773fc0-dirty
author dchristiany
date Mon, 28 Jan 2019 08:14:27 -0500
parents 871a7347ca24
children 76924e57a230
comparison
equal deleted inserted replaced
29:871a7347ca24 30:b8271b9a1049
65 # Extract sample_category_id and output filename 65 # Extract sample_category_id and output filename
66 tissue=tissue.split(".") 66 tissue=tissue.split(".")
67 sample_category_id = tissue[0] 67 sample_category_id = tissue[0]
68 name = tissue[1] 68 name = tissue[1]
69 output_file = name+"_"+time.strftime("%d-%m-%Y") + ".tsv" 69 output_file = name+"_"+time.strftime("%d-%m-%Y") + ".tsv"
70 query="https://db.systemsbiology.net/sbeams/cgi/PeptideAtlas/GetProteins?&atlas_build_id="+sample_category_id+ \ 70
71 "&display_options=ShowAbundances&organism_id="+organism_id+"&redundancy_constraint=4&presence_level_constraint=1%2C2"+ \ 71 query="https://db.systemsbiology.net/sbeams/cgi/PeptideAtlas/GetProteins?&atlas_build_id="+ \
72 "&gene_annotation_level_constraint=leaf&QUERY_NAME=AT_GetProteins&action=QUERY&output_mode=tsv&apply_action=QUERY" 72 sample_category_id+"&display_options=ShowAbundances&organism_id="+organism_id+ \
73 download = requests.get(query) 73 "&redundancy_constraint=4&presence_level_constraint=1%2C2&gene_annotation_level_constraint=leaf\
74 decoded_content = download.content.decode('utf-8') 74 &QUERY_NAME=AT_GetProteins&action=QUERY&output_mode=tsv&apply_action=QUERY"
75 cr = csv.reader(decoded_content.splitlines(), delimiter='\t') 75
76 76 print (query)
77 #build dictionary by only keeping uniprot accession (not isoform) as key and sum of observations as value 77
78 with requests.Session() as s:
79 download = s.get(query)
80 decoded_content = download.content.decode('utf-8')
81 cr = csv.reader(decoded_content.splitlines(), delimiter='\t')
82 #cr = list(cr)
83
78 uni_dict = build_dictionary(cr) 84 uni_dict = build_dictionary(cr)
79 85
80 #columns of data table peptide_atlas 86 #columns of data table peptide_atlas
81 date = time.strftime("%d-%m-%Y") 87 date = time.strftime("%d-%m-%Y")
82 tissue = tissue.split("-")[1] 88 tissue_id = name+"_"+date
83 tissue_id = tissue+"_"+date
84 tissue_name = tissue_id.replace("-","/").replace("_"," ") 89 tissue_name = tissue_id.replace("-","/").replace("_"," ")
85 path = os.path.join(target_directory,output_file) 90 path = os.path.join(output_file)
86 91
87 with open(path,"wb") as out : 92 with open(path,"w") as out :
88 w = csv.writer(out,delimiter='\t') 93 w = csv.writer(out,delimiter='\t')
89 w.writerow(["Uniprot_AC","nb_obs"]) 94 w.writerow(["Uniprot_AC","nb_obs"])
90 w.writerows(uni_dict.items()) 95 w.writerows(uni_dict.items())
91 96
92 data_table_entry = dict(id=tissue_id, name=tissue_name, value = path, tissue = tissue) 97 data_table_entry = dict(id=tissue_id, name=tissue_name, value = path, tissue = tissue)
94 99
95 #function to count the number of observations by uniprot id 100 #function to count the number of observations by uniprot id
96 def build_dictionary (csv) : 101 def build_dictionary (csv) :
97 uni_dict = {} 102 uni_dict = {}
98 for line in csv : 103 for line in csv :
99 if "-" not in line[2] and check_uniprot_access(line[2]) : 104 if "-" not in line[0] and check_uniprot_access(line[0]) :
100 if line[2] in uni_dict : 105 if line[0] in uni_dict :
101 uni_dict[line[2]] += int(line[4]) 106 uni_dict[line[0]] += int(line[5])
102 else : 107 else :
103 uni_dict[line[2]] = int(line[4]) 108 uni_dict[line[0]] = int(line[5])
104 109
105 return uni_dict 110 return uni_dict
106 111
107 #function to check if an id is an uniprot accession number : return True or False- 112 #function to check if an id is an uniprot accession number : return True or False-
108 def check_uniprot_access (id) : 113 def check_uniprot_access (id) :
109 uniprot_pattern = re.compile("[OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2}") 114 uniprot_pattern = re.compile("[OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2}")
110 if uniprot_pattern.match(id) : 115 if uniprot_pattern.match(id) :
111 return True 116 return True
112 else : 117 else :
113 return False 118 return False
114
115
116 119
117 ####################################################################################################### 120 #######################################################################################################
118 # 3. ID mapping file 121 # 3. ID mapping file
119 ####################################################################################################### 122 #######################################################################################################
120 import ftplib, gzip 123 import ftplib, gzip