Mercurial > repos > dchristiany > data_manager_proteore
comparison data_manager/resource_building.py @ 30:b8271b9a1049 draft
planemo upload commit c89c5deac442c0c2aa52b24f2c5af4b290773fc0-dirty
author | dchristiany |
---|---|
date | Mon, 28 Jan 2019 08:14:27 -0500 |
parents | 871a7347ca24 |
children | 76924e57a230 |
comparison
equal
deleted
inserted
replaced
29:871a7347ca24 | 30:b8271b9a1049 |
---|---|
65 # Extract sample_category_id and output filename | 65 # Extract sample_category_id and output filename |
66 tissue=tissue.split(".") | 66 tissue=tissue.split(".") |
67 sample_category_id = tissue[0] | 67 sample_category_id = tissue[0] |
68 name = tissue[1] | 68 name = tissue[1] |
69 output_file = name+"_"+time.strftime("%d-%m-%Y") + ".tsv" | 69 output_file = name+"_"+time.strftime("%d-%m-%Y") + ".tsv" |
70 query="https://db.systemsbiology.net/sbeams/cgi/PeptideAtlas/GetProteins?&atlas_build_id="+sample_category_id+ \ | 70 |
71 "&display_options=ShowAbundances&organism_id="+organism_id+"&redundancy_constraint=4&presence_level_constraint=1%2C2"+ \ | 71 query="https://db.systemsbiology.net/sbeams/cgi/PeptideAtlas/GetProteins?&atlas_build_id="+ \ |
72 "&gene_annotation_level_constraint=leaf&QUERY_NAME=AT_GetProteins&action=QUERY&output_mode=tsv&apply_action=QUERY" | 72 sample_category_id+"&display_options=ShowAbundances&organism_id="+organism_id+ \ |
73 download = requests.get(query) | 73 "&redundancy_constraint=4&presence_level_constraint=1%2C2&gene_annotation_level_constraint=leaf\ |
74 decoded_content = download.content.decode('utf-8') | 74 &QUERY_NAME=AT_GetProteins&action=QUERY&output_mode=tsv&apply_action=QUERY" |
75 cr = csv.reader(decoded_content.splitlines(), delimiter='\t') | 75 |
76 | 76 print (query) |
77 #build dictionary by only keeping uniprot accession (not isoform) as key and sum of observations as value | 77 |
78 with requests.Session() as s: | |
79 download = s.get(query) | |
80 decoded_content = download.content.decode('utf-8') | |
81 cr = csv.reader(decoded_content.splitlines(), delimiter='\t') | |
82 #cr = list(cr) | |
83 | |
78 uni_dict = build_dictionary(cr) | 84 uni_dict = build_dictionary(cr) |
79 | 85 |
80 #columns of data table peptide_atlas | 86 #columns of data table peptide_atlas |
81 date = time.strftime("%d-%m-%Y") | 87 date = time.strftime("%d-%m-%Y") |
82 tissue = tissue.split("-")[1] | 88 tissue_id = name+"_"+date |
83 tissue_id = tissue+"_"+date | |
84 tissue_name = tissue_id.replace("-","/").replace("_"," ") | 89 tissue_name = tissue_id.replace("-","/").replace("_"," ") |
85 path = os.path.join(target_directory,output_file) | 90 path = os.path.join(output_file) |
86 | 91 |
87 with open(path,"wb") as out : | 92 with open(path,"w") as out : |
88 w = csv.writer(out,delimiter='\t') | 93 w = csv.writer(out,delimiter='\t') |
89 w.writerow(["Uniprot_AC","nb_obs"]) | 94 w.writerow(["Uniprot_AC","nb_obs"]) |
90 w.writerows(uni_dict.items()) | 95 w.writerows(uni_dict.items()) |
91 | 96 |
92 data_table_entry = dict(id=tissue_id, name=tissue_name, value = path, tissue = tissue) | 97 data_table_entry = dict(id=tissue_id, name=tissue_name, value = path, tissue = tissue) |
94 | 99 |
95 #function to count the number of observations by uniprot id | 100 #function to count the number of observations by uniprot id |
96 def build_dictionary (csv) : | 101 def build_dictionary (csv) : |
97 uni_dict = {} | 102 uni_dict = {} |
98 for line in csv : | 103 for line in csv : |
99 if "-" not in line[2] and check_uniprot_access(line[2]) : | 104 if "-" not in line[0] and check_uniprot_access(line[0]) : |
100 if line[2] in uni_dict : | 105 if line[0] in uni_dict : |
101 uni_dict[line[2]] += int(line[4]) | 106 uni_dict[line[0]] += int(line[5]) |
102 else : | 107 else : |
103 uni_dict[line[2]] = int(line[4]) | 108 uni_dict[line[0]] = int(line[5]) |
104 | 109 |
105 return uni_dict | 110 return uni_dict |
106 | 111 |
107 #function to check if an id is an uniprot accession number : return True or False- | 112 #function to check if an id is an uniprot accession number : return True or False- |
108 def check_uniprot_access (id) : | 113 def check_uniprot_access (id) : |
109 uniprot_pattern = re.compile("[OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2}") | 114 uniprot_pattern = re.compile("[OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2}") |
110 if uniprot_pattern.match(id) : | 115 if uniprot_pattern.match(id) : |
111 return True | 116 return True |
112 else : | 117 else : |
113 return False | 118 return False |
114 | |
115 | |
116 | 119 |
117 ####################################################################################################### | 120 ####################################################################################################### |
118 # 3. ID mapping file | 121 # 3. ID mapping file |
119 ####################################################################################################### | 122 ####################################################################################################### |
120 import ftplib, gzip | 123 import ftplib, gzip |