Mercurial > repos > melissacline > ucsc_xena_platform
comparison xena_query.py @ 0:8bb037f88ed2
Uploaded
author | melissacline |
---|---|
date | Tue, 13 Jan 2015 23:37:23 -0500 |
parents | |
children | 02b0824c7d60 |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:8bb037f88ed2 |
---|---|
1 """ | |
2 Utilities for xena queries. | |
3 | |
4 A basic query example. | |
5 Queries are scheme expressions. | |
6 | |
7 >>> import xena_query as xena | |
8 >>> xena.post("https://genome-cancer.ucsc.edu/proj/public/xena", "(+ 1 2)") | |
9 '3.0' | |
10 | |
11 >>> xena.post("https://genome-cancer.ucsc.edu/proj/public/xena", "(let [x 2 y (+ x 3)] (* x y))") | |
12 '10.0' | |
13 | |
14 Looking up sample ids for the TCGA LGG cohort. | |
15 | |
16 >>> r = xena.post("https://genome-cancer.ucsc.edu/proj/public/xena", | |
17 xena.patient_to_sample_query("TCGA.LGG.sampleMap", | |
18 ["TCGA-CS-4938", | |
19 "TCGA-HT-7693", | |
20 "TCGA-CS-6665", | |
21 "TCGA-S9-A7J2", | |
22 "TCGA-FG-A6J3"])) | |
23 '{"TCGA.LGG.sampleMap":["TCGA-CS-4938-01","TCGA-CS-6665-01","TCGA-FG-A6J3-01","TCGA-HT-7693-01","TCGA-S9-A7J2-01"]}' | |
24 | |
25 >>> r = xena.post("https://genome-cancer.ucsc.edu/proj/public/xena", | |
26 xena.find_sample_by_field_query("TCGA.LGG.sampleMap", | |
27 "_PATIENT", | |
28 ["TCGA-CS-4938", | |
29 "TCGA-HT-7693", | |
30 "TCGA-CS-6665", | |
31 "TCGA-S9-A7J2", | |
32 "TCGA-FG-A6J3"])) | |
33 '{"TCGA.LGG.sampleMap":["TCGA-CS-4938-01","TCGA-CS-6665-01","TCGA-FG-A6J3-01","TCGA-HT-7693-01","TCGA-S9-A7J2-01"]}' | |
34 >>> import json | |
35 >>> json.loads(r) | |
36 {u'TCGA.LGG.sampleMap': [u'TCGA-CS-4938-01', u'TCGA-CS-6665-01', u'TCGA-FG-A6J3-01', u'TCGA-HT-7693-01', u'TCGA-S9-A7J2-01']} | |
37 """ | |
38 | |
39 import urllib2 | |
40 import re | |
41 | |
42 def compose1(f, g): | |
43 def composed(*args, **kwargs): | |
44 return f(g(*args, **kwargs)) | |
45 return composed | |
46 | |
47 # funcitonal composition, e.g. | |
48 # compose(f, g)(a, ...) == f(g(a, ...)) | |
49 compose = lambda *funcs: reduce(compose1, funcs) | |
50 | |
51 def quote(s): | |
52 return '"' + s + '"' | |
53 | |
54 def array_fmt(l): | |
55 return '[' + ', '.join((quote(s) for s in l)) + ']' | |
56 | |
57 # The strategy here is | |
58 # o Do table scan on code to find codes matching field values | |
59 # o Do IN query on unpack(field, x) to find rows matching codes | |
60 # o Project to unpack(sample, x) to get sampleID code | |
61 # o Join with code to get sampleID values | |
62 # | |
63 # Note the :limit on the table scan. This makes the table scan exit after we've | |
64 # found enough values, rather than continuing to the end. We can do this because | |
65 # enumerated values are unique. An alternative would be to index all the enumerated | |
66 # values in the db. | |
67 sample_query_str = """ | |
68 (let [cohort %s | |
69 field_id-dataset (car (query {:select [[:field.id :field_id] [:dataset.id :dataset]] | |
70 :from [:dataset] | |
71 :join [:field [:= :dataset_id :dataset.id]] | |
72 :where [:and [:= :cohort cohort] | |
73 [:= :field.name %s]]})) | |
74 values %s | |
75 field_id (:field_id field_id-dataset) | |
76 dataset (:dataset field_id-dataset) | |
77 sample (:id (car (query {:select [:field.id] | |
78 :from [:field] | |
79 :where [:and [:= :dataset_id dataset] | |
80 [:= :field.name "sampleID"]]}))) | |
81 N (- (:rows (car (query {:select [:rows] | |
82 :from [:dataset] | |
83 :where [:= :id dataset]}))) 1)] | |
84 {cohort (map :value (query {:select [:value] | |
85 :from [{:select [:x #sql/call [:unpack field_id, :x]] | |
86 :from [#sql/call [:system_range 0 N]] | |
87 :where [:in #sql/call [:unpack field_id, :x] {:select [:ordering] | |
88 :from [:code] | |
89 :where [:and [:= :field_id field_id] | |
90 [:in :value values]] | |
91 :limit (count values)}]}] | |
92 :join [:code [:and [:= :field_id sample] | |
93 [:= :ordering #sql/call [:unpack sample :x]]]]}))}) | |
94 """ | |
95 | |
96 cohort_query_str = """ | |
97 (map :cohort (query {:select [:%distinct.cohort] | |
98 :from [:dataset] | |
99 :where [:not [:is nil :cohort]]})) | |
100 """ | |
101 | |
102 datasets_list_in_cohort_query = """ | |
103 (map :text (query {:select [:text] | |
104 :from [:dataset] | |
105 :where [:= :cohort %s ]}) | |
106 """ | |
107 | |
108 datasets_type_pattern_str = """ | |
109 (map :name (query {:select [:name] | |
110 :from [:dataset] | |
111 :where [:and [:= :type %s] | |
112 [:like :name %s]]})) | |
113 """ | |
114 | |
115 def find_sample_by_field_query(cohort, field, values): | |
116 """Return a xena query which looks up sample ids for the given field=values.""" | |
117 return sample_query_str % (quote(cohort), quote(field), array_fmt(values)) | |
118 | |
119 def patient_to_sample_query(cohort, patients): | |
120 """Return a xena query which looks up sample ids for the given patients.""" | |
121 return find_sample_by_field_query(cohort, "_PATIENT", patients) | |
122 | |
123 headers = { 'Content-Type' : "text/plain" } | |
124 | |
125 def post(url, query): | |
126 """POST a xena data query to the given url.""" | |
127 req = urllib2.Request(url + '/data/', query, headers) | |
128 response = urllib2.urlopen(req) | |
129 result = response.read() | |
130 return result | |
131 | |
132 def find_cohorts(): | |
133 """ Return a list of cohorts on a host at a specific url """ | |
134 """ return example: ["chinSF2007_public","TCGA.BRCA.sampleMap","cohort3"] """ | |
135 return cohort_query_str | |
136 | |
137 def find_datasets_in_cohort(url, cohort): | |
138 """ Return a list of datasets in a specific cohort on server=url. | |
139 Each dataset is a dictionary of the data's metadata. | |
140 This should be refactored to be consistent with the other methods.""" | |
141 return map(json.loads, | |
142 json.loads(post(url, datasets_list_in_cohort_query % (quote(cohort))))) | |
143 | |
144 def find_datasets_type_pattern(type, pattern): | |
145 """Return a xena query which returns a list of datasets | |
146 filtered by a pattern on the dataset name. The pattern is sql: | |
147 % is wildcard.""" | |
148 return datasets_type_pattern_str % (quote(type), quote(pattern)) | |
149 | |
150 | |
151 def strip_first_url_dir(path): | |
152 return re.sub(r'^[^/]*', '', path) | |
153 | |
154 # proj/<proj>/xena/<proj>/<path> | |
155 # download/<proj>/xena/<path> | |
156 def name_to_url(base_url, name): | |
157 return base_url.replace('/proj/', '/download/') + strip_first_url_dir(name) |