annotate xena_query.py @ 53:09fdd4d23a3a

version 17
author jingchunzhu
date Mon, 21 Sep 2015 13:25:05 -0700
parents 02b0824c7d60
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
8bb037f88ed2 Uploaded
melissacline
parents:
diff changeset
1 """
8bb037f88ed2 Uploaded
melissacline
parents:
diff changeset
2 Utilities for xena queries.
8bb037f88ed2 Uploaded
melissacline
parents:
diff changeset
3
8bb037f88ed2 Uploaded
melissacline
parents:
diff changeset
4 A basic query example.
8bb037f88ed2 Uploaded
melissacline
parents:
diff changeset
5 Queries are scheme expressions.
8bb037f88ed2 Uploaded
melissacline
parents:
diff changeset
6
8bb037f88ed2 Uploaded
melissacline
parents:
diff changeset
7 >>> import xena_query as xena
8bb037f88ed2 Uploaded
melissacline
parents:
diff changeset
8 >>> xena.post("https://genome-cancer.ucsc.edu/proj/public/xena", "(+ 1 2)")
8bb037f88ed2 Uploaded
melissacline
parents:
diff changeset
9 '3.0'
8bb037f88ed2 Uploaded
melissacline
parents:
diff changeset
10
8bb037f88ed2 Uploaded
melissacline
parents:
diff changeset
11 >>> xena.post("https://genome-cancer.ucsc.edu/proj/public/xena", "(let [x 2 y (+ x 3)] (* x y))")
8bb037f88ed2 Uploaded
melissacline
parents:
diff changeset
12 '10.0'
8bb037f88ed2 Uploaded
melissacline
parents:
diff changeset
13
8bb037f88ed2 Uploaded
melissacline
parents:
diff changeset
14 Looking up sample ids for the TCGA LGG cohort.
8bb037f88ed2 Uploaded
melissacline
parents:
diff changeset
15
8bb037f88ed2 Uploaded
melissacline
parents:
diff changeset
16 >>> r = xena.post("https://genome-cancer.ucsc.edu/proj/public/xena",
8bb037f88ed2 Uploaded
melissacline
parents:
diff changeset
17 xena.patient_to_sample_query("TCGA.LGG.sampleMap",
8bb037f88ed2 Uploaded
melissacline
parents:
diff changeset
18 ["TCGA-CS-4938",
8bb037f88ed2 Uploaded
melissacline
parents:
diff changeset
19 "TCGA-HT-7693",
8bb037f88ed2 Uploaded
melissacline
parents:
diff changeset
20 "TCGA-CS-6665",
8bb037f88ed2 Uploaded
melissacline
parents:
diff changeset
21 "TCGA-S9-A7J2",
8bb037f88ed2 Uploaded
melissacline
parents:
diff changeset
22 "TCGA-FG-A6J3"]))
8bb037f88ed2 Uploaded
melissacline
parents:
diff changeset
23 '{"TCGA.LGG.sampleMap":["TCGA-CS-4938-01","TCGA-CS-6665-01","TCGA-FG-A6J3-01","TCGA-HT-7693-01","TCGA-S9-A7J2-01"]}'
8bb037f88ed2 Uploaded
melissacline
parents:
diff changeset
24
8bb037f88ed2 Uploaded
melissacline
parents:
diff changeset
25 >>> r = xena.post("https://genome-cancer.ucsc.edu/proj/public/xena",
8bb037f88ed2 Uploaded
melissacline
parents:
diff changeset
26 xena.find_sample_by_field_query("TCGA.LGG.sampleMap",
8bb037f88ed2 Uploaded
melissacline
parents:
diff changeset
27 "_PATIENT",
8bb037f88ed2 Uploaded
melissacline
parents:
diff changeset
28 ["TCGA-CS-4938",
8bb037f88ed2 Uploaded
melissacline
parents:
diff changeset
29 "TCGA-HT-7693",
8bb037f88ed2 Uploaded
melissacline
parents:
diff changeset
30 "TCGA-CS-6665",
8bb037f88ed2 Uploaded
melissacline
parents:
diff changeset
31 "TCGA-S9-A7J2",
8bb037f88ed2 Uploaded
melissacline
parents:
diff changeset
32 "TCGA-FG-A6J3"]))
8bb037f88ed2 Uploaded
melissacline
parents:
diff changeset
33 '{"TCGA.LGG.sampleMap":["TCGA-CS-4938-01","TCGA-CS-6665-01","TCGA-FG-A6J3-01","TCGA-HT-7693-01","TCGA-S9-A7J2-01"]}'
8bb037f88ed2 Uploaded
melissacline
parents:
diff changeset
34 >>> import json
8bb037f88ed2 Uploaded
melissacline
parents:
diff changeset
35 >>> json.loads(r)
8bb037f88ed2 Uploaded
melissacline
parents:
diff changeset
36 {u'TCGA.LGG.sampleMap': [u'TCGA-CS-4938-01', u'TCGA-CS-6665-01', u'TCGA-FG-A6J3-01', u'TCGA-HT-7693-01', u'TCGA-S9-A7J2-01']}
8bb037f88ed2 Uploaded
melissacline
parents:
diff changeset
37 """
8bb037f88ed2 Uploaded
melissacline
parents:
diff changeset
38
41
02b0824c7d60 Download data from any hub in the federated xena platform
jingchunzhu <jingchunzhu@gmail.com>
parents: 0
diff changeset
39 import urllib2, json
0
8bb037f88ed2 Uploaded
melissacline
parents:
diff changeset
40 import re
8bb037f88ed2 Uploaded
melissacline
parents:
diff changeset
41
8bb037f88ed2 Uploaded
melissacline
parents:
diff changeset
42 def compose1(f, g):
8bb037f88ed2 Uploaded
melissacline
parents:
diff changeset
43 def composed(*args, **kwargs):
8bb037f88ed2 Uploaded
melissacline
parents:
diff changeset
44 return f(g(*args, **kwargs))
8bb037f88ed2 Uploaded
melissacline
parents:
diff changeset
45 return composed
8bb037f88ed2 Uploaded
melissacline
parents:
diff changeset
46
8bb037f88ed2 Uploaded
melissacline
parents:
diff changeset
47 # funcitonal composition, e.g.
8bb037f88ed2 Uploaded
melissacline
parents:
diff changeset
48 # compose(f, g)(a, ...) == f(g(a, ...))
8bb037f88ed2 Uploaded
melissacline
parents:
diff changeset
49 compose = lambda *funcs: reduce(compose1, funcs)
8bb037f88ed2 Uploaded
melissacline
parents:
diff changeset
50
8bb037f88ed2 Uploaded
melissacline
parents:
diff changeset
51 def quote(s):
8bb037f88ed2 Uploaded
melissacline
parents:
diff changeset
52 return '"' + s + '"'
8bb037f88ed2 Uploaded
melissacline
parents:
diff changeset
53
8bb037f88ed2 Uploaded
melissacline
parents:
diff changeset
54 def array_fmt(l):
8bb037f88ed2 Uploaded
melissacline
parents:
diff changeset
55 return '[' + ', '.join((quote(s) for s in l)) + ']'
8bb037f88ed2 Uploaded
melissacline
parents:
diff changeset
56
8bb037f88ed2 Uploaded
melissacline
parents:
diff changeset
57 # The strategy here is
8bb037f88ed2 Uploaded
melissacline
parents:
diff changeset
58 # o Do table scan on code to find codes matching field values
8bb037f88ed2 Uploaded
melissacline
parents:
diff changeset
59 # o Do IN query on unpack(field, x) to find rows matching codes
8bb037f88ed2 Uploaded
melissacline
parents:
diff changeset
60 # o Project to unpack(sample, x) to get sampleID code
8bb037f88ed2 Uploaded
melissacline
parents:
diff changeset
61 # o Join with code to get sampleID values
8bb037f88ed2 Uploaded
melissacline
parents:
diff changeset
62 #
8bb037f88ed2 Uploaded
melissacline
parents:
diff changeset
63 # Note the :limit on the table scan. This makes the table scan exit after we've
8bb037f88ed2 Uploaded
melissacline
parents:
diff changeset
64 # found enough values, rather than continuing to the end. We can do this because
8bb037f88ed2 Uploaded
melissacline
parents:
diff changeset
65 # enumerated values are unique. An alternative would be to index all the enumerated
8bb037f88ed2 Uploaded
melissacline
parents:
diff changeset
66 # values in the db.
8bb037f88ed2 Uploaded
melissacline
parents:
diff changeset
67
8bb037f88ed2 Uploaded
melissacline
parents:
diff changeset
68 cohort_query_str = """
8bb037f88ed2 Uploaded
melissacline
parents:
diff changeset
69 (map :cohort (query {:select [:%distinct.cohort]
8bb037f88ed2 Uploaded
melissacline
parents:
diff changeset
70 :from [:dataset]
8bb037f88ed2 Uploaded
melissacline
parents:
diff changeset
71 :where [:not [:is nil :cohort]]}))
8bb037f88ed2 Uploaded
melissacline
parents:
diff changeset
72 """
8bb037f88ed2 Uploaded
melissacline
parents:
diff changeset
73
41
02b0824c7d60 Download data from any hub in the federated xena platform
jingchunzhu <jingchunzhu@gmail.com>
parents: 0
diff changeset
74 datasets_list_in_cohort_str ="""
02b0824c7d60 Download data from any hub in the federated xena platform
jingchunzhu <jingchunzhu@gmail.com>
parents: 0
diff changeset
75 (map :name (query {:select [:name :type :datasubtype :probemap :text :status]
02b0824c7d60 Download data from any hub in the federated xena platform
jingchunzhu <jingchunzhu@gmail.com>
parents: 0
diff changeset
76 :from [:dataset]
02b0824c7d60 Download data from any hub in the federated xena platform
jingchunzhu <jingchunzhu@gmail.com>
parents: 0
diff changeset
77 :where [:= :cohort %s]}))
02b0824c7d60 Download data from any hub in the federated xena platform
jingchunzhu <jingchunzhu@gmail.com>
parents: 0
diff changeset
78 """
02b0824c7d60 Download data from any hub in the federated xena platform
jingchunzhu <jingchunzhu@gmail.com>
parents: 0
diff changeset
79
02b0824c7d60 Download data from any hub in the federated xena platform
jingchunzhu <jingchunzhu@gmail.com>
parents: 0
diff changeset
80 dataset_type_str = """
02b0824c7d60 Download data from any hub in the federated xena platform
jingchunzhu <jingchunzhu@gmail.com>
parents: 0
diff changeset
81 (map :type (query {:select [:type]
0
8bb037f88ed2 Uploaded
melissacline
parents:
diff changeset
82 :from [:dataset]
41
02b0824c7d60 Download data from any hub in the federated xena platform
jingchunzhu <jingchunzhu@gmail.com>
parents: 0
diff changeset
83 :where [:= :name %s]}))
0
8bb037f88ed2 Uploaded
melissacline
parents:
diff changeset
84 """
8bb037f88ed2 Uploaded
melissacline
parents:
diff changeset
85
41
02b0824c7d60 Download data from any hub in the federated xena platform
jingchunzhu <jingchunzhu@gmail.com>
parents: 0
diff changeset
86 dataset_field_str = """
02b0824c7d60 Download data from any hub in the federated xena platform
jingchunzhu <jingchunzhu@gmail.com>
parents: 0
diff changeset
87 (map :name (query {:select [:field.name]
02b0824c7d60 Download data from any hub in the federated xena platform
jingchunzhu <jingchunzhu@gmail.com>
parents: 0
diff changeset
88 :from [:dataset]
02b0824c7d60 Download data from any hub in the federated xena platform
jingchunzhu <jingchunzhu@gmail.com>
parents: 0
diff changeset
89 :join [:field [:= :dataset.id :dataset_id]]
02b0824c7d60 Download data from any hub in the federated xena platform
jingchunzhu <jingchunzhu@gmail.com>
parents: 0
diff changeset
90 :where [:= :dataset.name %s]}))
0
8bb037f88ed2 Uploaded
melissacline
parents:
diff changeset
91 """
8bb037f88ed2 Uploaded
melissacline
parents:
diff changeset
92
41
02b0824c7d60 Download data from any hub in the federated xena platform
jingchunzhu <jingchunzhu@gmail.com>
parents: 0
diff changeset
93 dataset_samples_str = """
02b0824c7d60 Download data from any hub in the federated xena platform
jingchunzhu <jingchunzhu@gmail.com>
parents: 0
diff changeset
94 (map :value (query {:select [:value]
02b0824c7d60 Download data from any hub in the federated xena platform
jingchunzhu <jingchunzhu@gmail.com>
parents: 0
diff changeset
95 :from [:dataset]
02b0824c7d60 Download data from any hub in the federated xena platform
jingchunzhu <jingchunzhu@gmail.com>
parents: 0
diff changeset
96 :join [:field [:= :dataset.id :dataset_id]
02b0824c7d60 Download data from any hub in the federated xena platform
jingchunzhu <jingchunzhu@gmail.com>
parents: 0
diff changeset
97 :code [:= :field.id :field_id]]
02b0824c7d60 Download data from any hub in the federated xena platform
jingchunzhu <jingchunzhu@gmail.com>
parents: 0
diff changeset
98 :where [:and
02b0824c7d60 Download data from any hub in the federated xena platform
jingchunzhu <jingchunzhu@gmail.com>
parents: 0
diff changeset
99 [:= :dataset.name %s]
02b0824c7d60 Download data from any hub in the federated xena platform
jingchunzhu <jingchunzhu@gmail.com>
parents: 0
diff changeset
100 [:= :field.name "sampleID"]]}))
02b0824c7d60 Download data from any hub in the federated xena platform
jingchunzhu <jingchunzhu@gmail.com>
parents: 0
diff changeset
101 """
02b0824c7d60 Download data from any hub in the federated xena platform
jingchunzhu <jingchunzhu@gmail.com>
parents: 0
diff changeset
102
02b0824c7d60 Download data from any hub in the federated xena platform
jingchunzhu <jingchunzhu@gmail.com>
parents: 0
diff changeset
103 dataset_probe_str = """
02b0824c7d60 Download data from any hub in the federated xena platform
jingchunzhu <jingchunzhu@gmail.com>
parents: 0
diff changeset
104 (fetch [{:table %s
02b0824c7d60 Download data from any hub in the federated xena platform
jingchunzhu <jingchunzhu@gmail.com>
parents: 0
diff changeset
105 :columns %s
02b0824c7d60 Download data from any hub in the federated xena platform
jingchunzhu <jingchunzhu@gmail.com>
parents: 0
diff changeset
106 :samples %s}])
02b0824c7d60 Download data from any hub in the federated xena platform
jingchunzhu <jingchunzhu@gmail.com>
parents: 0
diff changeset
107 """
02b0824c7d60 Download data from any hub in the federated xena platform
jingchunzhu <jingchunzhu@gmail.com>
parents: 0
diff changeset
108
02b0824c7d60 Download data from any hub in the federated xena platform
jingchunzhu <jingchunzhu@gmail.com>
parents: 0
diff changeset
109
0
8bb037f88ed2 Uploaded
melissacline
parents:
diff changeset
110 def find_sample_by_field_query(cohort, field, values):
8bb037f88ed2 Uploaded
melissacline
parents:
diff changeset
111 """Return a xena query which looks up sample ids for the given field=values."""
8bb037f88ed2 Uploaded
melissacline
parents:
diff changeset
112 return sample_query_str % (quote(cohort), quote(field), array_fmt(values))
8bb037f88ed2 Uploaded
melissacline
parents:
diff changeset
113
8bb037f88ed2 Uploaded
melissacline
parents:
diff changeset
114 def patient_to_sample_query(cohort, patients):
8bb037f88ed2 Uploaded
melissacline
parents:
diff changeset
115 """Return a xena query which looks up sample ids for the given patients."""
8bb037f88ed2 Uploaded
melissacline
parents:
diff changeset
116 return find_sample_by_field_query(cohort, "_PATIENT", patients)
8bb037f88ed2 Uploaded
melissacline
parents:
diff changeset
117
8bb037f88ed2 Uploaded
melissacline
parents:
diff changeset
118 headers = { 'Content-Type' : "text/plain" }
8bb037f88ed2 Uploaded
melissacline
parents:
diff changeset
119
8bb037f88ed2 Uploaded
melissacline
parents:
diff changeset
120 def post(url, query):
8bb037f88ed2 Uploaded
melissacline
parents:
diff changeset
121 """POST a xena data query to the given url."""
8bb037f88ed2 Uploaded
melissacline
parents:
diff changeset
122 req = urllib2.Request(url + '/data/', query, headers)
8bb037f88ed2 Uploaded
melissacline
parents:
diff changeset
123 response = urllib2.urlopen(req)
8bb037f88ed2 Uploaded
melissacline
parents:
diff changeset
124 result = response.read()
8bb037f88ed2 Uploaded
melissacline
parents:
diff changeset
125 return result
8bb037f88ed2 Uploaded
melissacline
parents:
diff changeset
126
41
02b0824c7d60 Download data from any hub in the federated xena platform
jingchunzhu <jingchunzhu@gmail.com>
parents: 0
diff changeset
127 def find_cohorts(url):
0
8bb037f88ed2 Uploaded
melissacline
parents:
diff changeset
128 """ Return a list of cohorts on a host at a specific url """
8bb037f88ed2 Uploaded
melissacline
parents:
diff changeset
129 """ return example: ["chinSF2007_public","TCGA.BRCA.sampleMap","cohort3"] """
41
02b0824c7d60 Download data from any hub in the federated xena platform
jingchunzhu <jingchunzhu@gmail.com>
parents: 0
diff changeset
130 return json.loads(post(url,cohort_query_str))
02b0824c7d60 Download data from any hub in the federated xena platform
jingchunzhu <jingchunzhu@gmail.com>
parents: 0
diff changeset
131
02b0824c7d60 Download data from any hub in the federated xena platform
jingchunzhu <jingchunzhu@gmail.com>
parents: 0
diff changeset
132 def dataset_field(host, dataset):
02b0824c7d60 Download data from any hub in the federated xena platform
jingchunzhu <jingchunzhu@gmail.com>
parents: 0
diff changeset
133 """return probes or features of a dataset"""
02b0824c7d60 Download data from any hub in the federated xena platform
jingchunzhu <jingchunzhu@gmail.com>
parents: 0
diff changeset
134 return json.loads(post(host, dataset_field_str % (quote(dataset))))
0
8bb037f88ed2 Uploaded
melissacline
parents:
diff changeset
135
41
02b0824c7d60 Download data from any hub in the federated xena platform
jingchunzhu <jingchunzhu@gmail.com>
parents: 0
diff changeset
136 def datasets_list_in_cohort (host, cohort):
02b0824c7d60 Download data from any hub in the federated xena platform
jingchunzhu <jingchunzhu@gmail.com>
parents: 0
diff changeset
137 """return datasets in a cohort"""
02b0824c7d60 Download data from any hub in the federated xena platform
jingchunzhu <jingchunzhu@gmail.com>
parents: 0
diff changeset
138 return json.loads(post(host, datasets_list_in_cohort_str % (quote(cohort))))
0
8bb037f88ed2 Uploaded
melissacline
parents:
diff changeset
139
41
02b0824c7d60 Download data from any hub in the federated xena platform
jingchunzhu <jingchunzhu@gmail.com>
parents: 0
diff changeset
140 def dataset_samples (host, dataset):
02b0824c7d60 Download data from any hub in the federated xena platform
jingchunzhu <jingchunzhu@gmail.com>
parents: 0
diff changeset
141 return json.loads(post(host, dataset_samples_str % (quote(dataset))))
0
8bb037f88ed2 Uploaded
melissacline
parents:
diff changeset
142
41
02b0824c7d60 Download data from any hub in the federated xena platform
jingchunzhu <jingchunzhu@gmail.com>
parents: 0
diff changeset
143 def dataset_probe_values (host, dataset, samples, probes):
02b0824c7d60 Download data from any hub in the federated xena platform
jingchunzhu <jingchunzhu@gmail.com>
parents: 0
diff changeset
144 return json.loads(post(host, dataset_probe_str % (quote(dataset), array_fmt(probes), array_fmt(samples))))
02b0824c7d60 Download data from any hub in the federated xena platform
jingchunzhu <jingchunzhu@gmail.com>
parents: 0
diff changeset
145
02b0824c7d60 Download data from any hub in the federated xena platform
jingchunzhu <jingchunzhu@gmail.com>
parents: 0
diff changeset
146 def dataset_type (host, dataset):
02b0824c7d60 Download data from any hub in the federated xena platform
jingchunzhu <jingchunzhu@gmail.com>
parents: 0
diff changeset
147 return json.loads(post(host, dataset_type_str % (quote(dataset))))
0
8bb037f88ed2 Uploaded
melissacline
parents:
diff changeset
148
8bb037f88ed2 Uploaded
melissacline
parents:
diff changeset
149 def strip_first_url_dir(path):
8bb037f88ed2 Uploaded
melissacline
parents:
diff changeset
150 return re.sub(r'^[^/]*', '', path)
8bb037f88ed2 Uploaded
melissacline
parents:
diff changeset
151
8bb037f88ed2 Uploaded
melissacline
parents:
diff changeset
152 # proj/<proj>/xena/<proj>/<path>
8bb037f88ed2 Uploaded
melissacline
parents:
diff changeset
153 # download/<proj>/xena/<path>
8bb037f88ed2 Uploaded
melissacline
parents:
diff changeset
154 def name_to_url(base_url, name):
8bb037f88ed2 Uploaded
melissacline
parents:
diff changeset
155 return base_url.replace('/proj/', '/download/') + strip_first_url_dir(name)