Mercurial > repos > melissacline > ucsc_xena_platform
comparison xena_query.py @ 41:02b0824c7d60
Download data from any hub in the federated xena platform
author | jingchunzhu <jingchunzhu@gmail.com> |
---|---|
date | Mon, 27 Jul 2015 10:05:22 -0700 |
parents | 8bb037f88ed2 |
children |
comparison
equal
deleted
inserted
replaced
40:fd24e220f240 | 41:02b0824c7d60 |
---|---|
34 >>> import json | 34 >>> import json |
35 >>> json.loads(r) | 35 >>> json.loads(r) |
36 {u'TCGA.LGG.sampleMap': [u'TCGA-CS-4938-01', u'TCGA-CS-6665-01', u'TCGA-FG-A6J3-01', u'TCGA-HT-7693-01', u'TCGA-S9-A7J2-01']} | 36 {u'TCGA.LGG.sampleMap': [u'TCGA-CS-4938-01', u'TCGA-CS-6665-01', u'TCGA-FG-A6J3-01', u'TCGA-HT-7693-01', u'TCGA-S9-A7J2-01']} |
37 """ | 37 """ |
38 | 38 |
39 import urllib2 | 39 import urllib2, json |
40 import re | 40 import re |
41 | 41 |
42 def compose1(f, g): | 42 def compose1(f, g): |
43 def composed(*args, **kwargs): | 43 def composed(*args, **kwargs): |
44 return f(g(*args, **kwargs)) | 44 return f(g(*args, **kwargs)) |
62 # | 62 # |
63 # Note the :limit on the table scan. This makes the table scan exit after we've | 63 # Note the :limit on the table scan. This makes the table scan exit after we've |
64 # found enough values, rather than continuing to the end. We can do this because | 64 # found enough values, rather than continuing to the end. We can do this because |
65 # enumerated values are unique. An alternative would be to index all the enumerated | 65 # enumerated values are unique. An alternative would be to index all the enumerated |
66 # values in the db. | 66 # values in the db. |
67 sample_query_str = """ | |
68 (let [cohort %s | |
69 field_id-dataset (car (query {:select [[:field.id :field_id] [:dataset.id :dataset]] | |
70 :from [:dataset] | |
71 :join [:field [:= :dataset_id :dataset.id]] | |
72 :where [:and [:= :cohort cohort] | |
73 [:= :field.name %s]]})) | |
74 values %s | |
75 field_id (:field_id field_id-dataset) | |
76 dataset (:dataset field_id-dataset) | |
77 sample (:id (car (query {:select [:field.id] | |
78 :from [:field] | |
79 :where [:and [:= :dataset_id dataset] | |
80 [:= :field.name "sampleID"]]}))) | |
81 N (- (:rows (car (query {:select [:rows] | |
82 :from [:dataset] | |
83 :where [:= :id dataset]}))) 1)] | |
84 {cohort (map :value (query {:select [:value] | |
85 :from [{:select [:x #sql/call [:unpack field_id, :x]] | |
86 :from [#sql/call [:system_range 0 N]] | |
87 :where [:in #sql/call [:unpack field_id, :x] {:select [:ordering] | |
88 :from [:code] | |
89 :where [:and [:= :field_id field_id] | |
90 [:in :value values]] | |
91 :limit (count values)}]}] | |
92 :join [:code [:and [:= :field_id sample] | |
93 [:= :ordering #sql/call [:unpack sample :x]]]]}))}) | |
94 """ | |
95 | 67 |
96 cohort_query_str = """ | 68 cohort_query_str = """ |
97 (map :cohort (query {:select [:%distinct.cohort] | 69 (map :cohort (query {:select [:%distinct.cohort] |
98 :from [:dataset] | 70 :from [:dataset] |
99 :where [:not [:is nil :cohort]]})) | 71 :where [:not [:is nil :cohort]]})) |
100 """ | 72 """ |
101 | 73 |
102 datasets_list_in_cohort_query = """ | 74 datasets_list_in_cohort_str =""" |
103 (map :text (query {:select [:text] | 75 (map :name (query {:select [:name :type :datasubtype :probemap :text :status] |
104 :from [:dataset] | 76 :from [:dataset] |
105 :where [:= :cohort %s ]}) | 77 :where [:= :cohort %s]})) |
106 """ | 78 """ |
107 | 79 |
108 datasets_type_pattern_str = """ | 80 dataset_type_str = """ |
109 (map :name (query {:select [:name] | 81 (map :type (query {:select [:type] |
110 :from [:dataset] | 82 :from [:dataset] |
111 :where [:and [:= :type %s] | 83 :where [:= :name %s]})) |
112 [:like :name %s]]})) | |
113 """ | 84 """ |
85 | |
86 dataset_field_str = """ | |
87 (map :name (query {:select [:field.name] | |
88 :from [:dataset] | |
89 :join [:field [:= :dataset.id :dataset_id]] | |
90 :where [:= :dataset.name %s]})) | |
91 """ | |
92 | |
93 dataset_samples_str = """ | |
94 (map :value (query {:select [:value] | |
95 :from [:dataset] | |
96 :join [:field [:= :dataset.id :dataset_id] | |
97 :code [:= :field.id :field_id]] | |
98 :where [:and | |
99 [:= :dataset.name %s] | |
100 [:= :field.name "sampleID"]]})) | |
101 """ | |
102 | |
103 dataset_probe_str = """ | |
104 (fetch [{:table %s | |
105 :columns %s | |
106 :samples %s}]) | |
107 """ | |
108 | |
114 | 109 |
115 def find_sample_by_field_query(cohort, field, values): | 110 def find_sample_by_field_query(cohort, field, values): |
116 """Return a xena query which looks up sample ids for the given field=values.""" | 111 """Return a xena query which looks up sample ids for the given field=values.""" |
117 return sample_query_str % (quote(cohort), quote(field), array_fmt(values)) | 112 return sample_query_str % (quote(cohort), quote(field), array_fmt(values)) |
118 | 113 |
127 req = urllib2.Request(url + '/data/', query, headers) | 122 req = urllib2.Request(url + '/data/', query, headers) |
128 response = urllib2.urlopen(req) | 123 response = urllib2.urlopen(req) |
129 result = response.read() | 124 result = response.read() |
130 return result | 125 return result |
131 | 126 |
132 def find_cohorts(): | 127 def find_cohorts(url): |
133 """ Return a list of cohorts on a host at a specific url """ | 128 """ Return a list of cohorts on a host at a specific url """ |
134 """ return example: ["chinSF2007_public","TCGA.BRCA.sampleMap","cohort3"] """ | 129 """ return example: ["chinSF2007_public","TCGA.BRCA.sampleMap","cohort3"] """ |
135 return cohort_query_str | 130 return json.loads(post(url,cohort_query_str)) |
136 | 131 |
137 def find_datasets_in_cohort(url, cohort): | 132 def dataset_field(host, dataset): |
138 """ Return a list of datasets in a specific cohort on server=url. | 133 """return probes or features of a dataset""" |
139 Each dataset is a dictionary of the data's metadata. | 134 return json.loads(post(host, dataset_field_str % (quote(dataset)))) |
140 This should be refactored to be consistent with the other methods.""" | |
141 return map(json.loads, | |
142 json.loads(post(url, datasets_list_in_cohort_query % (quote(cohort))))) | |
143 | 135 |
144 def find_datasets_type_pattern(type, pattern): | 136 def datasets_list_in_cohort (host, cohort): |
145 """Return a xena query which returns a list of datasets | 137 """return datasets in a cohort""" |
146 filtered by a pattern on the dataset name. The pattern is sql: | 138 return json.loads(post(host, datasets_list_in_cohort_str % (quote(cohort)))) |
147 % is wildcard.""" | |
148 return datasets_type_pattern_str % (quote(type), quote(pattern)) | |
149 | 139 |
140 def dataset_samples (host, dataset): | |
141 return json.loads(post(host, dataset_samples_str % (quote(dataset)))) | |
142 | |
143 def dataset_probe_values (host, dataset, samples, probes): | |
144 return json.loads(post(host, dataset_probe_str % (quote(dataset), array_fmt(probes), array_fmt(samples)))) | |
145 | |
146 def dataset_type (host, dataset): | |
147 return json.loads(post(host, dataset_type_str % (quote(dataset)))) | |
150 | 148 |
151 def strip_first_url_dir(path): | 149 def strip_first_url_dir(path): |
152 return re.sub(r'^[^/]*', '', path) | 150 return re.sub(r'^[^/]*', '', path) |
153 | 151 |
154 # proj/<proj>/xena/<proj>/<path> | 152 # proj/<proj>/xena/<proj>/<path> |