comparison xena_query.py @ 41:02b0824c7d60

Download data from any hub in the federated xena platform
author jingchunzhu <jingchunzhu@gmail.com>
date Mon, 27 Jul 2015 10:05:22 -0700
parents 8bb037f88ed2
children
comparison
equal deleted inserted replaced
40:fd24e220f240 41:02b0824c7d60
34 >>> import json 34 >>> import json
35 >>> json.loads(r) 35 >>> json.loads(r)
36 {u'TCGA.LGG.sampleMap': [u'TCGA-CS-4938-01', u'TCGA-CS-6665-01', u'TCGA-FG-A6J3-01', u'TCGA-HT-7693-01', u'TCGA-S9-A7J2-01']} 36 {u'TCGA.LGG.sampleMap': [u'TCGA-CS-4938-01', u'TCGA-CS-6665-01', u'TCGA-FG-A6J3-01', u'TCGA-HT-7693-01', u'TCGA-S9-A7J2-01']}
37 """ 37 """
38 38
39 import urllib2 39 import urllib2, json
40 import re 40 import re
41 41
42 def compose1(f, g): 42 def compose1(f, g):
43 def composed(*args, **kwargs): 43 def composed(*args, **kwargs):
44 return f(g(*args, **kwargs)) 44 return f(g(*args, **kwargs))
62 # 62 #
63 # Note the :limit on the table scan. This makes the table scan exit after we've 63 # Note the :limit on the table scan. This makes the table scan exit after we've
64 # found enough values, rather than continuing to the end. We can do this because 64 # found enough values, rather than continuing to the end. We can do this because
65 # enumerated values are unique. An alternative would be to index all the enumerated 65 # enumerated values are unique. An alternative would be to index all the enumerated
66 # values in the db. 66 # values in the db.
67 sample_query_str = """
68 (let [cohort %s
69 field_id-dataset (car (query {:select [[:field.id :field_id] [:dataset.id :dataset]]
70 :from [:dataset]
71 :join [:field [:= :dataset_id :dataset.id]]
72 :where [:and [:= :cohort cohort]
73 [:= :field.name %s]]}))
74 values %s
75 field_id (:field_id field_id-dataset)
76 dataset (:dataset field_id-dataset)
77 sample (:id (car (query {:select [:field.id]
78 :from [:field]
79 :where [:and [:= :dataset_id dataset]
80 [:= :field.name "sampleID"]]})))
81 N (- (:rows (car (query {:select [:rows]
82 :from [:dataset]
83 :where [:= :id dataset]}))) 1)]
84 {cohort (map :value (query {:select [:value]
85 :from [{:select [:x #sql/call [:unpack field_id, :x]]
86 :from [#sql/call [:system_range 0 N]]
87 :where [:in #sql/call [:unpack field_id, :x] {:select [:ordering]
88 :from [:code]
89 :where [:and [:= :field_id field_id]
90 [:in :value values]]
91 :limit (count values)}]}]
92 :join [:code [:and [:= :field_id sample]
93 [:= :ordering #sql/call [:unpack sample :x]]]]}))})
94 """
95 67
96 cohort_query_str = """ 68 cohort_query_str = """
97 (map :cohort (query {:select [:%distinct.cohort] 69 (map :cohort (query {:select [:%distinct.cohort]
98 :from [:dataset] 70 :from [:dataset]
99 :where [:not [:is nil :cohort]]})) 71 :where [:not [:is nil :cohort]]}))
100 """ 72 """
101 73
102 datasets_list_in_cohort_query = """ 74 datasets_list_in_cohort_str ="""
103 (map :text (query {:select [:text] 75 (map :name (query {:select [:name :type :datasubtype :probemap :text :status]
104 :from [:dataset] 76 :from [:dataset]
105 :where [:= :cohort %s ]}) 77 :where [:= :cohort %s]}))
106 """ 78 """
107 79
108 datasets_type_pattern_str = """ 80 dataset_type_str = """
109 (map :name (query {:select [:name] 81 (map :type (query {:select [:type]
110 :from [:dataset] 82 :from [:dataset]
111 :where [:and [:= :type %s] 83 :where [:= :name %s]}))
112 [:like :name %s]]}))
113 """ 84 """
85
86 dataset_field_str = """
87 (map :name (query {:select [:field.name]
88 :from [:dataset]
89 :join [:field [:= :dataset.id :dataset_id]]
90 :where [:= :dataset.name %s]}))
91 """
92
93 dataset_samples_str = """
94 (map :value (query {:select [:value]
95 :from [:dataset]
96 :join [:field [:= :dataset.id :dataset_id]
97 :code [:= :field.id :field_id]]
98 :where [:and
99 [:= :dataset.name %s]
100 [:= :field.name "sampleID"]]}))
101 """
102
103 dataset_probe_str = """
104 (fetch [{:table %s
105 :columns %s
106 :samples %s}])
107 """
108
114 109
115 def find_sample_by_field_query(cohort, field, values): 110 def find_sample_by_field_query(cohort, field, values):
116 """Return a xena query which looks up sample ids for the given field=values.""" 111 """Return a xena query which looks up sample ids for the given field=values."""
117 return sample_query_str % (quote(cohort), quote(field), array_fmt(values)) 112 return sample_query_str % (quote(cohort), quote(field), array_fmt(values))
118 113
127 req = urllib2.Request(url + '/data/', query, headers) 122 req = urllib2.Request(url + '/data/', query, headers)
128 response = urllib2.urlopen(req) 123 response = urllib2.urlopen(req)
129 result = response.read() 124 result = response.read()
130 return result 125 return result
131 126
132 def find_cohorts(): 127 def find_cohorts(url):
133 """ Return a list of cohorts on a host at a specific url """ 128 """ Return a list of cohorts on a host at a specific url """
134 """ return example: ["chinSF2007_public","TCGA.BRCA.sampleMap","cohort3"] """ 129 """ return example: ["chinSF2007_public","TCGA.BRCA.sampleMap","cohort3"] """
135 return cohort_query_str 130 return json.loads(post(url,cohort_query_str))
136 131
137 def find_datasets_in_cohort(url, cohort): 132 def dataset_field(host, dataset):
138 """ Return a list of datasets in a specific cohort on server=url. 133 """return probes or features of a dataset"""
139 Each dataset is a dictionary of the data's metadata. 134 return json.loads(post(host, dataset_field_str % (quote(dataset))))
140 This should be refactored to be consistent with the other methods."""
141 return map(json.loads,
142 json.loads(post(url, datasets_list_in_cohort_query % (quote(cohort)))))
143 135
144 def find_datasets_type_pattern(type, pattern): 136 def datasets_list_in_cohort (host, cohort):
145 """Return a xena query which returns a list of datasets 137 """return datasets in a cohort"""
146 filtered by a pattern on the dataset name. The pattern is sql: 138 return json.loads(post(host, datasets_list_in_cohort_str % (quote(cohort))))
147 % is wildcard."""
148 return datasets_type_pattern_str % (quote(type), quote(pattern))
149 139
140 def dataset_samples (host, dataset):
141 return json.loads(post(host, dataset_samples_str % (quote(dataset))))
142
143 def dataset_probe_values (host, dataset, samples, probes):
144 return json.loads(post(host, dataset_probe_str % (quote(dataset), array_fmt(probes), array_fmt(samples))))
145
146 def dataset_type (host, dataset):
147 return json.loads(post(host, dataset_type_str % (quote(dataset))))
150 148
151 def strip_first_url_dir(path): 149 def strip_first_url_dir(path):
152 return re.sub(r'^[^/]*', '', path) 150 return re.sub(r'^[^/]*', '', path)
153 151
154 # proj/<proj>/xena/<proj>/<path> 152 # proj/<proj>/xena/<proj>/<path>