ucsc_xena_platform: xena_query.py comparison

comparison xena_query.py @ 41:02b0824c7d60

Download data from any hub in the federated xena platform

author	jingchunzhu <jingchunzhu@gmail.com>
date	Mon, 27 Jul 2015 10:05:22 -0700
parents	8bb037f88ed2
children

comparison

equal deleted inserted replaced

-:fd24e220f240
+:02b0824c7d60
 >>> import json
 >>> json.loads(r)
 {u'TCGA.LGG.sampleMap': [u'TCGA-CS-4938-01', u'TCGA-CS-6665-01', u'TCGA-FG-A6J3-01', u'TCGA-HT-7693-01', u'TCGA-S9-A7J2-01']}
 """
-import urllib2
+import urllib2, json
 import re
 def compose1(f, g):
 def composed(*args, **kwargs):
 return f(g(*args, **kwargs))
 #
 # Note the :limit on the table scan. This makes the table scan exit after we've
 # found enough values, rather than continuing to the end. We can do this because
 # enumerated values are unique. An alternative would be to index all the enumerated
 # values in the db.
-sample_query_str = """
-(let [cohort %s
-field_id-dataset (car (query {:select [[:field.id :field_id] [:dataset.id :dataset]]
-:from [:dataset]
-:join [:field [:= :dataset_id :dataset.id]]
-:where [:and [:= :cohort cohort]
-[:= :field.name %s]]}))
-values %s
-field_id (:field_id field_id-dataset)
-dataset (:dataset field_id-dataset)
-sample (:id (car (query {:select [:field.id]
-:from [:field]
-:where [:and [:= :dataset_id dataset]
-[:= :field.name "sampleID"]]})))
-N (- (:rows (car (query {:select [:rows]
-:from [:dataset]
-:where [:= :id dataset]}))) 1)]
-{cohort (map :value (query {:select [:value]
-:from [{:select [:x #sql/call [:unpack field_id, :x]]
-:from [#sql/call [:system_range 0 N]]
-:where [:in #sql/call [:unpack field_id, :x] {:select [:ordering]
-:from [:code]
-:where [:and [:= :field_id field_id]
-[:in :value values]]
-:limit (count values)}]}]
-:join [:code [:and [:= :field_id sample]
-[:= :ordering #sql/call [:unpack sample :x]]]]}))})
-"""
 cohort_query_str = """
 (map :cohort (query {:select [:%distinct.cohort]
 :from [:dataset]
 :where [:not [:is nil :cohort]]}))
 """
-datasets_list_in_cohort_query = """
+datasets_list_in_cohort_str ="""
-(map :text (query {:select [:text]
+(map :name (query {:select [:name :type :datasubtype :probemap :text :status]
 :from [:dataset]
-:where [:= :cohort %s ]})
+:where [:= :cohort %s]}))
 """
-datasets_type_pattern_str = """
+dataset_type_str = """
-(map :name (query {:select [:name]
+(map :type (query {:select [:type]
 :from [:dataset]
-:where [:and [:= :type %s]
+:where [:= :name %s]}))
-[:like :name %s]]}))
 """
+dataset_field_str = """
+(map :name (query {:select [:field.name]
+:from [:dataset]
+:join [:field [:= :dataset.id :dataset_id]]
+:where [:= :dataset.name %s]}))
+"""
+dataset_samples_str = """
+(map :value (query {:select [:value]
+:from [:dataset]
+:join [:field [:= :dataset.id :dataset_id]
+:code [:= :field.id :field_id]]
+:where [:and
+[:= :dataset.name %s]
+[:= :field.name "sampleID"]]}))
+"""
+dataset_probe_str = """
+(fetch [{:table %s
+:columns %s
+:samples %s}])
+"""
 def find_sample_by_field_query(cohort, field, values):
 """Return a xena query which looks up sample ids for the given field=values."""
 return sample_query_str % (quote(cohort), quote(field), array_fmt(values))
 req = urllib2.Request(url + '/data/', query, headers)
 response = urllib2.urlopen(req)
 result = response.read()
 return result
-def find_cohorts():
+def find_cohorts(url):
 """ Return a list of cohorts on a host at a specific url """
 """ return example: ["chinSF2007_public","TCGA.BRCA.sampleMap","cohort3"] """
-return cohort_query_str
+return json.loads(post(url,cohort_query_str))
-def find_datasets_in_cohort(url, cohort):
+def dataset_field(host, dataset):
-""" Return a list of datasets in a specific cohort on server=url.
+"""return probes or features of a dataset"""
-Each dataset is a dictionary of the data's metadata.
+return json.loads(post(host, dataset_field_str % (quote(dataset))))
-This should be refactored to be consistent with the other methods."""
-return map(json.loads,
-json.loads(post(url, datasets_list_in_cohort_query % (quote(cohort)))))
-def find_datasets_type_pattern(type, pattern):
+def datasets_list_in_cohort (host, cohort):
-"""Return a xena query which returns a list of datasets
+"""return datasets in a cohort"""
-filtered by a pattern on the dataset name. The pattern is sql:
+return json.loads(post(host, datasets_list_in_cohort_str % (quote(cohort))))
-% is wildcard."""
-return datasets_type_pattern_str % (quote(type), quote(pattern))
+def dataset_samples (host, dataset):
+return json.loads(post(host, dataset_samples_str % (quote(dataset))))
+def dataset_probe_values (host, dataset, samples, probes):
+return json.loads(post(host, dataset_probe_str % (quote(dataset), array_fmt(probes), array_fmt(samples))))
+def dataset_type (host, dataset):
+return json.loads(post(host, dataset_type_str % (quote(dataset))))
 def strip_first_url_dir(path):
 return re.sub(r'^[^/]*', '', path)
 # proj/<proj>/xena/<proj>/<path>

Mercurial > repos > melissacline > ucsc_xena_platform

comparison xena_query.py @ 41:02b0824c7d60