Mercurial > repos > melissacline > ucsc_xena_platform
diff xena_query.py @ 41:02b0824c7d60
Download data from any hub in the federated xena platform
author | jingchunzhu <jingchunzhu@gmail.com> |
---|---|
date | Mon, 27 Jul 2015 10:05:22 -0700 |
parents | 8bb037f88ed2 |
children |
line wrap: on
line diff
--- a/xena_query.py Mon Jul 27 00:59:02 2015 -0700 +++ b/xena_query.py Mon Jul 27 10:05:22 2015 -0700 @@ -36,7 +36,7 @@ {u'TCGA.LGG.sampleMap': [u'TCGA-CS-4938-01', u'TCGA-CS-6665-01', u'TCGA-FG-A6J3-01', u'TCGA-HT-7693-01', u'TCGA-S9-A7J2-01']} """ -import urllib2 +import urllib2, json import re def compose1(f, g): @@ -64,34 +64,6 @@ # found enough values, rather than continuing to the end. We can do this because # enumerated values are unique. An alternative would be to index all the enumerated # values in the db. -sample_query_str = """ -(let [cohort %s - field_id-dataset (car (query {:select [[:field.id :field_id] [:dataset.id :dataset]] - :from [:dataset] - :join [:field [:= :dataset_id :dataset.id]] - :where [:and [:= :cohort cohort] - [:= :field.name %s]]})) - values %s - field_id (:field_id field_id-dataset) - dataset (:dataset field_id-dataset) - sample (:id (car (query {:select [:field.id] - :from [:field] - :where [:and [:= :dataset_id dataset] - [:= :field.name "sampleID"]]}))) - N (- (:rows (car (query {:select [:rows] - :from [:dataset] - :where [:= :id dataset]}))) 1)] - {cohort (map :value (query {:select [:value] - :from [{:select [:x #sql/call [:unpack field_id, :x]] - :from [#sql/call [:system_range 0 N]] - :where [:in #sql/call [:unpack field_id, :x] {:select [:ordering] - :from [:code] - :where [:and [:= :field_id field_id] - [:in :value values]] - :limit (count values)}]}] - :join [:code [:and [:= :field_id sample] - [:= :ordering #sql/call [:unpack sample :x]]]]}))}) -""" cohort_query_str = """ (map :cohort (query {:select [:%distinct.cohort] @@ -99,19 +71,42 @@ :where [:not [:is nil :cohort]]})) """ -datasets_list_in_cohort_query = """ -(map :text (query {:select [:text] +datasets_list_in_cohort_str =""" +(map :name (query {:select [:name :type :datasubtype :probemap :text :status] + :from [:dataset] + :where [:= :cohort %s]})) +""" + +dataset_type_str = """ +(map :type (query {:select [:type] :from [:dataset] - :where [:= :cohort %s ]}) + :where [:= :name %s]})) """ -datasets_type_pattern_str = """ -(map :name (query {:select [:name] - :from [:dataset] - :where [:and [:= :type %s] - [:like :name %s]]})) +dataset_field_str = """ +(map :name (query {:select [:field.name] + :from [:dataset] + :join [:field [:= :dataset.id :dataset_id]] + :where [:= :dataset.name %s]})) """ +dataset_samples_str = """ +(map :value (query {:select [:value] + :from [:dataset] + :join [:field [:= :dataset.id :dataset_id] + :code [:= :field.id :field_id]] + :where [:and + [:= :dataset.name %s] + [:= :field.name "sampleID"]]})) +""" + +dataset_probe_str = """ + (fetch [{:table %s + :columns %s + :samples %s}]) +""" + + def find_sample_by_field_query(cohort, field, values): """Return a xena query which looks up sample ids for the given field=values.""" return sample_query_str % (quote(cohort), quote(field), array_fmt(values)) @@ -129,24 +124,27 @@ result = response.read() return result -def find_cohorts(): +def find_cohorts(url): """ Return a list of cohorts on a host at a specific url """ """ return example: ["chinSF2007_public","TCGA.BRCA.sampleMap","cohort3"] """ - return cohort_query_str + return json.loads(post(url,cohort_query_str)) + +def dataset_field(host, dataset): + """return probes or features of a dataset""" + return json.loads(post(host, dataset_field_str % (quote(dataset)))) -def find_datasets_in_cohort(url, cohort): - """ Return a list of datasets in a specific cohort on server=url. - Each dataset is a dictionary of the data's metadata. - This should be refactored to be consistent with the other methods.""" - return map(json.loads, - json.loads(post(url, datasets_list_in_cohort_query % (quote(cohort))))) +def datasets_list_in_cohort (host, cohort): + """return datasets in a cohort""" + return json.loads(post(host, datasets_list_in_cohort_str % (quote(cohort)))) -def find_datasets_type_pattern(type, pattern): - """Return a xena query which returns a list of datasets - filtered by a pattern on the dataset name. The pattern is sql: - % is wildcard.""" - return datasets_type_pattern_str % (quote(type), quote(pattern)) +def dataset_samples (host, dataset): + return json.loads(post(host, dataset_samples_str % (quote(dataset)))) +def dataset_probe_values (host, dataset, samples, probes): + return json.loads(post(host, dataset_probe_str % (quote(dataset), array_fmt(probes), array_fmt(samples)))) + +def dataset_type (host, dataset): + return json.loads(post(host, dataset_type_str % (quote(dataset)))) def strip_first_url_dir(path): return re.sub(r'^[^/]*', '', path)