diff xena_query.py @ 41:02b0824c7d60

Download data from any hub in the federated xena platform
author jingchunzhu <jingchunzhu@gmail.com>
date Mon, 27 Jul 2015 10:05:22 -0700
parents 8bb037f88ed2
children
line wrap: on
line diff
--- a/xena_query.py	Mon Jul 27 00:59:02 2015 -0700
+++ b/xena_query.py	Mon Jul 27 10:05:22 2015 -0700
@@ -36,7 +36,7 @@
 {u'TCGA.LGG.sampleMap': [u'TCGA-CS-4938-01', u'TCGA-CS-6665-01', u'TCGA-FG-A6J3-01', u'TCGA-HT-7693-01', u'TCGA-S9-A7J2-01']}
 """
 
-import urllib2
+import urllib2, json
 import re
 
 def compose1(f, g):
@@ -64,34 +64,6 @@
 # found enough values, rather than continuing to the end. We can do this because
 # enumerated values are unique. An alternative would be to index all the enumerated
 # values in the db.
-sample_query_str = """
-(let [cohort %s
-      field_id-dataset (car (query {:select [[:field.id :field_id] [:dataset.id :dataset]]
-                                    :from [:dataset]
-                                    :join [:field [:= :dataset_id :dataset.id]]
-                                    :where [:and [:= :cohort cohort]
-                                                 [:= :field.name %s]]}))
-      values %s
-      field_id (:field_id field_id-dataset)
-      dataset (:dataset field_id-dataset)
-      sample (:id (car (query {:select [:field.id]
-                               :from [:field]
-                               :where [:and [:= :dataset_id dataset]
-                                            [:= :field.name "sampleID"]]})))
-      N (- (:rows (car (query {:select [:rows]
-                               :from [:dataset]
-                               :where [:= :id dataset]}))) 1)]
-  {cohort (map :value (query {:select [:value]
-                              :from [{:select [:x #sql/call [:unpack field_id, :x]]
-                                      :from [#sql/call [:system_range 0 N]]
-                                      :where [:in #sql/call [:unpack field_id, :x] {:select [:ordering]
-                                                                                             :from [:code]
-                                                                                             :where [:and [:= :field_id field_id]
-                                                                                                          [:in :value values]]
-                                                                                             :limit (count values)}]}]
-                              :join [:code [:and [:= :field_id sample]
-                                                 [:= :ordering #sql/call [:unpack sample :x]]]]}))})
-"""
 
 cohort_query_str = """
 (map :cohort (query {:select [:%distinct.cohort]
@@ -99,19 +71,42 @@
                      :where [:not [:is nil :cohort]]}))
 """
 
-datasets_list_in_cohort_query = """
-(map :text (query {:select [:text]
+datasets_list_in_cohort_str ="""
+(map :name (query {:select [:name :type :datasubtype :probemap :text :status]
+      :from [:dataset]
+      :where [:= :cohort %s]}))
+"""
+
+dataset_type_str = """
+(map :type (query {:select [:type]
                    :from [:dataset]
-                   :where [:= :cohort %s ]})
+                   :where [:= :name %s]}))
 """
 
-datasets_type_pattern_str = """
-(map :name (query {:select [:name]
-                   :from [:dataset]
-                   :where [:and [:= :type %s]
-                                [:like :name %s]]}))
+dataset_field_str = """
+(map :name (query {:select [:field.name]
+             :from [:dataset]
+             :join [:field [:= :dataset.id :dataset_id]]
+             :where [:= :dataset.name %s]}))
 """
 
+dataset_samples_str = """
+(map :value (query {:select [:value]
+            :from [:dataset]
+            :join [:field [:= :dataset.id :dataset_id]
+            :code [:= :field.id :field_id]]
+            :where [:and
+            [:= :dataset.name %s]
+            [:= :field.name "sampleID"]]}))
+"""
+
+dataset_probe_str = """
+    (fetch [{:table %s
+          :columns %s
+          :samples %s}])
+"""
+
+
 def find_sample_by_field_query(cohort, field, values):
     """Return a xena query which looks up sample ids for the given field=values."""
     return sample_query_str % (quote(cohort), quote(field), array_fmt(values))
@@ -129,24 +124,27 @@
     result = response.read()
     return result
 
-def find_cohorts():
+def find_cohorts(url):
     """ Return a list of cohorts on a host at a specific url """
     """ return example: ["chinSF2007_public","TCGA.BRCA.sampleMap","cohort3"] """
-    return cohort_query_str
+    return json.loads(post(url,cohort_query_str))
+
+def dataset_field(host, dataset):
+    """return probes or features of a dataset"""
+    return json.loads(post(host, dataset_field_str % (quote(dataset))))
 
-def find_datasets_in_cohort(url, cohort):
-    """ Return a list of datasets in a specific cohort on server=url.
-    Each dataset is a dictionary of the data's metadata.
-    This should be refactored to be consistent with the other methods."""
-    return map(json.loads,
-            json.loads(post(url, datasets_list_in_cohort_query % (quote(cohort)))))
+def datasets_list_in_cohort (host, cohort):
+    """return datasets in a cohort"""
+    return json.loads(post(host, datasets_list_in_cohort_str % (quote(cohort))))
 
-def find_datasets_type_pattern(type, pattern):
-    """Return a xena query which returns a list of datasets
-    filtered by a pattern on the dataset name. The pattern is sql:
-    % is wildcard."""
-    return datasets_type_pattern_str % (quote(type), quote(pattern))
+def dataset_samples (host, dataset):
+    return json.loads(post(host, dataset_samples_str % (quote(dataset))))
 
+def dataset_probe_values (host, dataset, samples, probes):
+    return json.loads(post(host, dataset_probe_str % (quote(dataset), array_fmt(probes), array_fmt(samples))))
+
+def dataset_type (host, dataset):
+    return json.loads(post(host, dataset_type_str % (quote(dataset))))
 
 def strip_first_url_dir(path):
     return re.sub(r'^[^/]*', '', path)