# HG changeset patch
# User melissacline
# Date 1410313733 25200
# Node ID 75c7d80df9c1b63c591b76e7e6d69fa8a732bdc6
# Parent 16c3fad9bac58bb19767d15c9c211197ced3e1c6
Adding the xena_query python API to the install bundle
diff -r 16c3fad9bac5 -r 75c7d80df9c1 tool_dependencies.xml
--- a/tool_dependencies.xml Mon Sep 08 13:08:04 2014 -0700
+++ b/tool_dependencies.xml Tue Sep 09 18:48:53 2014 -0700
@@ -9,10 +9,17 @@
$INSTALL_DIR/xena
+
+ $INSTALL_DIR
+
${INSTALL_DIR}
+
+
+ ${INSTALL_DIR}
+
diff -r 16c3fad9bac5 -r 75c7d80df9c1 xena_query.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/xena_query.py Tue Sep 09 18:48:53 2014 -0700
@@ -0,0 +1,157 @@
+"""
+Utilities for xena queries.
+
+A basic query example.
+Queries are scheme expressions.
+
+>>> import xena_query as xena
+>>> xena.post("https://genome-cancer.ucsc.edu/proj/public/xena", "(+ 1 2)")
+'3.0'
+
+>>> xena.post("https://genome-cancer.ucsc.edu/proj/public/xena", "(let [x 2 y (+ x 3)] (* x y))")
+'10.0'
+
+Looking up sample ids for the TCGA LGG cohort.
+
+>>> r = xena.post("https://genome-cancer.ucsc.edu/proj/public/xena",
+ xena.patient_to_sample_query("TCGA.LGG.sampleMap",
+ ["TCGA-CS-4938",
+ "TCGA-HT-7693",
+ "TCGA-CS-6665",
+ "TCGA-S9-A7J2",
+ "TCGA-FG-A6J3"]))
+'{"TCGA.LGG.sampleMap":["TCGA-CS-4938-01","TCGA-CS-6665-01","TCGA-FG-A6J3-01","TCGA-HT-7693-01","TCGA-S9-A7J2-01"]}'
+
+>>> r = xena.post("https://genome-cancer.ucsc.edu/proj/public/xena",
+ xena.find_sample_by_field_query("TCGA.LGG.sampleMap",
+ "_PATIENT",
+ ["TCGA-CS-4938",
+ "TCGA-HT-7693",
+ "TCGA-CS-6665",
+ "TCGA-S9-A7J2",
+ "TCGA-FG-A6J3"]))
+'{"TCGA.LGG.sampleMap":["TCGA-CS-4938-01","TCGA-CS-6665-01","TCGA-FG-A6J3-01","TCGA-HT-7693-01","TCGA-S9-A7J2-01"]}'
+>>> import json
+>>> json.loads(r)
+{u'TCGA.LGG.sampleMap': [u'TCGA-CS-4938-01', u'TCGA-CS-6665-01', u'TCGA-FG-A6J3-01', u'TCGA-HT-7693-01', u'TCGA-S9-A7J2-01']}
+"""
+
+import urllib2
+import re
+
+def compose1(f, g):
+ def composed(*args, **kwargs):
+ return f(g(*args, **kwargs))
+ return composed
+
+# funcitonal composition, e.g.
+# compose(f, g)(a, ...) == f(g(a, ...))
+compose = lambda *funcs: reduce(compose1, funcs)
+
+def quote(s):
+ return '"' + s + '"'
+
+def array_fmt(l):
+ return '[' + ', '.join((quote(s) for s in l)) + ']'
+
+# The strategy here is
+# o Do table scan on code to find codes matching field values
+# o Do IN query on unpack(field, x) to find rows matching codes
+# o Project to unpack(sample, x) to get sampleID code
+# o Join with code to get sampleID values
+#
+# Note the :limit on the table scan. This makes the table scan exit after we've
+# found enough values, rather than continuing to the end. We can do this because
+# enumerated values are unique. An alternative would be to index all the enumerated
+# values in the db.
+sample_query_str = """
+(let [cohort %s
+ field_id-dataset (car (query {:select [[:field.id :field_id] [:dataset.id :dataset]]
+ :from [:dataset]
+ :join [:field [:= :dataset_id :dataset.id]]
+ :where [:and [:= :cohort cohort]
+ [:= :field.name %s]]}))
+ values %s
+ field_id (:field_id field_id-dataset)
+ dataset (:dataset field_id-dataset)
+ sample (:id (car (query {:select [:field.id]
+ :from [:field]
+ :where [:and [:= :dataset_id dataset]
+ [:= :field.name "sampleID"]]})))
+ N (- (:rows (car (query {:select [:rows]
+ :from [:dataset]
+ :where [:= :id dataset]}))) 1)]
+ {cohort (map :value (query {:select [:value]
+ :from [{:select [:x #sql/call [:unpack field_id, :x]]
+ :from [#sql/call [:system_range 0 N]]
+ :where [:in #sql/call [:unpack field_id, :x] {:select [:ordering]
+ :from [:code]
+ :where [:and [:= :field_id field_id]
+ [:in :value values]]
+ :limit (count values)}]}]
+ :join [:code [:and [:= :field_id sample]
+ [:= :ordering #sql/call [:unpack sample :x]]]]}))})
+"""
+
+cohort_query_str = """
+(map :cohort (query {:select [:%distinct.cohort]
+ :from [:dataset]
+ :where [:not [:is nil :cohort]]}))
+"""
+
+datasets_list_in_cohort_query = """
+(map :text (query {:select [:text]
+ :from [:dataset]
+ :where [:= :cohort %s ]})
+"""
+
+datasets_type_pattern_str = """
+(map :name (query {:select [:name]
+ :from [:dataset]
+ :where [:and [:= :type %s]
+ [:like :name %s]]}))
+"""
+
+def find_sample_by_field_query(cohort, field, values):
+ """Return a xena query which looks up sample ids for the given field=values."""
+ return sample_query_str % (quote(cohort), quote(field), array_fmt(values))
+
+def patient_to_sample_query(cohort, patients):
+ """Return a xena query which looks up sample ids for the given patients."""
+ return find_sample_by_field_query(cohort, "_PATIENT", patients)
+
+headers = { 'Content-Type' : "text/plain" }
+
+def post(url, query):
+ """POST a xena data query to the given url."""
+ req = urllib2.Request(url + '/data/', query, headers)
+ response = urllib2.urlopen(req)
+ result = response.read()
+ return result
+
+def find_cohorts():
+ """ Return a list of cohorts on a host at a specific url """
+ """ return example: ["chinSF2007_public","TCGA.BRCA.sampleMap","cohort3"] """
+ return cohort_query_str
+
+def find_datasets_in_cohort(url, cohort):
+ """ Return a list of datasets in a specific cohort on server=url.
+ Each dataset is a dictionary of the data's metadata.
+ This should be refactored to be consistent with the other methods."""
+ return map(json.loads,
+ json.loads(post(url, datasets_list_in_cohort_query % (quote(cohort)))))
+
+def find_datasets_type_pattern(type, pattern):
+ """Return a xena query which returns a list of datasets
+ filtered by a pattern on the dataset name. The pattern is sql:
+ % is wildcard."""
+ return datasets_type_pattern_str % (quote(type), quote(pattern))
+
+
+def strip_first_url_dir(path):
+ return re.sub(r'^[^/]*', '', path)
+
+# proj//xena//
+# download//xena/
+def name_to_url(base_url, name):
+ return base_url.replace('/proj/', '/download/') + strip_first_url_dir(name)