changeset 41:02b0824c7d60

Download data from any hub in the federated xena platform
author jingchunzhu <jingchunzhu@gmail.com>
date Mon, 27 Jul 2015 10:05:22 -0700
parents fd24e220f240
children bc9784300015
files getXenaData.py xenaGetDataset.py xenaGetDataset.xml xena_import.xml xena_query.py
diffstat 5 files changed, 116 insertions(+), 59 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/getXenaData.py	Mon Jul 27 10:05:22 2015 -0700
@@ -0,0 +1,44 @@
+# getXenaData.py
+import os, sys, string, json, csv
+import xena_query as xena
+
+if len(sys.argv[:])!=4:
+  print "python getXenaData.py hub datasetId outputfile\n"
+  sys.exit(1)
+
+url = sys.argv[1]
+dataset = sys.argv[2]
+output = sys.argv[3]
+
+if string.find(url,"galaxyxena") !=-1 and string.find(url,"ucsc.edu")!=-1:
+  url = "https://galaxyxena.soe.ucsc.edu:443/xena"
+
+samples = xena.dataset_samples (url, dataset)
+if not samples:
+  print "dataset does not exists"
+  sys.exit(1)
+
+type = xena.dataset_type(url, dataset)
+if type[0] not in ["genomicMatrix", "clinicalMatrix"]:
+  print "the current data type is not supported"
+  sys.exit(1)
+
+fout = open(output,'w')
+writer = csv.writer(fout, delimiter='\t')
+writer.writerow(["sample"]+samples)
+
+probes = xena.dataset_field(url, dataset)
+start=0
+size =100
+N= len(probes)
+for i in range (start, N,size):
+  results = xena.dataset_probe_values (url, dataset, samples, probes[i:i+size])
+  print "..."
+  for j in range (0, size):
+    if i+j == N:
+      break
+    writer.writerow([probes[i+j]]+results[j])
+
+fout.close()
+print "done"
+sys.exit(0)
--- a/xenaGetDataset.py	Mon Jul 27 00:59:02 2015 -0700
+++ b/xenaGetDataset.py	Mon Jul 27 10:05:22 2015 -0700
@@ -9,7 +9,7 @@
     parser = argparse.ArgumentParser()
     parser.add_argument("dataHub", type=str)
     parser.add_argument("datasetId", type=str)
-    parser.add_argument("metadatafile", type=str)
+    #parser.add_argument("metadatafile", type=str)
     parser.add_argument("datafile", type=str)
     args = parser.parse_args()
 
@@ -17,11 +17,13 @@
     datasetIdTokens = re.split("/", args.datasetId)
     datasetUrl = datasetUrlHost + "/" + "/".join(datasetIdTokens[1:])
     print datasetUrl
+    """
     metadataUrl = datasetUrl + ".json"
     mm = urllib2.urlopen(metadataUrl)
     with open(args.metadatafile, "w") as metadata:
         metadata.write(mm.read())
     mm.close()
+    """
     dd = urllib2.urlopen(datasetUrl)
     with open(args.datafile, "w") as data:
         data.write(dd.read())
--- a/xenaGetDataset.xml	Mon Jul 27 00:59:02 2015 -0700
+++ b/xenaGetDataset.xml	Mon Jul 27 10:05:22 2015 -0700
@@ -1,24 +1,37 @@
 <tool id="xenaGetDataset"  name="Get Data from Xena" version="0.0.2">
   <description>
-    Get a specified dataset and its associated metadata from the federated Xena platfrom
+    Download individual dataset from the federated Xena platfrom
   </description>
   <requirements>
     <requirement type="package" version="1.0">installXena</requirement>
   </requirements>
   <command interpreter="python">
-    xenaGetDataset.py $dataHub $dataset $metadataFile $dataFile
+    #if $hub.dataHub == "https://genome-cancer.ucsc.edu/proj/public/xena":
+        xenaGetDataset.py $hub.dataHub $dataset $dataFile
+    #elif $hub.customDataHub:
+        getXenaData.py $hub.customDataHub $dataset $dataFile
+    #else
+        getXenaData.py $hub.dataHub $dataset $dataFile
+    #end if
   </command>
   <inputs>
-    <param type="select" name="dataHub" label="Data Hub" optional="false">
-      <option value="https://genome-cancer.ucsc.edu/proj/public/xena"/>
-    </param>
+    <conditional name="hub">
+      <param type="select" name="dataHub" label="Data Hub" optional="false">
+	<option value="https://genome-cancer.ucsc.edu/proj/public/xena"/>
+	<option value="datahub">Enter a different hub</option>
+      </param>
+      <when value ="https://genome-cancer.ucsc.edu/proj/public/xena"/>
+      <when value ="datahub">
+	<param type="text" name ="customDataHub" label="Hub url" optional="false"/>
+      </when>
+    </conditional>  
     <param type="text" name="dataset" label="Dataset ID" optional="false"/>
   </inputs>
   <outputs>
-    <data format="txt" name="metadataFile" label="${dataset}.json"/>
+    <!-- <data format="txt" name="metadataFile" label="${dataset}.json"/> -->
     <data format="tabular" name="dataFile" label="${dataset}" />
   </outputs>
   <help>
-    Given the data hub name and the dataset id, download the data into a Galaxy dataset. Xena dataset id can be obtained through the Explore Data in Xena tool.  The accompanying metadata will be downloaded automatically along with the data, and will be stored in a second Galaxy dataset.
+    Given the data hub name and the dataset id, download the dataset into this Galaxy. Xena dataset id can be obtained through the Explore Data in Xena tool.
   </help>
 </tool>
--- a/xena_import.xml	Mon Jul 27 00:59:02 2015 -0700
+++ b/xena_import.xml	Mon Jul 27 10:05:22 2015 -0700
@@ -1,5 +1,5 @@
 <tool id="xenaImport" name="XENA Import" version="0.0.1">
-  <description>Import a Galaxy dataset into the Xena Server running on this galaxy instance</description>
+  <description>Load a Galaxy dataset into the Xena Server running on this galaxy instance</description>
   <requirements>
     <requirement type="package" version="1.0">installXena</requirement>
   </requirements>
--- a/xena_query.py	Mon Jul 27 00:59:02 2015 -0700
+++ b/xena_query.py	Mon Jul 27 10:05:22 2015 -0700
@@ -36,7 +36,7 @@
 {u'TCGA.LGG.sampleMap': [u'TCGA-CS-4938-01', u'TCGA-CS-6665-01', u'TCGA-FG-A6J3-01', u'TCGA-HT-7693-01', u'TCGA-S9-A7J2-01']}
 """
 
-import urllib2
+import urllib2, json
 import re
 
 def compose1(f, g):
@@ -64,34 +64,6 @@
 # found enough values, rather than continuing to the end. We can do this because
 # enumerated values are unique. An alternative would be to index all the enumerated
 # values in the db.
-sample_query_str = """
-(let [cohort %s
-      field_id-dataset (car (query {:select [[:field.id :field_id] [:dataset.id :dataset]]
-                                    :from [:dataset]
-                                    :join [:field [:= :dataset_id :dataset.id]]
-                                    :where [:and [:= :cohort cohort]
-                                                 [:= :field.name %s]]}))
-      values %s
-      field_id (:field_id field_id-dataset)
-      dataset (:dataset field_id-dataset)
-      sample (:id (car (query {:select [:field.id]
-                               :from [:field]
-                               :where [:and [:= :dataset_id dataset]
-                                            [:= :field.name "sampleID"]]})))
-      N (- (:rows (car (query {:select [:rows]
-                               :from [:dataset]
-                               :where [:= :id dataset]}))) 1)]
-  {cohort (map :value (query {:select [:value]
-                              :from [{:select [:x #sql/call [:unpack field_id, :x]]
-                                      :from [#sql/call [:system_range 0 N]]
-                                      :where [:in #sql/call [:unpack field_id, :x] {:select [:ordering]
-                                                                                             :from [:code]
-                                                                                             :where [:and [:= :field_id field_id]
-                                                                                                          [:in :value values]]
-                                                                                             :limit (count values)}]}]
-                              :join [:code [:and [:= :field_id sample]
-                                                 [:= :ordering #sql/call [:unpack sample :x]]]]}))})
-"""
 
 cohort_query_str = """
 (map :cohort (query {:select [:%distinct.cohort]
@@ -99,19 +71,42 @@
                      :where [:not [:is nil :cohort]]}))
 """
 
-datasets_list_in_cohort_query = """
-(map :text (query {:select [:text]
+datasets_list_in_cohort_str ="""
+(map :name (query {:select [:name :type :datasubtype :probemap :text :status]
+      :from [:dataset]
+      :where [:= :cohort %s]}))
+"""
+
+dataset_type_str = """
+(map :type (query {:select [:type]
                    :from [:dataset]
-                   :where [:= :cohort %s ]})
+                   :where [:= :name %s]}))
 """
 
-datasets_type_pattern_str = """
-(map :name (query {:select [:name]
-                   :from [:dataset]
-                   :where [:and [:= :type %s]
-                                [:like :name %s]]}))
+dataset_field_str = """
+(map :name (query {:select [:field.name]
+             :from [:dataset]
+             :join [:field [:= :dataset.id :dataset_id]]
+             :where [:= :dataset.name %s]}))
 """
 
+dataset_samples_str = """
+(map :value (query {:select [:value]
+            :from [:dataset]
+            :join [:field [:= :dataset.id :dataset_id]
+            :code [:= :field.id :field_id]]
+            :where [:and
+            [:= :dataset.name %s]
+            [:= :field.name "sampleID"]]}))
+"""
+
+dataset_probe_str = """
+    (fetch [{:table %s
+          :columns %s
+          :samples %s}])
+"""
+
+
 def find_sample_by_field_query(cohort, field, values):
     """Return a xena query which looks up sample ids for the given field=values."""
     return sample_query_str % (quote(cohort), quote(field), array_fmt(values))
@@ -129,24 +124,27 @@
     result = response.read()
     return result
 
-def find_cohorts():
+def find_cohorts(url):
     """ Return a list of cohorts on a host at a specific url """
     """ return example: ["chinSF2007_public","TCGA.BRCA.sampleMap","cohort3"] """
-    return cohort_query_str
+    return json.loads(post(url,cohort_query_str))
+
+def dataset_field(host, dataset):
+    """return probes or features of a dataset"""
+    return json.loads(post(host, dataset_field_str % (quote(dataset))))
 
-def find_datasets_in_cohort(url, cohort):
-    """ Return a list of datasets in a specific cohort on server=url.
-    Each dataset is a dictionary of the data's metadata.
-    This should be refactored to be consistent with the other methods."""
-    return map(json.loads,
-            json.loads(post(url, datasets_list_in_cohort_query % (quote(cohort)))))
+def datasets_list_in_cohort (host, cohort):
+    """return datasets in a cohort"""
+    return json.loads(post(host, datasets_list_in_cohort_str % (quote(cohort))))
 
-def find_datasets_type_pattern(type, pattern):
-    """Return a xena query which returns a list of datasets
-    filtered by a pattern on the dataset name. The pattern is sql:
-    % is wildcard."""
-    return datasets_type_pattern_str % (quote(type), quote(pattern))
+def dataset_samples (host, dataset):
+    return json.loads(post(host, dataset_samples_str % (quote(dataset))))
 
+def dataset_probe_values (host, dataset, samples, probes):
+    return json.loads(post(host, dataset_probe_str % (quote(dataset), array_fmt(probes), array_fmt(samples))))
+
+def dataset_type (host, dataset):
+    return json.loads(post(host, dataset_type_str % (quote(dataset))))
 
 def strip_first_url_dir(path):
     return re.sub(r'^[^/]*', '', path)