# HG changeset patch # User jingchunzhu # Date 1438016722 25200 # Node ID 02b0824c7d60af1d90a8f5e5b0ac119ae9e136cc # Parent fd24e220f240ef1b3fa7ac3f6e53e610e53079eb Download data from any hub in the federated xena platform diff -r fd24e220f240 -r 02b0824c7d60 getXenaData.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/getXenaData.py Mon Jul 27 10:05:22 2015 -0700 @@ -0,0 +1,44 @@ +# getXenaData.py +import os, sys, string, json, csv +import xena_query as xena + +if len(sys.argv[:])!=4: + print "python getXenaData.py hub datasetId outputfile\n" + sys.exit(1) + +url = sys.argv[1] +dataset = sys.argv[2] +output = sys.argv[3] + +if string.find(url,"galaxyxena") !=-1 and string.find(url,"ucsc.edu")!=-1: + url = "https://galaxyxena.soe.ucsc.edu:443/xena" + +samples = xena.dataset_samples (url, dataset) +if not samples: + print "dataset does not exists" + sys.exit(1) + +type = xena.dataset_type(url, dataset) +if type[0] not in ["genomicMatrix", "clinicalMatrix"]: + print "the current data type is not supported" + sys.exit(1) + +fout = open(output,'w') +writer = csv.writer(fout, delimiter='\t') +writer.writerow(["sample"]+samples) + +probes = xena.dataset_field(url, dataset) +start=0 +size =100 +N= len(probes) +for i in range (start, N,size): + results = xena.dataset_probe_values (url, dataset, samples, probes[i:i+size]) + print "..." + for j in range (0, size): + if i+j == N: + break + writer.writerow([probes[i+j]]+results[j]) + +fout.close() +print "done" +sys.exit(0) diff -r fd24e220f240 -r 02b0824c7d60 xenaGetDataset.py --- a/xenaGetDataset.py Mon Jul 27 00:59:02 2015 -0700 +++ b/xenaGetDataset.py Mon Jul 27 10:05:22 2015 -0700 @@ -9,7 +9,7 @@ parser = argparse.ArgumentParser() parser.add_argument("dataHub", type=str) parser.add_argument("datasetId", type=str) - parser.add_argument("metadatafile", type=str) + #parser.add_argument("metadatafile", type=str) parser.add_argument("datafile", type=str) args = parser.parse_args() @@ -17,11 +17,13 @@ datasetIdTokens = re.split("/", args.datasetId) datasetUrl = datasetUrlHost + "/" + "/".join(datasetIdTokens[1:]) print datasetUrl + """ metadataUrl = datasetUrl + ".json" mm = urllib2.urlopen(metadataUrl) with open(args.metadatafile, "w") as metadata: metadata.write(mm.read()) mm.close() + """ dd = urllib2.urlopen(datasetUrl) with open(args.datafile, "w") as data: data.write(dd.read()) diff -r fd24e220f240 -r 02b0824c7d60 xenaGetDataset.xml --- a/xenaGetDataset.xml Mon Jul 27 00:59:02 2015 -0700 +++ b/xenaGetDataset.xml Mon Jul 27 10:05:22 2015 -0700 @@ -1,24 +1,37 @@ - Get a specified dataset and its associated metadata from the federated Xena platfrom + Download individual dataset from the federated Xena platfrom installXena - xenaGetDataset.py $dataHub $dataset $metadataFile $dataFile + #if $hub.dataHub == "https://genome-cancer.ucsc.edu/proj/public/xena": + xenaGetDataset.py $hub.dataHub $dataset $dataFile + #elif $hub.customDataHub: + getXenaData.py $hub.customDataHub $dataset $dataFile + #else + getXenaData.py $hub.dataHub $dataset $dataFile + #end if - - - + - Given the data hub name and the dataset id, download the data into a Galaxy dataset. Xena dataset id can be obtained through the Explore Data in Xena tool. The accompanying metadata will be downloaded automatically along with the data, and will be stored in a second Galaxy dataset. + Given the data hub name and the dataset id, download the dataset into this Galaxy. Xena dataset id can be obtained through the Explore Data in Xena tool. diff -r fd24e220f240 -r 02b0824c7d60 xena_import.xml --- a/xena_import.xml Mon Jul 27 00:59:02 2015 -0700 +++ b/xena_import.xml Mon Jul 27 10:05:22 2015 -0700 @@ -1,5 +1,5 @@ - Import a Galaxy dataset into the Xena Server running on this galaxy instance + Load a Galaxy dataset into the Xena Server running on this galaxy instance installXena diff -r fd24e220f240 -r 02b0824c7d60 xena_query.py --- a/xena_query.py Mon Jul 27 00:59:02 2015 -0700 +++ b/xena_query.py Mon Jul 27 10:05:22 2015 -0700 @@ -36,7 +36,7 @@ {u'TCGA.LGG.sampleMap': [u'TCGA-CS-4938-01', u'TCGA-CS-6665-01', u'TCGA-FG-A6J3-01', u'TCGA-HT-7693-01', u'TCGA-S9-A7J2-01']} """ -import urllib2 +import urllib2, json import re def compose1(f, g): @@ -64,34 +64,6 @@ # found enough values, rather than continuing to the end. We can do this because # enumerated values are unique. An alternative would be to index all the enumerated # values in the db. -sample_query_str = """ -(let [cohort %s - field_id-dataset (car (query {:select [[:field.id :field_id] [:dataset.id :dataset]] - :from [:dataset] - :join [:field [:= :dataset_id :dataset.id]] - :where [:and [:= :cohort cohort] - [:= :field.name %s]]})) - values %s - field_id (:field_id field_id-dataset) - dataset (:dataset field_id-dataset) - sample (:id (car (query {:select [:field.id] - :from [:field] - :where [:and [:= :dataset_id dataset] - [:= :field.name "sampleID"]]}))) - N (- (:rows (car (query {:select [:rows] - :from [:dataset] - :where [:= :id dataset]}))) 1)] - {cohort (map :value (query {:select [:value] - :from [{:select [:x #sql/call [:unpack field_id, :x]] - :from [#sql/call [:system_range 0 N]] - :where [:in #sql/call [:unpack field_id, :x] {:select [:ordering] - :from [:code] - :where [:and [:= :field_id field_id] - [:in :value values]] - :limit (count values)}]}] - :join [:code [:and [:= :field_id sample] - [:= :ordering #sql/call [:unpack sample :x]]]]}))}) -""" cohort_query_str = """ (map :cohort (query {:select [:%distinct.cohort] @@ -99,19 +71,42 @@ :where [:not [:is nil :cohort]]})) """ -datasets_list_in_cohort_query = """ -(map :text (query {:select [:text] +datasets_list_in_cohort_str =""" +(map :name (query {:select [:name :type :datasubtype :probemap :text :status] + :from [:dataset] + :where [:= :cohort %s]})) +""" + +dataset_type_str = """ +(map :type (query {:select [:type] :from [:dataset] - :where [:= :cohort %s ]}) + :where [:= :name %s]})) """ -datasets_type_pattern_str = """ -(map :name (query {:select [:name] - :from [:dataset] - :where [:and [:= :type %s] - [:like :name %s]]})) +dataset_field_str = """ +(map :name (query {:select [:field.name] + :from [:dataset] + :join [:field [:= :dataset.id :dataset_id]] + :where [:= :dataset.name %s]})) """ +dataset_samples_str = """ +(map :value (query {:select [:value] + :from [:dataset] + :join [:field [:= :dataset.id :dataset_id] + :code [:= :field.id :field_id]] + :where [:and + [:= :dataset.name %s] + [:= :field.name "sampleID"]]})) +""" + +dataset_probe_str = """ + (fetch [{:table %s + :columns %s + :samples %s}]) +""" + + def find_sample_by_field_query(cohort, field, values): """Return a xena query which looks up sample ids for the given field=values.""" return sample_query_str % (quote(cohort), quote(field), array_fmt(values)) @@ -129,24 +124,27 @@ result = response.read() return result -def find_cohorts(): +def find_cohorts(url): """ Return a list of cohorts on a host at a specific url """ """ return example: ["chinSF2007_public","TCGA.BRCA.sampleMap","cohort3"] """ - return cohort_query_str + return json.loads(post(url,cohort_query_str)) + +def dataset_field(host, dataset): + """return probes or features of a dataset""" + return json.loads(post(host, dataset_field_str % (quote(dataset)))) -def find_datasets_in_cohort(url, cohort): - """ Return a list of datasets in a specific cohort on server=url. - Each dataset is a dictionary of the data's metadata. - This should be refactored to be consistent with the other methods.""" - return map(json.loads, - json.loads(post(url, datasets_list_in_cohort_query % (quote(cohort))))) +def datasets_list_in_cohort (host, cohort): + """return datasets in a cohort""" + return json.loads(post(host, datasets_list_in_cohort_str % (quote(cohort)))) -def find_datasets_type_pattern(type, pattern): - """Return a xena query which returns a list of datasets - filtered by a pattern on the dataset name. The pattern is sql: - % is wildcard.""" - return datasets_type_pattern_str % (quote(type), quote(pattern)) +def dataset_samples (host, dataset): + return json.loads(post(host, dataset_samples_str % (quote(dataset)))) +def dataset_probe_values (host, dataset, samples, probes): + return json.loads(post(host, dataset_probe_str % (quote(dataset), array_fmt(probes), array_fmt(samples)))) + +def dataset_type (host, dataset): + return json.loads(post(host, dataset_type_str % (quote(dataset)))) def strip_first_url_dir(path): return re.sub(r'^[^/]*', '', path)