# HG changeset patch # User jingchunzhu # Date 1438931593 25200 # Node ID eb5acf81e6096b4b570ea3ad8b47c23e3f281432 # Parent 03b7b1cf78ce416a9af855919c8422c8e2e8dd89 improve messages diff -r 03b7b1cf78ce -r eb5acf81e609 mergeGenomicFiles.xml --- a/mergeGenomicFiles.xml Thu Aug 06 00:30:49 2015 -0700 +++ b/mergeGenomicFiles.xml Fri Aug 07 00:13:13 2015 -0700 @@ -17,9 +17,9 @@ - + - + @@ -28,6 +28,11 @@ ***Merge Genomic Datasets*** + Output Genomic Matrix is of format Rows (Identifiers) by Columns (Samples), ready to be imported into a Xena Hub. + + Output Data Source is of format Rows (Samples) by Columns (identifiers), ready to be imported into a Xena Hub. + + Given two genomic datasets, merge them to produce a third dataset that is the union of the first two. The new dataset will contain all column labels from either dataset, and all row labels from either dataset. If a row label appears in both datasets, the output dataset will contain, for that row, all values for the first set of columns, plus all values for the second set of columns. If a row label appears in the first dataset only, the output dataset will contain the values for the columns of the first dataset, and blanks (indicating missing values) for the columns of the second dataset. To maintain provenance, this script also outputs a second matrix, with one row for each column in the output dataset, and two columns per row indicating which input dataset that column came from. By default, the input dataset name is used to indicate which input file each column came from. Optionally, the user can specify descriptive labels to be used in place of the filenames. This all assumes that each column exists in only one input dataset. diff -r 03b7b1cf78ce -r eb5acf81e609 mergeGenomicMatrixFiles.py --- a/mergeGenomicMatrixFiles.py Thu Aug 06 00:30:49 2015 -0700 +++ b/mergeGenomicMatrixFiles.py Fri Aug 07 00:13:13 2015 -0700 @@ -1,7 +1,7 @@ #!/usr/bin/env python import argparse -import string,os,sys +import string,os,sys,json def header (samples, sourceFiles, infile, labelThisFile): if labelThisFile == None: @@ -81,6 +81,13 @@ fout.close() return +def outputMergedMatrixJson(output): + fout = open(output,'w') + j={} + j["type"]="genomicMatrix" + json.dump(j, fout) + fout.close() + if __name__ == '__main__' : # # The input files to this script are two or more matrices, in which @@ -117,3 +124,4 @@ outputSourceMatrix(sourceFiles, args.outSourceMatrix) outputMergedMatrix(dataMatrix, samples, genes, args.outMergedData) + diff -r 03b7b1cf78ce -r eb5acf81e609 mergeMutationDatasets.xml --- a/mergeMutationDatasets.xml Thu Aug 06 00:30:49 2015 -0700 +++ b/mergeMutationDatasets.xml Fri Aug 07 00:13:13 2015 -0700 @@ -14,9 +14,9 @@ - + - + @@ -26,6 +26,10 @@ ***Merge Xena Positional Mutation Datasets*** + Output Mutation by Position datafile is ready to be imported into a Xena Hub. + + Output Data Source is of format Rows (Samples) by Columns (identifiers), ready to be imported into a Xena Hub. + Given two datasets of mutation data as formatted for the UCSC Xena Browser, merge them to produce a third dataset that is the union of the first two. The new dataset will contain all mutations from either dataset. To maintain provenance, this script also outputs a second matrix, with one row for each sample ID that appears in the output dataset, and two columns per row indicating which input dataset(s) contained some mutation data for that sample. By default, the input dataset name is used to indicate which input file each column came from. Optionally, the user can specify descriptive labels to be used in place of the dataset names.