Mercurial > repos > melissacline > ucsc_cancer_utilities

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mergeMutationDatasets.xml	Fri Mar 20 16:38:46 2015 -0700
@@ -0,0 +1,31 @@
+<tool id="mergeMutationDatasets" description="Merge two Xena positional mutation datasets into a new dataset" name="Merge Positional Mutation Data" version="0.0.1">
+  <description>
+    Given two mutation datasets, merge them to create a larger dataset with the mutations from both datasets. Output this larger dataset, along with a 2-column matrix indicating the source of each mutation
+  </description>
+  <command interpreter="python">
+      mergeXenaMutation.py $outputC $outputSourceMatrix $errorLog  $inputA $inputB
+      #if $labelForDatasetA
+          --aLabel "${labelForDatasetA}"
+      #end if
+      #if $labelForDatasetB
+          --bLabel "${labelForDatasetB}"
+      #end if
+  </command>
+  <inputs>
+    <param name="inputA" format="tabular" type="data" label="Mutation Dataset A"/>
+    <param type="text" name="labelForDatasetA"  label="Dataset A Label (optional)" optional="true"/>
+    <param name="inputB" format="tabular" type="data" label="Mutation Dataset B"/>
+    <param type="text" name="labelForDatasetB"  label="Dataset B Label (optional)" optional="true"/>
+ </inputs>
+  <outputs>
+    <data name="errorLog" format="data" label="Execution Log"/>
+    <data name="outputSourceMatrix" format="tabular" label="Mutation Data Sources"/>
+    <data name="outputC" format="tabular" label="Merged Mutation Data"/>
+  </outputs>
+  <help>
+    ***Merge Xena Positional Mutation Datasets***
+
+    Given two datasets of mutation data as formatted for the UCSC Xena Browser, merge them to produce a third dataset that is the union of the first two.  The new dataset will contain all mutations from either dataset.
+
+    To maintain provenance, this script also outputs a second matrix, with one row for each sample ID that appears in the output dataset, and two columns per row indicating which input dataset(s) contained some mutation data for that sample.  By default, the input dataset name is used to indicate which input file each column came from.  Optionally, the user can specify descriptive labels to be used in place of the dataset names.   </help>
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mergeXenaMutation.py	Fri Mar 20 16:38:46 2015 -0700
@@ -0,0 +1,214 @@
+#!/usr/bin/env python
+
+import argparse
+import string, os, sys
+
+requiredCOLs = ["chr", "start","end","reference","alt","gene","effect"]
+
+def headerError(filename, column, ferror):
+  ferror.write(filename +" does not have column " + column+"\n")
+  ferror.close()
+  sys.exit(1)
+
+def findAnyValueInList (values, dataList):
+  for value in values:
+    for i in range(0,len(dataList)):
+      if value == dataList[i]:
+        return i
+  return -1
+
+def header (infile, ferror):
+  fin= open(infile,'U')
+
+  columnDic ={}
+  #header
+  line = fin.readline()
+  fin.close()
+  if line [0]=="#":
+    line = line[1:-1]
+  data = string.split(line,"\t")
+
+  columnDic["chr"]= findAnyValueInList (["chr","chrom"], data)
+  if columnDic["chr"] ==-1:
+    headerError(infile, "chr", ferror)
+
+  columnDic["start"]= findAnyValueInList (["start","chrStart"], data)
+  if columnDic["start"] == -1:
+    headerError(infile, "start", ferror)
+
+  columnDic["end"]= findAnyValueInList (["end","chrEnd"], data)
+  if columnDic["end"] == -1:
+    headerError(infile, "end", ferror)
+
+  columnDic["alt"]= findAnyValueInList (["alt"], data)
+  if columnDic["alt"] == -1:
+    headerError(infile, "alt", ferror)
+
+  columnDic["reference"]= findAnyValueInList (["reference","ref"], data)
+  if columnDic["reference"] == -1:
+    headerError(infile, "reference", ferror)
+
+  columnDic["gene"]= findAnyValueInList (["gene"], data)
+  if columnDic["gene"] == -1:
+    headerError(infile, "gene", ferror)
+
+  columnDic["effect"]= findAnyValueInList (["effect"], data)
+  if columnDic["effect"] == -1:
+    headerError(infile, "effect", ferror)
+
+  requiredCols = columnDic.keys()
+  requiredColsPos = columnDic.values()
+  for i in range(1,len(data)):
+    if i not in requiredColsPos:
+      columnDic [data[i]]=i
+
+  return columnDic
+
+def summarizeColumns(inFiles, fileColumn, allCols, ferror):
+  for infile in inFiles:
+    columnDic = header (infile, ferror)
+    fileColumn [infile] = columnDic
+    for col in columnDic:
+      if col not in allCols:
+        allCols.append(col)
+  return
+
+def outputHeader (requiredCOLs,allCols,fout):
+  fout.write("#sample")
+  for col in requiredCOLs:
+    fout.write("\t"+col)
+  for col in allCols:
+    if col not in requiredCOLs:
+      fout.write("\t"+col)
+  fout.write("\n")
+  fout.close()
+  return
+
+def processAndOutput(infile,requiredCOLs,allCols,columnDic,fout):
+  fin = open(infile,'U')
+  fin.readline()
+  while 1:
+    line = fin.readline()[:-1]
+    if line =="":
+      break
+    data = string.split(line,'\t')
+    fout.write(data[0])
+    for col in requiredCOLs:
+      pos = columnDic[col]
+      fout.write("\t"+ data[pos])
+    for col in allCols:
+      if col not in requiredCOLs:
+        if col in columnDic:
+          pos = columnDic[col]
+          fout.write("\t"+ data[pos])
+        else:
+          fout.write("\t")
+    fout.write("\n")
+  fin.close()
+  return
+
+def collectSource(inFile, label, sampleDic):
+  fin = open(inFile,'U')
+  fin.readline()
+  while 1:
+    line = fin.readline()[:-1]
+    if line =="":
+      break
+    sample = string.split(line,'\t')[0]
+    if sample not in sampleDic:
+      sampleDic[sample]=[]
+    if inFile not in sampleDic[sample]:
+      sampleDic[sample].append(label)
+  fin.close()
+  return
+
+def outputSampleDic (sampleDic, outPhenotypeFile):
+  fout = open(outPhenotypeFile,'w')
+  fout.write("sample\tsource\n")
+  for sample in sampleDic:
+    source = sampleDic[sample]
+    source.sort()
+    fout.write(sample+"\t"+string.join(source,", ")+"\n")
+  fout.close()
+  return
+
+if __name__ == '__main__' :
+  if len(sys.argv[:]) <6:
+    print "python mergeMultipleXenaMutation.py outputXenaMutation outputPhenotypeMatrix errorLog inputfile(s)"
+    print "this is merging data A+B=C for mutation by position type of data\n"
+    sys.exit(1)
+
+  #
+  # The input files to this script are two or more matrices, in which
+  # columns represent samples and rows represent genes or measurements.
+  # There are two output files: outMergedData contains the input data merged
+  # into a single matrix, and outSourceMatrix is a two-column matrix
+  # indicating which file each sample (or column label) came from.  This
+  # assumes that each sample came from at most one file.
+  #
+  parser = argparse.ArgumentParser()
+  parser.add_argument("outMergedData", type=str,
+                      help="Filename for the merged dataset")
+  parser.add_argument("outSourceMatrix", type=str,
+                      help="""Filename for a Nx2 matrix that indicates
+                                the source file of each column""")
+  parser.add_argument("errorLog", type=str,
+                      help="""Error log""")
+  parser.add_argument("inFileA", type=str, help="First input file")
+  parser.add_argument("inFileB", type=str, help="Second input file")
+  parser.add_argument("--aLabel", type=str, default=None,
+                      help="User-friendly label for the first input file")
+  parser.add_argument("--bLabel", type=str, default=None,
+                      help="User-friendly label for the second input file")
+  args = parser.parse_args()
+
+
+  #inFiles = sys.argv[4:]
+  inFiles = list()
+  inFiles.append(args.inFileA)
+  inFiles.append(args.inFileB)
+  errofile = args.errorLog
+  outfile = args.outMergedData
+  #print outfile
+  outPhenotypeFile = args.outSourceMatrix
+  #print outPhenotypeFile
+
+  ferror = open(errofile,'w')
+
+  #get all the columns, build fileColumn dictionary
+  fileColumn={}
+  allCols =[]
+  summarizeColumns(inFiles, fileColumn, allCols, ferror)
+  ferror.close()
+
+  #output header line
+  fout = open(outfile,'w')
+  outputHeader (requiredCOLs,allCols,fout)
+
+  #process and output combined mutationXena file
+  fout = open(outfile,'a')
+
+  columnDic = fileColumn[args.inFileA]
+  processAndOutput(args.inFileA,requiredCOLs,allCols,columnDic,fout)
+  columnDic = fileColumn[args.inFileB]
+  processAndOutput(args.inFileB,requiredCOLs,allCols,columnDic,fout)
+  fout.close()
+
+  #collect sample from source information
+  sampleDic ={}
+  if args.aLabel is None:
+    collectSource(args.inFileA, args.inFileA, sampleDic)
+  else:
+    collectSource(args.inFileA, args.aLabel, sampleDic)
+  if args.bLabel is None:
+    collectSource(args.inFileB, args.inFileB, sampleDic)
+  else:
+    collectSource(args.inFileB, args.bLabel, sampleDic)
+
+
+  #output sample source information as phenotype matrix
+  outputSampleDic (sampleDic, outPhenotypeFile)
+
+
+
+