view mergeXenaMutation.py @ 55:1093078e7976

merge mutation data conform to new mutationVector data standard
author jingchunzhu
date Fri, 18 Sep 2015 10:24:39 -0700
parents 9806198df91f
children
line wrap: on
line source

#!/usr/bin/env python

import argparse
import string, os, sys

requiredCOLs = ["chr", "start","end","reference","alt"]

def headerError(filename, column, ferror):
  ferror.write(filename +" does not have column " + column+"\n")
  ferror.close()
  sys.exit(1)

def findAnyValueInList (values, dataList):
  for value in values:
    for i in range(0,len(dataList)):
      if value == dataList[i]:
        return i
  return -1

def header (infile, ferror):
  fin= open(infile,'U')

  columnDic ={}
  #header
  line = fin.readline()
  fin.close()
  if line [0]=="#":
    line = line[1:-1]
  data = string.split(string.strip(line),"\t")

  columnDic["chr"]= findAnyValueInList (["chr","chrom", "Chr"], data)
  if columnDic["chr"] ==-1:
    headerError(infile, "chr", ferror)

  columnDic["start"]= findAnyValueInList (["start","chrStart","Start"], data)
  if columnDic["start"] == -1:
    headerError(infile, "start", ferror)

  columnDic["end"]= findAnyValueInList (["end","chrEnd", "End"], data)
  if columnDic["end"] == -1:
    headerError(infile, "end", ferror)

  columnDic["alt"]= findAnyValueInList (["alt","Alt"], data)
  if columnDic["alt"] == -1:
    headerError(infile, "alt", ferror)

  columnDic["reference"]= findAnyValueInList (["reference","ref","Reference","Ref"], data)
  if columnDic["reference"] == -1:
    headerError(infile, "reference", ferror)

  #columnDic["gene"]= findAnyValueInList (["gene","Gene"], data)
  #if columnDic["gene"] == -1:
  #  headerError(infile, "gene", ferror)

  #columnDic["effect"]= findAnyValueInList (["effect"], data)
  #if columnDic["effect"] == -1:
  #  headerError(infile, "effect", ferror)

  requiredCols = columnDic.keys()
  requiredColsPos = columnDic.values()
  for i in range(1,len(data)):
    if i not in requiredColsPos:
      columnDic [data[i]]=i

  return columnDic

def summarizeColumns(inFiles, fileColumn, allCols, ferror):
  for infile in inFiles:
    columnDic = header (infile, ferror)
    fileColumn [infile] = columnDic
    for col in columnDic:
      if col not in allCols:
        allCols.append(col)
  return

def outputHeader (requiredCOLs,allCols,fout):
  fout.write("sample")
  for col in requiredCOLs:
    fout.write("\t"+col)
  for col in allCols:
    if col not in requiredCOLs:
      fout.write("\t"+col)
  fout.write("\n")
  fout.close()
  return

def processAndOutput(infile,requiredCOLs,allCols,columnDic,fout):
  fin = open(infile,'U')
  fin.readline()
  while 1:
    line = fin.readline()[:-1]
    if line =="":
      break
    data = string.split(line,'\t')
    fout.write(data[0])
    for col in requiredCOLs:
      pos = columnDic[col]
      fout.write("\t"+ data[pos])
    for col in allCols:
      if col not in requiredCOLs:
        if col in columnDic:
          pos = columnDic[col]
          fout.write("\t"+ data[pos])
        else:
          fout.write("\t")
    fout.write("\n")
  fin.close()
  return

def collectSource(inFile, label, sampleDic):
  fin = open(inFile,'U')
  fin.readline()
  while 1:
    line = fin.readline()[:-1]
    if line =="":
      break
    sample = string.split(line,'\t')[0]
    if sample not in sampleDic:
      sampleDic[sample]=[]
    if label not in sampleDic[sample]:
      sampleDic[sample].append(label)
  fin.close()
  return

def outputSampleDic (sampleDic, outPhenotypeFile):
  fout = open(outPhenotypeFile,'w')
  fout.write("sample\tsource\n")
  for sample in sampleDic:
    source = sampleDic[sample]
    source.sort()
    fout.write(sample+"\t"+string.join(source,", ")+"\n")
  fout.close()
  return

if __name__ == '__main__' :
  if len(sys.argv[:]) <6:
    print "python mergeMultipleXenaMutation.py outputXenaMutation outputPhenotypeMatrix errorLog inputfile(s)"
    print "this is merging data A+B=C for mutation by position type of data\n"
    sys.exit(1)

  #
  # The input files to this script are two or more matrices, in which
  # columns represent samples and rows represent genes or measurements.
  # There are two output files: outMergedData contains the input data merged
  # into a single matrix, and outSourceMatrix is a two-column matrix
  # indicating which file each sample (or column label) came from.  This
  # assumes that each sample came from at most one file.
  #
  parser = argparse.ArgumentParser()
  parser.add_argument("outMergedData", type=str,
                      help="Filename for the merged dataset")
  parser.add_argument("outSourceMatrix", type=str,
                      help="""Filename for a Nx2 matrix that indicates
                                the source file of each column""")
  parser.add_argument("errorLog", type=str,
                      help="""Error log""")
  parser.add_argument("inFileA", type=str, help="First input file")
  parser.add_argument("inFileB", type=str, help="Second input file")
  parser.add_argument("--aLabel", type=str, default=None,
                      help="User-friendly label for the first input file")
  parser.add_argument("--bLabel", type=str, default=None,
                      help="User-friendly label for the second input file")
  args = parser.parse_args()

                                    
  #inFiles = sys.argv[4:]
  inFiles = list()
  inFiles.append(args.inFileA)
  inFiles.append(args.inFileB)
  errofile = args.errorLog
  outfile = args.outMergedData
  #print outfile
  outPhenotypeFile = args.outSourceMatrix
  #print outPhenotypeFile
    
  ferror = open(errofile,'w')
    
  #get all the columns, build fileColumn dictionary
  fileColumn={}
  allCols =[]
  summarizeColumns(inFiles, fileColumn, allCols, ferror)
  ferror.close()

  #output header line
  fout = open(outfile,'w')
  outputHeader (requiredCOLs,allCols,fout)
  
  #process and output combined mutationXena file
  fout = open(outfile,'a')

  columnDic = fileColumn[args.inFileA]
  processAndOutput(args.inFileA,requiredCOLs,allCols,columnDic,fout)
  columnDic = fileColumn[args.inFileB]
  processAndOutput(args.inFileB,requiredCOLs,allCols,columnDic,fout)
  fout.close()
  
  #collect sample from source information
  sampleDic ={}
  if args.aLabel is None:
    collectSource(args.inFileA, args.inFileA, sampleDic)
  else:
    collectSource(args.inFileA, args.aLabel, sampleDic)
  if args.bLabel is None:
    collectSource(args.inFileB, args.inFileB, sampleDic)
  else:
    collectSource(args.inFileB, args.bLabel, sampleDic)


  #output sample source information as phenotype matrix
  outputSampleDic (sampleDic, outPhenotypeFile)