view mergeXenaMutation.py @ 37:e81019e3ac99

Updated synapseGetDataset to look at the filename rather than the (no longer existant) content type field to determine if the data is in zip format
author melissacline
date Mon, 27 Jul 2015 16:29:24 -0700
parents 914bc8ee6222
children 9806198df91f
line wrap: on
line source

#!/usr/bin/env python

import argparse
import string, os, sys

requiredCOLs = ["chr", "start","end","reference","alt","gene","effect"]

def headerError(filename, column, ferror):
  ferror.write(filename +" does not have column " + column+"\n")
  ferror.close()
  sys.exit(1)

def findAnyValueInList (values, dataList):
  for value in values:
    for i in range(0,len(dataList)):
      if value == dataList[i]:
        return i
  return -1

def header (infile, ferror):
  fin= open(infile,'U')

  columnDic ={}
  #header
  line = fin.readline()
  fin.close()
  if line [0]=="#":
    line = line[1:-1]
  data = string.split(line,"\t")

  columnDic["chr"]= findAnyValueInList (["chr","chrom"], data)
  if columnDic["chr"] ==-1:
    headerError(infile, "chr", ferror)

  columnDic["start"]= findAnyValueInList (["start","chrStart"], data)
  if columnDic["start"] == -1:
    headerError(infile, "start", ferror)

  columnDic["end"]= findAnyValueInList (["end","chrEnd"], data)
  if columnDic["end"] == -1:
    headerError(infile, "end", ferror)

  columnDic["alt"]= findAnyValueInList (["alt"], data)
  if columnDic["alt"] == -1:
    headerError(infile, "alt", ferror)

  columnDic["reference"]= findAnyValueInList (["reference","ref"], data)
  if columnDic["reference"] == -1:
    headerError(infile, "reference", ferror)

  columnDic["gene"]= findAnyValueInList (["gene"], data)
  if columnDic["gene"] == -1:
    headerError(infile, "gene", ferror)

  columnDic["effect"]= findAnyValueInList (["effect"], data)
  if columnDic["effect"] == -1:
    headerError(infile, "effect", ferror)

  requiredCols = columnDic.keys()
  requiredColsPos = columnDic.values()
  for i in range(1,len(data)):
    if i not in requiredColsPos:
      columnDic [data[i]]=i

  return columnDic

def summarizeColumns(inFiles, fileColumn, allCols, ferror):
  for infile in inFiles:
    columnDic = header (infile, ferror)
    fileColumn [infile] = columnDic
    for col in columnDic:
      if col not in allCols:
        allCols.append(col)
  return

def outputHeader (requiredCOLs,allCols,fout):
  fout.write("#sample")
  for col in requiredCOLs:
    fout.write("\t"+col)
  for col in allCols:
    if col not in requiredCOLs:
      fout.write("\t"+col)
  fout.write("\n")
  fout.close()
  return

def processAndOutput(infile,requiredCOLs,allCols,columnDic,fout):
  fin = open(infile,'U')
  fin.readline()
  while 1:
    line = fin.readline()[:-1]
    if line =="":
      break
    data = string.split(line,'\t')
    fout.write(data[0])
    for col in requiredCOLs:
      pos = columnDic[col]
      fout.write("\t"+ data[pos])
    for col in allCols:
      if col not in requiredCOLs:
        if col in columnDic:
          pos = columnDic[col]
          fout.write("\t"+ data[pos])
        else:
          fout.write("\t")
    fout.write("\n")
  fin.close()
  return

def collectSource(inFile, label, sampleDic):
  fin = open(inFile,'U')
  fin.readline()
  while 1:
    line = fin.readline()[:-1]
    if line =="":
      break
    sample = string.split(line,'\t')[0]
    if sample not in sampleDic:
      sampleDic[sample]=[]
    if inFile not in sampleDic[sample]:
      sampleDic[sample].append(label)
  fin.close()
  return

def outputSampleDic (sampleDic, outPhenotypeFile):
  fout = open(outPhenotypeFile,'w')
  fout.write("sample\tsource\n")
  for sample in sampleDic:
    source = sampleDic[sample]
    source.sort()
    fout.write(sample+"\t"+string.join(source,", ")+"\n")
  fout.close()
  return

if __name__ == '__main__' :
  if len(sys.argv[:]) <6:
    print "python mergeMultipleXenaMutation.py outputXenaMutation outputPhenotypeMatrix errorLog inputfile(s)"
    print "this is merging data A+B=C for mutation by position type of data\n"
    sys.exit(1)

  #
  # The input files to this script are two or more matrices, in which
  # columns represent samples and rows represent genes or measurements.
  # There are two output files: outMergedData contains the input data merged
  # into a single matrix, and outSourceMatrix is a two-column matrix
  # indicating which file each sample (or column label) came from.  This
  # assumes that each sample came from at most one file.
  #
  parser = argparse.ArgumentParser()
  parser.add_argument("outMergedData", type=str,
                      help="Filename for the merged dataset")
  parser.add_argument("outSourceMatrix", type=str,
                      help="""Filename for a Nx2 matrix that indicates
                                the source file of each column""")
  parser.add_argument("errorLog", type=str,
                      help="""Error log""")
  parser.add_argument("inFileA", type=str, help="First input file")
  parser.add_argument("inFileB", type=str, help="Second input file")
  parser.add_argument("--aLabel", type=str, default=None,
                      help="User-friendly label for the first input file")
  parser.add_argument("--bLabel", type=str, default=None,
                      help="User-friendly label for the second input file")
  args = parser.parse_args()

                                    
  #inFiles = sys.argv[4:]
  inFiles = list()
  inFiles.append(args.inFileA)
  inFiles.append(args.inFileB)
  errofile = args.errorLog
  outfile = args.outMergedData
  #print outfile
  outPhenotypeFile = args.outSourceMatrix
  #print outPhenotypeFile
    
  ferror = open(errofile,'w')
    
  #get all the columns, build fileColumn dictionary
  fileColumn={}
  allCols =[]
  summarizeColumns(inFiles, fileColumn, allCols, ferror)
  ferror.close()

  #output header line
  fout = open(outfile,'w')
  outputHeader (requiredCOLs,allCols,fout)
  
  #process and output combined mutationXena file
  fout = open(outfile,'a')

  columnDic = fileColumn[args.inFileA]
  processAndOutput(args.inFileA,requiredCOLs,allCols,columnDic,fout)
  columnDic = fileColumn[args.inFileB]
  processAndOutput(args.inFileB,requiredCOLs,allCols,columnDic,fout)
  fout.close()
  
  #collect sample from source information
  sampleDic ={}
  if args.aLabel is None:
    collectSource(args.inFileA, args.inFileA, sampleDic)
  else:
    collectSource(args.inFileA, args.aLabel, sampleDic)
  if args.bLabel is None:
    collectSource(args.inFileB, args.inFileB, sampleDic)
  else:
    collectSource(args.inFileB, args.bLabel, sampleDic)


  #output sample source information as phenotype matrix
  outputSampleDic (sampleDic, outPhenotypeFile)