Mercurial > repos > melissacline > ucsc_cancer_utilities
view mergeXenaMutation.py @ 47:23d98125d20c
parse snpEff output
author | jingchunzhu |
---|---|
date | Thu, 13 Aug 2015 23:26:33 -0700 |
parents | 9806198df91f |
children | 1093078e7976 |
line wrap: on
line source
#!/usr/bin/env python import argparse import string, os, sys requiredCOLs = ["chr", "start","end","reference","alt","gene","effect"] def headerError(filename, column, ferror): ferror.write(filename +" does not have column " + column+"\n") ferror.close() sys.exit(1) def findAnyValueInList (values, dataList): for value in values: for i in range(0,len(dataList)): if value == dataList[i]: return i return -1 def header (infile, ferror): fin= open(infile,'U') columnDic ={} #header line = fin.readline() fin.close() if line [0]=="#": line = line[1:-1] data = string.split(line,"\t") columnDic["chr"]= findAnyValueInList (["chr","chrom"], data) if columnDic["chr"] ==-1: headerError(infile, "chr", ferror) columnDic["start"]= findAnyValueInList (["start","chrStart"], data) if columnDic["start"] == -1: headerError(infile, "start", ferror) columnDic["end"]= findAnyValueInList (["end","chrEnd"], data) if columnDic["end"] == -1: headerError(infile, "end", ferror) columnDic["alt"]= findAnyValueInList (["alt"], data) if columnDic["alt"] == -1: headerError(infile, "alt", ferror) columnDic["reference"]= findAnyValueInList (["reference","ref"], data) if columnDic["reference"] == -1: headerError(infile, "reference", ferror) columnDic["gene"]= findAnyValueInList (["gene"], data) if columnDic["gene"] == -1: headerError(infile, "gene", ferror) columnDic["effect"]= findAnyValueInList (["effect"], data) if columnDic["effect"] == -1: headerError(infile, "effect", ferror) requiredCols = columnDic.keys() requiredColsPos = columnDic.values() for i in range(1,len(data)): if i not in requiredColsPos: columnDic [data[i]]=i return columnDic def summarizeColumns(inFiles, fileColumn, allCols, ferror): for infile in inFiles: columnDic = header (infile, ferror) fileColumn [infile] = columnDic for col in columnDic: if col not in allCols: allCols.append(col) return def outputHeader (requiredCOLs,allCols,fout): fout.write("#sample") for col in requiredCOLs: fout.write("\t"+col) for col in allCols: if col not in requiredCOLs: fout.write("\t"+col) fout.write("\n") fout.close() return def processAndOutput(infile,requiredCOLs,allCols,columnDic,fout): fin = open(infile,'U') fin.readline() while 1: line = fin.readline()[:-1] if line =="": break data = string.split(line,'\t') fout.write(data[0]) for col in requiredCOLs: pos = columnDic[col] fout.write("\t"+ data[pos]) for col in allCols: if col not in requiredCOLs: if col in columnDic: pos = columnDic[col] fout.write("\t"+ data[pos]) else: fout.write("\t") fout.write("\n") fin.close() return def collectSource(inFile, label, sampleDic): fin = open(inFile,'U') fin.readline() while 1: line = fin.readline()[:-1] if line =="": break sample = string.split(line,'\t')[0] if sample not in sampleDic: sampleDic[sample]=[] if label not in sampleDic[sample]: sampleDic[sample].append(label) fin.close() return def outputSampleDic (sampleDic, outPhenotypeFile): fout = open(outPhenotypeFile,'w') fout.write("sample\tsource\n") for sample in sampleDic: source = sampleDic[sample] source.sort() fout.write(sample+"\t"+string.join(source,", ")+"\n") fout.close() return if __name__ == '__main__' : if len(sys.argv[:]) <6: print "python mergeMultipleXenaMutation.py outputXenaMutation outputPhenotypeMatrix errorLog inputfile(s)" print "this is merging data A+B=C for mutation by position type of data\n" sys.exit(1) # # The input files to this script are two or more matrices, in which # columns represent samples and rows represent genes or measurements. # There are two output files: outMergedData contains the input data merged # into a single matrix, and outSourceMatrix is a two-column matrix # indicating which file each sample (or column label) came from. This # assumes that each sample came from at most one file. # parser = argparse.ArgumentParser() parser.add_argument("outMergedData", type=str, help="Filename for the merged dataset") parser.add_argument("outSourceMatrix", type=str, help="""Filename for a Nx2 matrix that indicates the source file of each column""") parser.add_argument("errorLog", type=str, help="""Error log""") parser.add_argument("inFileA", type=str, help="First input file") parser.add_argument("inFileB", type=str, help="Second input file") parser.add_argument("--aLabel", type=str, default=None, help="User-friendly label for the first input file") parser.add_argument("--bLabel", type=str, default=None, help="User-friendly label for the second input file") args = parser.parse_args() #inFiles = sys.argv[4:] inFiles = list() inFiles.append(args.inFileA) inFiles.append(args.inFileB) errofile = args.errorLog outfile = args.outMergedData #print outfile outPhenotypeFile = args.outSourceMatrix #print outPhenotypeFile ferror = open(errofile,'w') #get all the columns, build fileColumn dictionary fileColumn={} allCols =[] summarizeColumns(inFiles, fileColumn, allCols, ferror) ferror.close() #output header line fout = open(outfile,'w') outputHeader (requiredCOLs,allCols,fout) #process and output combined mutationXena file fout = open(outfile,'a') columnDic = fileColumn[args.inFileA] processAndOutput(args.inFileA,requiredCOLs,allCols,columnDic,fout) columnDic = fileColumn[args.inFileB] processAndOutput(args.inFileB,requiredCOLs,allCols,columnDic,fout) fout.close() #collect sample from source information sampleDic ={} if args.aLabel is None: collectSource(args.inFileA, args.inFileA, sampleDic) else: collectSource(args.inFileA, args.aLabel, sampleDic) if args.bLabel is None: collectSource(args.inFileB, args.inFileB, sampleDic) else: collectSource(args.inFileB, args.bLabel, sampleDic) #output sample source information as phenotype matrix outputSampleDic (sampleDic, outPhenotypeFile)