# HG changeset patch # User melissacline # Date 1426889334 14400 # Node ID 15cb5a49cdbcaf236c21b19996ff312052c6648c # Parent 2035405538b42edd2fdcf73a6765bfd8a0259bb2 Uploaded diff -r 2035405538b4 -r 15cb5a49cdbc mergeXenaMutation.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mergeXenaMutation.py Fri Mar 20 18:08:54 2015 -0400 @@ -0,0 +1,212 @@ +#!/usr/bin/env python + +import argparse +import string, os, sys + +requiredCOLs = ["chr", "start","end","reference","alt","gene","effect"] + +def headerError(filename, column, ferror): + ferror.write(filename +" does not have column " + column+"\n") + ferror.close() + sys.exit(1) + +def findAnyValueInList (values, dataList): + for value in values: + for i in range(0,len(dataList)): + if value == dataList[i]: + return i + return -1 + +def header (infile, ferror): + fin= open(infile,'U') + + columnDic ={} + #header + line = fin.readline() + fin.close() + if line [0]=="#": + line = line[1:-1] + data = string.split(line,"\t") + + columnDic["chr"]= findAnyValueInList (["chr","chrom"], data) + if columnDic["chr"] ==-1: + headerError(infile, "chr", ferror) + + columnDic["start"]= findAnyValueInList (["start","chrStart"], data) + if columnDic["start"] == -1: + headerError(infile, "start", ferror) + + columnDic["end"]= findAnyValueInList (["end","chrEnd"], data) + if columnDic["end"] == -1: + headerError(infile, "end", ferror) + + columnDic["alt"]= findAnyValueInList (["alt"], data) + if columnDic["alt"] == -1: + headerError(infile, "alt", ferror) + + columnDic["reference"]= findAnyValueInList (["reference","ref"], data) + if columnDic["reference"] == -1: + headerError(infile, "reference", ferror) + + columnDic["gene"]= findAnyValueInList (["gene"], data) + if columnDic["gene"] == -1: + headerError(infile, "gene", ferror) + + columnDic["effect"]= findAnyValueInList (["effect"], data) + if columnDic["effect"] == -1: + headerError(infile, "effect", ferror) + + requiredCols = columnDic.keys() + requiredColsPos = columnDic.values() + for i in range(1,len(data)): + if i not in requiredColsPos: + columnDic [data[i]]=i + + return columnDic + +def summarizeColumns(infiles, fileColumn, allCols, ferror): + for infile in inFiles: + columnDic = header (infile, ferror) + fileColumn [infile] = columnDic + for col in columnDic: + if col not in allCols: + allCols.append(col) + return + +def outputHeader (requiredCOLs,allCols,fout): + fout.write("#sample") + for col in requiredCOLs: + fout.write("\t"+col) + for col in allCols: + if col not in requiredCOLs: + fout.write("\t"+col) + fout.write("\n") + fout.close() + return + +def processAndOutput(infile,requiredCOLs,allCols,columnDic,fout): + fin = open(infile,'U') + fin.readline() + while 1: + line = fin.readline()[:-1] + if line =="": + break + data = string.split(line,'\t') + fout.write(data[0]) + for col in requiredCOLs: + pos = columnDic[col] + fout.write("\t"+ data[pos]) + for col in allCols: + if col not in requiredCOLs: + if col in columnDic: + pos = columnDic[col] + fout.write("\t"+ data[pos]) + else: + fout.write("\t") + fout.write("\n") + fin.close() + return + +def collectSource(inFile, label, sampleDic): + fin = open(inFile,'U') + fin.readline() + while 1: + line = fin.readline()[:-1] + if line =="": + break + sample = string.split(line,'\t')[0] + if sample not in sampleDic: + sampleDic[sample]=[] + if inFile not in sampleDic[sample]: + sampleDic[sample].append(label) + fin.close() + return + +def outputSampleDic (sampleDic, outPhenotypeFile): + fout = open(outPhenotypeFile,'w') + fout.write("sample\tsource\n") + for sample in sampleDic: + source = sampleDic[sample] + source.sort() + fout.write(sample+"\t"+string.join(source,", ")+"\n") + fout.close() + return + +if __name__ == '__main__' : + if len(sys.argv[:]) <6: + print "python mergeMultipleXenaMutation.py outputXenaMutation outputPhenotypeMatrix errorLog inputfile(s)" + print "this is merging data A+B=C for mutation by position type of data\n" + sys.exit(1) + + # + # The input files to this script are two or more matrices, in which + # columns represent samples and rows represent genes or measurements. + # There are two output files: outMergedData contains the input data merged + # into a single matrix, and outSourceMatrix is a two-column matrix + # indicating which file each sample (or column label) came from. This + # assumes that each sample came from at most one file. + # + parser = argparse.ArgumentParser() + parser.add_argument("inFileA", type=str, help="First input file") + parser.add_argument("inFileB", type=str, help="Second input file") + parser.add_argument("outMergedData", type=str, + help="Filename for the merged dataset") + parser.add_argument("outSourceMatrix", type=str, + help="""Filename for a Nx2 matrix that indicates + the source file of each column""") + parser.add_argument("errorLog", type=str, + help="""Error log""") + parser.add_argument("--aLabel", type=str, default=None, + help="User-friendly label for the first input file") + parser.add_argument("--bLabel", type=str, default=None, + help="User-friendly label for the second input file") + args = parser.parse_args() + + + #inFiles = sys.argv[4:] + print inFiles + errofile = args.errorLog + outfile = args.outMergedData + print outfile + outPhenotypeFile = args.outSourceMatrix + print outPhenotypeFile + + ferror = open(errofile,'w') + + #get all the columns, build fileColumn dictionary + fileColumn={} + allCols =[] + summarizeColumns(inFiles, fileColumn, allCols, ferror) + ferror.close() + + #output header line + fout = open(outfile,'w') + outputHeader (requiredCOLs,allCols,fout) + + #process and output combined mutationXena file + fout = open(outfile,'a') + + columnDic = fileColumn[args.inFileA] + processAndOutput(args.inFileA,requiredCOLs,allCols,columnDic,fout) + columnDic = fileColumn[args.inFileB] + processAndOutput(args.inFileB,requiredCOLs,allCols,columnDic,fout) + fout.close() + + #collect sample from source information + sampleDic ={} + if args.aLabel is None: + collectSource(args.inFileA, args.inFileA, sampleDic) + else: + collectSource(args.inFileA, args.aLabel, sampleDic + if args.bLabel is None: + collectSource(args.inFileB, args.inFileB, sampleDic) + else: + collectSource(args.inFileB, args.bLabel, sampleDic + + + #output sample source information as phenotype matrix + outputSampleDic (sampleDic, outPhenotypeFile) + + + +