view mergeGenomicMatrixFiles.py @ 60:bf57076e27b9 default tip

change genomicSegment input data
author jingchunzhu@gmail.com
date Tue, 27 Oct 2015 16:07:09 -0700 (2015-10-27)
parents eb5acf81e609
children
line wrap: on
line source
#!/usr/bin/env python

import argparse
import string,os,sys,json

def header (samples, sourceFiles, infile, labelThisFile):
    if labelThisFile == None:
        labelToUse = infile
    else:
        labelToUse = labelThisFile
    fin= open(infile, 'U')
    #header, samples
    newSamples = string.split(string.strip(fin.readline()),'\t')[1:]
    for sample in newSamples:
        if sample not in samples:
            samples[sample]= len(samples)
            sourceFiles[sample] = labelToUse
    fin.close()
    return

def process(genes, samples, dataMatrix, infile):
    maxLength= len(samples)

    fin= open(infile,'U')
    #header 
    newSamples = string.split(string.strip(fin.readline()),'\t')
    
    while 1:
        line = fin.readline()[:-1]
        if line =="":
            break
        data = string.split(line,"\t")
        gene = data[0]
        if gene not in genes:
            genes[gene]= len(genes)
            l=[]
            for i in range (0, maxLength):
                l.append("")
            dataMatrix.append(l)

        x = genes[gene]
        for i in range (1, len(data)):
            sample = newSamples[i]
            y = samples[sample]
            dataMatrix[x][y]= data[i]

    fin.close()
    return


def outputSourceMatrix(sourceData, outputFileName):
    fout = open(outputFileName, "w")
    fout.write("Sample\tSource\n")
    for thisSample in sourceData.keys():
        fout.write("%s\t%s\n" % (thisSample, sourceData[thisSample]))
    fout.close()
    return


def outputMergedMatrix(dataMatrix, samples, genes, outfile):
    fout = open(outfile,"w")
    maxLength= len(samples)
    sList=[]
    for i in range (0, maxLength):
        sList.append("")
    for sample in samples:
        pos =samples[sample]
        sList[pos] = sample

    fout.write("sample")
    for sample in sList:
        fout.write("\t"+sample)
    fout.write("\n")

    for gene in genes:
        fout.write(gene)
        for sample in sList:
            value = dataMatrix[genes[gene]][samples[sample]]
            fout.write("\t"+value)
        fout.write("\n")
    fout.close()
    return

def outputMergedMatrixJson(output):
    fout = open(output,'w')
    j={}
    j["type"]="genomicMatrix"
    json.dump(j, fout)
    fout.close()

if __name__ == '__main__' :
    #
    # The input files to this script are two or more matrices, in which
    # columns represent samples and rows represent genes or measurements.
    # There are two output files: outMergedData contains the input data merged
    # into a single matrix, and outSourceMatrix is a two-column matrix 
    # indicating which file each sample (or column label) came from.  This
    # assumes that each sample came from at most one file.
    #
    parser = argparse.ArgumentParser()
    parser.add_argument("inFileA", type=str, help="First input file")
    parser.add_argument("inFileB", type=str, help="Second input file")
    parser.add_argument("outMergedData", type=str, 
                        help="Filename for the merged dataset")
    parser.add_argument("outSourceMatrix", type=str,
                        help="""Filename for a Nx2 matrix that indicates
                                the source file of each column""")
    parser.add_argument("--aLabel", type=str, default=None,
                        help="User-friendly label for the first input file")
    parser.add_argument("--bLabel", type=str, default=None,
                        help="User-friendly label for the second input file")
    args = parser.parse_args()

    genes={}
    samples={}
    sourceFiles = {}
    dataMatrix=[]

    header(samples, sourceFiles, args.inFileA, args.aLabel)
    header(samples, sourceFiles, args.inFileB, args.bLabel)

    process(genes, samples, dataMatrix, args.inFileA)
    process(genes, samples, dataMatrix, args.inFileB)

    outputSourceMatrix(sourceFiles, args.outSourceMatrix)
    outputMergedMatrix(dataMatrix, samples, genes, args.outMergedData)