view mergeGenomicMatrixFiles.py @ 37:e81019e3ac99

Updated synapseGetDataset to look at the filename rather than the (no longer existant) content type field to determine if the data is in zip format
author melissacline
date Mon, 27 Jul 2015 16:29:24 -0700
parents 1d83dbbee373
children eb5acf81e609
line wrap: on
line source

#!/usr/bin/env python

import argparse
import string,os,sys

def header (samples, sourceFiles, infile, labelThisFile):
    if labelThisFile == None:
        labelToUse = infile
    else:
        labelToUse = labelThisFile
    fin= open(infile, 'U')
    #header, samples
    newSamples = string.split(string.strip(fin.readline()),'\t')[1:]
    for sample in newSamples:
        if sample not in samples:
            samples[sample]= len(samples)
            sourceFiles[sample] = labelToUse
    fin.close()
    return

def process(genes, samples, dataMatrix, infile):
    maxLength= len(samples)

    fin= open(infile,'U')
    #header 
    newSamples = string.split(string.strip(fin.readline()),'\t')
    
    while 1:
        line = fin.readline()[:-1]
        if line =="":
            break
        data = string.split(line,"\t")
        gene = data[0]
        if gene not in genes:
            genes[gene]= len(genes)
            l=[]
            for i in range (0, maxLength):
                l.append("")
            dataMatrix.append(l)

        x = genes[gene]
        for i in range (1, len(data)):
            sample = newSamples[i]
            y = samples[sample]
            dataMatrix[x][y]= data[i]

    fin.close()
    return


def outputSourceMatrix(sourceData, outputFileName):
    fout = open(outputFileName, "w")
    fout.write("Sample\tSource\n")
    for thisSample in sourceData.keys():
        fout.write("%s\t%s\n" % (thisSample, sourceData[thisSample]))
    fout.close()
    return


def outputMergedMatrix(dataMatrix, samples, genes, outfile):
    fout = open(outfile,"w")
    maxLength= len(samples)
    sList=[]
    for i in range (0, maxLength):
        sList.append("")
    for sample in samples:
        pos =samples[sample]
        sList[pos] = sample

    fout.write("sample")
    for sample in sList:
        fout.write("\t"+sample)
    fout.write("\n")

    for gene in genes:
        fout.write(gene)
        for sample in sList:
            value = dataMatrix[genes[gene]][samples[sample]]
            fout.write("\t"+value)
        fout.write("\n")
    fout.close()
    return

if __name__ == '__main__' :
    #
    # The input files to this script are two or more matrices, in which
    # columns represent samples and rows represent genes or measurements.
    # There are two output files: outMergedData contains the input data merged
    # into a single matrix, and outSourceMatrix is a two-column matrix 
    # indicating which file each sample (or column label) came from.  This
    # assumes that each sample came from at most one file.
    #
    parser = argparse.ArgumentParser()
    parser.add_argument("inFileA", type=str, help="First input file")
    parser.add_argument("inFileB", type=str, help="Second input file")
    parser.add_argument("outMergedData", type=str, 
                        help="Filename for the merged dataset")
    parser.add_argument("outSourceMatrix", type=str,
                        help="""Filename for a Nx2 matrix that indicates
                                the source file of each column""")
    parser.add_argument("--aLabel", type=str, default=None,
                        help="User-friendly label for the first input file")
    parser.add_argument("--bLabel", type=str, default=None,
                        help="User-friendly label for the second input file")
    args = parser.parse_args()

    genes={}
    samples={}
    sourceFiles = {}
    dataMatrix=[]

    header(samples, sourceFiles, args.inFileA, args.aLabel)
    header(samples, sourceFiles, args.inFileB, args.bLabel)

    process(genes, samples, dataMatrix, args.inFileA)
    process(genes, samples, dataMatrix, args.inFileB)

    outputSourceMatrix(sourceFiles, args.outSourceMatrix)
    outputMergedMatrix(dataMatrix, samples, genes, args.outMergedData)