view mergeGenomicMatrixFiles.py @ 7:1d150e860c4d

Expanded the functionality of the merge genomic datasets tool, to generate an output dataset with the file (or label) indicating where each column came from
author melissacline
date Mon, 09 Mar 2015 19:58:03 -0700
parents 2035405538b4
children 5d4538cb38db
line wrap: on
line source

#!/usr/bin/env python

import argparse
import string,os,sys

def header (samples, sourceFiles, infile, labelThisFile):
    if labelThisFile == None:
        labelToUse = infile
    else:
        labelToUse = labelThisFile
    fin= open(infile,'r')
    #header, samples
    newSamples = string.split(string.strip(fin.readline()),'\t')[1:]
    for sample in newSamples:
        if sample not in samples:
            samples[sample]= len(samples)
            sourceFiles[sample] = labelToUse
    fin.close()
    return

def process(genes, samples, dataMatrix, infile):
    maxLength= len(samples)

    fin= open(infile,'r')
    #header 
    newSamples = string.split(string.strip(fin.readline()),'\t')
    
    while 1:
        line = fin.readline()[:-1]
        if line =="":
            break
        data = string.split(line,"\t")
        gene = data[0]
        if gene not in genes:
            genes[gene]= len(genes)
            l=[]
            for i in range (0, maxLength):
                l.append("")
            dataMatrix.append(l)

        x = genes[gene]
        for i in range (1, len(data)):
            sample = newSamples[i]
            y = samples[sample]
            dataMatrix[x][y]= data[i]

    fin.close()
    return


def outputSourceMatrix(sourceData, outputFileName):
    fout = open(outputFileName, "w")
    fout.write("Sample\tSource\n")
    for thisSample in sourceData.keys():
        fout.write("%s\t%s\n" % (thisSample, sourceData[thisSample]))
    fout.close()
    return


def outputMergedMatrix(dataMatrix, samples, genes, outfile):
    fout = open(outfile,"w")
    maxLength= len(samples)
    sList=[]
    for i in range (0, maxLength):
        sList.append("")
    for sample in samples:
        pos =samples[sample]
        sList[pos] = sample

    fout.write("sample")
    for sample in sList:
        fout.write("\t"+sample)
    fout.write("\n")

    for gene in genes:
        fout.write(gene)
        for sample in sList:
            value = dataMatrix[genes[gene]][samples[sample]]
            fout.write("\t"+value)
        fout.write("\n")
    fout.close()
    return

if __name__ == '__main__' :
    #
    # The input files to this script are two or more matrices, in which
    # columns represent samples and rows represent genes or measurements.
    # There are two output files: outMergedData contains the input data merged
    # into a single matrix, and outSourceMatrix is a two-column matrix 
    # indicating which file each sample (or column label) came from.  This
    # assumes that each sample came from at most one file.
    #
    parser = argparse.ArgumentParser()
    parser.add_argument("inFileA", type=str, help="First input file")
    parser.add_argument("inFileB", type=str, help="Second input file")
    parser.add_argument("outMergedData", type=str, 
                        help="Filename for the merged dataset")
    parser.add_argument("outSourceMatrix", type=str,
                        help="""Filename for a Nx2 matrix that indicates
                                the source file of each column""")
    parser.add_argument("--aLabel", type=str, default=None,
                        help="User-friendly label for the first input file")
    parser.add_argument("--bLabel", type=str, default=None,
                        help="User-friendly label for the second input file")
    args = parser.parse_args()
    outSourceMatrix = sys.argv[2]  

    genes={}
    samples={}
    sourceFiles = {}
    dataMatrix=[]

    header(samples, sourceFiles, args.inFileA, args.aLabel)
    header(samples, sourceFiles, args.inFileB, args.bLabel)

    process(genes, samples, dataMatrix, args.inFileA)
    process(genes, samples, dataMatrix, args.inFileB)

    outputSourceMatrix(sourceFiles, args.outSourceMatrix)
    outputMergedMatrix(dataMatrix, samples, genes, args.outMergedData)