Mercurial > repos > yboursin > pre_ebseq_eba
changeset 2:340ada00d965 draft
Uploaded
| author | yboursin |
|---|---|
| date | Thu, 20 Oct 2016 08:47:06 -0400 |
| parents | 00231125f4fb |
| children | 954cd01a7c12 |
| files | pre_EBSeq.py |
| diffstat | 1 files changed, 54 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pre_EBSeq.py Thu Oct 20 08:47:06 2016 -0400 @@ -0,0 +1,54 @@ +#!/usr/bin/env python +# Author: rachel.legendre@pasteur.fr + +from os.path import basename, join +from os import getcwd, system +import argparse +from shutil import copyfile +import tempfile +import csv +import pandas as pd +from collections import Counter + +def __main__(): + parser = argparse.ArgumentParser() + parser.add_argument('--inputs', action='append', nargs='*') + parser.add_argument('--outvector') + parser.add_argument('--outtable') + args = parser.parse_args() + + IGvector = args.outvector + outtable = args.outtable + inputs = args.inputs + working_directory = getcwd() + + dfs = [] + + #Build the Expression table from the "expected_count" column of RSEM count table + for (filename, cond) in inputs: + # read the csv, making sure the first two columns are str + df = pd.read_csv(filename, sep='\t', converters={0: str, 1: str}) + # throw away all but the first two columns + df = df.iloc[:, [0,1,4]] + # change the column names so they won't collide during concatenation + df = df.rename(index=str, columns={"expected_count": cond}) + dfs.append(df) + # concatenate them horizontally + df_final = reduce(lambda left, right: pd.merge(left, right, on=['gene_id','transcript_id(s)']), dfs) + # write it out + df_final.to_csv(outtable, index=None, sep="\t") + + + #get IG vector from the Expression Table + #The IG Vector is a table with only one column of numbers (integers) + df2 = pd.read_csv(outtable, sep='\t', converters={0: str, 1: str}) + ids= df2[['transcript_id(s)', 'gene_id']] + counts = Counter(ids['gene_id']) + gene_order = list(ids['gene_id']) + with open(IGvector, 'wb') as IG: + for gene in gene_order: + nbG = counts[gene] + IG.write(str(nbG) + '\n') + +if __name__ == "__main__": + __main__()
