comparison pre_EBSeq.py @ 2:340ada00d965 draft

Uploaded
author yboursin
date Thu, 20 Oct 2016 08:47:06 -0400
parents
children
comparison
equal deleted inserted replaced
1:00231125f4fb 2:340ada00d965
1 #!/usr/bin/env python
2 # Author: rachel.legendre@pasteur.fr
3
4 from os.path import basename, join
5 from os import getcwd, system
6 import argparse
7 from shutil import copyfile
8 import tempfile
9 import csv
10 import pandas as pd
11 from collections import Counter
12
13 def __main__():
14 parser = argparse.ArgumentParser()
15 parser.add_argument('--inputs', action='append', nargs='*')
16 parser.add_argument('--outvector')
17 parser.add_argument('--outtable')
18 args = parser.parse_args()
19
20 IGvector = args.outvector
21 outtable = args.outtable
22 inputs = args.inputs
23 working_directory = getcwd()
24
25 dfs = []
26
27 #Build the Expression table from the "expected_count" column of RSEM count table
28 for (filename, cond) in inputs:
29 # read the csv, making sure the first two columns are str
30 df = pd.read_csv(filename, sep='\t', converters={0: str, 1: str})
31 # throw away all but the first two columns
32 df = df.iloc[:, [0,1,4]]
33 # change the column names so they won't collide during concatenation
34 df = df.rename(index=str, columns={"expected_count": cond})
35 dfs.append(df)
36 # concatenate them horizontally
37 df_final = reduce(lambda left, right: pd.merge(left, right, on=['gene_id','transcript_id(s)']), dfs)
38 # write it out
39 df_final.to_csv(outtable, index=None, sep="\t")
40
41
42 #get IG vector from the Expression Table
43 #The IG Vector is a table with only one column of numbers (integers)
44 df2 = pd.read_csv(outtable, sep='\t', converters={0: str, 1: str})
45 ids= df2[['transcript_id(s)', 'gene_id']]
46 counts = Counter(ids['gene_id'])
47 gene_order = list(ids['gene_id'])
48 with open(IGvector, 'wb') as IG:
49 for gene in gene_order:
50 nbG = counts[gene]
51 IG.write(str(nbG) + '\n')
52
53 if __name__ == "__main__":
54 __main__()