annotate pre_EBSeq.py @ 2:340ada00d965 draft

Uploaded
author yboursin
date Thu, 20 Oct 2016 08:47:06 -0400
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
2
340ada00d965 Uploaded
yboursin
parents:
diff changeset
1 #!/usr/bin/env python
340ada00d965 Uploaded
yboursin
parents:
diff changeset
2 # Author: rachel.legendre@pasteur.fr
340ada00d965 Uploaded
yboursin
parents:
diff changeset
3
340ada00d965 Uploaded
yboursin
parents:
diff changeset
4 from os.path import basename, join
340ada00d965 Uploaded
yboursin
parents:
diff changeset
5 from os import getcwd, system
340ada00d965 Uploaded
yboursin
parents:
diff changeset
6 import argparse
340ada00d965 Uploaded
yboursin
parents:
diff changeset
7 from shutil import copyfile
340ada00d965 Uploaded
yboursin
parents:
diff changeset
8 import tempfile
340ada00d965 Uploaded
yboursin
parents:
diff changeset
9 import csv
340ada00d965 Uploaded
yboursin
parents:
diff changeset
10 import pandas as pd
340ada00d965 Uploaded
yboursin
parents:
diff changeset
11 from collections import Counter
340ada00d965 Uploaded
yboursin
parents:
diff changeset
12
340ada00d965 Uploaded
yboursin
parents:
diff changeset
13 def __main__():
340ada00d965 Uploaded
yboursin
parents:
diff changeset
14 parser = argparse.ArgumentParser()
340ada00d965 Uploaded
yboursin
parents:
diff changeset
15 parser.add_argument('--inputs', action='append', nargs='*')
340ada00d965 Uploaded
yboursin
parents:
diff changeset
16 parser.add_argument('--outvector')
340ada00d965 Uploaded
yboursin
parents:
diff changeset
17 parser.add_argument('--outtable')
340ada00d965 Uploaded
yboursin
parents:
diff changeset
18 args = parser.parse_args()
340ada00d965 Uploaded
yboursin
parents:
diff changeset
19
340ada00d965 Uploaded
yboursin
parents:
diff changeset
20 IGvector = args.outvector
340ada00d965 Uploaded
yboursin
parents:
diff changeset
21 outtable = args.outtable
340ada00d965 Uploaded
yboursin
parents:
diff changeset
22 inputs = args.inputs
340ada00d965 Uploaded
yboursin
parents:
diff changeset
23 working_directory = getcwd()
340ada00d965 Uploaded
yboursin
parents:
diff changeset
24
340ada00d965 Uploaded
yboursin
parents:
diff changeset
25 dfs = []
340ada00d965 Uploaded
yboursin
parents:
diff changeset
26
340ada00d965 Uploaded
yboursin
parents:
diff changeset
27 #Build the Expression table from the "expected_count" column of RSEM count table
340ada00d965 Uploaded
yboursin
parents:
diff changeset
28 for (filename, cond) in inputs:
340ada00d965 Uploaded
yboursin
parents:
diff changeset
29 # read the csv, making sure the first two columns are str
340ada00d965 Uploaded
yboursin
parents:
diff changeset
30 df = pd.read_csv(filename, sep='\t', converters={0: str, 1: str})
340ada00d965 Uploaded
yboursin
parents:
diff changeset
31 # throw away all but the first two columns
340ada00d965 Uploaded
yboursin
parents:
diff changeset
32 df = df.iloc[:, [0,1,4]]
340ada00d965 Uploaded
yboursin
parents:
diff changeset
33 # change the column names so they won't collide during concatenation
340ada00d965 Uploaded
yboursin
parents:
diff changeset
34 df = df.rename(index=str, columns={"expected_count": cond})
340ada00d965 Uploaded
yboursin
parents:
diff changeset
35 dfs.append(df)
340ada00d965 Uploaded
yboursin
parents:
diff changeset
36 # concatenate them horizontally
340ada00d965 Uploaded
yboursin
parents:
diff changeset
37 df_final = reduce(lambda left, right: pd.merge(left, right, on=['gene_id','transcript_id(s)']), dfs)
340ada00d965 Uploaded
yboursin
parents:
diff changeset
38 # write it out
340ada00d965 Uploaded
yboursin
parents:
diff changeset
39 df_final.to_csv(outtable, index=None, sep="\t")
340ada00d965 Uploaded
yboursin
parents:
diff changeset
40
340ada00d965 Uploaded
yboursin
parents:
diff changeset
41
340ada00d965 Uploaded
yboursin
parents:
diff changeset
42 #get IG vector from the Expression Table
340ada00d965 Uploaded
yboursin
parents:
diff changeset
43 #The IG Vector is a table with only one column of numbers (integers)
340ada00d965 Uploaded
yboursin
parents:
diff changeset
44 df2 = pd.read_csv(outtable, sep='\t', converters={0: str, 1: str})
340ada00d965 Uploaded
yboursin
parents:
diff changeset
45 ids= df2[['transcript_id(s)', 'gene_id']]
340ada00d965 Uploaded
yboursin
parents:
diff changeset
46 counts = Counter(ids['gene_id'])
340ada00d965 Uploaded
yboursin
parents:
diff changeset
47 gene_order = list(ids['gene_id'])
340ada00d965 Uploaded
yboursin
parents:
diff changeset
48 with open(IGvector, 'wb') as IG:
340ada00d965 Uploaded
yboursin
parents:
diff changeset
49 for gene in gene_order:
340ada00d965 Uploaded
yboursin
parents:
diff changeset
50 nbG = counts[gene]
340ada00d965 Uploaded
yboursin
parents:
diff changeset
51 IG.write(str(nbG) + '\n')
340ada00d965 Uploaded
yboursin
parents:
diff changeset
52
340ada00d965 Uploaded
yboursin
parents:
diff changeset
53 if __name__ == "__main__":
340ada00d965 Uploaded
yboursin
parents:
diff changeset
54 __main__()