Mercurial > repos > yboursin > pre_ebseq_eba
comparison pre_EBSeq.py @ 2:340ada00d965 draft
Uploaded
| author | yboursin |
|---|---|
| date | Thu, 20 Oct 2016 08:47:06 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 1:00231125f4fb | 2:340ada00d965 |
|---|---|
| 1 #!/usr/bin/env python | |
| 2 # Author: rachel.legendre@pasteur.fr | |
| 3 | |
| 4 from os.path import basename, join | |
| 5 from os import getcwd, system | |
| 6 import argparse | |
| 7 from shutil import copyfile | |
| 8 import tempfile | |
| 9 import csv | |
| 10 import pandas as pd | |
| 11 from collections import Counter | |
| 12 | |
| 13 def __main__(): | |
| 14 parser = argparse.ArgumentParser() | |
| 15 parser.add_argument('--inputs', action='append', nargs='*') | |
| 16 parser.add_argument('--outvector') | |
| 17 parser.add_argument('--outtable') | |
| 18 args = parser.parse_args() | |
| 19 | |
| 20 IGvector = args.outvector | |
| 21 outtable = args.outtable | |
| 22 inputs = args.inputs | |
| 23 working_directory = getcwd() | |
| 24 | |
| 25 dfs = [] | |
| 26 | |
| 27 #Build the Expression table from the "expected_count" column of RSEM count table | |
| 28 for (filename, cond) in inputs: | |
| 29 # read the csv, making sure the first two columns are str | |
| 30 df = pd.read_csv(filename, sep='\t', converters={0: str, 1: str}) | |
| 31 # throw away all but the first two columns | |
| 32 df = df.iloc[:, [0,1,4]] | |
| 33 # change the column names so they won't collide during concatenation | |
| 34 df = df.rename(index=str, columns={"expected_count": cond}) | |
| 35 dfs.append(df) | |
| 36 # concatenate them horizontally | |
| 37 df_final = reduce(lambda left, right: pd.merge(left, right, on=['gene_id','transcript_id(s)']), dfs) | |
| 38 # write it out | |
| 39 df_final.to_csv(outtable, index=None, sep="\t") | |
| 40 | |
| 41 | |
| 42 #get IG vector from the Expression Table | |
| 43 #The IG Vector is a table with only one column of numbers (integers) | |
| 44 df2 = pd.read_csv(outtable, sep='\t', converters={0: str, 1: str}) | |
| 45 ids= df2[['transcript_id(s)', 'gene_id']] | |
| 46 counts = Counter(ids['gene_id']) | |
| 47 gene_order = list(ids['gene_id']) | |
| 48 with open(IGvector, 'wb') as IG: | |
| 49 for gene in gene_order: | |
| 50 nbG = counts[gene] | |
| 51 IG.write(str(nbG) + '\n') | |
| 52 | |
| 53 if __name__ == "__main__": | |
| 54 __main__() |
