annotate degust.py @ 2:1e4307fd98f3 draft

Uploaded
author simon-gladman
date Mon, 24 Feb 2014 19:02:18 -0500
parents 773107d91822
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
1
773107d91822 Uploaded
simon-gladman
parents:
diff changeset
1 #!/usr/bin/env python
773107d91822 Uploaded
simon-gladman
parents:
diff changeset
2
773107d91822 Uploaded
simon-gladman
parents:
diff changeset
3 import argparse, json, re, sys, csv, StringIO, math
773107d91822 Uploaded
simon-gladman
parents:
diff changeset
4
773107d91822 Uploaded
simon-gladman
parents:
diff changeset
5 def embed(csv, args):
773107d91822 Uploaded
simon-gladman
parents:
diff changeset
6 html="""
773107d91822 Uploaded
simon-gladman
parents:
diff changeset
7 <html>
773107d91822 Uploaded
simon-gladman
parents:
diff changeset
8 <head profile="http://www.w3.org/2005/10/profile">
773107d91822 Uploaded
simon-gladman
parents:
diff changeset
9 <link rel="icon" type="image/png" href="images/favicon.png"/>
773107d91822 Uploaded
simon-gladman
parents:
diff changeset
10
773107d91822 Uploaded
simon-gladman
parents:
diff changeset
11 <!-- Externals CSS -->
773107d91822 Uploaded
simon-gladman
parents:
diff changeset
12 <link rel="stylesheet" href='http://victorian-bioinformatics-consortium.github.io/degust/dist/latest/css/lib.css' />
773107d91822 Uploaded
simon-gladman
parents:
diff changeset
13
773107d91822 Uploaded
simon-gladman
parents:
diff changeset
14 <link rel="stylesheet" href='http://victorian-bioinformatics-consortium.github.io/degust/dist/latest/css/common.css' type="text/css" />
773107d91822 Uploaded
simon-gladman
parents:
diff changeset
15 <link rel="stylesheet" href='http://victorian-bioinformatics-consortium.github.io/degust/dist/latest/css/compare.css' type="text/css "/>
773107d91822 Uploaded
simon-gladman
parents:
diff changeset
16
773107d91822 Uploaded
simon-gladman
parents:
diff changeset
17 <script type="text/javascript" src='http://victorian-bioinformatics-consortium.github.io/degust/dist/latest/common.js'></script>
773107d91822 Uploaded
simon-gladman
parents:
diff changeset
18 <script type="text/javascript" src='http://victorian-bioinformatics-consortium.github.io/degust/dist/latest/slickgrid.js'></script>
773107d91822 Uploaded
simon-gladman
parents:
diff changeset
19 <script type="text/javascript" src='http://victorian-bioinformatics-consortium.github.io/degust/dist/latest/compare.js'></script>
773107d91822 Uploaded
simon-gladman
parents:
diff changeset
20 </head>
773107d91822 Uploaded
simon-gladman
parents:
diff changeset
21 <body>
773107d91822 Uploaded
simon-gladman
parents:
diff changeset
22 <div id="replace-me" class="container">
773107d91822 Uploaded
simon-gladman
parents:
diff changeset
23 <div class="jumbotron">
773107d91822 Uploaded
simon-gladman
parents:
diff changeset
24 <h1>Degust</h1>
773107d91822 Uploaded
simon-gladman
parents:
diff changeset
25 <p><a href='http://victorian-bioinformatics-consortium.github.io/degust/'>Degust</a> is preparing your data... prepare for degustation...</p>
773107d91822 Uploaded
simon-gladman
parents:
diff changeset
26 <img src='http://victorian-bioinformatics-consortium.github.io/degust/dist/latest/images/front-loader.gif'>
773107d91822 Uploaded
simon-gladman
parents:
diff changeset
27 </div>
773107d91822 Uploaded
simon-gladman
parents:
diff changeset
28 </div>
773107d91822 Uploaded
simon-gladman
parents:
diff changeset
29
773107d91822 Uploaded
simon-gladman
parents:
diff changeset
30 <script type="text/javascript">
773107d91822 Uploaded
simon-gladman
parents:
diff changeset
31 window.settings = { };
773107d91822 Uploaded
simon-gladman
parents:
diff changeset
32 </script>
773107d91822 Uploaded
simon-gladman
parents:
diff changeset
33 </body>
773107d91822 Uploaded
simon-gladman
parents:
diff changeset
34 </html>
773107d91822 Uploaded
simon-gladman
parents:
diff changeset
35
773107d91822 Uploaded
simon-gladman
parents:
diff changeset
36 """
773107d91822 Uploaded
simon-gladman
parents:
diff changeset
37 enc = json.dumps(csv)
773107d91822 Uploaded
simon-gladman
parents:
diff changeset
38 columns = \
773107d91822 Uploaded
simon-gladman
parents:
diff changeset
39 ["{idx:%s, name: %s, type:'info'}"%(json.dumps(c),json.dumps(c)) for c in args.info] + \
773107d91822 Uploaded
simon-gladman
parents:
diff changeset
40 ["{idx:%s, name: 'FDR', type: 'fdr'}"%json.dumps(args.fdr)] + \
773107d91822 Uploaded
simon-gladman
parents:
diff changeset
41 ["{idx:%s, name: 'Average', type: 'avg'}"%json.dumps(args.avg)] + \
773107d91822 Uploaded
simon-gladman
parents:
diff changeset
42 ["{idx:%s, name: %s, type: 'primary'}"%(json.dumps(args.primary), json.dumps(args.primary))] + \
773107d91822 Uploaded
simon-gladman
parents:
diff changeset
43 ["{idx:%s, name: %s, type:'fc'}"%(json.dumps(c),json.dumps(c)) for c in args.logFC] + \
773107d91822 Uploaded
simon-gladman
parents:
diff changeset
44 ["{idx:%s, name: %s, type:'link'}"%(json.dumps(c),json.dumps(c)) for c in args.link_col]
773107d91822 Uploaded
simon-gladman
parents:
diff changeset
45
773107d91822 Uploaded
simon-gladman
parents:
diff changeset
46 settings = ["html_version: '0.11.2'",
773107d91822 Uploaded
simon-gladman
parents:
diff changeset
47 "asset_base: 'http://victorian-bioinformatics-consortium.github.io/degust/dist/latest/'",
773107d91822 Uploaded
simon-gladman
parents:
diff changeset
48 "csv_data: data",
773107d91822 Uploaded
simon-gladman
parents:
diff changeset
49 "csv_format: %s"%("false" if args.tab else "true"),
773107d91822 Uploaded
simon-gladman
parents:
diff changeset
50 "name: %s"%json.dumps(args.name),
773107d91822 Uploaded
simon-gladman
parents:
diff changeset
51 "columns:[%s]"%(",".join(columns)),
773107d91822 Uploaded
simon-gladman
parents:
diff changeset
52 ]
773107d91822 Uploaded
simon-gladman
parents:
diff changeset
53 if args.notour:
773107d91822 Uploaded
simon-gladman
parents:
diff changeset
54 settings += ["show_tour: false"]
773107d91822 Uploaded
simon-gladman
parents:
diff changeset
55 if args.link_url:
773107d91822 Uploaded
simon-gladman
parents:
diff changeset
56 settings += ["link_url: %s"%json.dumps(args.link_url)]
773107d91822 Uploaded
simon-gladman
parents:
diff changeset
57
773107d91822 Uploaded
simon-gladman
parents:
diff changeset
58 window_settings = "window.settings = {%s};"%(",".join(settings))
773107d91822 Uploaded
simon-gladman
parents:
diff changeset
59 s = html.replace('window.settings = { };', "var data=%s;\n\n%s"%(enc,window_settings), 1)
773107d91822 Uploaded
simon-gladman
parents:
diff changeset
60 return s
773107d91822 Uploaded
simon-gladman
parents:
diff changeset
61
773107d91822 Uploaded
simon-gladman
parents:
diff changeset
62 def check_args(args, csv_file):
773107d91822 Uploaded
simon-gladman
parents:
diff changeset
63 # Check args match csv file.
773107d91822 Uploaded
simon-gladman
parents:
diff changeset
64 delim = "\t" if args.tab else ","
773107d91822 Uploaded
simon-gladman
parents:
diff changeset
65 reader = csv.reader(csv_file.split('\n'), delimiter=delim)
773107d91822 Uploaded
simon-gladman
parents:
diff changeset
66 headers = reader.next()
773107d91822 Uploaded
simon-gladman
parents:
diff changeset
67 err = False
773107d91822 Uploaded
simon-gladman
parents:
diff changeset
68 if args.avg is None:
773107d91822 Uploaded
simon-gladman
parents:
diff changeset
69 sys.stderr.write("ERROR: Column for average expression not defined (use --avg) necessary for the ma-plot\n")
773107d91822 Uploaded
simon-gladman
parents:
diff changeset
70 err=True
773107d91822 Uploaded
simon-gladman
parents:
diff changeset
71 elif args.avg not in headers:
773107d91822 Uploaded
simon-gladman
parents:
diff changeset
72 sys.stderr.write("ERROR: Column for average expression not found (%s)\n"%args.avg)
773107d91822 Uploaded
simon-gladman
parents:
diff changeset
73 err=True
773107d91822 Uploaded
simon-gladman
parents:
diff changeset
74
773107d91822 Uploaded
simon-gladman
parents:
diff changeset
75 if args.fdr not in headers:
773107d91822 Uploaded
simon-gladman
parents:
diff changeset
76 sys.stderr.write("ERROR: Column for FDR not found (%s)\n"%args.fdr)
773107d91822 Uploaded
simon-gladman
parents:
diff changeset
77 err=True
773107d91822 Uploaded
simon-gladman
parents:
diff changeset
78
773107d91822 Uploaded
simon-gladman
parents:
diff changeset
79 if args.logFC is None:
773107d91822 Uploaded
simon-gladman
parents:
diff changeset
80 sys.stderr.write("ERROR: No columns defined for log-fold-change, --logFC\n")
773107d91822 Uploaded
simon-gladman
parents:
diff changeset
81 err=True
773107d91822 Uploaded
simon-gladman
parents:
diff changeset
82 else:
773107d91822 Uploaded
simon-gladman
parents:
diff changeset
83 for f in args.logFC:
773107d91822 Uploaded
simon-gladman
parents:
diff changeset
84 if f not in headers:
773107d91822 Uploaded
simon-gladman
parents:
diff changeset
85 sys.stderr.write("ERROR: Column for logFC not found, --logFC : (%s)\n"%f)
773107d91822 Uploaded
simon-gladman
parents:
diff changeset
86 err=True
773107d91822 Uploaded
simon-gladman
parents:
diff changeset
87
773107d91822 Uploaded
simon-gladman
parents:
diff changeset
88 if args.info is None:
773107d91822 Uploaded
simon-gladman
parents:
diff changeset
89 sys.stderr.write("ERROR: No columns defined for per-gene information, eg. gene IDs (use --info)\n")
773107d91822 Uploaded
simon-gladman
parents:
diff changeset
90 err=True
773107d91822 Uploaded
simon-gladman
parents:
diff changeset
91 else:
773107d91822 Uploaded
simon-gladman
parents:
diff changeset
92 for f in args.info:
773107d91822 Uploaded
simon-gladman
parents:
diff changeset
93 if f not in headers:
773107d91822 Uploaded
simon-gladman
parents:
diff changeset
94 sys.stderr.write("ERROR: Column for info not found (%s)\n"%f)
773107d91822 Uploaded
simon-gladman
parents:
diff changeset
95 err=True
773107d91822 Uploaded
simon-gladman
parents:
diff changeset
96 return err
773107d91822 Uploaded
simon-gladman
parents:
diff changeset
97
773107d91822 Uploaded
simon-gladman
parents:
diff changeset
98
773107d91822 Uploaded
simon-gladman
parents:
diff changeset
99 def cuffdiff_avg(str):
773107d91822 Uploaded
simon-gladman
parents:
diff changeset
100 """Given a string that is the output from cuffdiff, create and log2(average expression) column.
773107d91822 Uploaded
simon-gladman
parents:
diff changeset
101 Acutally, it is just the average log2() of the FPKM, but that should be enough for visualisation
773107d91822 Uploaded
simon-gladman
parents:
diff changeset
102 """
773107d91822 Uploaded
simon-gladman
parents:
diff changeset
103 delim = "\t" if args.tab else ","
773107d91822 Uploaded
simon-gladman
parents:
diff changeset
104 reader = csv.reader(csv_file.split('\n'), delimiter=delim)
773107d91822 Uploaded
simon-gladman
parents:
diff changeset
105 si = StringIO.StringIO()
773107d91822 Uploaded
simon-gladman
parents:
diff changeset
106 cw = csv.writer(si, delimiter=delim)
773107d91822 Uploaded
simon-gladman
parents:
diff changeset
107
773107d91822 Uploaded
simon-gladman
parents:
diff changeset
108 headers = reader.next()
773107d91822 Uploaded
simon-gladman
parents:
diff changeset
109 cw.writerow(headers + ['Avg'])
773107d91822 Uploaded
simon-gladman
parents:
diff changeset
110 idx1 = headers.index("value_1")
773107d91822 Uploaded
simon-gladman
parents:
diff changeset
111 idx2 = headers.index("value_2")
773107d91822 Uploaded
simon-gladman
parents:
diff changeset
112 tst_idx = headers.index("status")
773107d91822 Uploaded
simon-gladman
parents:
diff changeset
113 for r in reader:
773107d91822 Uploaded
simon-gladman
parents:
diff changeset
114 if len(r)>=max(idx1,idx2) and r[tst_idx] == 'OK':
773107d91822 Uploaded
simon-gladman
parents:
diff changeset
115 v1 = max(float(r[idx1]),1)
773107d91822 Uploaded
simon-gladman
parents:
diff changeset
116 v2 = max(float(r[idx2]),1)
773107d91822 Uploaded
simon-gladman
parents:
diff changeset
117 v = 0.5 * (math.log(v1,2) + math.log(v2,2))
773107d91822 Uploaded
simon-gladman
parents:
diff changeset
118 cw.writerow(r + [v])
773107d91822 Uploaded
simon-gladman
parents:
diff changeset
119 return si.getvalue()
773107d91822 Uploaded
simon-gladman
parents:
diff changeset
120
773107d91822 Uploaded
simon-gladman
parents:
diff changeset
121 parser = argparse.ArgumentParser(description='Produce a standalone Degust html file from a CSV file containing DGE.')
773107d91822 Uploaded
simon-gladman
parents:
diff changeset
122 parser.add_argument('csvfile', type=argparse.FileType('r'),
773107d91822 Uploaded
simon-gladman
parents:
diff changeset
123 nargs='?', default='-',
773107d91822 Uploaded
simon-gladman
parents:
diff changeset
124 help="CSV file to process (default stdin)")
773107d91822 Uploaded
simon-gladman
parents:
diff changeset
125 parser.add_argument('-o','--out', type=argparse.FileType('w'),
773107d91822 Uploaded
simon-gladman
parents:
diff changeset
126 default='-',
773107d91822 Uploaded
simon-gladman
parents:
diff changeset
127 help="Output file (default stdout)")
773107d91822 Uploaded
simon-gladman
parents:
diff changeset
128
773107d91822 Uploaded
simon-gladman
parents:
diff changeset
129 parser.add_argument('--name', default='Unnamed',
773107d91822 Uploaded
simon-gladman
parents:
diff changeset
130 help='Name for this DGE comparison')
773107d91822 Uploaded
simon-gladman
parents:
diff changeset
131 parser.add_argument('--notour',
773107d91822 Uploaded
simon-gladman
parents:
diff changeset
132 help='Do not show the tour on first load')
773107d91822 Uploaded
simon-gladman
parents:
diff changeset
133 parser.add_argument('--primary', default='pri',
773107d91822 Uploaded
simon-gladman
parents:
diff changeset
134 help='Name for the primary condition that the fold-changes are relative to')
773107d91822 Uploaded
simon-gladman
parents:
diff changeset
135 parser.add_argument('--avg',
773107d91822 Uploaded
simon-gladman
parents:
diff changeset
136 help='Name for average intensity column in CSV file')
773107d91822 Uploaded
simon-gladman
parents:
diff changeset
137 parser.add_argument('--fdr', default='adj.P.Val',
773107d91822 Uploaded
simon-gladman
parents:
diff changeset
138 help='Name for "FDR" column in CSV file (default "adj.P.Val")')
773107d91822 Uploaded
simon-gladman
parents:
diff changeset
139 parser.add_argument('--logFC',
773107d91822 Uploaded
simon-gladman
parents:
diff changeset
140 help='Comma separated names for "logFC" columns in CSV file')
773107d91822 Uploaded
simon-gladman
parents:
diff changeset
141 parser.add_argument('--info',
773107d91822 Uploaded
simon-gladman
parents:
diff changeset
142 help='Comma separated names for info columns in CSV file')
773107d91822 Uploaded
simon-gladman
parents:
diff changeset
143 parser.add_argument('--link-col',
773107d91822 Uploaded
simon-gladman
parents:
diff changeset
144 help='Name for column to use with "--link-url"')
773107d91822 Uploaded
simon-gladman
parents:
diff changeset
145 parser.add_argument('--link-url',
773107d91822 Uploaded
simon-gladman
parents:
diff changeset
146 help='Gene info URL. Used when double-clicking the gene-table. Any "%%s" will be replaced with the value from the specified "--link-col"')
773107d91822 Uploaded
simon-gladman
parents:
diff changeset
147 parser.add_argument('--tab', action='store_true', default=False,
773107d91822 Uploaded
simon-gladman
parents:
diff changeset
148 help='Specify that the csv file is actually tab delimited')
773107d91822 Uploaded
simon-gladman
parents:
diff changeset
149 parser.add_argument('--cuffdiff', action='store_true', default=False,
773107d91822 Uploaded
simon-gladman
parents:
diff changeset
150 help='Input file is from cuffdiff (gene_exp.diff). This will set the columns automatically. Note this is still experimental')
773107d91822 Uploaded
simon-gladman
parents:
diff changeset
151
773107d91822 Uploaded
simon-gladman
parents:
diff changeset
152 args = parser.parse_args()
773107d91822 Uploaded
simon-gladman
parents:
diff changeset
153
773107d91822 Uploaded
simon-gladman
parents:
diff changeset
154 #print args
773107d91822 Uploaded
simon-gladman
parents:
diff changeset
155 if args.info: args.info = args.info.split(",")
773107d91822 Uploaded
simon-gladman
parents:
diff changeset
156 if args.logFC: args.logFC = args.logFC.split(",")
773107d91822 Uploaded
simon-gladman
parents:
diff changeset
157 args.link_col = [args.link_col] if args.link_col else []
773107d91822 Uploaded
simon-gladman
parents:
diff changeset
158
773107d91822 Uploaded
simon-gladman
parents:
diff changeset
159 # print args
773107d91822 Uploaded
simon-gladman
parents:
diff changeset
160
773107d91822 Uploaded
simon-gladman
parents:
diff changeset
161 if args.csvfile == sys.stdin:
773107d91822 Uploaded
simon-gladman
parents:
diff changeset
162 sys.stderr.write("Reading from stdin...\n")
773107d91822 Uploaded
simon-gladman
parents:
diff changeset
163
773107d91822 Uploaded
simon-gladman
parents:
diff changeset
164 csv_file = args.csvfile.read()
773107d91822 Uploaded
simon-gladman
parents:
diff changeset
165
773107d91822 Uploaded
simon-gladman
parents:
diff changeset
166 if args.cuffdiff:
773107d91822 Uploaded
simon-gladman
parents:
diff changeset
167 args.info = ['gene_id','gene']
773107d91822 Uploaded
simon-gladman
parents:
diff changeset
168 args.logFC = ['log2(fold_change)']
773107d91822 Uploaded
simon-gladman
parents:
diff changeset
169 args.tab = True
773107d91822 Uploaded
simon-gladman
parents:
diff changeset
170 args.fdr = 'q_value'
773107d91822 Uploaded
simon-gladman
parents:
diff changeset
171 args.avg = 'Avg'
773107d91822 Uploaded
simon-gladman
parents:
diff changeset
172 csv_file = cuffdiff_avg(csv_file)
773107d91822 Uploaded
simon-gladman
parents:
diff changeset
173
773107d91822 Uploaded
simon-gladman
parents:
diff changeset
174 err = check_args(args, csv_file)
773107d91822 Uploaded
simon-gladman
parents:
diff changeset
175
773107d91822 Uploaded
simon-gladman
parents:
diff changeset
176 if not err:
773107d91822 Uploaded
simon-gladman
parents:
diff changeset
177 args.out.write(embed(csv_file, args))
773107d91822 Uploaded
simon-gladman
parents:
diff changeset
178