annotate cwpair2_util.py @ 7:d455f14530dc draft

Uploaded
author greg
date Tue, 24 Nov 2015 08:15:57 -0500
parents 2e0ddcc726f9
children 1fc26b8e618d
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
507521bf124a Uploaded
greg
parents:
diff changeset
1 import bisect
507521bf124a Uploaded
greg
parents:
diff changeset
2 import csv
507521bf124a Uploaded
greg
parents:
diff changeset
3 import os
507521bf124a Uploaded
greg
parents:
diff changeset
4 import sys
507521bf124a Uploaded
greg
parents:
diff changeset
5 import traceback
1
4d86371aafa8 Uploaded
greg
parents: 0
diff changeset
6 import matplotlib
4d86371aafa8 Uploaded
greg
parents: 0
diff changeset
7 matplotlib.use('Agg')
0
507521bf124a Uploaded
greg
parents:
diff changeset
8 from matplotlib import pyplot
507521bf124a Uploaded
greg
parents:
diff changeset
9
507521bf124a Uploaded
greg
parents:
diff changeset
10 DETAILS = 'D'
507521bf124a Uploaded
greg
parents:
diff changeset
11 FINAL_PLOTS = 'F'
507521bf124a Uploaded
greg
parents:
diff changeset
12 ORPHANS = 'O'
507521bf124a Uploaded
greg
parents:
diff changeset
13 PREVIEW_PLOTS = 'P'
507521bf124a Uploaded
greg
parents:
diff changeset
14 SIMPLES = 'S'
507521bf124a Uploaded
greg
parents:
diff changeset
15 STATS_GRAPH = 'C'
507521bf124a Uploaded
greg
parents:
diff changeset
16 GFF_EXT = 'gff'
507521bf124a Uploaded
greg
parents:
diff changeset
17 TABULAR_EXT = 'tabular'
507521bf124a Uploaded
greg
parents:
diff changeset
18
507521bf124a Uploaded
greg
parents:
diff changeset
19 # Graph settings.
507521bf124a Uploaded
greg
parents:
diff changeset
20 COLORS = 'krg'
507521bf124a Uploaded
greg
parents:
diff changeset
21 Y_LABEL = 'Peak-pair counts'
507521bf124a Uploaded
greg
parents:
diff changeset
22 X_LABEL = 'Peak-pair distance (bp)'
507521bf124a Uploaded
greg
parents:
diff changeset
23 TICK_WIDTH = 3
507521bf124a Uploaded
greg
parents:
diff changeset
24 ADJUST = [0.140, 0.9, 0.9, 0.1]
2
279cdc63bcff Uploaded
greg
parents: 1
diff changeset
25 PLOT_FORMAT = 'pdf'
0
507521bf124a Uploaded
greg
parents:
diff changeset
26 pyplot.rc('xtick.major', size=10.00)
507521bf124a Uploaded
greg
parents:
diff changeset
27 pyplot.rc('ytick.major', size=10.00)
507521bf124a Uploaded
greg
parents:
diff changeset
28 pyplot.rc('lines', linewidth=4.00)
507521bf124a Uploaded
greg
parents:
diff changeset
29 pyplot.rc('axes', linewidth=3.00)
1
4d86371aafa8 Uploaded
greg
parents: 0
diff changeset
30 pyplot.rc('font', family='Bitstream Vera Sans', size=32.0)
0
507521bf124a Uploaded
greg
parents:
diff changeset
31
507521bf124a Uploaded
greg
parents:
diff changeset
32
507521bf124a Uploaded
greg
parents:
diff changeset
33 class FrequencyDistribution(object):
507521bf124a Uploaded
greg
parents:
diff changeset
34
507521bf124a Uploaded
greg
parents:
diff changeset
35 def __init__(self, start, end, binsize=10, d=None):
507521bf124a Uploaded
greg
parents:
diff changeset
36 self.start = start
507521bf124a Uploaded
greg
parents:
diff changeset
37 self.end = end
507521bf124a Uploaded
greg
parents:
diff changeset
38 self.dist = d or {}
507521bf124a Uploaded
greg
parents:
diff changeset
39 self.binsize = binsize
507521bf124a Uploaded
greg
parents:
diff changeset
40
507521bf124a Uploaded
greg
parents:
diff changeset
41 def get_bin(self, x):
507521bf124a Uploaded
greg
parents:
diff changeset
42 """
507521bf124a Uploaded
greg
parents:
diff changeset
43 Returns the bin in which a data point falls
507521bf124a Uploaded
greg
parents:
diff changeset
44 """
507521bf124a Uploaded
greg
parents:
diff changeset
45 return self.start + (x-self.start) // self.binsize * self.binsize + self.binsize/2.0
507521bf124a Uploaded
greg
parents:
diff changeset
46
507521bf124a Uploaded
greg
parents:
diff changeset
47 def add(self, x):
507521bf124a Uploaded
greg
parents:
diff changeset
48 x = self.get_bin(x)
507521bf124a Uploaded
greg
parents:
diff changeset
49 self.dist[x] = self.dist.get(x, 0) + 1
507521bf124a Uploaded
greg
parents:
diff changeset
50
507521bf124a Uploaded
greg
parents:
diff changeset
51 def graph_series(self):
507521bf124a Uploaded
greg
parents:
diff changeset
52 x = []
507521bf124a Uploaded
greg
parents:
diff changeset
53 y = []
507521bf124a Uploaded
greg
parents:
diff changeset
54 for i in range(self.start, self.end, self.binsize):
507521bf124a Uploaded
greg
parents:
diff changeset
55 center = self.get_bin(i)
507521bf124a Uploaded
greg
parents:
diff changeset
56 x.append(center)
507521bf124a Uploaded
greg
parents:
diff changeset
57 y.append(self.dist.get(center, 0))
507521bf124a Uploaded
greg
parents:
diff changeset
58 return x, y
507521bf124a Uploaded
greg
parents:
diff changeset
59
507521bf124a Uploaded
greg
parents:
diff changeset
60 def mode(self):
507521bf124a Uploaded
greg
parents:
diff changeset
61 return max(self.dist.items(), key=lambda data: data[1])[0]
507521bf124a Uploaded
greg
parents:
diff changeset
62
507521bf124a Uploaded
greg
parents:
diff changeset
63 def size(self):
507521bf124a Uploaded
greg
parents:
diff changeset
64 return sum(self.dist.values())
507521bf124a Uploaded
greg
parents:
diff changeset
65
507521bf124a Uploaded
greg
parents:
diff changeset
66
507521bf124a Uploaded
greg
parents:
diff changeset
67 def stop_err(msg):
507521bf124a Uploaded
greg
parents:
diff changeset
68 sys.stderr.write(msg)
507521bf124a Uploaded
greg
parents:
diff changeset
69 sys.exit(1)
507521bf124a Uploaded
greg
parents:
diff changeset
70
507521bf124a Uploaded
greg
parents:
diff changeset
71
507521bf124a Uploaded
greg
parents:
diff changeset
72 def distance(peak1, peak2):
507521bf124a Uploaded
greg
parents:
diff changeset
73 return (peak2[1]+peak2[2])/2 - (peak1[1]+peak1[2])/2
507521bf124a Uploaded
greg
parents:
diff changeset
74
507521bf124a Uploaded
greg
parents:
diff changeset
75
507521bf124a Uploaded
greg
parents:
diff changeset
76 def gff_row(cname, start, end, score, source, type='.', strand='.', phase='.', attrs={}):
507521bf124a Uploaded
greg
parents:
diff changeset
77 return (cname, source, type, start, end, score, strand, phase, gff_attrs(attrs))
507521bf124a Uploaded
greg
parents:
diff changeset
78
507521bf124a Uploaded
greg
parents:
diff changeset
79
507521bf124a Uploaded
greg
parents:
diff changeset
80 def gff_attrs(d):
507521bf124a Uploaded
greg
parents:
diff changeset
81 if not d:
507521bf124a Uploaded
greg
parents:
diff changeset
82 return '.'
507521bf124a Uploaded
greg
parents:
diff changeset
83 return ';'.join('%s=%s' % item for item in d.items())
507521bf124a Uploaded
greg
parents:
diff changeset
84
507521bf124a Uploaded
greg
parents:
diff changeset
85
507521bf124a Uploaded
greg
parents:
diff changeset
86 def parse_chromosomes(reader):
507521bf124a Uploaded
greg
parents:
diff changeset
87 # This version of cwpair2 accepts only gff format as input.
507521bf124a Uploaded
greg
parents:
diff changeset
88 chromosomes = {}
507521bf124a Uploaded
greg
parents:
diff changeset
89 reader.next()
507521bf124a Uploaded
greg
parents:
diff changeset
90 for line in reader:
507521bf124a Uploaded
greg
parents:
diff changeset
91 cname, junk, junk, start, end, value, strand, junk, junk = line
507521bf124a Uploaded
greg
parents:
diff changeset
92 start = int(start)
507521bf124a Uploaded
greg
parents:
diff changeset
93 end = int(end)
507521bf124a Uploaded
greg
parents:
diff changeset
94 value = float(value)
507521bf124a Uploaded
greg
parents:
diff changeset
95 if cname not in chromosomes:
507521bf124a Uploaded
greg
parents:
diff changeset
96 chromosomes[cname] = []
507521bf124a Uploaded
greg
parents:
diff changeset
97 peaks = chromosomes[cname]
507521bf124a Uploaded
greg
parents:
diff changeset
98 peaks.append((strand, start, end, value))
507521bf124a Uploaded
greg
parents:
diff changeset
99 return chromosomes
507521bf124a Uploaded
greg
parents:
diff changeset
100
507521bf124a Uploaded
greg
parents:
diff changeset
101
507521bf124a Uploaded
greg
parents:
diff changeset
102 def perc95(chromosomes):
507521bf124a Uploaded
greg
parents:
diff changeset
103 """
507521bf124a Uploaded
greg
parents:
diff changeset
104 Returns the 95th percentile value of the given chromosomes.
507521bf124a Uploaded
greg
parents:
diff changeset
105 """
507521bf124a Uploaded
greg
parents:
diff changeset
106 values = []
507521bf124a Uploaded
greg
parents:
diff changeset
107 for peaks in chromosomes.values():
507521bf124a Uploaded
greg
parents:
diff changeset
108 for peak in peaks:
507521bf124a Uploaded
greg
parents:
diff changeset
109 values.append(peak[3])
507521bf124a Uploaded
greg
parents:
diff changeset
110 values.sort()
507521bf124a Uploaded
greg
parents:
diff changeset
111 # Get 95% value
507521bf124a Uploaded
greg
parents:
diff changeset
112 return values[int(len(values)*0.95)]
507521bf124a Uploaded
greg
parents:
diff changeset
113
507521bf124a Uploaded
greg
parents:
diff changeset
114
507521bf124a Uploaded
greg
parents:
diff changeset
115 def filter(chromosomes, threshold=0.05):
507521bf124a Uploaded
greg
parents:
diff changeset
116 """
507521bf124a Uploaded
greg
parents:
diff changeset
117 Filters the peaks to those above a threshold. Threshold < 1.0 is interpreted
507521bf124a Uploaded
greg
parents:
diff changeset
118 as a proportion of the maximum, >=1.0 as an absolute value.
507521bf124a Uploaded
greg
parents:
diff changeset
119 """
507521bf124a Uploaded
greg
parents:
diff changeset
120 if threshold < 1:
507521bf124a Uploaded
greg
parents:
diff changeset
121 p95 = perc95(chromosomes)
507521bf124a Uploaded
greg
parents:
diff changeset
122 threshold = p95 * threshold
507521bf124a Uploaded
greg
parents:
diff changeset
123 # Make the threshold a proportion of the
507521bf124a Uploaded
greg
parents:
diff changeset
124 for cname, peaks in chromosomes.items():
507521bf124a Uploaded
greg
parents:
diff changeset
125 chromosomes[cname] = [peak for peak in peaks if peak[3] > threshold]
507521bf124a Uploaded
greg
parents:
diff changeset
126
507521bf124a Uploaded
greg
parents:
diff changeset
127
507521bf124a Uploaded
greg
parents:
diff changeset
128 def split_strands(chromosome):
507521bf124a Uploaded
greg
parents:
diff changeset
129 watson = [peak for peak in chromosome if peak[0] == '+']
507521bf124a Uploaded
greg
parents:
diff changeset
130 crick = [peak for peak in chromosome if peak[0] == '-']
507521bf124a Uploaded
greg
parents:
diff changeset
131 return watson, crick
507521bf124a Uploaded
greg
parents:
diff changeset
132
507521bf124a Uploaded
greg
parents:
diff changeset
133
507521bf124a Uploaded
greg
parents:
diff changeset
134 def all_pair_distribution(chromosomes, up_distance, down_distance, binsize):
507521bf124a Uploaded
greg
parents:
diff changeset
135 dist = FrequencyDistribution(-up_distance, down_distance, binsize=binsize)
507521bf124a Uploaded
greg
parents:
diff changeset
136 for cname, data in chromosomes.items():
507521bf124a Uploaded
greg
parents:
diff changeset
137 watson, crick = split_strands(data)
507521bf124a Uploaded
greg
parents:
diff changeset
138 crick.sort(key=lambda data: float(data[1]))
507521bf124a Uploaded
greg
parents:
diff changeset
139 keys = make_keys(crick)
507521bf124a Uploaded
greg
parents:
diff changeset
140 for peak in watson:
507521bf124a Uploaded
greg
parents:
diff changeset
141 for cpeak in get_window(crick, peak, up_distance, down_distance, keys):
507521bf124a Uploaded
greg
parents:
diff changeset
142 dist.add(distance(peak, cpeak))
507521bf124a Uploaded
greg
parents:
diff changeset
143 return dist
507521bf124a Uploaded
greg
parents:
diff changeset
144
507521bf124a Uploaded
greg
parents:
diff changeset
145
507521bf124a Uploaded
greg
parents:
diff changeset
146 def make_keys(crick):
507521bf124a Uploaded
greg
parents:
diff changeset
147 return [(data[1] + data[2])//2 for data in crick]
507521bf124a Uploaded
greg
parents:
diff changeset
148
507521bf124a Uploaded
greg
parents:
diff changeset
149
507521bf124a Uploaded
greg
parents:
diff changeset
150 def get_window(crick, peak, up_distance, down_distance, keys=None):
507521bf124a Uploaded
greg
parents:
diff changeset
151 """
507521bf124a Uploaded
greg
parents:
diff changeset
152 Returns a window of all crick peaks within a distance of a watson peak.
507521bf124a Uploaded
greg
parents:
diff changeset
153 crick strand MUST be sorted by distance
507521bf124a Uploaded
greg
parents:
diff changeset
154 """
507521bf124a Uploaded
greg
parents:
diff changeset
155 strand, start, end, value = peak
507521bf124a Uploaded
greg
parents:
diff changeset
156 midpoint = (start + end) // 2
507521bf124a Uploaded
greg
parents:
diff changeset
157 lower = midpoint - up_distance
507521bf124a Uploaded
greg
parents:
diff changeset
158 upper = midpoint + down_distance
507521bf124a Uploaded
greg
parents:
diff changeset
159 keys = keys or make_keys(crick)
507521bf124a Uploaded
greg
parents:
diff changeset
160 start_index = bisect.bisect_left(keys, lower)
507521bf124a Uploaded
greg
parents:
diff changeset
161 end_index = bisect.bisect_right(keys, upper)
507521bf124a Uploaded
greg
parents:
diff changeset
162 return [cpeak for cpeak in crick[start_index:end_index]]
507521bf124a Uploaded
greg
parents:
diff changeset
163
507521bf124a Uploaded
greg
parents:
diff changeset
164
507521bf124a Uploaded
greg
parents:
diff changeset
165 def match_largest(window, peak):
507521bf124a Uploaded
greg
parents:
diff changeset
166 if not window:
507521bf124a Uploaded
greg
parents:
diff changeset
167 return None
507521bf124a Uploaded
greg
parents:
diff changeset
168 return max(window, key=lambda cpeak: cpeak[3])
507521bf124a Uploaded
greg
parents:
diff changeset
169
507521bf124a Uploaded
greg
parents:
diff changeset
170
507521bf124a Uploaded
greg
parents:
diff changeset
171 def match_closest(window, peak):
507521bf124a Uploaded
greg
parents:
diff changeset
172 if not window:
507521bf124a Uploaded
greg
parents:
diff changeset
173 return None
507521bf124a Uploaded
greg
parents:
diff changeset
174
507521bf124a Uploaded
greg
parents:
diff changeset
175 def key(cpeak):
507521bf124a Uploaded
greg
parents:
diff changeset
176 d = distance(peak, cpeak)
507521bf124a Uploaded
greg
parents:
diff changeset
177 # Search negative distances last
507521bf124a Uploaded
greg
parents:
diff changeset
178 if d < 0:
507521bf124a Uploaded
greg
parents:
diff changeset
179 # And then prefer less negative distances
507521bf124a Uploaded
greg
parents:
diff changeset
180 d = 10000 - d
507521bf124a Uploaded
greg
parents:
diff changeset
181 return d
507521bf124a Uploaded
greg
parents:
diff changeset
182 return min(window, key=key)
507521bf124a Uploaded
greg
parents:
diff changeset
183
507521bf124a Uploaded
greg
parents:
diff changeset
184
507521bf124a Uploaded
greg
parents:
diff changeset
185 def match_mode(window, peak, mode):
507521bf124a Uploaded
greg
parents:
diff changeset
186 if not window:
507521bf124a Uploaded
greg
parents:
diff changeset
187 return None
507521bf124a Uploaded
greg
parents:
diff changeset
188 return min(window, key=lambda cpeak: abs(distance(peak, cpeak)-mode))
507521bf124a Uploaded
greg
parents:
diff changeset
189
507521bf124a Uploaded
greg
parents:
diff changeset
190 METHODS = {'mode': match_mode, 'closest': match_closest, 'largest': match_largest}
507521bf124a Uploaded
greg
parents:
diff changeset
191
507521bf124a Uploaded
greg
parents:
diff changeset
192
507521bf124a Uploaded
greg
parents:
diff changeset
193 def frequency_plot(freqs, fname, labels=[], title=''):
507521bf124a Uploaded
greg
parents:
diff changeset
194 pyplot.clf()
507521bf124a Uploaded
greg
parents:
diff changeset
195 pyplot.figure(figsize=(10, 10))
507521bf124a Uploaded
greg
parents:
diff changeset
196 for i, freq in enumerate(freqs):
507521bf124a Uploaded
greg
parents:
diff changeset
197 x, y = freq.graph_series()
507521bf124a Uploaded
greg
parents:
diff changeset
198 pyplot.plot(x, y, '%s-' % COLORS[i])
507521bf124a Uploaded
greg
parents:
diff changeset
199 if len(freqs) > 1:
507521bf124a Uploaded
greg
parents:
diff changeset
200 pyplot.legend(labels)
507521bf124a Uploaded
greg
parents:
diff changeset
201 pyplot.xlim(freq.start, freq.end)
507521bf124a Uploaded
greg
parents:
diff changeset
202 pyplot.ylim(ymin=0)
507521bf124a Uploaded
greg
parents:
diff changeset
203 pyplot.ylabel(Y_LABEL)
507521bf124a Uploaded
greg
parents:
diff changeset
204 pyplot.xlabel(X_LABEL)
507521bf124a Uploaded
greg
parents:
diff changeset
205 pyplot.subplots_adjust(left=ADJUST[0], right=ADJUST[1], top=ADJUST[2], bottom=ADJUST[3])
507521bf124a Uploaded
greg
parents:
diff changeset
206 # Get the current axes
507521bf124a Uploaded
greg
parents:
diff changeset
207 ax = pyplot.gca()
507521bf124a Uploaded
greg
parents:
diff changeset
208 for l in ax.get_xticklines() + ax.get_yticklines():
507521bf124a Uploaded
greg
parents:
diff changeset
209 l.set_markeredgewidth(TICK_WIDTH)
507521bf124a Uploaded
greg
parents:
diff changeset
210 pyplot.savefig(fname)
507521bf124a Uploaded
greg
parents:
diff changeset
211
507521bf124a Uploaded
greg
parents:
diff changeset
212
507521bf124a Uploaded
greg
parents:
diff changeset
213 def create_directories(method):
507521bf124a Uploaded
greg
parents:
diff changeset
214 if method == 'all':
507521bf124a Uploaded
greg
parents:
diff changeset
215 match_methods = METHODS.keys()
507521bf124a Uploaded
greg
parents:
diff changeset
216 else:
507521bf124a Uploaded
greg
parents:
diff changeset
217 match_methods = [method]
507521bf124a Uploaded
greg
parents:
diff changeset
218 for match_method in match_methods:
507521bf124a Uploaded
greg
parents:
diff changeset
219 os.mkdir('%s_%s' % (match_method, DETAILS))
507521bf124a Uploaded
greg
parents:
diff changeset
220 os.mkdir('%s_%s' % (match_method, FINAL_PLOTS))
507521bf124a Uploaded
greg
parents:
diff changeset
221 os.mkdir('%s_%s' % (match_method, ORPHANS))
507521bf124a Uploaded
greg
parents:
diff changeset
222 os.mkdir('%s_%s' % (match_method, PREVIEW_PLOTS))
507521bf124a Uploaded
greg
parents:
diff changeset
223 os.mkdir('%s_%s' % (match_method, SIMPLES))
507521bf124a Uploaded
greg
parents:
diff changeset
224 os.mkdir('%s_%s' % (match_method, STATS_GRAPH))
507521bf124a Uploaded
greg
parents:
diff changeset
225
507521bf124a Uploaded
greg
parents:
diff changeset
226
2
279cdc63bcff Uploaded
greg
parents: 1
diff changeset
227 def process_file(dataset_path, galaxy_hid, method, threshold, up_distance,
5
2e0ddcc726f9 Uploaded
greg
parents: 2
diff changeset
228 down_distance, binsize, output_files, sort_score):
0
507521bf124a Uploaded
greg
parents:
diff changeset
229 if method == 'all':
507521bf124a Uploaded
greg
parents:
diff changeset
230 match_methods = METHODS.keys()
507521bf124a Uploaded
greg
parents:
diff changeset
231 else:
507521bf124a Uploaded
greg
parents:
diff changeset
232 match_methods = [method]
507521bf124a Uploaded
greg
parents:
diff changeset
233 statistics = []
507521bf124a Uploaded
greg
parents:
diff changeset
234 for match_method in match_methods:
507521bf124a Uploaded
greg
parents:
diff changeset
235 stats = perform_process(dataset_path,
507521bf124a Uploaded
greg
parents:
diff changeset
236 galaxy_hid,
507521bf124a Uploaded
greg
parents:
diff changeset
237 match_method,
507521bf124a Uploaded
greg
parents:
diff changeset
238 threshold,
507521bf124a Uploaded
greg
parents:
diff changeset
239 up_distance,
507521bf124a Uploaded
greg
parents:
diff changeset
240 down_distance,
507521bf124a Uploaded
greg
parents:
diff changeset
241 binsize,
507521bf124a Uploaded
greg
parents:
diff changeset
242 output_files,
507521bf124a Uploaded
greg
parents:
diff changeset
243 sort_score)
507521bf124a Uploaded
greg
parents:
diff changeset
244 statistics.append(stats)
507521bf124a Uploaded
greg
parents:
diff changeset
245 if output_files == 'all' and method == 'all':
507521bf124a Uploaded
greg
parents:
diff changeset
246 frequency_plot([s['dist'] for s in statistics],
507521bf124a Uploaded
greg
parents:
diff changeset
247 statistics[0]['graph_path'],
507521bf124a Uploaded
greg
parents:
diff changeset
248 labels=METHODS.keys())
507521bf124a Uploaded
greg
parents:
diff changeset
249 return statistics
507521bf124a Uploaded
greg
parents:
diff changeset
250
507521bf124a Uploaded
greg
parents:
diff changeset
251
507521bf124a Uploaded
greg
parents:
diff changeset
252 def perform_process(dataset_path, galaxy_hid, method, threshold, up_distance,
5
2e0ddcc726f9 Uploaded
greg
parents: 2
diff changeset
253 down_distance, binsize, output_files, sort_score):
0
507521bf124a Uploaded
greg
parents:
diff changeset
254 output_details = output_files in ["all", "simple_orphan_detail"]
507521bf124a Uploaded
greg
parents:
diff changeset
255 output_plots = output_files in ["all"]
507521bf124a Uploaded
greg
parents:
diff changeset
256 output_orphans = output_files in ["all", "simple_orphan", "simple_orphan_detail"]
507521bf124a Uploaded
greg
parents:
diff changeset
257 # Keep track of statistics for the output file
507521bf124a Uploaded
greg
parents:
diff changeset
258 statistics = {}
507521bf124a Uploaded
greg
parents:
diff changeset
259 input = csv.reader(open(dataset_path, 'rt'), delimiter='\t')
507521bf124a Uploaded
greg
parents:
diff changeset
260 fpath, fname = os.path.split(dataset_path)
507521bf124a Uploaded
greg
parents:
diff changeset
261 statistics['fname'] = '%s: data %s' % (method, str(galaxy_hid))
507521bf124a Uploaded
greg
parents:
diff changeset
262 statistics['dir'] = fpath
507521bf124a Uploaded
greg
parents:
diff changeset
263 if threshold >= 1:
507521bf124a Uploaded
greg
parents:
diff changeset
264 filter_string = 'fa%d' % threshold
507521bf124a Uploaded
greg
parents:
diff changeset
265 else:
507521bf124a Uploaded
greg
parents:
diff changeset
266 filter_string = 'f%d' % (threshold * 100)
507521bf124a Uploaded
greg
parents:
diff changeset
267 fname = 'data_%s_%su%dd%db%d' % (galaxy_hid, filter_string, up_distance, down_distance, binsize)
507521bf124a Uploaded
greg
parents:
diff changeset
268
507521bf124a Uploaded
greg
parents:
diff changeset
269 def make_path(output_type, extension=TABULAR_EXT):
507521bf124a Uploaded
greg
parents:
diff changeset
270 # Returns the full path for a certain output.
507521bf124a Uploaded
greg
parents:
diff changeset
271 return os.path.join(output_type, '%s_%s.%s' % (output_type, fname, extension))
507521bf124a Uploaded
greg
parents:
diff changeset
272
507521bf124a Uploaded
greg
parents:
diff changeset
273 def td_writer(output_type, extension=TABULAR_EXT):
507521bf124a Uploaded
greg
parents:
diff changeset
274 # Returns a tab-delimited writer for a specified output.
507521bf124a Uploaded
greg
parents:
diff changeset
275 output_file_path = make_path(output_type, extension)
507521bf124a Uploaded
greg
parents:
diff changeset
276 return csv.writer(open(output_file_path, 'wt'), delimiter='\t')
507521bf124a Uploaded
greg
parents:
diff changeset
277
507521bf124a Uploaded
greg
parents:
diff changeset
278 try:
507521bf124a Uploaded
greg
parents:
diff changeset
279 chromosomes = parse_chromosomes(input)
507521bf124a Uploaded
greg
parents:
diff changeset
280 except Exception:
507521bf124a Uploaded
greg
parents:
diff changeset
281 stop_err('Unable to parse file "%s".\n%s' % (dataset_path, traceback.format_exc()))
507521bf124a Uploaded
greg
parents:
diff changeset
282 if output_details:
507521bf124a Uploaded
greg
parents:
diff changeset
283 # Details
507521bf124a Uploaded
greg
parents:
diff changeset
284 detailed_output = td_writer('%s_%s' % (method, DETAILS), extension=TABULAR_EXT)
507521bf124a Uploaded
greg
parents:
diff changeset
285 detailed_output.writerow(('chrom', 'start', 'end', 'value', 'strand') * 2 + ('midpoint', 'c-w reads sum', 'c-w distance (bp)'))
507521bf124a Uploaded
greg
parents:
diff changeset
286 if output_plots:
507521bf124a Uploaded
greg
parents:
diff changeset
287 # Final Plot
2
279cdc63bcff Uploaded
greg
parents: 1
diff changeset
288 final_plot_path = make_path('%s_%s' % (method, FINAL_PLOTS), PLOT_FORMAT)
0
507521bf124a Uploaded
greg
parents:
diff changeset
289 if output_orphans:
507521bf124a Uploaded
greg
parents:
diff changeset
290 # Orphans
507521bf124a Uploaded
greg
parents:
diff changeset
291 orphan_output = td_writer('%s_%s' % (method, ORPHANS), extension=TABULAR_EXT)
507521bf124a Uploaded
greg
parents:
diff changeset
292 orphan_output.writerow(('chrom', 'strand', 'start', 'end', 'value'))
507521bf124a Uploaded
greg
parents:
diff changeset
293 if output_plots:
507521bf124a Uploaded
greg
parents:
diff changeset
294 # Preview Plot
2
279cdc63bcff Uploaded
greg
parents: 1
diff changeset
295 preview_plot_path = make_path('%s_%s' % (method, PREVIEW_PLOTS), PLOT_FORMAT)
0
507521bf124a Uploaded
greg
parents:
diff changeset
296 # Simple
507521bf124a Uploaded
greg
parents:
diff changeset
297 simple_output = td_writer('%s_%s' % (method, SIMPLES), extension=GFF_EXT)
507521bf124a Uploaded
greg
parents:
diff changeset
298 statistics['stats_path'] = 'statistics.%s' % TABULAR_EXT
507521bf124a Uploaded
greg
parents:
diff changeset
299 if output_plots:
2
279cdc63bcff Uploaded
greg
parents: 1
diff changeset
300 statistics['graph_path'] = make_path('%s_%s' % (method, STATS_GRAPH), PLOT_FORMAT)
0
507521bf124a Uploaded
greg
parents:
diff changeset
301 statistics['perc95'] = perc95(chromosomes)
507521bf124a Uploaded
greg
parents:
diff changeset
302 if threshold > 0:
507521bf124a Uploaded
greg
parents:
diff changeset
303 # Apply filter
507521bf124a Uploaded
greg
parents:
diff changeset
304 filter(chromosomes, threshold)
507521bf124a Uploaded
greg
parents:
diff changeset
305 if method == 'mode':
507521bf124a Uploaded
greg
parents:
diff changeset
306 freq = all_pair_distribution(chromosomes, up_distance, down_distance, binsize)
507521bf124a Uploaded
greg
parents:
diff changeset
307 mode = freq.mode()
507521bf124a Uploaded
greg
parents:
diff changeset
308 statistics['preview_mode'] = mode
507521bf124a Uploaded
greg
parents:
diff changeset
309 if output_plots:
507521bf124a Uploaded
greg
parents:
diff changeset
310 frequency_plot([freq], preview_plot_path, title='Preview frequency plot')
507521bf124a Uploaded
greg
parents:
diff changeset
311 else:
507521bf124a Uploaded
greg
parents:
diff changeset
312 statistics['preview_mode'] = 'NA'
507521bf124a Uploaded
greg
parents:
diff changeset
313 dist = FrequencyDistribution(-up_distance, down_distance, binsize=binsize)
507521bf124a Uploaded
greg
parents:
diff changeset
314 orphans = 0
507521bf124a Uploaded
greg
parents:
diff changeset
315 # x will be used to archive the summary dataset
507521bf124a Uploaded
greg
parents:
diff changeset
316 x = []
507521bf124a Uploaded
greg
parents:
diff changeset
317 for cname, chromosome in chromosomes.items():
507521bf124a Uploaded
greg
parents:
diff changeset
318 # Each peak is (strand, start, end, value)
507521bf124a Uploaded
greg
parents:
diff changeset
319 watson, crick = split_strands(chromosome)
507521bf124a Uploaded
greg
parents:
diff changeset
320 # Sort by value of each peak
507521bf124a Uploaded
greg
parents:
diff changeset
321 watson.sort(key=lambda data: -float(data[3]))
507521bf124a Uploaded
greg
parents:
diff changeset
322 # Sort by position to facilitate binary search
507521bf124a Uploaded
greg
parents:
diff changeset
323 crick.sort(key=lambda data: float(data[1]))
507521bf124a Uploaded
greg
parents:
diff changeset
324 keys = make_keys(crick)
507521bf124a Uploaded
greg
parents:
diff changeset
325 for peak in watson:
507521bf124a Uploaded
greg
parents:
diff changeset
326 window = get_window(crick, peak, up_distance, down_distance, keys)
507521bf124a Uploaded
greg
parents:
diff changeset
327 if method == 'mode':
507521bf124a Uploaded
greg
parents:
diff changeset
328 match = match_mode(window, peak, mode)
507521bf124a Uploaded
greg
parents:
diff changeset
329 else:
507521bf124a Uploaded
greg
parents:
diff changeset
330 match = METHODS[method](window, peak)
507521bf124a Uploaded
greg
parents:
diff changeset
331 if match:
507521bf124a Uploaded
greg
parents:
diff changeset
332 midpoint = (match[1] + match[2] + peak[1] + peak[2]) // 4
507521bf124a Uploaded
greg
parents:
diff changeset
333 d = distance(peak, match)
507521bf124a Uploaded
greg
parents:
diff changeset
334 dist.add(d)
507521bf124a Uploaded
greg
parents:
diff changeset
335 # Simple output in gff format.
507521bf124a Uploaded
greg
parents:
diff changeset
336 x.append(gff_row(cname,
507521bf124a Uploaded
greg
parents:
diff changeset
337 source='cwpair',
507521bf124a Uploaded
greg
parents:
diff changeset
338 start=midpoint,
507521bf124a Uploaded
greg
parents:
diff changeset
339 end=midpoint + 1,
507521bf124a Uploaded
greg
parents:
diff changeset
340 score=peak[3] + match[3],
507521bf124a Uploaded
greg
parents:
diff changeset
341 attrs={'cw_distance': d}))
507521bf124a Uploaded
greg
parents:
diff changeset
342 if output_details:
507521bf124a Uploaded
greg
parents:
diff changeset
343 detailed_output.writerow((cname,
507521bf124a Uploaded
greg
parents:
diff changeset
344 peak[1],
507521bf124a Uploaded
greg
parents:
diff changeset
345 peak[2],
507521bf124a Uploaded
greg
parents:
diff changeset
346 peak[3],
507521bf124a Uploaded
greg
parents:
diff changeset
347 '+',
507521bf124a Uploaded
greg
parents:
diff changeset
348 cname,
507521bf124a Uploaded
greg
parents:
diff changeset
349 match[1],
507521bf124a Uploaded
greg
parents:
diff changeset
350 match[2],
507521bf124a Uploaded
greg
parents:
diff changeset
351 match[3], '-',
507521bf124a Uploaded
greg
parents:
diff changeset
352 midpoint,
507521bf124a Uploaded
greg
parents:
diff changeset
353 peak[3]+match[3],
507521bf124a Uploaded
greg
parents:
diff changeset
354 d))
507521bf124a Uploaded
greg
parents:
diff changeset
355 i = bisect.bisect_left(keys, (match[1]+match[2])/2)
507521bf124a Uploaded
greg
parents:
diff changeset
356 del crick[i]
507521bf124a Uploaded
greg
parents:
diff changeset
357 del keys[i]
507521bf124a Uploaded
greg
parents:
diff changeset
358 else:
507521bf124a Uploaded
greg
parents:
diff changeset
359 if output_orphans:
507521bf124a Uploaded
greg
parents:
diff changeset
360 orphan_output.writerow((cname, peak[0], peak[1], peak[2], peak[3]))
507521bf124a Uploaded
greg
parents:
diff changeset
361 # Keep track of orphans for statistics.
507521bf124a Uploaded
greg
parents:
diff changeset
362 orphans += 1
507521bf124a Uploaded
greg
parents:
diff changeset
363 # Remaining crick peaks are orphans
507521bf124a Uploaded
greg
parents:
diff changeset
364 if output_orphans:
507521bf124a Uploaded
greg
parents:
diff changeset
365 for cpeak in crick:
507521bf124a Uploaded
greg
parents:
diff changeset
366 orphan_output.writerow((cname, cpeak[0], cpeak[1], cpeak[2], cpeak[3]))
507521bf124a Uploaded
greg
parents:
diff changeset
367 # Keep track of orphans for statistics.
507521bf124a Uploaded
greg
parents:
diff changeset
368 orphans += len(crick)
507521bf124a Uploaded
greg
parents:
diff changeset
369 # Sort output by score if specified.
507521bf124a Uploaded
greg
parents:
diff changeset
370 if sort_score == "desc":
507521bf124a Uploaded
greg
parents:
diff changeset
371 x.sort(key=lambda data: float(data[5]), reverse=True)
507521bf124a Uploaded
greg
parents:
diff changeset
372 elif sort_score == "asc":
507521bf124a Uploaded
greg
parents:
diff changeset
373 x.sort(key=lambda data: float(data[5]))
5
2e0ddcc726f9 Uploaded
greg
parents: 2
diff changeset
374 # Writing a summary to gff format file
0
507521bf124a Uploaded
greg
parents:
diff changeset
375 for row in x:
507521bf124a Uploaded
greg
parents:
diff changeset
376 row_tmp = list(row)
507521bf124a Uploaded
greg
parents:
diff changeset
377 # Dataset in tuple cannot be modified in Python, so row will
507521bf124a Uploaded
greg
parents:
diff changeset
378 # be converted to list format to add 'chr'.
507521bf124a Uploaded
greg
parents:
diff changeset
379 if row_tmp[0] == "999":
507521bf124a Uploaded
greg
parents:
diff changeset
380 row_tmp[0] = 'chrM'
507521bf124a Uploaded
greg
parents:
diff changeset
381 elif row_tmp[0] == "998":
507521bf124a Uploaded
greg
parents:
diff changeset
382 row_tmp[0] = 'chrY'
507521bf124a Uploaded
greg
parents:
diff changeset
383 elif row_tmp[0] == "997":
507521bf124a Uploaded
greg
parents:
diff changeset
384 row_tmp[0] = 'chrX'
507521bf124a Uploaded
greg
parents:
diff changeset
385 else:
507521bf124a Uploaded
greg
parents:
diff changeset
386 row_tmp[0] = row_tmp[0]
507521bf124a Uploaded
greg
parents:
diff changeset
387 # Print row_tmp.
507521bf124a Uploaded
greg
parents:
diff changeset
388 simple_output.writerow(row_tmp)
507521bf124a Uploaded
greg
parents:
diff changeset
389 statistics['paired'] = dist.size() * 2
507521bf124a Uploaded
greg
parents:
diff changeset
390 statistics['orphans'] = orphans
507521bf124a Uploaded
greg
parents:
diff changeset
391 statistics['final_mode'] = dist.mode()
507521bf124a Uploaded
greg
parents:
diff changeset
392 if output_plots:
507521bf124a Uploaded
greg
parents:
diff changeset
393 frequency_plot([dist], final_plot_path, title='Frequency distribution')
507521bf124a Uploaded
greg
parents:
diff changeset
394 statistics['dist'] = dist
507521bf124a Uploaded
greg
parents:
diff changeset
395 return statistics