Mercurial > repos > bgruening > repmatch_gff3
comparison repmatch_gff3_util.py @ 0:b26d50dd3226 draft default tip
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/repmatch_gff3 commit 0e04a4c237677c1f5be1950babcf8591097996a9
| author | bgruening |
|---|---|
| date | Wed, 23 Dec 2015 09:24:35 -0500 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:b26d50dd3226 |
|---|---|
| 1 import bisect | |
| 2 import csv | |
| 3 import os | |
| 4 import shutil | |
| 5 import sys | |
| 6 import tempfile | |
| 7 import matplotlib | |
| 8 matplotlib.use('Agg') | |
| 9 from matplotlib import pyplot | |
| 10 | |
| 11 # Graph settings | |
| 12 Y_LABEL = 'Counts' | |
| 13 X_LABEL = 'Number of matched replicates' | |
| 14 TICK_WIDTH = 3 | |
| 15 # Amount to shift the graph to make labels fit, [left, right, top, bottom] | |
| 16 ADJUST = [0.180, 0.9, 0.9, 0.1] | |
| 17 # Length of tick marks, use TICK_WIDTH for width | |
| 18 pyplot.rc('xtick.major', size=10.00) | |
| 19 pyplot.rc('ytick.major', size=10.00) | |
| 20 pyplot.rc('lines', linewidth=4.00) | |
| 21 pyplot.rc('axes', linewidth=3.00) | |
| 22 pyplot.rc('font', family='Bitstream Vera Sans', size=32.0) | |
| 23 | |
| 24 COLORS = 'krb' | |
| 25 | |
| 26 | |
| 27 class Replicate(object): | |
| 28 | |
| 29 def __init__(self, id, dataset_path): | |
| 30 self.id = id | |
| 31 self.dataset_path = dataset_path | |
| 32 self.parse(csv.reader(open(dataset_path, 'rt'), delimiter='\t')) | |
| 33 | |
| 34 def parse(self, reader): | |
| 35 self.chromosomes = {} | |
| 36 for line in reader: | |
| 37 if line[0].startswith("#") or line[0].startswith('"'): | |
| 38 continue | |
| 39 cname, junk, junk, mid, midplus, value, strand, junk, attrs = line | |
| 40 attrs = parse_gff_attrs(attrs) | |
| 41 distance = attrs['cw_distance'] | |
| 42 mid = int(mid) | |
| 43 midplus = int(midplus) | |
| 44 value = float(value) | |
| 45 distance = int(distance) | |
| 46 if cname not in self.chromosomes: | |
| 47 self.chromosomes[cname] = Chromosome(cname) | |
| 48 chrom = self.chromosomes[cname] | |
| 49 chrom.add_peak(Peak(cname, mid, value, distance, self)) | |
| 50 for chrom in self.chromosomes.values(): | |
| 51 chrom.sort_by_index() | |
| 52 | |
| 53 def filter(self, up_limit, low_limit): | |
| 54 for chrom in self.chromosomes.values(): | |
| 55 chrom.filter(up_limit, low_limit) | |
| 56 | |
| 57 def size(self): | |
| 58 return sum([len(c.peaks) for c in self.chromosomes.values()]) | |
| 59 | |
| 60 | |
| 61 class Chromosome(object): | |
| 62 | |
| 63 def __init__(self, name): | |
| 64 self.name = name | |
| 65 self.peaks = [] | |
| 66 | |
| 67 def add_peak(self, peak): | |
| 68 self.peaks.append(peak) | |
| 69 | |
| 70 def sort_by_index(self): | |
| 71 self.peaks.sort(key=lambda peak: peak.midpoint) | |
| 72 self.keys = make_keys(self.peaks) | |
| 73 | |
| 74 def remove_peak(self, peak): | |
| 75 i = bisect.bisect_left(self.keys, peak.midpoint) | |
| 76 # If the peak was actually found | |
| 77 if i < len(self.peaks) and self.peaks[i].midpoint == peak.midpoint: | |
| 78 del self.keys[i] | |
| 79 del self.peaks[i] | |
| 80 | |
| 81 def filter(self, up_limit, low_limit): | |
| 82 self.peaks = [p for p in self.peaks if low_limit <= p.distance <= up_limit] | |
| 83 self.keys = make_keys(self.peaks) | |
| 84 | |
| 85 | |
| 86 class Peak(object): | |
| 87 | |
| 88 def __init__(self, chrom, midpoint, value, distance, replicate): | |
| 89 self.chrom = chrom | |
| 90 self.value = value | |
| 91 self.midpoint = midpoint | |
| 92 self.distance = distance | |
| 93 self.replicate = replicate | |
| 94 | |
| 95 def normalized_value(self, med): | |
| 96 return self.value * med / self.replicate.median | |
| 97 | |
| 98 | |
| 99 class PeakGroup(object): | |
| 100 | |
| 101 def __init__(self): | |
| 102 self.peaks = {} | |
| 103 | |
| 104 def add_peak(self, repid, peak): | |
| 105 self.peaks[repid] = peak | |
| 106 | |
| 107 @property | |
| 108 def chrom(self): | |
| 109 return self.peaks.values()[0].chrom | |
| 110 | |
| 111 @property | |
| 112 def midpoint(self): | |
| 113 return median([peak.midpoint for peak in self.peaks.values()]) | |
| 114 | |
| 115 @property | |
| 116 def num_replicates(self): | |
| 117 return len(self.peaks) | |
| 118 | |
| 119 @property | |
| 120 def median_distance(self): | |
| 121 return median([peak.distance for peak in self.peaks.values()]) | |
| 122 | |
| 123 @property | |
| 124 def value_sum(self): | |
| 125 return sum([peak.value for peak in self.peaks.values()]) | |
| 126 | |
| 127 def normalized_value(self, med): | |
| 128 values = [] | |
| 129 for peak in self.peaks.values(): | |
| 130 values.append(peak.normalized_value(med)) | |
| 131 return median(values) | |
| 132 | |
| 133 @property | |
| 134 def peakpeak_distance(self): | |
| 135 keys = self.peaks.keys() | |
| 136 return abs(self.peaks[keys[0]].midpoint - self.peaks[keys[1]].midpoint) | |
| 137 | |
| 138 | |
| 139 class FrequencyDistribution(object): | |
| 140 | |
| 141 def __init__(self, d=None): | |
| 142 self.dist = d or {} | |
| 143 | |
| 144 def add(self, x): | |
| 145 self.dist[x] = self.dist.get(x, 0) + 1 | |
| 146 | |
| 147 def graph_series(self): | |
| 148 x = [] | |
| 149 y = [] | |
| 150 for key, val in self.dist.items(): | |
| 151 x.append(key) | |
| 152 y.append(val) | |
| 153 return x, y | |
| 154 | |
| 155 def mode(self): | |
| 156 return max(self.dist.items(), key=lambda data: data[1])[0] | |
| 157 | |
| 158 def size(self): | |
| 159 return sum(self.dist.values()) | |
| 160 | |
| 161 | |
| 162 def stop_err(msg): | |
| 163 sys.stderr.write(msg) | |
| 164 sys.exit(1) | |
| 165 | |
| 166 | |
| 167 def median(data): | |
| 168 """ | |
| 169 Find the integer median of the data set. | |
| 170 """ | |
| 171 if not data: | |
| 172 return 0 | |
| 173 sdata = sorted(data) | |
| 174 if len(data) % 2 == 0: | |
| 175 return (sdata[len(data)//2] + sdata[len(data)//2-1]) / 2 | |
| 176 else: | |
| 177 return sdata[len(data)//2] | |
| 178 | |
| 179 | |
| 180 def make_keys(peaks): | |
| 181 return [data.midpoint for data in peaks] | |
| 182 | |
| 183 | |
| 184 def get_window(chromosome, target_peaks, distance): | |
| 185 """ | |
| 186 Returns a window of all peaks from a replicate within a certain distance of | |
| 187 a peak from another replicate. | |
| 188 """ | |
| 189 lower = target_peaks[0].midpoint | |
| 190 upper = target_peaks[0].midpoint | |
| 191 for peak in target_peaks: | |
| 192 lower = min(lower, peak.midpoint - distance) | |
| 193 upper = max(upper, peak.midpoint + distance) | |
| 194 start_index = bisect.bisect_left(chromosome.keys, lower) | |
| 195 end_index = bisect.bisect_right(chromosome.keys, upper) | |
| 196 return (chromosome.peaks[start_index: end_index], chromosome.name) | |
| 197 | |
| 198 | |
| 199 def match_largest(window, peak, chrum): | |
| 200 if not window: | |
| 201 return None | |
| 202 if peak.chrom != chrum: | |
| 203 return None | |
| 204 return max(window, key=lambda cpeak: cpeak.value) | |
| 205 | |
| 206 | |
| 207 def match_closest(window, peak, chrum): | |
| 208 if not window: | |
| 209 return None | |
| 210 if peak.chrom != chrum: | |
| 211 return None | |
| 212 return min(window, key=lambda match: abs(match.midpoint - peak.midpoint)) | |
| 213 | |
| 214 | |
| 215 def frequency_histogram(freqs, dataset_path, labels=[], title=''): | |
| 216 pyplot.clf() | |
| 217 pyplot.figure(figsize=(10, 10)) | |
| 218 for i, freq in enumerate(freqs): | |
| 219 xvals, yvals = freq.graph_series() | |
| 220 # Go from high to low | |
| 221 xvals.reverse() | |
| 222 pyplot.bar([x-0.4 + 0.8/len(freqs)*i for x in xvals], yvals, width=0.8/len(freqs), color=COLORS[i]) | |
| 223 pyplot.xticks(range(min(xvals), max(xvals)+1), map(str, reversed(range(min(xvals), max(xvals)+1)))) | |
| 224 pyplot.xlabel(X_LABEL) | |
| 225 pyplot.ylabel(Y_LABEL) | |
| 226 pyplot.subplots_adjust(left=ADJUST[0], right=ADJUST[1], top=ADJUST[2], bottom=ADJUST[3]) | |
| 227 ax = pyplot.gca() | |
| 228 for l in ax.get_xticklines() + ax.get_yticklines(): | |
| 229 l.set_markeredgewidth(TICK_WIDTH) | |
| 230 pyplot.savefig(dataset_path) | |
| 231 | |
| 232 | |
| 233 METHODS = {'closest': match_closest, 'largest': match_largest} | |
| 234 | |
| 235 | |
| 236 def gff_attrs(d): | |
| 237 if not d: | |
| 238 return '.' | |
| 239 return ';'.join('%s=%s' % item for item in d.items()) | |
| 240 | |
| 241 | |
| 242 def parse_gff_attrs(s): | |
| 243 d = {} | |
| 244 if s == '.': | |
| 245 return d | |
| 246 for item in s.split(';'): | |
| 247 key, val = item.split('=') | |
| 248 d[key] = val | |
| 249 return d | |
| 250 | |
| 251 | |
| 252 def gff_row(cname, start, end, score, source, type='.', strand='.', phase='.', attrs={}): | |
| 253 return (cname, source, type, start, end, score, strand, phase, gff_attrs(attrs)) | |
| 254 | |
| 255 | |
| 256 def get_temporary_plot_path(): | |
| 257 """ | |
| 258 Return the path to a temporary file with a valid image format | |
| 259 file extension that can be used with bioformats. | |
| 260 """ | |
| 261 tmp_dir = tempfile.mkdtemp(prefix='tmp-repmatch-') | |
| 262 fd, name = tempfile.mkstemp(suffix='.pdf', dir=tmp_dir) | |
| 263 os.close(fd) | |
| 264 return name | |
| 265 | |
| 266 | |
| 267 def process_files(dataset_paths, galaxy_hids, method, distance, step, replicates, up_limit, low_limit, output_files, | |
| 268 output_matched_peaks, output_unmatched_peaks, output_detail, output_statistics_table, output_statistics_histogram): | |
| 269 output_statistics_histogram_file = output_files in ["all"] and method in ["all"] | |
| 270 if len(dataset_paths) < 2: | |
| 271 return | |
| 272 if method == 'all': | |
| 273 match_methods = METHODS.keys() | |
| 274 else: | |
| 275 match_methods = [method] | |
| 276 for match_method in match_methods: | |
| 277 statistics = perform_process(dataset_paths, | |
| 278 galaxy_hids, | |
| 279 match_method, | |
| 280 distance, | |
| 281 step, | |
| 282 replicates, | |
| 283 up_limit, | |
| 284 low_limit, | |
| 285 output_files, | |
| 286 output_matched_peaks, | |
| 287 output_unmatched_peaks, | |
| 288 output_detail, | |
| 289 output_statistics_table, | |
| 290 output_statistics_histogram) | |
| 291 if output_statistics_histogram_file: | |
| 292 tmp_statistics_histogram_path = get_temporary_plot_path() | |
| 293 frequency_histogram([stat['distribution'] for stat in [statistics]], | |
| 294 tmp_statistics_histogram_path, | |
| 295 METHODS.keys()) | |
| 296 shutil.move(tmp_statistics_histogram_path, output_statistics_histogram) | |
| 297 | |
| 298 | |
| 299 def perform_process(dataset_paths, galaxy_hids, method, distance, step, num_required, up_limit, low_limit, output_files, | |
| 300 output_matched_peaks, output_unmatched_peaks, output_detail, output_statistics_table, output_statistics_histogram): | |
| 301 output_detail_file = output_files in ["all"] and output_detail is not None | |
| 302 output_statistics_table_file = output_files in ["all"] and output_statistics_table is not None | |
| 303 output_unmatched_peaks_file = output_files in ["all", "matched_peaks_unmatched_peaks"] and output_unmatched_peaks is not None | |
| 304 output_statistics_histogram_file = output_files in ["all"] and output_statistics_histogram is not None | |
| 305 replicates = [] | |
| 306 for i, dataset_path in enumerate(dataset_paths): | |
| 307 try: | |
| 308 galaxy_hid = galaxy_hids[i] | |
| 309 r = Replicate(galaxy_hid, dataset_path) | |
| 310 replicates.append(r) | |
| 311 except Exception, e: | |
| 312 stop_err('Unable to parse file "%s", exception: %s' % (dataset_path, str(e))) | |
| 313 attrs = 'd%sr%s' % (distance, num_required) | |
| 314 if up_limit != 1000: | |
| 315 attrs += 'u%d' % up_limit | |
| 316 if low_limit != -1000: | |
| 317 attrs += 'l%d' % low_limit | |
| 318 if step != 0: | |
| 319 attrs += 's%d' % step | |
| 320 | |
| 321 def td_writer(file_path): | |
| 322 # Returns a tab-delimited writer for a certain output | |
| 323 return csv.writer(open(file_path, 'wt'), delimiter='\t') | |
| 324 | |
| 325 labels = ('chrom', | |
| 326 'median midpoint', | |
| 327 'median midpoint+1', | |
| 328 'median normalized reads', | |
| 329 'replicates', | |
| 330 'median c-w distance', | |
| 331 'reads sum') | |
| 332 for replicate in replicates: | |
| 333 labels += ('chrom', | |
| 334 'median midpoint', | |
| 335 'median midpoint+1', | |
| 336 'c-w sum', | |
| 337 'c-w distance', | |
| 338 'replicate id') | |
| 339 matched_peaks_output = td_writer(output_matched_peaks) | |
| 340 if output_statistics_table_file: | |
| 341 statistics_table_output = td_writer(output_statistics_table) | |
| 342 statistics_table_output.writerow(('data', 'median read count')) | |
| 343 if output_detail_file: | |
| 344 detail_output = td_writer(output_detail) | |
| 345 detail_output.writerow(labels) | |
| 346 if output_unmatched_peaks_file: | |
| 347 unmatched_peaks_output = td_writer(output_unmatched_peaks) | |
| 348 unmatched_peaks_output.writerow(('chrom', 'midpoint', 'midpoint+1', 'c-w sum', 'c-w distance', 'replicate id')) | |
| 349 # Perform filtering | |
| 350 if up_limit < 1000 or low_limit > -1000: | |
| 351 for replicate in replicates: | |
| 352 replicate.filter(up_limit, low_limit) | |
| 353 # Actually merge the peaks | |
| 354 peak_groups = [] | |
| 355 unmatched_peaks = [] | |
| 356 freq = FrequencyDistribution() | |
| 357 | |
| 358 def do_match(reps, distance): | |
| 359 # Copy list because we will mutate it, but keep replicate references. | |
| 360 reps = reps[:] | |
| 361 while len(reps) > 1: | |
| 362 # Iterate over each replicate as "main" | |
| 363 main = reps[0] | |
| 364 reps.remove(main) | |
| 365 for chromosome in main.chromosomes.values(): | |
| 366 peaks_by_value = chromosome.peaks[:] | |
| 367 # Sort main replicate by value | |
| 368 peaks_by_value.sort(key=lambda peak: -peak.value) | |
| 369 | |
| 370 def search_for_matches(group): | |
| 371 # Here we use multiple passes, expanding the window to be | |
| 372 # +- distance from any previously matched peak. | |
| 373 while True: | |
| 374 new_match = False | |
| 375 for replicate in reps: | |
| 376 if replicate.id in group.peaks: | |
| 377 # Stop if match already found for this replicate | |
| 378 continue | |
| 379 try: | |
| 380 # Lines changed to remove a major bug by Rohit Reja. | |
| 381 window, chrum = get_window(replicate.chromosomes[chromosome.name], | |
| 382 group.peaks.values(), | |
| 383 distance) | |
| 384 match = METHODS[method](window, peak, chrum) | |
| 385 except KeyError: | |
| 386 continue | |
| 387 if match: | |
| 388 group.add_peak(replicate.id, match) | |
| 389 new_match = True | |
| 390 if not new_match: | |
| 391 break | |
| 392 # Attempt to enlarge existing peak groups | |
| 393 for group in peak_groups: | |
| 394 old_peaks = group.peaks.values()[:] | |
| 395 search_for_matches(group) | |
| 396 for peak in group.peaks.values(): | |
| 397 if peak not in old_peaks: | |
| 398 peak.replicate.chromosomes[chromosome.name].remove_peak(peak) | |
| 399 # Attempt to find new peaks groups. For each peak in the | |
| 400 # main replicate, search for matches in the other replicates | |
| 401 for peak in peaks_by_value: | |
| 402 matches = PeakGroup() | |
| 403 matches.add_peak(main.id, peak) | |
| 404 search_for_matches(matches) | |
| 405 # Were enough replicates matched? | |
| 406 if matches.num_replicates >= num_required: | |
| 407 for peak in matches.peaks.values(): | |
| 408 peak.replicate.chromosomes[chromosome.name].remove_peak(peak) | |
| 409 peak_groups.append(matches) | |
| 410 # Zero or less = no stepping | |
| 411 if step <= 0: | |
| 412 do_match(replicates, distance) | |
| 413 else: | |
| 414 for d in range(0, distance, step): | |
| 415 do_match(replicates, d) | |
| 416 for group in peak_groups: | |
| 417 freq.add(group.num_replicates) | |
| 418 # Collect together the remaining unmatched_peaks | |
| 419 for replicate in replicates: | |
| 420 for chromosome in replicate.chromosomes.values(): | |
| 421 for peak in chromosome.peaks: | |
| 422 freq.add(1) | |
| 423 unmatched_peaks.append(peak) | |
| 424 # Average the unmatched_peaks count in the graph by # replicates | |
| 425 med = median([peak.value for group in peak_groups for peak in group.peaks.values()]) | |
| 426 for replicate in replicates: | |
| 427 replicate.median = median([peak.value for group in peak_groups for peak in group.peaks.values() if peak.replicate == replicate]) | |
| 428 statistics_table_output.writerow((replicate.id, replicate.median)) | |
| 429 for group in peak_groups: | |
| 430 # Output matched_peaks (matched pairs). | |
| 431 matched_peaks_output.writerow(gff_row(cname=group.chrom, | |
| 432 start=group.midpoint, | |
| 433 end=group.midpoint+1, | |
| 434 source='repmatch', | |
| 435 score=group.normalized_value(med), | |
| 436 attrs={'median_distance': group.median_distance, | |
| 437 'replicates': group.num_replicates, | |
| 438 'value_sum': group.value_sum})) | |
| 439 if output_detail_file: | |
| 440 matched_peaks = (group.chrom, | |
| 441 group.midpoint, | |
| 442 group.midpoint+1, | |
| 443 group.normalized_value(med), | |
| 444 group.num_replicates, | |
| 445 group.median_distance, | |
| 446 group.value_sum) | |
| 447 for peak in group.peaks.values(): | |
| 448 matched_peaks += (peak.chrom, peak.midpoint, peak.midpoint+1, peak.value, peak.distance, peak.replicate.id) | |
| 449 detail_output.writerow(matched_peaks) | |
| 450 if output_unmatched_peaks_file: | |
| 451 for unmatched_peak in unmatched_peaks: | |
| 452 unmatched_peaks_output.writerow((unmatched_peak.chrom, | |
| 453 unmatched_peak.midpoint, | |
| 454 unmatched_peak.midpoint+1, | |
| 455 unmatched_peak.value, | |
| 456 unmatched_peak.distance, | |
| 457 unmatched_peak.replicate.id)) | |
| 458 if output_statistics_histogram_file: | |
| 459 tmp_statistics_histogram_path = get_temporary_plot_path() | |
| 460 frequency_histogram([freq], tmp_statistics_histogram_path) | |
| 461 shutil.move(tmp_statistics_histogram_path, output_statistics_histogram) | |
| 462 return {'distribution': freq} |
