repmatch_gff3: repmatch_gff3_util.py comparison

comparison repmatch_gff3_util.py @ 0:d33030c8e2cc draft

Uploaded

author	greg
date	Tue, 17 Nov 2015 14:26:08 -0500
parents
children	8159aaa7da4b

comparison

equal deleted inserted replaced

--1:000000000000
+:d33030c8e2cc
+import bisect
+import csv
+import os
+import shutil
+import sys
+import tempfile
+from matplotlib import pyplot
+# Graph settings
+Y_LABEL = 'Counts'
+X_LABEL = 'Number of matched replicates'
+TICK_WIDTH = 3
+# Amount to shift the graph to make labels fit, [left, right, top, bottom]
+ADJUST = [0.180, 0.9, 0.9, 0.1]
+# Length of tick marks, use TICK_WIDTH for width
+pyplot.rc('xtick.major', size=10.00)
+pyplot.rc('ytick.major', size=10.00)
+pyplot.rc('lines', linewidth=4.00)
+pyplot.rc('axes', linewidth=3.00)
+pyplot.rc('font', family='Arial', size=32.0)
+PLOT_FORMATS = ['png', 'pdf', 'svg']
+COLORS = 'krb'
+class Replicate(object):
+def __init__(self, id, dataset_path):
+self.id = id
+self.dataset_path = dataset_path
+self.parse(csv.reader(open(dataset_path, 'rt'), delimiter='\t'))
+def parse(self, reader):
+self.chromosomes = {}
+for line in reader:
+if line[0].startswith("#") or line[0].startswith('"'):
+continue
+cname, junk, junk, mid, midplus, value, strand, junk, attrs = line
+attrs = parse_gff_attrs(attrs)
+distance = attrs['cw_distance']
+mid = int(mid)
+midplus = int(midplus)
+value = float(value)
+distance = int(distance)
+if cname not in self.chromosomes:
+self.chromosomes[cname] = Chromosome(cname)
+chrom = self.chromosomes[cname]
+chrom.add_peak(Peak(cname, mid, value, distance, self))
+for chrom in self.chromosomes.values():
+chrom.sort_by_index()
+def filter(self, up_limit, low_limit):
+for chrom in self.chromosomes.values():
+chrom.filter(up_limit, low_limit)
+def size(self):
+return sum([len(c.peaks) for c in self.chromosomes.values()])
+class Chromosome(object):
+def __init__(self, name):
+self.name = name
+self.peaks = []
+def add_peak(self, peak):
+self.peaks.append(peak)
+def sort_by_index(self):
+self.peaks.sort(key=lambda peak: peak.midpoint)
+self.keys = make_keys(self.peaks)
+def remove_peak(self, peak):
+i = bisect.bisect_left(self.keys, peak.midpoint)
+# If the peak was actually found
+if i < len(self.peaks) and self.peaks[i].midpoint == peak.midpoint:
+del self.keys[i]
+del self.peaks[i]
+def filter(self, up_limit, low_limit):
+self.peaks = [p for p in self.peaks if low_limit <= p.distance <= up_limit]
+self.keys = make_keys(self.peaks)
+class Peak(object):
+def __init__(self, chrom, midpoint, value, distance, replicate):
+self.chrom = chrom
+self.value = value
+self.midpoint = midpoint
+self.distance = distance
+self.replicate = replicate
+def normalized_value(self, med):
+return self.value * med / self.replicate.median
+class PeakGroup(object):
+def __init__(self):
+self.peaks = {}
+def add_peak(self, repid, peak):
+self.peaks[repid] = peak
+@property
+def chrom(self):
+return self.peaks.values()[0].chrom
+@property
+def midpoint(self):
+return median([peak.midpoint for peak in self.peaks.values()])
+@property
+def num_replicates(self):
+return len(self.peaks)
+@property
+def median_distance(self):
+return median([peak.distance for peak in self.peaks.values()])
+@property
+def value_sum(self):
+return sum([peak.value for peak in self.peaks.values()])
+def normalized_value(self, med):
+values = []
+for peak in self.peaks.values():
+values.append(peak.normalized_value(med))
+return median(values)
+@property
+def peakpeak_distance(self):
+keys = self.peaks.keys()
+return abs(self.peaks[keys[0]].midpoint - self.peaks[keys[1]].midpoint)
+class FrequencyDistribution(object):
+def __init__(self, d=None):
+self.dist = d or {}
+def add(self, x):
+self.dist[x] = self.dist.get(x, 0) + 1
+def graph_series(self):
+x = []
+y = []
+for key, val in self.dist.items():
+x.append(key)
+y.append(val)
+return x, y
+def mode(self):
+return max(self.dist.items(), key=lambda data: data[1])[0]
+def size(self):
+return sum(self.dist.values())
+def stop_err(msg):
+sys.stderr.write(msg)
+sys.exit(1)
+def median(data):
+"""
+Find the integer median of the data set.
+"""
+if not data:
+return 0
+sdata = sorted(data)
+if len(data) % 2 == 0:
+return (sdata[len(data)//2] + sdata[len(data)//2-1]) / 2
+else:
+return sdata[len(data)//2]
+def make_keys(peaks):
+return [data.midpoint for data in peaks]
+def get_window(chromosome, target_peaks, distance):
+"""
+Returns a window of all peaks from a replicate within a certain distance of
+a peak from another replicate.
+"""
+lower = target_peaks[0].midpoint
+upper = target_peaks[0].midpoint
+for peak in target_peaks:
+lower = min(lower, peak.midpoint - distance)
+upper = max(upper, peak.midpoint + distance)
+start_index = bisect.bisect_left(chromosome.keys, lower)
+end_index = bisect.bisect_right(chromosome.keys, upper)
+return (chromosome.peaks[start_index: end_index], chromosome.name)
+def match_largest(window, peak, chrum):
+if not window:
+return None
+if peak.chrom != chrum:
+return None
+return max(window, key=lambda cpeak: cpeak.value)
+def match_closest(window, peak, chrum):
+if not window:
+return None
+if peak.chrom != chrum:
+return None
+return min(window, key=lambda match: abs(match.midpoint - peak.midpoint))
+def frequency_histogram(freqs, dataset_path, labels=[], title=''):
+pyplot.clf()
+pyplot.figure(figsize=(10, 10))
+for i, freq in enumerate(freqs):
+xvals, yvals = freq.graph_series()
+# Go from high to low
+xvals.reverse()
+pyplot.bar([x-0.4 + 0.8/len(freqs)*i for x in xvals], yvals, width=0.8/len(freqs), color=COLORS[i])
+pyplot.xticks(range(min(xvals), max(xvals)+1), map(str, reversed(range(min(xvals), max(xvals)+1))))
+pyplot.xlabel(X_LABEL)
+pyplot.ylabel(Y_LABEL)
+pyplot.subplots_adjust(left=ADJUST[0], right=ADJUST[1], top=ADJUST[2], bottom=ADJUST[3])
+ax = pyplot.gca()
+for l in ax.get_xticklines() + ax.get_yticklines():
+l.set_markeredgewidth(TICK_WIDTH)
+pyplot.savefig(dataset_path)
+METHODS = {'closest': match_closest, 'largest': match_largest}
+def gff_attrs(d):
+if not d:
+return '.'
+return ';'.join('%s=%s' % item for item in d.items())
+def parse_gff_attrs(s):
+d = {}
+if s == '.':
+return d
+for item in s.split(';'):
+key, val = item.split('=')
+d[key] = val
+return d
+def gff_row(cname, start, end, score, source, type='.', strand='.', phase='.', attrs={}):
+return (cname, source, type, start, end, score, strand, phase, gff_attrs(attrs))
+def get_temporary_plot_path(plot_format):
+"""
+Return the path to a temporary file with a valid image format
+file extension that can be used with bioformats.
+"""
+tmp_dir = tempfile.mkdtemp(prefix='tmp-repmatch-')
+fd, name = tempfile.mkstemp(suffix='.%s' % plot_format, dir=tmp_dir)
+os.close(fd)
+return name
+def process_files(dataset_paths, galaxy_hids, method, distance, step, replicates, up_limit, low_limit, output_files,
+plot_format, output_summary, output_orphan, output_detail, output_key, output_histogram):
+output_histogram_file = output_files in ["all"] and method in ["all"]
+if len(dataset_paths) < 2:
+return
+if method == 'all':
+match_methods = METHODS.keys()
+else:
+match_methods = [method]
+for match_method in match_methods:
+statistics = perform_process(dataset_paths,
+galaxy_hids,
+match_method,
+distance,
+step,
+replicates,
+up_limit,
+low_limit,
+output_files,
+plot_format,
+output_summary,
+output_orphan,
+output_detail,
+output_key,
+output_histogram)
+if output_histogram_file:
+tmp_histogram_path = get_temporary_plot_path(plot_format)
+frequency_histogram([stat['distribution'] for stat in [statistics]],
+tmp_histogram_path,
+METHODS.keys())
+shutil.move(tmp_histogram_path, output_histogram)
+def perform_process(dataset_paths, galaxy_hids, method, distance, step, num_required, up_limit, low_limit, output_files,
+plot_format, output_summary, output_orphan, output_detail, output_key, output_histogram):
+output_detail_file = output_files in ["all"] and output_detail is not None
+output_key_file = output_files in ["all"] and output_key is not None
+output_orphan_file = output_files in ["all", "simple_orphan"] and output_orphan is not None
+output_histogram_file = output_files in ["all"] and output_histogram is not None
+replicates = []
+for i, dataset_path in enumerate(dataset_paths):
+try:
+galaxy_hid = galaxy_hids[i]
+r = Replicate(galaxy_hid, dataset_path)
+replicates.append(r)
+except Exception, e:
+stop_err('Unable to parse file "%s", exception: %s' % (dataset_path, str(e)))
+attrs = 'd%sr%s' % (distance, num_required)
+if up_limit != 1000:
+attrs += 'u%d' % up_limit
+if low_limit != -1000:
+attrs += 'l%d' % low_limit
+if step != 0:
+attrs += 's%d' % step
+def td_writer(file_path):
+# Returns a tab-delimited writer for a certain output
+return csv.writer(open(file_path, 'wt'), delimiter='\t')
+labels = ('chrom',
+'median midpoint',
+'median midpoint+1',
+'median normalized reads',
+'replicates',
+'median c-w distance',
+'reads sum')
+for replicate in replicates:
+labels += ('chrom',
+'median midpoint',
+'median midpoint+1',
+'c-w sum',
+'c-w distance',
+'replicate id')
+summary_output = td_writer(output_summary)
+if output_key_file:
+key_output = td_writer(output_key)
+key_output.writerow(('data', 'median read count'))
+if output_detail_file:
+detail_output = td_writer(output_detail)
+detail_output.writerow(labels)
+if output_orphan_file:
+orphan_output = td_writer(output_orphan)
+orphan_output.writerow(('chrom', 'midpoint', 'midpoint+1', 'c-w sum', 'c-w distance', 'replicate id'))
+# Perform filtering
+if up_limit < 1000 or low_limit > -1000:
+for replicate in replicates:
+replicate.filter(up_limit, low_limit)
+# Actually merge the peaks
+peak_groups = []
+orphans = []
+freq = FrequencyDistribution()
+def do_match(reps, distance):
+# Copy list because we will mutate it, but keep replicate references.
+reps = reps[:]
+while len(reps) > 1:
+# Iterate over each replicate as "main"
+main = reps[0]
+reps.remove(main)
+for chromosome in main.chromosomes.values():
+peaks_by_value = chromosome.peaks[:]
+# Sort main replicate by value
+peaks_by_value.sort(key=lambda peak: -peak.value)
+def search_for_matches(group):
+# Here we use multiple passes, expanding the window to be
+#  +- distance from any previously matched peak.
+while True:
+new_match = False
+for replicate in reps:
+if replicate.id in group.peaks:
+# Stop if match already found for this replicate
+continue
+try:
+# Lines changed to remove a major bug by Rohit Reja.
+window, chrum = get_window(replicate.chromosomes[chromosome.name],
+group.peaks.values(),
+distance)
+match = METHODS[method](window, peak, chrum)
+except KeyError:
+continue
+if match:
+group.add_peak(replicate.id, match)
+new_match = True
+if not new_match:
+break
+# Attempt to enlarge existing peak groups
+for group in peak_groups:
+old_peaks = group.peaks.values()[:]
+search_for_matches(group)
+for peak in group.peaks.values():
+if peak not in old_peaks:
+peak.replicate.chromosomes[chromosome.name].remove_peak(peak)
+# Attempt to find new peaks groups.  For each peak in the
+# main replicate, search for matches in the other replicates
+for peak in peaks_by_value:
+matches = PeakGroup()
+matches.add_peak(main.id, peak)
+search_for_matches(matches)
+# Were enough replicates matched?
+if matches.num_replicates >= num_required:
+for peak in matches.peaks.values():
+peak.replicate.chromosomes[chromosome.name].remove_peak(peak)
+peak_groups.append(matches)
+# Zero or less = no stepping
+if step <= 0:
+do_match(replicates, distance)
+else:
+for d in range(0, distance, step):
+do_match(replicates, d)
+for group in peak_groups:
+freq.add(group.num_replicates)
+# Collect together the remaining orphans
+for replicate in replicates:
+for chromosome in replicate.chromosomes.values():
+for peak in chromosome.peaks:
+freq.add(1)
+orphans.append(peak)
+# Average the orphan count in the graph by # replicates
+med = median([peak.value for group in peak_groups for peak in group.peaks.values()])
+for replicate in replicates:
+replicate.median = median([peak.value for group in peak_groups for peak in group.peaks.values() if peak.replicate == replicate])
+key_output.writerow((replicate.id, replicate.median))
+for group in peak_groups:
+# Output summary (matched pairs).
+summary_output.writerow(gff_row(cname=group.chrom,
+start=group.midpoint,
+end=group.midpoint+1,
+source='repmatch',
+score=group.normalized_value(med),
+attrs={'median_distance': group.median_distance,
+'replicates': group.num_replicates,
+'value_sum': group.value_sum}))
+if output_detail_file:
+summary = (group.chrom,
+group.midpoint,
+group.midpoint+1,
+group.normalized_value(med),
+group.num_replicates,
+group.median_distance,
+group.value_sum)
+for peak in group.peaks.values():
+summary += (peak.chrom, peak.midpoint, peak.midpoint+1, peak.value, peak.distance, peak.replicate.id)
+detail_output.writerow(summary)
+if output_orphan_file:
+for orphan in orphans:
+orphan_output.writerow((orphan.chrom,
+orphan.midpoint,
+orphan.midpoint+1,
+orphan.value,
+orphan.distance,
+orphan.replicate.id))
+if output_histogram_file:
+tmp_histogram_path = get_temporary_plot_path(plot_format)
+frequency_histogram([freq], tmp_histogram_path)
+shutil.move(tmp_histogram_path, output_histogram)
+return {'distribution': freq}

Mercurial > repos > greg > repmatch_gff3

comparison repmatch_gff3_util.py @ 0:d33030c8e2cc draft