Mercurial > repos > bgruening > join_files_on_column_fuzzy
view join_files_on_column_fuzzy.py @ 3:6544babbdb8b draft default tip
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/join_files_on_column_fuzzy commit 3419a5a5e19a93369c8c20a39babe5636a309292
author | bgruening |
---|---|
date | Tue, 29 May 2018 07:49:14 -0400 |
parents | 5c31b0ea0734 |
children |
line wrap: on
line source
#!/usr/bin/env python import os import argparse import sys def main(args): if args.header: h1 = True h2 = True else: h1 = False h2 = False cache = list() out = open(args.outfile, 'w+') write_buffer = list() def _readline(header = False): with open(args.f2) as handle2: for line in handle2: line = line.strip() if header: header = False yield line continue if not line: continue columns = line.split(args.sep) value2 = columns[args.c2-1] yield columns, float(value2) def fill_cache(): try: cache.append(next(it)) except StopIteration: pass it = _readline(header = h2) with open(args.f1) as handle1: for line in handle1: line = line.strip() if h1: h1 = False seconda_header = next(it) if args.add_distance: out.write('%s\t%s\t%s\n' % (line, seconda_header, args.unit)) else: out.write('%s\t%s\n' % (line, seconda_header)) continue if not line: continue columns = line.split(args.sep) value1 = float(columns[args.c1-1]) _cache = list() fill_cache() while cache: _c, value2 = cache.pop(0) upper_bound = value1 + args.distance if args.unit == 'absolute': if value2 <= upper_bound and value2 >= (value1 - args.distance): line_template = '%s\n' abs_dist = abs(value1 - value2) if args.add_distance: line_template = '%s\t' + str(abs_dist) + '\n' write_buffer.append([abs_dist, line_template % '\t'.join( columns + _c )]) _cache.append([_c, value2]) fill_cache() elif value2 > upper_bound: # if the value from list 2 is bigger then the current value, he will be taken into the next round _cache.append([_c, value2]) elif value2 < upper_bound: # if the value from list 2 is smaller then the currecnt value, check the next one of list 2 fill_cache() elif args.unit == 'ppm': ppm_dist = abs((value1 - value2) / value1 * 1000000) if ppm_dist <= args.distance: line_template = '%s\n' if args.add_distance: line_template = '%s\t' + str(ppm_dist) + '\n' write_buffer.append([ppm_dist, line_template % '\t'.join( columns + _c )]) _cache.append([_c, value2]) fill_cache() elif ppm_dist > args.distance: _cache.append([_c, value2]) fill_cache() if args.closest and write_buffer: write_buffer.sort(key=lambda x: x[0]) out.write(write_buffer[0][1]) else: for _dist, line in write_buffer: out.write(line) write_buffer = list() cache = _cache out.close() if __name__ == '__main__': parser = argparse.ArgumentParser(description='Merge two files on a common column the fuzzy way.') parser.add_argument('--f1', required=True) parser.add_argument('--f2', required=True) parser.add_argument('--c1', type=int, required=True, help="Column in file 1 to be merged on.") parser.add_argument('--c2', type=int, required=True, help="Column in file 2 to be merged on.") parser.add_argument('--outfile', required=True) parser.add_argument('--header', action='store_true', help="The files have a header line at the beginning.") parser.add_argument('--closest', action='store_true', help="Only report the closest match.") parser.add_argument('--add_distance', action='store_true', help="Add addional column with the distance between the two values.") parser.add_argument('--sep', type=str, default="\t", help="Files are separated by this separator.") parser.add_argument('--distance', type=float, default="0.2", help="Maximal allowed distance.") parser.add_argument('--unit', choices=['ppm', 'absolute'], default='absolute') args = parser.parse_args() main(args)