Mercurial > repos > bgruening > join_files_on_column_fuzzy
comparison join_files_on_column_fuzzy.py @ 0:5b667b17923a draft
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/join_files_on_column_fuzzy commit fd2bc86108994c9eda731b305ca6a8c71554cfaa
| author | bgruening |
|---|---|
| date | Sun, 26 Nov 2017 16:12:46 -0500 |
| parents | |
| children | 5c31b0ea0734 |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:5b667b17923a |
|---|---|
| 1 #!/usr/bin/env python | |
| 2 | |
| 3 import os | |
| 4 import argparse | |
| 5 import sys | |
| 6 | |
| 7 def main(args): | |
| 8 | |
| 9 if args.header: | |
| 10 h1 = True | |
| 11 h2 = True | |
| 12 else: | |
| 13 h1 = False | |
| 14 h2 = False | |
| 15 | |
| 16 cache = list() | |
| 17 out = open(args.outfile, 'w+') | |
| 18 write_buffer = list() | |
| 19 | |
| 20 def _readline(header = False): | |
| 21 with open(args.f2) as handle2: | |
| 22 for line in handle2: | |
| 23 line = line.strip() | |
| 24 if header: | |
| 25 header = False | |
| 26 yield line | |
| 27 continue | |
| 28 if not line: | |
| 29 continue | |
| 30 columns = line.split(args.sep) | |
| 31 value2 = columns[args.c2-1] | |
| 32 yield columns, float(value2) | |
| 33 | |
| 34 def fill_cache(): | |
| 35 try: | |
| 36 cache.append(next(it)) | |
| 37 except StopIteration: | |
| 38 pass | |
| 39 | |
| 40 it = _readline(header = h2) | |
| 41 | |
| 42 with open(args.f1) as handle1: | |
| 43 for line in handle1: | |
| 44 line = line.strip() | |
| 45 if h1: | |
| 46 h1 = False | |
| 47 seconda_header = next(it) | |
| 48 if args.add_distance: | |
| 49 out.write('%s\t%s\t%s\n' % (line, seconda_header, args.unit)) | |
| 50 else: | |
| 51 out.write('%s\t%s\n' % (line, seconda_header)) | |
| 52 continue | |
| 53 if not line: | |
| 54 continue | |
| 55 columns = line.split(args.sep) | |
| 56 value1 = float(columns[args.c1-1]) | |
| 57 _cache = list() | |
| 58 fill_cache() | |
| 59 while cache: | |
| 60 _c, value2 = cache.pop(0) | |
| 61 upper_bound = value1 + args.distance | |
| 62 if args.unit == 'absolute': | |
| 63 if value2 <= upper_bound and value2 >= (value1 - args.distance): | |
| 64 line_template = '%s\n' | |
| 65 abs_dist = abs(value1 - value2) | |
| 66 if args.add_distance: | |
| 67 line_template = '%s\t' + str(abs_dist) + '\n' | |
| 68 write_buffer.append([abs_dist, line_template % '\t'.join( columns + _c )]) | |
| 69 _cache.append([_c, value2]) | |
| 70 fill_cache() | |
| 71 elif value2 > upper_bound: | |
| 72 # if the value from list 2 is bigger then the current value, he will be taken into the next round | |
| 73 _cache.append([_c, value2]) | |
| 74 elif value2 < upper_bound: | |
| 75 # if the value from list 2 is smaller then the currecnt value, check the next one of list 2 | |
| 76 fill_cache() | |
| 77 elif args.unit == 'ppm': | |
| 78 ppm_dist = abs((value1 - value2) / value1 * 1000000) | |
| 79 if ppm_dist <= args.distance: | |
| 80 line_template = '%s\n' | |
| 81 if args.add_distance: | |
| 82 line_template = '%s\t' + str(ppm_dist) + '\n' | |
| 83 write_buffer.append([ppm_dist, line_template % '\t'.join( columns + _c )]) | |
| 84 _cache.append([_c, value2]) | |
| 85 fill_cache() | |
| 86 elif ppm_dist > args.distance: | |
| 87 _cache.append([_c, value2]) | |
| 88 elif ppm_dist < args.distance: | |
| 89 fill_cache() | |
| 90 if args.closest and write_buffer: | |
| 91 write_buffer.sort(key=lambda x: x[0]) | |
| 92 out.write(write_buffer[0][1]) | |
| 93 else: | |
| 94 for _dist, line in write_buffer: | |
| 95 out.write(line) | |
| 96 write_buffer = list() | |
| 97 cache = _cache | |
| 98 out.close() | |
| 99 | |
| 100 | |
| 101 if __name__ == '__main__': | |
| 102 | |
| 103 parser = argparse.ArgumentParser(description='Merge two files on a common column the fuzzy way.') | |
| 104 parser.add_argument('--f1', required=True) | |
| 105 parser.add_argument('--f2', required=True) | |
| 106 parser.add_argument('--c1', type=int, required=True, help="Column in file 1 to be merged on.") | |
| 107 parser.add_argument('--c2', type=int, required=True, help="Column in file 2 to be merged on.") | |
| 108 parser.add_argument('--outfile', required=True) | |
| 109 parser.add_argument('--header', action='store_true', help="The files have a header line at the beginning.") | |
| 110 parser.add_argument('--closest', action='store_true', help="Only report the closest match.") | |
| 111 parser.add_argument('--add_distance', action='store_true', help="Add addional column with the distance between the two values.") | |
| 112 parser.add_argument('--sep', type=str, default="\t", help="Files are separated by this separator.") | |
| 113 parser.add_argument('--distance', type=float, default="0.2", help="Maximal allowed distance.") | |
| 114 parser.add_argument('--unit', choices=['ppm', 'absolute'], default='absolute') | |
| 115 args = parser.parse_args() | |
| 116 | |
| 117 main(args) | |
| 118 | |
| 119 |
