diff join_files_on_column_fuzzy.py @ 0:5b667b17923a draft

planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/join_files_on_column_fuzzy commit fd2bc86108994c9eda731b305ca6a8c71554cfaa
author bgruening
date Sun, 26 Nov 2017 16:12:46 -0500
parents
children 5c31b0ea0734
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/join_files_on_column_fuzzy.py	Sun Nov 26 16:12:46 2017 -0500
@@ -0,0 +1,119 @@
+#!/usr/bin/env python
+
+import os
+import argparse
+import sys
+
+def main(args):
+
+    if args.header:
+        h1 = True
+        h2 = True
+    else:
+        h1 = False
+        h2 = False
+
+    cache = list()
+    out = open(args.outfile, 'w+')
+    write_buffer = list()
+
+    def _readline(header = False):
+        with open(args.f2) as handle2:
+            for line in handle2:
+                line = line.strip()
+                if header:
+                    header = False
+                    yield line
+                    continue
+                if not line:
+                    continue
+                columns = line.split(args.sep)
+                value2 = columns[args.c2-1]
+                yield columns, float(value2)
+
+    def fill_cache():
+        try:
+            cache.append(next(it))
+        except StopIteration:
+           pass
+
+    it = _readline(header = h2)
+
+    with open(args.f1) as handle1:
+        for line in handle1:
+            line = line.strip()
+            if h1:
+                h1 = False
+                seconda_header = next(it)
+                if args.add_distance:
+                    out.write('%s\t%s\t%s\n' % (line, seconda_header, args.unit))
+                else:
+                    out.write('%s\t%s\n' % (line, seconda_header))
+                continue
+            if not line:
+                continue
+            columns = line.split(args.sep)
+            value1 = float(columns[args.c1-1])
+            _cache = list()
+            fill_cache()
+            while cache:
+                _c, value2 = cache.pop(0)
+                upper_bound = value1 + args.distance
+                if args.unit == 'absolute':
+                    if value2 <= upper_bound and value2 >= (value1 - args.distance):
+                        line_template = '%s\n'
+                        abs_dist = abs(value1 - value2)
+                        if args.add_distance:
+                            line_template = '%s\t' + str(abs_dist) + '\n'
+                        write_buffer.append([abs_dist, line_template % '\t'.join( columns + _c )])
+                        _cache.append([_c, value2])
+                        fill_cache()
+                    elif value2 > upper_bound:
+                        # if the value from list 2 is bigger then the current value, he will be taken into the next round
+                        _cache.append([_c, value2])
+                    elif value2 < upper_bound:
+                        # if the value from list 2 is smaller then the currecnt value, check the next one of list 2
+                        fill_cache()
+                elif args.unit == 'ppm':
+                    ppm_dist = abs((value1 - value2) / value1 * 1000000)
+                    if ppm_dist <= args.distance:
+                        line_template = '%s\n'
+                        if args.add_distance:
+                            line_template = '%s\t' + str(ppm_dist) + '\n'
+                        write_buffer.append([ppm_dist, line_template % '\t'.join( columns + _c )])
+                        _cache.append([_c, value2])
+                        fill_cache()
+                    elif ppm_dist > args.distance:
+                        _cache.append([_c, value2])
+                    elif ppm_dist < args.distance:
+                        fill_cache()
+            if args.closest and write_buffer:
+                write_buffer.sort(key=lambda x: x[0])
+                out.write(write_buffer[0][1])
+            else:
+                for _dist, line in write_buffer:
+                    out.write(line)
+            write_buffer = list()
+            cache = _cache
+    out.close()
+
+
+if __name__ == '__main__':
+
+    parser = argparse.ArgumentParser(description='Merge two files on a common column the fuzzy way.')
+    parser.add_argument('--f1', required=True)
+    parser.add_argument('--f2', required=True)
+    parser.add_argument('--c1', type=int, required=True, help="Column in file 1 to be merged on.")
+    parser.add_argument('--c2', type=int, required=True, help="Column in file 2 to be merged on.")
+    parser.add_argument('--outfile', required=True)
+    parser.add_argument('--header', action='store_true', help="The files have a header line at the beginning.")
+    parser.add_argument('--closest', action='store_true', help="Only report the closest match.")
+    parser.add_argument('--add_distance', action='store_true', help="Add addional column with the distance between the two values.")
+    parser.add_argument('--sep', type=str, default="\t", help="Files are separated by this separator.")
+    parser.add_argument('--distance', type=float, default="0.2", help="Maximal allowed distance.")
+    parser.add_argument('--unit', choices=['ppm', 'absolute'], default='absolute')
+    args = parser.parse_args()
+
+    main(args)
+
+