# HG changeset patch
# User rijst
# Date 1355311935 18000
# Node ID 493433860c0840a9434cd27a6d4cabcd3e5674b8

Uploaded

diff -r 000000000000 -r 493433860c08 snpsplit.py
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/snpsplit.py	Wed Dec 12 06:32:15 2012 -0500
@@ -0,0 +1,45 @@
+'''This script takes a tab-delimited file containting position, ref base, mut base and splits any multicharacter ref or mut base entries into seperate lines and calculating the new positions'''
+
+import sys
+
+if len(sys.argv) != 3:
+    exit("snpsplit takes exactly two arguments (input and output file), no more and no less")
+
+input_name = sys.argv[1]
+output_name = sys.argv[2]
+
+try:
+    in_file = open(input_name)
+except IOError as e:
+    exit("Error trying to open '"+input_name+"': {1}".format(e.errno, e.strerror))
+
+try:
+    out_file = open(output_name, 'w')
+except IOError as e:
+    exit("Error trying to open '"+output_name+"': {1}".format(e.errno, e.strerror))
+
+def splitter(cells):
+    global out_lines
+    for i in range(0,len(cells[1])):
+        if cells[1][i] == cells[2][i]: continue
+        out_file.write(str(int(cells[0])+i)+'\t'+cells[1][i]+'\t'+cells[2][i]+'\n')
+        out_lines += 1
+
+in_lines=out_lines=0
+out_file.write("Position\tRef\tMut\n")
+for line in in_file:
+    in_lines += 1
+    cells = line.rstrip().split('\t')
+    if not str(line[0]).isdigit():
+        out_file.write(line)
+        continue
+
+    # Can only deal with SNPs/MNPs, not indels.
+    if len(cells[1]) != len(cells[2]): continue
+    splitter(cells)
+
+in_file.close()
+out_file.close()
+
+print "Lines read: %s" % in_lines
+print "Lines printed: %s" % out_lines