# HG changeset patch # User rijst # Date 1355311935 18000 # Node ID 493433860c0840a9434cd27a6d4cabcd3e5674b8 Uploaded diff -r 000000000000 -r 493433860c08 snpsplit.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/snpsplit.py Wed Dec 12 06:32:15 2012 -0500 @@ -0,0 +1,45 @@ +'''This script takes a tab-delimited file containting position, ref base, mut base and splits any multicharacter ref or mut base entries into seperate lines and calculating the new positions''' + +import sys + +if len(sys.argv) != 3: + exit("snpsplit takes exactly two arguments (input and output file), no more and no less") + +input_name = sys.argv[1] +output_name = sys.argv[2] + +try: + in_file = open(input_name) +except IOError as e: + exit("Error trying to open '"+input_name+"': {1}".format(e.errno, e.strerror)) + +try: + out_file = open(output_name, 'w') +except IOError as e: + exit("Error trying to open '"+output_name+"': {1}".format(e.errno, e.strerror)) + +def splitter(cells): + global out_lines + for i in range(0,len(cells[1])): + if cells[1][i] == cells[2][i]: continue + out_file.write(str(int(cells[0])+i)+'\t'+cells[1][i]+'\t'+cells[2][i]+'\n') + out_lines += 1 + +in_lines=out_lines=0 +out_file.write("Position\tRef\tMut\n") +for line in in_file: + in_lines += 1 + cells = line.rstrip().split('\t') + if not str(line[0]).isdigit(): + out_file.write(line) + continue + + # Can only deal with SNPs/MNPs, not indels. + if len(cells[1]) != len(cells[2]): continue + splitter(cells) + +in_file.close() +out_file.close() + +print "Lines read: %s" % in_lines +print "Lines printed: %s" % out_lines