Mercurial > repos > nanettec > lookup
changeset 0:eac0e302ab3a draft
Uploaded
author | nanettec |
---|---|
date | Thu, 25 Feb 2016 04:58:09 -0500 |
parents | |
children | 54200db292a3 |
files | lookup.py |
diffstat | 1 files changed, 280 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/lookup.py Thu Feb 25 04:58:09 2016 -0500 @@ -0,0 +1,280 @@ +""" +@summary: The information from the Markers file and the QTL Cartographer Z file, +are combined to proportionally estimate a base pair position at each 2cM interval. +The Lookup file can then serve as a lookup table to convert between base pair positions centimorgan positions. +@author: nanette.coetzer@gmail.com +@version 5 + +""" +import optparse, sys +import subprocess +import tempfile +import os, re + +def stop_err( msg ): + sys.stderr.write( "%s\n" % msg ) + sys.exit() + +def __main__(): + #Parse Command Line + parser = optparse.OptionParser() + parser.add_option("-i", "--input1", default=None, dest="input1", + help="Markers file") + parser.add_option("-j", "--input2", default=None, dest="input2", + help="QTL Cartographer Z file") + parser.add_option("-k", "--input3", default=None, dest="input3", + help="Chromosome length file") + parser.add_option("-o", "--output1", default=None, dest="output1", + help="Lookup file") + parser.add_option("-p", "--output2", default=None, dest="output2", + help="Lookup summary file") + + (options, args) = parser.parse_args() + + try: + open(options.input1, "r").close() + except TypeError, e: + stop_err("You need to supply the Markers file:\n" + str(e)) + except IOError, e: + stop_err("Can not open the Markers file:\n" + str(e)) + + try: + open(options.input2, "r").close() + except TypeError, e: + stop_err("You need to supply the QTL Cartographer Z file:\n" + str(e)) + except IOError, e: + stop_err("Can not open the QTL Cartographer Z file:\n" + str(e)) + + try: + open(options.input3, "r").close() + except TypeError, e: + stop_err("You need to supply the Chromosome length file:\n" + str(e)) + except IOError, e: + stop_err("Can not open the Chromosome length file:\n" + str(e)) + + + ############################################## + + in1 = open(options.input1,'r') # Markers.txt + prev_c = "0" + count = 0 + marker_dict = {} # For each unique chr + marker, store the cM + bp positions + last_marker_dict = {} # For the last marker on each chr, store the cM + bp positions [reason: these intervals are not included in the z.txt file, so add separately] + + for line in in1: # markers file + count += 1 + if line.startswith("c"): + # skip the header line + if count != 1: + # use info in each line to populate marker_dict + i = line.strip().split("\t") + marker_dict[i[0]+"\t"+i[1]] = i[3]+"\t"+i[4] + chr = i[0] + if chr != prev_c and prev_c != "0": + # if the chr number changes, add previous marker info to last_marker_dict + last_marker_dict[prev_c+"\t"+prev_m] = prev_cM+"\t"+prev_bp + prev_c = i[0] + prev_m = i[1] + prev_cM = i[3] + prev_bp = i[4] + else: + # no header row + # use info in each line to populate marker_dict + i = line.strip().split("\t") + marker_dict[i[0]+"\t"+i[1]] = i[3]+"\t"+i[4] + chr = i[0] + if chr != prev_c and prev_c != "0": + # if the chr number changes, add previous marker info to last_marker_dict + last_marker_dict[prev_c+"\t"+prev_m] = prev_cM+"\t"+prev_bp + prev_c = i[0] + prev_m = i[1] + prev_cM = i[3] + prev_bp = i[4] + + # add the last marker (on last chr) info to last_marker_dict + last_marker_dict[prev_c+"\t"+prev_m] = prev_cM+"\t"+prev_bp + in1.close() + + ############################################## + + in1 = open(options.input1,'r') + s2_l2 = {} # For each unique chr + marker, store the previous (smallest) and current (largest) bp position + s1_l1 = {} # For each unique chr + marker, store the previous (smallest) and current (largest) cM position + count = 0 + pc = 0 + if line.startswith("c"): + for line in in1: + count += 1 + if count != 1: + i = line.strip().split("\t") + if i[0] == pc: + s2_l2[pc+"\t"+pm] = pbp+"\t"+i[4] + s1_l1[pc+"\t"+pm] = pcM+"\t"+i[3] + else: + pass + pc = i[0] # previous chr + pm = i[1] # previous marker + pcM = i[3] # previous cM position + pbp = i[4] # previous bp position + else: + for line in in1: + count += 1 + #if count != 1: + i = line.strip().split("\t") + if i[0] == pc: + s2_l2[pc+"\t"+pm] = pbp+"\t"+i[4] + s1_l1[pc+"\t"+pm] = pcM+"\t"+i[3] + else: + pass + pc = i[0] # previous chr + pm = i[1] # previous marker + pcM = i[3] # previous cM position + pbp = i[4] # previous bp position + in1.close() + + ############################################## + + id = 0 + check1 = "" + check2 = "" + prev_m = 0 + prev_c = "0" + # parse the z.txt file, to extract only the chr, marker and interval position (of the first block in the file) + r = re.compile("^(\d+)\s+(\d+)\s+(\d+.\d+)\s+") + + # create temp output file + lookup_temp = tempfile.mktemp() + temp_outfile = open(lookup_temp,'w') + temp_outfile.write("id\tchr\tmarker\tinterval\tcM\tbp\tlength_cM\n") + + in2 = open(options.input2,'r') + for line in in2: # z file + if check2 == "": + count = 0 + i = line.strip() + m1 = r.match(i) + try: + c = m1.group(1) + ans = 1 + except: + ans = 0 + if ans == 1: + c = m1.group(1) + m = m1.group(2) + p = m1.group(3) + # if the previous marker number is larger than the current marker number, a new chromosome started + # before carrying on, add the last marker of the previous chromosome + if int(prev_m) > int(m): + id += 1 + add_m = int(prev_m)+1 + add_cM = last_marker_dict[prev_c+"\t"+str(add_m)].split("\t")[0] + add_bp = last_marker_dict[prev_c+"\t"+str(add_m)].split("\t")[1] + add_p = (float(add_cM) + 0.01)/100 + temp_outfile.write(str(id)+"\t"+str(prev_c)+"\t"+str(add_m)+"\t"+str(add_p)+"\t"+str(add_cM)+"\t"+str(add_bp)+"\n") + #print c+"\t"+m+"\t"+p+"\t"+str(float(p)*100-0.01)+"\t"+marker_dict[c+"\t"+m] + # Calculation: + s1 = float(s1_l1[c+"\t"+m].split("\t")[0]) + l1 = float(s1_l1[c+"\t"+m].split("\t")[1]) + s2 = float(s2_l2[c+"\t"+m].split("\t")[0]) + l2 = float(s2_l2[c+"\t"+m].split("\t")[1]) + v1 = float(p)*100-0.01 + x2 = int(round((((v1-s1)*(l2-s2)/(l1-s1))+s2),0)) + check1 = "done" + id += 1 + temp_outfile.write(str(id)+"\t"+str(c)+"\t"+str(m)+"\t"+str(p)+"\t"+str(float(p)*100-0.01)+"\t"+str(x2)+"\n") + # store previous chr and marker + prev_m = m + prev_c = c + # do not go on to the other blocks in the z.txt file + if ans == 0 and check1 == "done": + check2 = "stop" + # add the last marker of the last chromosome + id += 1 + add_m = int(prev_m)+1 + add_cM = last_marker_dict[prev_c+"\t"+str(add_m)].split("\t")[0] + add_bp = last_marker_dict[prev_c+"\t"+str(add_m)].split("\t")[1] + add_p = (float(add_cM) + 0.01)/100 + temp_outfile.write(str(id)+"\t"+str(prev_c)+"\t"+str(add_m)+"\t"+str(add_p)+"\t"+str(add_cM)+"\t"+str(add_bp)+"\n") + + in2.close() + temp_outfile.close() + + ############################################## + + # Aim1: add cM_length column + # Aim2: create lookup summary file + temp_infile = open(lookup_temp,'r') + in3 = open(options.input3,'r') # chr_length.txt + out1 = open(options.output1,'w') # lookup.txt + out2 = open(options.output2,'w') # lookup_summary.txt + + out2.write("chr\tmarkers\tcM\tbp\tinterval positions\tbins\n") + chr_dict = {} + for line in in3: + l = line.strip().split("\t") + if l[0].startswith("chr"): + chr = l[0][3:] + else: + chr = l[0] + chr_dict[chr] = l[1] + + tot_num_int_pos = 0 + count = 0 + # calc totals for summary file + tot_markers = 0 + tot_bp = 0 + tot_cM = 0 + tot_int_pos = 0 + tot_bins = 0 + for line in temp_infile: + l = line.strip().split("\t") + if count == 0: + out1.write(line.strip()+"\n") + if count > 1: + if l[4] != "0.0": + length_cm = float(l[4])-float(prev_cm) + out1.write(prev_line+"\t"+str(length_cm)+"\n") + else: + # always the end of a chromosome + out1.write(prev_line+"\t0\n") + # summary file + pl = prev_line.split("\t") + num_int_pos = int(pl[0]) - tot_num_int_pos + tot_num_int_pos += num_int_pos + out2.write(str(pl[1])+"\t"+str(pl[2])+"\t"+str(pl[4])+"\t"+chr_dict[pl[1]]+"\t"+str(num_int_pos)+"\t"+str(num_int_pos-1)+"\n") + # calc totals for summary file + tot_markers += int(pl[2]) + tot_bp += int(chr_dict[pl[1]]) + tot_cM += float(pl[4]) + tot_int_pos += num_int_pos + tot_bins += (num_int_pos-1) + prev_line = line.strip() + prev_cm = l[4] + count += 1 + # the end of the last chromosome + out1.write(prev_line+"\t0\n") + # summary file + pl = prev_line.split("\t") + num_int_pos = int(pl[0]) - tot_num_int_pos + tot_num_int_pos += num_int_pos + out2.write(str(pl[1])+"\t"+str(pl[2])+"\t"+str(pl[4])+"\t"+chr_dict[pl[1]]+"\t"+str(num_int_pos)+"\t"+str(num_int_pos-1)+"\n") + # calc totals for summary file + tot_markers += int(pl[2]) + tot_bp += int(chr_dict[pl[1]]) + tot_cM += float(pl[4]) + tot_int_pos += num_int_pos + tot_bins += (num_int_pos-1) + out2.write("Total\t"+str(tot_markers)+"\t"+str(tot_cM)+"\t"+str(tot_bp)+"\t"+str(tot_int_pos)+"\t"+str(tot_bins)+"\n") + + temp_infile.close() + in3.close() + out1.close() + out2.close() + + ############################################## + +if __name__=="__main__": + __main__() + + +