Mercurial > repos > nanettec > lookup
view lookup.py @ 0:eac0e302ab3a draft
Uploaded
author | nanettec |
---|---|
date | Thu, 25 Feb 2016 04:58:09 -0500 |
parents | |
children |
line wrap: on
line source
""" @summary: The information from the Markers file and the QTL Cartographer Z file, are combined to proportionally estimate a base pair position at each 2cM interval. The Lookup file can then serve as a lookup table to convert between base pair positions centimorgan positions. @author: nanette.coetzer@gmail.com @version 5 """ import optparse, sys import subprocess import tempfile import os, re def stop_err( msg ): sys.stderr.write( "%s\n" % msg ) sys.exit() def __main__(): #Parse Command Line parser = optparse.OptionParser() parser.add_option("-i", "--input1", default=None, dest="input1", help="Markers file") parser.add_option("-j", "--input2", default=None, dest="input2", help="QTL Cartographer Z file") parser.add_option("-k", "--input3", default=None, dest="input3", help="Chromosome length file") parser.add_option("-o", "--output1", default=None, dest="output1", help="Lookup file") parser.add_option("-p", "--output2", default=None, dest="output2", help="Lookup summary file") (options, args) = parser.parse_args() try: open(options.input1, "r").close() except TypeError, e: stop_err("You need to supply the Markers file:\n" + str(e)) except IOError, e: stop_err("Can not open the Markers file:\n" + str(e)) try: open(options.input2, "r").close() except TypeError, e: stop_err("You need to supply the QTL Cartographer Z file:\n" + str(e)) except IOError, e: stop_err("Can not open the QTL Cartographer Z file:\n" + str(e)) try: open(options.input3, "r").close() except TypeError, e: stop_err("You need to supply the Chromosome length file:\n" + str(e)) except IOError, e: stop_err("Can not open the Chromosome length file:\n" + str(e)) ############################################## in1 = open(options.input1,'r') # Markers.txt prev_c = "0" count = 0 marker_dict = {} # For each unique chr + marker, store the cM + bp positions last_marker_dict = {} # For the last marker on each chr, store the cM + bp positions [reason: these intervals are not included in the z.txt file, so add separately] for line in in1: # markers file count += 1 if line.startswith("c"): # skip the header line if count != 1: # use info in each line to populate marker_dict i = line.strip().split("\t") marker_dict[i[0]+"\t"+i[1]] = i[3]+"\t"+i[4] chr = i[0] if chr != prev_c and prev_c != "0": # if the chr number changes, add previous marker info to last_marker_dict last_marker_dict[prev_c+"\t"+prev_m] = prev_cM+"\t"+prev_bp prev_c = i[0] prev_m = i[1] prev_cM = i[3] prev_bp = i[4] else: # no header row # use info in each line to populate marker_dict i = line.strip().split("\t") marker_dict[i[0]+"\t"+i[1]] = i[3]+"\t"+i[4] chr = i[0] if chr != prev_c and prev_c != "0": # if the chr number changes, add previous marker info to last_marker_dict last_marker_dict[prev_c+"\t"+prev_m] = prev_cM+"\t"+prev_bp prev_c = i[0] prev_m = i[1] prev_cM = i[3] prev_bp = i[4] # add the last marker (on last chr) info to last_marker_dict last_marker_dict[prev_c+"\t"+prev_m] = prev_cM+"\t"+prev_bp in1.close() ############################################## in1 = open(options.input1,'r') s2_l2 = {} # For each unique chr + marker, store the previous (smallest) and current (largest) bp position s1_l1 = {} # For each unique chr + marker, store the previous (smallest) and current (largest) cM position count = 0 pc = 0 if line.startswith("c"): for line in in1: count += 1 if count != 1: i = line.strip().split("\t") if i[0] == pc: s2_l2[pc+"\t"+pm] = pbp+"\t"+i[4] s1_l1[pc+"\t"+pm] = pcM+"\t"+i[3] else: pass pc = i[0] # previous chr pm = i[1] # previous marker pcM = i[3] # previous cM position pbp = i[4] # previous bp position else: for line in in1: count += 1 #if count != 1: i = line.strip().split("\t") if i[0] == pc: s2_l2[pc+"\t"+pm] = pbp+"\t"+i[4] s1_l1[pc+"\t"+pm] = pcM+"\t"+i[3] else: pass pc = i[0] # previous chr pm = i[1] # previous marker pcM = i[3] # previous cM position pbp = i[4] # previous bp position in1.close() ############################################## id = 0 check1 = "" check2 = "" prev_m = 0 prev_c = "0" # parse the z.txt file, to extract only the chr, marker and interval position (of the first block in the file) r = re.compile("^(\d+)\s+(\d+)\s+(\d+.\d+)\s+") # create temp output file lookup_temp = tempfile.mktemp() temp_outfile = open(lookup_temp,'w') temp_outfile.write("id\tchr\tmarker\tinterval\tcM\tbp\tlength_cM\n") in2 = open(options.input2,'r') for line in in2: # z file if check2 == "": count = 0 i = line.strip() m1 = r.match(i) try: c = m1.group(1) ans = 1 except: ans = 0 if ans == 1: c = m1.group(1) m = m1.group(2) p = m1.group(3) # if the previous marker number is larger than the current marker number, a new chromosome started # before carrying on, add the last marker of the previous chromosome if int(prev_m) > int(m): id += 1 add_m = int(prev_m)+1 add_cM = last_marker_dict[prev_c+"\t"+str(add_m)].split("\t")[0] add_bp = last_marker_dict[prev_c+"\t"+str(add_m)].split("\t")[1] add_p = (float(add_cM) + 0.01)/100 temp_outfile.write(str(id)+"\t"+str(prev_c)+"\t"+str(add_m)+"\t"+str(add_p)+"\t"+str(add_cM)+"\t"+str(add_bp)+"\n") #print c+"\t"+m+"\t"+p+"\t"+str(float(p)*100-0.01)+"\t"+marker_dict[c+"\t"+m] # Calculation: s1 = float(s1_l1[c+"\t"+m].split("\t")[0]) l1 = float(s1_l1[c+"\t"+m].split("\t")[1]) s2 = float(s2_l2[c+"\t"+m].split("\t")[0]) l2 = float(s2_l2[c+"\t"+m].split("\t")[1]) v1 = float(p)*100-0.01 x2 = int(round((((v1-s1)*(l2-s2)/(l1-s1))+s2),0)) check1 = "done" id += 1 temp_outfile.write(str(id)+"\t"+str(c)+"\t"+str(m)+"\t"+str(p)+"\t"+str(float(p)*100-0.01)+"\t"+str(x2)+"\n") # store previous chr and marker prev_m = m prev_c = c # do not go on to the other blocks in the z.txt file if ans == 0 and check1 == "done": check2 = "stop" # add the last marker of the last chromosome id += 1 add_m = int(prev_m)+1 add_cM = last_marker_dict[prev_c+"\t"+str(add_m)].split("\t")[0] add_bp = last_marker_dict[prev_c+"\t"+str(add_m)].split("\t")[1] add_p = (float(add_cM) + 0.01)/100 temp_outfile.write(str(id)+"\t"+str(prev_c)+"\t"+str(add_m)+"\t"+str(add_p)+"\t"+str(add_cM)+"\t"+str(add_bp)+"\n") in2.close() temp_outfile.close() ############################################## # Aim1: add cM_length column # Aim2: create lookup summary file temp_infile = open(lookup_temp,'r') in3 = open(options.input3,'r') # chr_length.txt out1 = open(options.output1,'w') # lookup.txt out2 = open(options.output2,'w') # lookup_summary.txt out2.write("chr\tmarkers\tcM\tbp\tinterval positions\tbins\n") chr_dict = {} for line in in3: l = line.strip().split("\t") if l[0].startswith("chr"): chr = l[0][3:] else: chr = l[0] chr_dict[chr] = l[1] tot_num_int_pos = 0 count = 0 # calc totals for summary file tot_markers = 0 tot_bp = 0 tot_cM = 0 tot_int_pos = 0 tot_bins = 0 for line in temp_infile: l = line.strip().split("\t") if count == 0: out1.write(line.strip()+"\n") if count > 1: if l[4] != "0.0": length_cm = float(l[4])-float(prev_cm) out1.write(prev_line+"\t"+str(length_cm)+"\n") else: # always the end of a chromosome out1.write(prev_line+"\t0\n") # summary file pl = prev_line.split("\t") num_int_pos = int(pl[0]) - tot_num_int_pos tot_num_int_pos += num_int_pos out2.write(str(pl[1])+"\t"+str(pl[2])+"\t"+str(pl[4])+"\t"+chr_dict[pl[1]]+"\t"+str(num_int_pos)+"\t"+str(num_int_pos-1)+"\n") # calc totals for summary file tot_markers += int(pl[2]) tot_bp += int(chr_dict[pl[1]]) tot_cM += float(pl[4]) tot_int_pos += num_int_pos tot_bins += (num_int_pos-1) prev_line = line.strip() prev_cm = l[4] count += 1 # the end of the last chromosome out1.write(prev_line+"\t0\n") # summary file pl = prev_line.split("\t") num_int_pos = int(pl[0]) - tot_num_int_pos tot_num_int_pos += num_int_pos out2.write(str(pl[1])+"\t"+str(pl[2])+"\t"+str(pl[4])+"\t"+chr_dict[pl[1]]+"\t"+str(num_int_pos)+"\t"+str(num_int_pos-1)+"\n") # calc totals for summary file tot_markers += int(pl[2]) tot_bp += int(chr_dict[pl[1]]) tot_cM += float(pl[4]) tot_int_pos += num_int_pos tot_bins += (num_int_pos-1) out2.write("Total\t"+str(tot_markers)+"\t"+str(tot_cM)+"\t"+str(tot_bp)+"\t"+str(tot_int_pos)+"\t"+str(tot_bins)+"\n") temp_infile.close() in3.close() out1.close() out2.close() ############################################## if __name__=="__main__": __main__()