lookup: lookup.py comparison

comparison lookup.py @ 0:eac0e302ab3a draft

Uploaded

author	nanettec
date	Thu, 25 Feb 2016 04:58:09 -0500
parents
children

comparison

equal deleted inserted replaced

--1:000000000000
+:eac0e302ab3a
+"""
+@summary: The information from the Markers file and the QTL Cartographer Z file,
+are combined to proportionally estimate a base pair position at each 2cM interval.
+The Lookup file can then serve as a lookup table to convert between base pair positions centimorgan positions.
+@author: nanette.coetzer@gmail.com
+@version 5
+"""
+import optparse, sys
+import subprocess
+import tempfile
+import os, re
+def stop_err( msg ):
+sys.stderr.write( "%s\n" % msg )
+sys.exit()
+def __main__():
+#Parse Command Line
+parser = optparse.OptionParser()
+parser.add_option("-i", "--input1", default=None, dest="input1",
+help="Markers file")
+parser.add_option("-j", "--input2", default=None, dest="input2",
+help="QTL Cartographer Z file")
+parser.add_option("-k", "--input3", default=None, dest="input3",
+help="Chromosome length file")
+parser.add_option("-o", "--output1", default=None, dest="output1",
+help="Lookup file")
+parser.add_option("-p", "--output2", default=None, dest="output2",
+help="Lookup summary file")
+(options, args) = parser.parse_args()
+try:
+open(options.input1, "r").close()
+except TypeError, e:
+stop_err("You need to supply the Markers file:\n" + str(e))
+except IOError, e:
+stop_err("Can not open the Markers file:\n" + str(e))
+try:
+open(options.input2, "r").close()
+except TypeError, e:
+stop_err("You need to supply the QTL Cartographer Z file:\n" + str(e))
+except IOError, e:
+stop_err("Can not open the QTL Cartographer Z file:\n" + str(e))
+try:
+open(options.input3, "r").close()
+except TypeError, e:
+stop_err("You need to supply the Chromosome length file:\n" + str(e))
+except IOError, e:
+stop_err("Can not open the Chromosome length file:\n" + str(e))
+##############################################
+in1 = open(options.input1,'r')  # Markers.txt
+prev_c = "0"
+count = 0
+marker_dict = {}    # For each unique chr + marker, store the cM + bp positions
+last_marker_dict = {}   # For the last marker on each chr, store the cM + bp positions [reason: these intervals are not included in the z.txt file, so add separately]
+for line in in1:    # markers file
+count += 1
+if line.startswith("c"):
+# skip the header line
+if count != 1:
+# use info in each line to populate marker_dict
+i = line.strip().split("\t")
+marker_dict[i[0]+"\t"+i[1]] = i[3]+"\t"+i[4]
+chr = i[0]
+if chr != prev_c and prev_c != "0":
+# if the chr number changes, add previous marker info to last_marker_dict
+last_marker_dict[prev_c+"\t"+prev_m] = prev_cM+"\t"+prev_bp
+prev_c = i[0]
+prev_m = i[1]
+prev_cM = i[3]
+prev_bp = i[4]
+else:
+# no header row
+# use info in each line to populate marker_dict
+i = line.strip().split("\t")
+marker_dict[i[0]+"\t"+i[1]] = i[3]+"\t"+i[4]
+chr = i[0]
+if chr != prev_c and prev_c != "0":
+# if the chr number changes, add previous marker info to last_marker_dict
+last_marker_dict[prev_c+"\t"+prev_m] = prev_cM+"\t"+prev_bp
+prev_c = i[0]
+prev_m = i[1]
+prev_cM = i[3]
+prev_bp = i[4]
+# add the last marker (on last chr) info to last_marker_dict
+last_marker_dict[prev_c+"\t"+prev_m] = prev_cM+"\t"+prev_bp
+in1.close()
+##############################################
+in1 = open(options.input1,'r')
+s2_l2 = {}  # For each unique chr + marker, store the previous (smallest) and current (largest) bp position
+s1_l1 = {}  # For each unique chr + marker, store the previous (smallest) and current (largest) cM position
+count = 0
+pc = 0
+if line.startswith("c"):
+for line in in1:
+count += 1
+if count != 1:
+i = line.strip().split("\t")
+if i[0] == pc:
+s2_l2[pc+"\t"+pm] = pbp+"\t"+i[4]
+s1_l1[pc+"\t"+pm] = pcM+"\t"+i[3]
+else:
+pass
+pc = i[0]   # previous chr
+pm = i[1]   # previous marker
+pcM = i[3]   # previous cM position
+pbp = i[4]   # previous bp position
+else:
+for line in in1:
+count += 1
+#if count != 1:
+i = line.strip().split("\t")
+if i[0] == pc:
+s2_l2[pc+"\t"+pm] = pbp+"\t"+i[4]
+s1_l1[pc+"\t"+pm] = pcM+"\t"+i[3]
+else:
+pass
+pc = i[0]   # previous chr
+pm = i[1]   # previous marker
+pcM = i[3]   # previous cM position
+pbp = i[4]   # previous bp position
+in1.close()
+##############################################
+id = 0
+check1 = ""
+check2 = ""
+prev_m = 0
+prev_c = "0"
+# parse the z.txt file, to extract only the chr, marker and interval position (of the first block in the file)
+r = re.compile("^(\d+)\s+(\d+)\s+(\d+.\d+)\s+")
+# create temp output file
+lookup_temp = tempfile.mktemp()
+temp_outfile = open(lookup_temp,'w')
+temp_outfile.write("id\tchr\tmarker\tinterval\tcM\tbp\tlength_cM\n")
+in2 = open(options.input2,'r')
+for line in in2:    # z file
+if check2 == "":
+count = 0
+i = line.strip()
+m1 = r.match(i)
+try:
+c = m1.group(1)
+ans = 1
+except:
+ans = 0
+if ans == 1:
+c = m1.group(1)
+m = m1.group(2)
+p = m1.group(3)
+# if the previous marker number is larger than the current marker number, a new chromosome started
+# before carrying on, add the last marker of the previous chromosome
+if int(prev_m) > int(m):
+id += 1
+add_m = int(prev_m)+1
+add_cM = last_marker_dict[prev_c+"\t"+str(add_m)].split("\t")[0]
+add_bp = last_marker_dict[prev_c+"\t"+str(add_m)].split("\t")[1]
+add_p = (float(add_cM) + 0.01)/100
+temp_outfile.write(str(id)+"\t"+str(prev_c)+"\t"+str(add_m)+"\t"+str(add_p)+"\t"+str(add_cM)+"\t"+str(add_bp)+"\n")
+#print c+"\t"+m+"\t"+p+"\t"+str(float(p)*100-0.01)+"\t"+marker_dict[c+"\t"+m]
+# Calculation:
+s1 = float(s1_l1[c+"\t"+m].split("\t")[0])
+l1 = float(s1_l1[c+"\t"+m].split("\t")[1])
+s2 = float(s2_l2[c+"\t"+m].split("\t")[0])
+l2 = float(s2_l2[c+"\t"+m].split("\t")[1])
+v1 = float(p)*100-0.01
+x2 = int(round((((v1-s1)*(l2-s2)/(l1-s1))+s2),0))
+check1 = "done"
+id += 1
+temp_outfile.write(str(id)+"\t"+str(c)+"\t"+str(m)+"\t"+str(p)+"\t"+str(float(p)*100-0.01)+"\t"+str(x2)+"\n")
+# store previous chr and marker
+prev_m = m
+prev_c = c
+# do not go on to the other blocks in the z.txt file
+if ans == 0 and check1 == "done":
+check2 = "stop"
+# add the last marker of the last chromosome
+id += 1
+add_m = int(prev_m)+1
+add_cM = last_marker_dict[prev_c+"\t"+str(add_m)].split("\t")[0]
+add_bp = last_marker_dict[prev_c+"\t"+str(add_m)].split("\t")[1]
+add_p = (float(add_cM) + 0.01)/100
+temp_outfile.write(str(id)+"\t"+str(prev_c)+"\t"+str(add_m)+"\t"+str(add_p)+"\t"+str(add_cM)+"\t"+str(add_bp)+"\n")
+in2.close()
+temp_outfile.close()
+##############################################
+# Aim1: add cM_length column
+# Aim2: create lookup summary file
+temp_infile = open(lookup_temp,'r')
+in3 = open(options.input3,'r')  # chr_length.txt
+out1 = open(options.output1,'w')    # lookup.txt
+out2 = open(options.output2,'w')    # lookup_summary.txt
+out2.write("chr\tmarkers\tcM\tbp\tinterval positions\tbins\n")
+chr_dict = {}
+for line in in3:
+l = line.strip().split("\t")
+if l[0].startswith("chr"):
+chr = l[0][3:]
+else:
+chr = l[0]
+chr_dict[chr] = l[1]
+tot_num_int_pos = 0
+count = 0
+# calc totals for summary file
+tot_markers = 0
+tot_bp = 0
+tot_cM = 0
+tot_int_pos = 0
+tot_bins = 0
+for line in temp_infile:
+l = line.strip().split("\t")
+if count == 0:
+out1.write(line.strip()+"\n")
+if count > 1:
+if l[4] != "0.0":
+length_cm = float(l[4])-float(prev_cm)
+out1.write(prev_line+"\t"+str(length_cm)+"\n")
+else:
+# always the end of a chromosome
+out1.write(prev_line+"\t0\n")
+# summary file
+pl = prev_line.split("\t")
+num_int_pos = int(pl[0]) - tot_num_int_pos
+tot_num_int_pos += num_int_pos
+out2.write(str(pl[1])+"\t"+str(pl[2])+"\t"+str(pl[4])+"\t"+chr_dict[pl[1]]+"\t"+str(num_int_pos)+"\t"+str(num_int_pos-1)+"\n")
+# calc totals for summary file
+tot_markers += int(pl[2])
+tot_bp += int(chr_dict[pl[1]])
+tot_cM += float(pl[4])
+tot_int_pos += num_int_pos
+tot_bins += (num_int_pos-1)
+prev_line = line.strip()
+prev_cm = l[4]
+count += 1
+# the end of the last chromosome
+out1.write(prev_line+"\t0\n")
+# summary file
+pl = prev_line.split("\t")
+num_int_pos = int(pl[0]) - tot_num_int_pos
+tot_num_int_pos += num_int_pos
+out2.write(str(pl[1])+"\t"+str(pl[2])+"\t"+str(pl[4])+"\t"+chr_dict[pl[1]]+"\t"+str(num_int_pos)+"\t"+str(num_int_pos-1)+"\n")
+# calc totals for summary file
+tot_markers += int(pl[2])
+tot_bp += int(chr_dict[pl[1]])
+tot_cM += float(pl[4])
+tot_int_pos += num_int_pos
+tot_bins += (num_int_pos-1)
+out2.write("Total\t"+str(tot_markers)+"\t"+str(tot_cM)+"\t"+str(tot_bp)+"\t"+str(tot_int_pos)+"\t"+str(tot_bins)+"\n")
+temp_infile.close()
+in3.close()
+out1.close()
+out2.close()
+##############################################
+if __name__=="__main__":
+__main__()

Mercurial > repos > nanettec > lookup

comparison lookup.py @ 0:eac0e302ab3a draft