view lookup.py @ 0:eac0e302ab3a draft

Uploaded
author nanettec
date Thu, 25 Feb 2016 04:58:09 -0500
parents
children
line wrap: on
line source

"""
@summary: The information from the Markers file and the QTL Cartographer Z file,
are combined to proportionally estimate a base pair position at each 2cM interval.
The Lookup file can then serve as a lookup table to convert between base pair positions centimorgan positions.
@author: nanette.coetzer@gmail.com
@version 5

"""
import optparse, sys
import subprocess
import tempfile
import os, re

def stop_err( msg ):
    sys.stderr.write( "%s\n" % msg )
    sys.exit()
 
def __main__():
    #Parse Command Line
    parser = optparse.OptionParser()
    parser.add_option("-i", "--input1", default=None, dest="input1", 
                      help="Markers file")
    parser.add_option("-j", "--input2", default=None, dest="input2", 
                      help="QTL Cartographer Z file")
    parser.add_option("-k", "--input3", default=None, dest="input3", 
                      help="Chromosome length file")
    parser.add_option("-o", "--output1", default=None, dest="output1", 
                      help="Lookup file")
    parser.add_option("-p", "--output2", default=None, dest="output2", 
                      help="Lookup summary file")
    
    (options, args) = parser.parse_args()
    
    try:
        open(options.input1, "r").close()
    except TypeError, e:
        stop_err("You need to supply the Markers file:\n" + str(e))
    except IOError, e:
        stop_err("Can not open the Markers file:\n" + str(e))
        
    try:
        open(options.input2, "r").close()
    except TypeError, e:
        stop_err("You need to supply the QTL Cartographer Z file:\n" + str(e))
    except IOError, e:
        stop_err("Can not open the QTL Cartographer Z file:\n" + str(e))
        
    try:
        open(options.input3, "r").close()
    except TypeError, e:
        stop_err("You need to supply the Chromosome length file:\n" + str(e))
    except IOError, e:
        stop_err("Can not open the Chromosome length file:\n" + str(e))


    ##############################################

    in1 = open(options.input1,'r')  # Markers.txt
    prev_c = "0"
    count = 0
    marker_dict = {}    # For each unique chr + marker, store the cM + bp positions
    last_marker_dict = {}   # For the last marker on each chr, store the cM + bp positions [reason: these intervals are not included in the z.txt file, so add separately]

    for line in in1:    # markers file
        count += 1
        if line.startswith("c"):
            # skip the header line
            if count != 1:
                # use info in each line to populate marker_dict
                i = line.strip().split("\t")
                marker_dict[i[0]+"\t"+i[1]] = i[3]+"\t"+i[4]
                chr = i[0]
                if chr != prev_c and prev_c != "0":
                    # if the chr number changes, add previous marker info to last_marker_dict
                    last_marker_dict[prev_c+"\t"+prev_m] = prev_cM+"\t"+prev_bp
                prev_c = i[0]
                prev_m = i[1]
                prev_cM = i[3]
                prev_bp = i[4]
        else:
            # no header row
            # use info in each line to populate marker_dict
            i = line.strip().split("\t")
            marker_dict[i[0]+"\t"+i[1]] = i[3]+"\t"+i[4]
            chr = i[0]
            if chr != prev_c and prev_c != "0":
                # if the chr number changes, add previous marker info to last_marker_dict
                last_marker_dict[prev_c+"\t"+prev_m] = prev_cM+"\t"+prev_bp
            prev_c = i[0]
            prev_m = i[1]
            prev_cM = i[3]
            prev_bp = i[4]
    
    # add the last marker (on last chr) info to last_marker_dict
    last_marker_dict[prev_c+"\t"+prev_m] = prev_cM+"\t"+prev_bp
    in1.close()
    
    ##############################################
    
    in1 = open(options.input1,'r') 
    s2_l2 = {}  # For each unique chr + marker, store the previous (smallest) and current (largest) bp position
    s1_l1 = {}  # For each unique chr + marker, store the previous (smallest) and current (largest) cM position
    count = 0
    pc = 0
    if line.startswith("c"):
        for line in in1:
            count += 1
            if count != 1:
                i = line.strip().split("\t")
                if i[0] == pc:
                    s2_l2[pc+"\t"+pm] = pbp+"\t"+i[4]
                    s1_l1[pc+"\t"+pm] = pcM+"\t"+i[3]
                else:
                    pass
                pc = i[0]   # previous chr
                pm = i[1]   # previous marker
                pcM = i[3]   # previous cM position
                pbp = i[4]   # previous bp position
    else:
        for line in in1:
            count += 1
            #if count != 1:
            i = line.strip().split("\t")
            if i[0] == pc:
                s2_l2[pc+"\t"+pm] = pbp+"\t"+i[4]
                s1_l1[pc+"\t"+pm] = pcM+"\t"+i[3]
            else:
                pass
            pc = i[0]   # previous chr
            pm = i[1]   # previous marker
            pcM = i[3]   # previous cM position
            pbp = i[4]   # previous bp position
    in1.close()
    
    ##############################################
    
    id = 0
    check1 = ""
    check2 = ""
    prev_m = 0
    prev_c = "0"
    # parse the z.txt file, to extract only the chr, marker and interval position (of the first block in the file)
    r = re.compile("^(\d+)\s+(\d+)\s+(\d+.\d+)\s+")
    
    # create temp output file
    lookup_temp = tempfile.mktemp()
    temp_outfile = open(lookup_temp,'w')
    temp_outfile.write("id\tchr\tmarker\tinterval\tcM\tbp\tlength_cM\n")
    
    in2 = open(options.input2,'r')
    for line in in2:    # z file
        if check2 == "":
            count = 0
            i = line.strip()
            m1 = r.match(i)
            try:
                c = m1.group(1)
                ans = 1
            except:
                ans = 0
            if ans == 1:
                c = m1.group(1)
                m = m1.group(2)
                p = m1.group(3)
                # if the previous marker number is larger than the current marker number, a new chromosome started
                # before carrying on, add the last marker of the previous chromosome
                if int(prev_m) > int(m):
                    id += 1
                    add_m = int(prev_m)+1
                    add_cM = last_marker_dict[prev_c+"\t"+str(add_m)].split("\t")[0]
                    add_bp = last_marker_dict[prev_c+"\t"+str(add_m)].split("\t")[1]
                    add_p = (float(add_cM) + 0.01)/100
                    temp_outfile.write(str(id)+"\t"+str(prev_c)+"\t"+str(add_m)+"\t"+str(add_p)+"\t"+str(add_cM)+"\t"+str(add_bp)+"\n")
                #print c+"\t"+m+"\t"+p+"\t"+str(float(p)*100-0.01)+"\t"+marker_dict[c+"\t"+m]
                # Calculation: 
                s1 = float(s1_l1[c+"\t"+m].split("\t")[0])
                l1 = float(s1_l1[c+"\t"+m].split("\t")[1])
                s2 = float(s2_l2[c+"\t"+m].split("\t")[0])
                l2 = float(s2_l2[c+"\t"+m].split("\t")[1])
                v1 = float(p)*100-0.01
                x2 = int(round((((v1-s1)*(l2-s2)/(l1-s1))+s2),0))
                check1 = "done"
                id += 1
                temp_outfile.write(str(id)+"\t"+str(c)+"\t"+str(m)+"\t"+str(p)+"\t"+str(float(p)*100-0.01)+"\t"+str(x2)+"\n")
                # store previous chr and marker
                prev_m = m
                prev_c = c
            # do not go on to the other blocks in the z.txt file
            if ans == 0 and check1 == "done":
                check2 = "stop"
    # add the last marker of the last chromosome
    id += 1
    add_m = int(prev_m)+1
    add_cM = last_marker_dict[prev_c+"\t"+str(add_m)].split("\t")[0]
    add_bp = last_marker_dict[prev_c+"\t"+str(add_m)].split("\t")[1]
    add_p = (float(add_cM) + 0.01)/100
    temp_outfile.write(str(id)+"\t"+str(prev_c)+"\t"+str(add_m)+"\t"+str(add_p)+"\t"+str(add_cM)+"\t"+str(add_bp)+"\n")
    
    in2.close() 
    temp_outfile.close()
      
    ##############################################
    
    # Aim1: add cM_length column
    # Aim2: create lookup summary file
    temp_infile = open(lookup_temp,'r')
    in3 = open(options.input3,'r')  # chr_length.txt
    out1 = open(options.output1,'w')    # lookup.txt
    out2 = open(options.output2,'w')    # lookup_summary.txt    
    
    out2.write("chr\tmarkers\tcM\tbp\tinterval positions\tbins\n")
    chr_dict = {}
    for line in in3:
        l = line.strip().split("\t")
        if l[0].startswith("chr"):
            chr = l[0][3:]
        else:
            chr = l[0]
        chr_dict[chr] = l[1]
    
    tot_num_int_pos = 0
    count = 0
    # calc totals for summary file
    tot_markers = 0
    tot_bp = 0
    tot_cM = 0
    tot_int_pos = 0
    tot_bins = 0
    for line in temp_infile:
        l = line.strip().split("\t")
        if count == 0:
            out1.write(line.strip()+"\n")
        if count > 1:
            if l[4] != "0.0":
                length_cm = float(l[4])-float(prev_cm)
                out1.write(prev_line+"\t"+str(length_cm)+"\n")
            else:
                # always the end of a chromosome
                out1.write(prev_line+"\t0\n")
                # summary file
                pl = prev_line.split("\t")
                num_int_pos = int(pl[0]) - tot_num_int_pos
                tot_num_int_pos += num_int_pos
                out2.write(str(pl[1])+"\t"+str(pl[2])+"\t"+str(pl[4])+"\t"+chr_dict[pl[1]]+"\t"+str(num_int_pos)+"\t"+str(num_int_pos-1)+"\n")
                # calc totals for summary file
                tot_markers += int(pl[2])
                tot_bp += int(chr_dict[pl[1]])
                tot_cM += float(pl[4])
                tot_int_pos += num_int_pos
                tot_bins += (num_int_pos-1)       
        prev_line = line.strip()
        prev_cm = l[4]    
        count += 1
    # the end of the last chromosome
    out1.write(prev_line+"\t0\n")
    # summary file
    pl = prev_line.split("\t")
    num_int_pos = int(pl[0]) - tot_num_int_pos
    tot_num_int_pos += num_int_pos
    out2.write(str(pl[1])+"\t"+str(pl[2])+"\t"+str(pl[4])+"\t"+chr_dict[pl[1]]+"\t"+str(num_int_pos)+"\t"+str(num_int_pos-1)+"\n")
    # calc totals for summary file
    tot_markers += int(pl[2])
    tot_bp += int(chr_dict[pl[1]])
    tot_cM += float(pl[4])
    tot_int_pos += num_int_pos
    tot_bins += (num_int_pos-1)
    out2.write("Total\t"+str(tot_markers)+"\t"+str(tot_cM)+"\t"+str(tot_bp)+"\t"+str(tot_int_pos)+"\t"+str(tot_bins)+"\n")
    
    temp_infile.close()
    in3.close()
    out1.close()
    out2.close()
            
    ##############################################
    
if __name__=="__main__": 
    __main__()