Mercurial > repos > nanettec > lookup
changeset 2:a777732066cb draft
Uploaded
author | nanettec |
---|---|
date | Thu, 25 Feb 2016 05:25:25 -0500 |
parents | 54200db292a3 |
children | 942852f2cc8e |
files | lookup.py lookup.xml lookup/._lookup.py.gz lookup/._lookup.xml.gz lookup/lookup.py.gz lookup/lookup.xml.gz lookup/test-data/._chr_length.txt.gz lookup/test-data/._markers_filled_in.txt.gz lookup/test-data/._qtlcart.z.filepart.txt.gz lookup/test-data/chr_length.txt.gz lookup/test-data/markers_filled_in.txt.gz lookup/test-data/qtlcart.z.filepart.txt.gz |
diffstat | 12 files changed, 0 insertions(+), 378 deletions(-) [+] |
line wrap: on
line diff
--- a/lookup.py Thu Feb 25 04:58:58 2016 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,280 +0,0 @@ -""" -@summary: The information from the Markers file and the QTL Cartographer Z file, -are combined to proportionally estimate a base pair position at each 2cM interval. -The Lookup file can then serve as a lookup table to convert between base pair positions centimorgan positions. -@author: nanette.coetzer@gmail.com -@version 5 - -""" -import optparse, sys -import subprocess -import tempfile -import os, re - -def stop_err( msg ): - sys.stderr.write( "%s\n" % msg ) - sys.exit() - -def __main__(): - #Parse Command Line - parser = optparse.OptionParser() - parser.add_option("-i", "--input1", default=None, dest="input1", - help="Markers file") - parser.add_option("-j", "--input2", default=None, dest="input2", - help="QTL Cartographer Z file") - parser.add_option("-k", "--input3", default=None, dest="input3", - help="Chromosome length file") - parser.add_option("-o", "--output1", default=None, dest="output1", - help="Lookup file") - parser.add_option("-p", "--output2", default=None, dest="output2", - help="Lookup summary file") - - (options, args) = parser.parse_args() - - try: - open(options.input1, "r").close() - except TypeError, e: - stop_err("You need to supply the Markers file:\n" + str(e)) - except IOError, e: - stop_err("Can not open the Markers file:\n" + str(e)) - - try: - open(options.input2, "r").close() - except TypeError, e: - stop_err("You need to supply the QTL Cartographer Z file:\n" + str(e)) - except IOError, e: - stop_err("Can not open the QTL Cartographer Z file:\n" + str(e)) - - try: - open(options.input3, "r").close() - except TypeError, e: - stop_err("You need to supply the Chromosome length file:\n" + str(e)) - except IOError, e: - stop_err("Can not open the Chromosome length file:\n" + str(e)) - - - ############################################## - - in1 = open(options.input1,'r') # Markers.txt - prev_c = "0" - count = 0 - marker_dict = {} # For each unique chr + marker, store the cM + bp positions - last_marker_dict = {} # For the last marker on each chr, store the cM + bp positions [reason: these intervals are not included in the z.txt file, so add separately] - - for line in in1: # markers file - count += 1 - if line.startswith("c"): - # skip the header line - if count != 1: - # use info in each line to populate marker_dict - i = line.strip().split("\t") - marker_dict[i[0]+"\t"+i[1]] = i[3]+"\t"+i[4] - chr = i[0] - if chr != prev_c and prev_c != "0": - # if the chr number changes, add previous marker info to last_marker_dict - last_marker_dict[prev_c+"\t"+prev_m] = prev_cM+"\t"+prev_bp - prev_c = i[0] - prev_m = i[1] - prev_cM = i[3] - prev_bp = i[4] - else: - # no header row - # use info in each line to populate marker_dict - i = line.strip().split("\t") - marker_dict[i[0]+"\t"+i[1]] = i[3]+"\t"+i[4] - chr = i[0] - if chr != prev_c and prev_c != "0": - # if the chr number changes, add previous marker info to last_marker_dict - last_marker_dict[prev_c+"\t"+prev_m] = prev_cM+"\t"+prev_bp - prev_c = i[0] - prev_m = i[1] - prev_cM = i[3] - prev_bp = i[4] - - # add the last marker (on last chr) info to last_marker_dict - last_marker_dict[prev_c+"\t"+prev_m] = prev_cM+"\t"+prev_bp - in1.close() - - ############################################## - - in1 = open(options.input1,'r') - s2_l2 = {} # For each unique chr + marker, store the previous (smallest) and current (largest) bp position - s1_l1 = {} # For each unique chr + marker, store the previous (smallest) and current (largest) cM position - count = 0 - pc = 0 - if line.startswith("c"): - for line in in1: - count += 1 - if count != 1: - i = line.strip().split("\t") - if i[0] == pc: - s2_l2[pc+"\t"+pm] = pbp+"\t"+i[4] - s1_l1[pc+"\t"+pm] = pcM+"\t"+i[3] - else: - pass - pc = i[0] # previous chr - pm = i[1] # previous marker - pcM = i[3] # previous cM position - pbp = i[4] # previous bp position - else: - for line in in1: - count += 1 - #if count != 1: - i = line.strip().split("\t") - if i[0] == pc: - s2_l2[pc+"\t"+pm] = pbp+"\t"+i[4] - s1_l1[pc+"\t"+pm] = pcM+"\t"+i[3] - else: - pass - pc = i[0] # previous chr - pm = i[1] # previous marker - pcM = i[3] # previous cM position - pbp = i[4] # previous bp position - in1.close() - - ############################################## - - id = 0 - check1 = "" - check2 = "" - prev_m = 0 - prev_c = "0" - # parse the z.txt file, to extract only the chr, marker and interval position (of the first block in the file) - r = re.compile("^(\d+)\s+(\d+)\s+(\d+.\d+)\s+") - - # create temp output file - lookup_temp = tempfile.mktemp() - temp_outfile = open(lookup_temp,'w') - temp_outfile.write("id\tchr\tmarker\tinterval\tcM\tbp\tlength_cM\n") - - in2 = open(options.input2,'r') - for line in in2: # z file - if check2 == "": - count = 0 - i = line.strip() - m1 = r.match(i) - try: - c = m1.group(1) - ans = 1 - except: - ans = 0 - if ans == 1: - c = m1.group(1) - m = m1.group(2) - p = m1.group(3) - # if the previous marker number is larger than the current marker number, a new chromosome started - # before carrying on, add the last marker of the previous chromosome - if int(prev_m) > int(m): - id += 1 - add_m = int(prev_m)+1 - add_cM = last_marker_dict[prev_c+"\t"+str(add_m)].split("\t")[0] - add_bp = last_marker_dict[prev_c+"\t"+str(add_m)].split("\t")[1] - add_p = (float(add_cM) + 0.01)/100 - temp_outfile.write(str(id)+"\t"+str(prev_c)+"\t"+str(add_m)+"\t"+str(add_p)+"\t"+str(add_cM)+"\t"+str(add_bp)+"\n") - #print c+"\t"+m+"\t"+p+"\t"+str(float(p)*100-0.01)+"\t"+marker_dict[c+"\t"+m] - # Calculation: - s1 = float(s1_l1[c+"\t"+m].split("\t")[0]) - l1 = float(s1_l1[c+"\t"+m].split("\t")[1]) - s2 = float(s2_l2[c+"\t"+m].split("\t")[0]) - l2 = float(s2_l2[c+"\t"+m].split("\t")[1]) - v1 = float(p)*100-0.01 - x2 = int(round((((v1-s1)*(l2-s2)/(l1-s1))+s2),0)) - check1 = "done" - id += 1 - temp_outfile.write(str(id)+"\t"+str(c)+"\t"+str(m)+"\t"+str(p)+"\t"+str(float(p)*100-0.01)+"\t"+str(x2)+"\n") - # store previous chr and marker - prev_m = m - prev_c = c - # do not go on to the other blocks in the z.txt file - if ans == 0 and check1 == "done": - check2 = "stop" - # add the last marker of the last chromosome - id += 1 - add_m = int(prev_m)+1 - add_cM = last_marker_dict[prev_c+"\t"+str(add_m)].split("\t")[0] - add_bp = last_marker_dict[prev_c+"\t"+str(add_m)].split("\t")[1] - add_p = (float(add_cM) + 0.01)/100 - temp_outfile.write(str(id)+"\t"+str(prev_c)+"\t"+str(add_m)+"\t"+str(add_p)+"\t"+str(add_cM)+"\t"+str(add_bp)+"\n") - - in2.close() - temp_outfile.close() - - ############################################## - - # Aim1: add cM_length column - # Aim2: create lookup summary file - temp_infile = open(lookup_temp,'r') - in3 = open(options.input3,'r') # chr_length.txt - out1 = open(options.output1,'w') # lookup.txt - out2 = open(options.output2,'w') # lookup_summary.txt - - out2.write("chr\tmarkers\tcM\tbp\tinterval positions\tbins\n") - chr_dict = {} - for line in in3: - l = line.strip().split("\t") - if l[0].startswith("chr"): - chr = l[0][3:] - else: - chr = l[0] - chr_dict[chr] = l[1] - - tot_num_int_pos = 0 - count = 0 - # calc totals for summary file - tot_markers = 0 - tot_bp = 0 - tot_cM = 0 - tot_int_pos = 0 - tot_bins = 0 - for line in temp_infile: - l = line.strip().split("\t") - if count == 0: - out1.write(line.strip()+"\n") - if count > 1: - if l[4] != "0.0": - length_cm = float(l[4])-float(prev_cm) - out1.write(prev_line+"\t"+str(length_cm)+"\n") - else: - # always the end of a chromosome - out1.write(prev_line+"\t0\n") - # summary file - pl = prev_line.split("\t") - num_int_pos = int(pl[0]) - tot_num_int_pos - tot_num_int_pos += num_int_pos - out2.write(str(pl[1])+"\t"+str(pl[2])+"\t"+str(pl[4])+"\t"+chr_dict[pl[1]]+"\t"+str(num_int_pos)+"\t"+str(num_int_pos-1)+"\n") - # calc totals for summary file - tot_markers += int(pl[2]) - tot_bp += int(chr_dict[pl[1]]) - tot_cM += float(pl[4]) - tot_int_pos += num_int_pos - tot_bins += (num_int_pos-1) - prev_line = line.strip() - prev_cm = l[4] - count += 1 - # the end of the last chromosome - out1.write(prev_line+"\t0\n") - # summary file - pl = prev_line.split("\t") - num_int_pos = int(pl[0]) - tot_num_int_pos - tot_num_int_pos += num_int_pos - out2.write(str(pl[1])+"\t"+str(pl[2])+"\t"+str(pl[4])+"\t"+chr_dict[pl[1]]+"\t"+str(num_int_pos)+"\t"+str(num_int_pos-1)+"\n") - # calc totals for summary file - tot_markers += int(pl[2]) - tot_bp += int(chr_dict[pl[1]]) - tot_cM += float(pl[4]) - tot_int_pos += num_int_pos - tot_bins += (num_int_pos-1) - out2.write("Total\t"+str(tot_markers)+"\t"+str(tot_cM)+"\t"+str(tot_bp)+"\t"+str(tot_int_pos)+"\t"+str(tot_bins)+"\n") - - temp_infile.close() - in3.close() - out1.close() - out2.close() - - ############################################## - -if __name__=="__main__": - __main__() - - -
--- a/lookup.xml Thu Feb 25 04:58:58 2016 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,98 +0,0 @@ -<tool id="lookup5" name="Lookup table" version="5.0.0"> - <description> for 2cM intervals</description> - <command interpreter="python"> - lookup.py --input1 $input1 --input2 $input2 --input3 $input3 --output1 $output1 --output2 $output2 - </command> - <inputs> - <param label="Markers file" name="input1" type="data" format="tabular" help="Tabular file giving marker positions in cM and bp"></param> - <param label="QTL Cartographer Z file" name="input2" type="data" format="tabular" help="QTL Cartographer output file (.z)"></param> - <param label="Chromosome length file" name="input3" type="data" format="tabular" help="A tabular file giving the length of each chromosome in bp"></param> - </inputs> - <outputs> - <data format="tabular" name="output1" /> - <data format="tabular" name="output2" /> - </outputs> - <requirements> - </requirements> - <tests> - <test> - </test> - </tests> - <help> - -**What it does** - -The information from the Markers file and the QTL Cartographer Z file, are combined to proportionally estimate a base pair position at each 2cM interval. - -The Lookup file can then serve as a lookup table to convert between base pair and centimorgan positions. - -------- - -**Example input files** - -Markers file, each row correspond to a marker (5 columns; only a part of the file is shown):: - - chr marker name cM bp - 1 1 marker1 0 2038278 - 1 2 marker2 7.53 3649871 - 1 3 marker3 18.77 6155281 - 1 4 marker4 38.71 12398322 - 1 5 marker5 44.62 14171692 - 1 6 marker6 48.73 16529505 - -QTL Cartographer Z file, each row correspond to a 2cM interval (the example shows part of the important part of the Z file, the first 3 data columns of the first block). Note: Headers is fine and will be ignored:: - - 1 1 0.0001 … - 1 1 0.0201 … - 1 1 0.0401 … - 1 1 0.0601 … - 1 2 0.0754 … - 1 2 0.0954 … - 1 2 0.1154 … - 1 2 0.1354 … - 1 2 0.1554 … - 1 2 0.1754 … - 1 3 0.1878 … - 1 3 0.2078 … - 1 3 0.2278 … - 1 3 0.2478 … - -Chromosome length file, each row correspond to a chromosome (2 columns; only a part of the file is shown):: - - chr1 301354135 - chr2 237068873 - chr3 232140174 - chr4 241473504 - chr5 217872852 - chr6 169174353 - -------- - -**Example output files** - - -Lookup output file, each row correspond to a 2 cM interval (7 columns; only a part of the file is shown):: - - int.id chr marker int cM bp length_cM - 1 1 1 0.0001 0.0 2038278 2.0 - 2 1 1 0.0201 2.0 2466324 2.0 - 3 1 1 0.0401 4.0 2894370 2.0 - 4 1 1 0.0601 6.0 3322416 1.53 - 5 1 2 0.0754 7.53 3649871 2.0 - 6 1 2 0.0954 9.53 4095673 2.0 - 7 1 2 0.1154 11.53 4541476 2.0 - 8 1 2 0.1354 13.53 4987278 2.0 - -Chromosome summary file, each row correspond to a chromosome (6 columns; only a part of the file is shown). The last row gives the total across the genome:: - - chr markers cM bp int_positions bins - 1 27 324.4 301354135 177 176 - 2 14 169.11 237068873 92 91 - 3 19 221.29 232140174 123 122 - 4 20 188.37 241473504 105 104 - 5 20 203.82 217872852 110 109 - 6 17 195.85 169174353 106 105 - Total 117 1302.84 1399083891 713 707 - - </help> -</tool>