Mercurial > repos > nanettec > lookup

--- a/lookup.py	Thu Feb 25 04:58:58 2016 -0500
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,280 +0,0 @@
-"""
-@summary: The information from the Markers file and the QTL Cartographer Z file,
-are combined to proportionally estimate a base pair position at each 2cM interval.
-The Lookup file can then serve as a lookup table to convert between base pair positions centimorgan positions.
-@author: nanette.coetzer@gmail.com
-@version 5
-
-"""
-import optparse, sys
-import subprocess
-import tempfile
-import os, re
-
-def stop_err( msg ):
-    sys.stderr.write( "%s\n" % msg )
-    sys.exit()
-
-def __main__():
-    #Parse Command Line
-    parser = optparse.OptionParser()
-    parser.add_option("-i", "--input1", default=None, dest="input1",
-                      help="Markers file")
-    parser.add_option("-j", "--input2", default=None, dest="input2",
-                      help="QTL Cartographer Z file")
-    parser.add_option("-k", "--input3", default=None, dest="input3",
-                      help="Chromosome length file")
-    parser.add_option("-o", "--output1", default=None, dest="output1",
-                      help="Lookup file")
-    parser.add_option("-p", "--output2", default=None, dest="output2",
-                      help="Lookup summary file")
-
-    (options, args) = parser.parse_args()
-
-    try:
-        open(options.input1, "r").close()
-    except TypeError, e:
-        stop_err("You need to supply the Markers file:\n" + str(e))
-    except IOError, e:
-        stop_err("Can not open the Markers file:\n" + str(e))
-
-    try:
-        open(options.input2, "r").close()
-    except TypeError, e:
-        stop_err("You need to supply the QTL Cartographer Z file:\n" + str(e))
-    except IOError, e:
-        stop_err("Can not open the QTL Cartographer Z file:\n" + str(e))
-
-    try:
-        open(options.input3, "r").close()
-    except TypeError, e:
-        stop_err("You need to supply the Chromosome length file:\n" + str(e))
-    except IOError, e:
-        stop_err("Can not open the Chromosome length file:\n" + str(e))
-
-
-    ##############################################
-
-    in1 = open(options.input1,'r')  # Markers.txt
-    prev_c = "0"
-    count = 0
-    marker_dict = {}    # For each unique chr + marker, store the cM + bp positions
-    last_marker_dict = {}   # For the last marker on each chr, store the cM + bp positions [reason: these intervals are not included in the z.txt file, so add separately]
-
-    for line in in1:    # markers file
-        count += 1
-        if line.startswith("c"):
-            # skip the header line
-            if count != 1:
-                # use info in each line to populate marker_dict
-                i = line.strip().split("\t")
-                marker_dict[i[0]+"\t"+i[1]] = i[3]+"\t"+i[4]
-                chr = i[0]
-                if chr != prev_c and prev_c != "0":
-                    # if the chr number changes, add previous marker info to last_marker_dict
-                    last_marker_dict[prev_c+"\t"+prev_m] = prev_cM+"\t"+prev_bp
-                prev_c = i[0]
-                prev_m = i[1]
-                prev_cM = i[3]
-                prev_bp = i[4]
-        else:
-            # no header row
-            # use info in each line to populate marker_dict
-            i = line.strip().split("\t")
-            marker_dict[i[0]+"\t"+i[1]] = i[3]+"\t"+i[4]
-            chr = i[0]
-            if chr != prev_c and prev_c != "0":
-                # if the chr number changes, add previous marker info to last_marker_dict
-                last_marker_dict[prev_c+"\t"+prev_m] = prev_cM+"\t"+prev_bp
-            prev_c = i[0]
-            prev_m = i[1]
-            prev_cM = i[3]
-            prev_bp = i[4]
-
-    # add the last marker (on last chr) info to last_marker_dict
-    last_marker_dict[prev_c+"\t"+prev_m] = prev_cM+"\t"+prev_bp
-    in1.close()
-
-    ##############################################
-
-    in1 = open(options.input1,'r')
-    s2_l2 = {}  # For each unique chr + marker, store the previous (smallest) and current (largest) bp position
-    s1_l1 = {}  # For each unique chr + marker, store the previous (smallest) and current (largest) cM position
-    count = 0
-    pc = 0
-    if line.startswith("c"):
-        for line in in1:
-            count += 1
-            if count != 1:
-                i = line.strip().split("\t")
-                if i[0] == pc:
-                    s2_l2[pc+"\t"+pm] = pbp+"\t"+i[4]
-                    s1_l1[pc+"\t"+pm] = pcM+"\t"+i[3]
-                else:
-                    pass
-                pc = i[0]   # previous chr
-                pm = i[1]   # previous marker
-                pcM = i[3]   # previous cM position
-                pbp = i[4]   # previous bp position
-    else:
-        for line in in1:
-            count += 1
-            #if count != 1:
-            i = line.strip().split("\t")
-            if i[0] == pc:
-                s2_l2[pc+"\t"+pm] = pbp+"\t"+i[4]
-                s1_l1[pc+"\t"+pm] = pcM+"\t"+i[3]
-            else:
-                pass
-            pc = i[0]   # previous chr
-            pm = i[1]   # previous marker
-            pcM = i[3]   # previous cM position
-            pbp = i[4]   # previous bp position
-    in1.close()
-
-    ##############################################
-
-    id = 0
-    check1 = ""
-    check2 = ""
-    prev_m = 0
-    prev_c = "0"
-    # parse the z.txt file, to extract only the chr, marker and interval position (of the first block in the file)
-    r = re.compile("^(\d+)\s+(\d+)\s+(\d+.\d+)\s+")
-
-    # create temp output file
-    lookup_temp = tempfile.mktemp()
-    temp_outfile = open(lookup_temp,'w')
-    temp_outfile.write("id\tchr\tmarker\tinterval\tcM\tbp\tlength_cM\n")
-
-    in2 = open(options.input2,'r')
-    for line in in2:    # z file
-        if check2 == "":
-            count = 0
-            i = line.strip()
-            m1 = r.match(i)
-            try:
-                c = m1.group(1)
-                ans = 1
-            except:
-                ans = 0
-            if ans == 1:
-                c = m1.group(1)
-                m = m1.group(2)
-                p = m1.group(3)
-                # if the previous marker number is larger than the current marker number, a new chromosome started
-                # before carrying on, add the last marker of the previous chromosome
-                if int(prev_m) > int(m):
-                    id += 1
-                    add_m = int(prev_m)+1
-                    add_cM = last_marker_dict[prev_c+"\t"+str(add_m)].split("\t")[0]
-                    add_bp = last_marker_dict[prev_c+"\t"+str(add_m)].split("\t")[1]
-                    add_p = (float(add_cM) + 0.01)/100
-                    temp_outfile.write(str(id)+"\t"+str(prev_c)+"\t"+str(add_m)+"\t"+str(add_p)+"\t"+str(add_cM)+"\t"+str(add_bp)+"\n")
-                #print c+"\t"+m+"\t"+p+"\t"+str(float(p)*100-0.01)+"\t"+marker_dict[c+"\t"+m]
-                # Calculation:
-                s1 = float(s1_l1[c+"\t"+m].split("\t")[0])
-                l1 = float(s1_l1[c+"\t"+m].split("\t")[1])
-                s2 = float(s2_l2[c+"\t"+m].split("\t")[0])
-                l2 = float(s2_l2[c+"\t"+m].split("\t")[1])
-                v1 = float(p)*100-0.01
-                x2 = int(round((((v1-s1)*(l2-s2)/(l1-s1))+s2),0))
-                check1 = "done"
-                id += 1
-                temp_outfile.write(str(id)+"\t"+str(c)+"\t"+str(m)+"\t"+str(p)+"\t"+str(float(p)*100-0.01)+"\t"+str(x2)+"\n")
-                # store previous chr and marker
-                prev_m = m
-                prev_c = c
-            # do not go on to the other blocks in the z.txt file
-            if ans == 0 and check1 == "done":
-                check2 = "stop"
-    # add the last marker of the last chromosome
-    id += 1
-    add_m = int(prev_m)+1
-    add_cM = last_marker_dict[prev_c+"\t"+str(add_m)].split("\t")[0]
-    add_bp = last_marker_dict[prev_c+"\t"+str(add_m)].split("\t")[1]
-    add_p = (float(add_cM) + 0.01)/100
-    temp_outfile.write(str(id)+"\t"+str(prev_c)+"\t"+str(add_m)+"\t"+str(add_p)+"\t"+str(add_cM)+"\t"+str(add_bp)+"\n")
-
-    in2.close()
-    temp_outfile.close()
-
-    ##############################################
-
-    # Aim1: add cM_length column
-    # Aim2: create lookup summary file
-    temp_infile = open(lookup_temp,'r')
-    in3 = open(options.input3,'r')  # chr_length.txt
-    out1 = open(options.output1,'w')    # lookup.txt
-    out2 = open(options.output2,'w')    # lookup_summary.txt
-
-    out2.write("chr\tmarkers\tcM\tbp\tinterval positions\tbins\n")
-    chr_dict = {}
-    for line in in3:
-        l = line.strip().split("\t")
-        if l[0].startswith("chr"):
-            chr = l[0][3:]
-        else:
-            chr = l[0]
-        chr_dict[chr] = l[1]
-
-    tot_num_int_pos = 0
-    count = 0
-    # calc totals for summary file
-    tot_markers = 0
-    tot_bp = 0
-    tot_cM = 0
-    tot_int_pos = 0
-    tot_bins = 0
-    for line in temp_infile:
-        l = line.strip().split("\t")
-        if count == 0:
-            out1.write(line.strip()+"\n")
-        if count > 1:
-            if l[4] != "0.0":
-                length_cm = float(l[4])-float(prev_cm)
-                out1.write(prev_line+"\t"+str(length_cm)+"\n")
-            else:
-                # always the end of a chromosome
-                out1.write(prev_line+"\t0\n")
-                # summary file
-                pl = prev_line.split("\t")
-                num_int_pos = int(pl[0]) - tot_num_int_pos
-                tot_num_int_pos += num_int_pos
-                out2.write(str(pl[1])+"\t"+str(pl[2])+"\t"+str(pl[4])+"\t"+chr_dict[pl[1]]+"\t"+str(num_int_pos)+"\t"+str(num_int_pos-1)+"\n")
-                # calc totals for summary file
-                tot_markers += int(pl[2])
-                tot_bp += int(chr_dict[pl[1]])
-                tot_cM += float(pl[4])
-                tot_int_pos += num_int_pos
-                tot_bins += (num_int_pos-1)
-        prev_line = line.strip()
-        prev_cm = l[4]
-        count += 1
-    # the end of the last chromosome
-    out1.write(prev_line+"\t0\n")
-    # summary file
-    pl = prev_line.split("\t")
-    num_int_pos = int(pl[0]) - tot_num_int_pos
-    tot_num_int_pos += num_int_pos
-    out2.write(str(pl[1])+"\t"+str(pl[2])+"\t"+str(pl[4])+"\t"+chr_dict[pl[1]]+"\t"+str(num_int_pos)+"\t"+str(num_int_pos-1)+"\n")
-    # calc totals for summary file
-    tot_markers += int(pl[2])
-    tot_bp += int(chr_dict[pl[1]])
-    tot_cM += float(pl[4])
-    tot_int_pos += num_int_pos
-    tot_bins += (num_int_pos-1)
-    out2.write("Total\t"+str(tot_markers)+"\t"+str(tot_cM)+"\t"+str(tot_bp)+"\t"+str(tot_int_pos)+"\t"+str(tot_bins)+"\n")
-
-    temp_infile.close()
-    in3.close()
-    out1.close()
-    out2.close()
-
-    ##############################################
-
-if __name__=="__main__":
-    __main__()
-
-
-
--- a/lookup.xml	Thu Feb 25 04:58:58 2016 -0500
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,98 +0,0 @@
-<tool id="lookup5" name="Lookup table" version="5.0.0">
-	<description> for 2cM intervals</description>
-	<command interpreter="python">
-            lookup.py --input1 $input1 --input2 $input2 --input3 $input3 --output1 $output1 --output2 $output2
-	</command>
-        <inputs>
-            <param label="Markers file" name="input1" type="data" format="tabular" help="Tabular file giving marker positions in cM and bp"></param>
-            <param label="QTL Cartographer Z file" name="input2" type="data" format="tabular" help="QTL Cartographer output file (.z)"></param>
-	    <param label="Chromosome length file" name="input3" type="data" format="tabular" help="A tabular file giving the length of each chromosome in bp"></param>
-        </inputs>
-	<outputs>
-                <data format="tabular" name="output1" />
-		<data format="tabular" name="output2" />
-	</outputs>
-	<requirements>
-	</requirements>
-	<tests>
-          <test>
-          </test>
-	</tests>
-	<help>
-
-**What it does**
-
-The information from the Markers file and the QTL Cartographer Z file, are combined to proportionally estimate a base pair position at each 2cM interval.
-
-The Lookup file can then serve as a lookup table to convert between base pair and centimorgan positions.
-
--------
-
-**Example input files**
-
-Markers file, each row correspond to a marker (5 columns; only a part of the file is shown)::
-
- chr	marker	name	cM	bp
- 1	1	marker1	0	2038278
- 1	2	marker2	7.53	3649871
- 1	3	marker3	18.77	6155281
- 1	4	marker4	38.71	12398322
- 1	5	marker5	44.62	14171692
- 1	6	marker6	48.73	16529505
-
-QTL Cartographer Z file, each row correspond to a 2cM interval (the example shows part of the important part of the Z file, the first 3 data columns of the first block). Note: Headers is fine and will be ignored::
-
- 1	1	0.0001	…
- 1	1	0.0201	…
- 1	1	0.0401	…
- 1	1	0.0601	…
- 1	2	0.0754	…
- 1	2	0.0954	…
- 1	2	0.1154	…
- 1	2	0.1354	…
- 1	2	0.1554	…
- 1	2	0.1754	…
- 1	3	0.1878	…
- 1	3	0.2078	…
- 1	3	0.2278	…
- 1	3	0.2478	…
-
-Chromosome length file, each row correspond to a chromosome (2 columns; only a part of the file is shown)::
-
- chr1	301354135
- chr2	237068873
- chr3	232140174
- chr4	241473504
- chr5	217872852
- chr6	169174353
-
--------
-
-**Example output files**
-
-
-Lookup output file, each row correspond to a 2 cM interval (7 columns; only a part of the file is shown)::
-
- int.id	chr	marker	int	cM	bp	length_cM
- 1	1	1	0.0001	0.0	2038278	2.0
- 2	1	1	0.0201	2.0	2466324	2.0
- 3	1	1	0.0401	4.0	2894370	2.0
- 4	1	1	0.0601	6.0	3322416	1.53
- 5	1	2	0.0754	7.53	3649871	2.0
- 6	1	2	0.0954	9.53	4095673	2.0
- 7	1	2	0.1154	11.53	4541476	2.0
- 8	1	2	0.1354	13.53	4987278	2.0
-
-Chromosome summary file, each row correspond to a chromosome (6 columns; only a part of the file is shown). The last row gives the total across the genome::
-
- chr	markers	cM	bp	int_positions	bins
- 1	27	324.4	301354135	177	176
- 2	14	169.11	237068873	92	91
- 3	19	221.29	232140174	123	122
- 4	20	188.37	241473504	105	104
- 5	20	203.82	217872852	110	109
- 6	17	195.85	169174353	106	105
- Total 	117	1302.84	1399083891	713	707
-
-        </help>
-</tool>
Binary file lookup/._lookup.py.gz has changed
Binary file lookup/._lookup.xml.gz has changed
Binary file lookup/lookup.py.gz has changed
Binary file lookup/lookup.xml.gz has changed
Binary file lookup/test-data/._chr_length.txt.gz has changed
Binary file lookup/test-data/._markers_filled_in.txt.gz has changed
Binary file lookup/test-data/._qtlcart.z.filepart.txt.gz has changed
Binary file lookup/test-data/chr_length.txt.gz has changed
Binary file lookup/test-data/markers_filled_in.txt.gz has changed
Binary file lookup/test-data/qtlcart.z.filepart.txt.gz has changed