0
|
1 #!/usr/bin/env python
|
|
2 """
|
|
3 usage: %prog bed_file_1 bed_file_2 out_file
|
|
4 -1, --cols1=N,N,N,N: Columns for chr, start, end, strand in first file
|
|
5 -2, --cols2=N,N,N,N,N: Columns for chr, start, end, strand, name/value in second file
|
|
6 """
|
|
7
|
|
8 import collections
|
|
9 import sys
|
|
10 from galaxy.tools.util.galaxyops import *
|
|
11 from bx.cookbook import doc_optparse
|
|
12
|
|
13
|
|
14 #export PYTHONPATH=~/galaxy/lib/
|
|
15 #running command python WeightedAverage.py interval_interpolate.bed value_interpolate.bed interpolate_result.bed
|
|
16
|
|
17 def stop_err(msg):
|
|
18 sys.stderr.write(msg)
|
|
19 sys.exit()
|
|
20
|
|
21
|
|
22 def FindRate(chromosome, start_stop, dictType):
|
|
23 OverlapList = []
|
|
24 for tempO in dictType[chromosome]:
|
|
25 DatabaseInterval = [tempO[0], tempO[1]]
|
|
26 Overlap = GetOverlap( start_stop, DatabaseInterval )
|
|
27 if Overlap > 0:
|
|
28 OverlapList.append([Overlap, tempO[2]])
|
|
29
|
|
30 if len(OverlapList) > 0:
|
|
31 SumRecomb = 0
|
|
32 SumOverlap = 0
|
|
33 for member in OverlapList:
|
|
34 SumRecomb += member[0]*member[1]
|
|
35 SumOverlap += member[0]
|
|
36 averageRate = SumRecomb/SumOverlap
|
|
37 return averageRate
|
|
38 else:
|
|
39 return 'NA'
|
|
40
|
|
41
|
|
42 def GetOverlap(a, b):
|
|
43 return min(a[1], b[1])-max(a[0], b[0])
|
|
44
|
|
45
|
|
46 options, args = doc_optparse.parse( __doc__ )
|
|
47
|
|
48 try:
|
|
49 chr_col_1, start_col_1, end_col_1, strand_col1 = parse_cols_arg( options.cols1 )
|
|
50 chr_col_2, start_col_2, end_col_2, strand_col2, name_col_2 = parse_cols_arg( options.cols2 )
|
|
51 input1, input2, input3 = args
|
|
52 except Exception, eee:
|
|
53 print eee
|
|
54 stop_err( "Data issue: click the pencil icon in the history item to correct the metadata attributes." )
|
|
55
|
|
56 fd2 = open(input2)
|
|
57 lines2 = fd2.readlines()
|
|
58 RecombChrDict = collections.defaultdict(list)
|
|
59
|
|
60 skipped = 0
|
|
61 for line in lines2:
|
|
62 temp = line.strip().split()
|
|
63 try:
|
|
64 assert float(temp[int(name_col_2)])
|
|
65 except:
|
|
66 skipped += 1
|
|
67 continue
|
|
68 tempIndex = [int(temp[int(start_col_2)]), int(temp[int(end_col_2)]), float(temp[int(name_col_2)])]
|
|
69 RecombChrDict[temp[int(chr_col_2)]].append(tempIndex)
|
|
70
|
|
71 print "Skipped %d features with invalid values" % (skipped)
|
|
72
|
|
73 fd1 = open(input1)
|
|
74 lines = fd1.readlines()
|
|
75 finalProduct = ''
|
|
76 for line in lines:
|
|
77 temp = line.strip().split('\t')
|
|
78 chromosome = temp[int(chr_col_1)]
|
|
79 start = int(temp[int(start_col_1)])
|
|
80 stop = int(temp[int(end_col_1)])
|
|
81 start_stop = [start, stop]
|
|
82 RecombRate = FindRate( chromosome, start_stop, RecombChrDict )
|
|
83 try:
|
|
84 RecombRate = "%.4f" % (float(RecombRate))
|
|
85 except:
|
|
86 RecombRate = RecombRate
|
|
87 finalProduct += line.strip()+'\t'+str(RecombRate)+'\n'
|
|
88 fdd = open(input3, 'w')
|
|
89 fdd.writelines(finalProduct)
|
|
90 fdd.close()
|