comparison flanking_features.py @ 1:850c05b9af00 draft

planemo upload commit 33927a87ba2eee9bf0ecdd376a66241b17b3d734
author devteam
date Tue, 13 Oct 2015 12:50:14 -0400
parents e928e029f6eb
children 94248d5b9b8b
comparison
equal deleted inserted replaced
0:e928e029f6eb 1:850c05b9af00
8 -2, --cols2=N,N,N,N: Columns for start, end, strand in second file 8 -2, --cols2=N,N,N,N: Columns for start, end, strand in second file
9 -G, --gff1: input 1 is GFF format, meaning start and end coordinates are 1-based, closed interval 9 -G, --gff1: input 1 is GFF format, meaning start and end coordinates are 1-based, closed interval
10 -H, --gff2: input 2 is GFF format, meaning start and end coordinates are 1-based, closed interval 10 -H, --gff2: input 2 is GFF format, meaning start and end coordinates are 1-based, closed interval
11 """ 11 """
12 12
13 import sys, traceback, fileinput 13 import fileinput
14 from warnings import warn 14 import sys
15 from bx.cookbook import doc_optparse 15 from bx.cookbook import doc_optparse
16 from galaxy.tools.util.galaxyops import * 16 from bx.intervals.io import Comment, GenomicInterval, Header, NiceReaderWrapper
17 from bx.intervals.io import *
18 from bx.intervals.operations import quicksect 17 from bx.intervals.operations import quicksect
19 from utils.gff_util import * 18 from bx.tabular.io import ParseError
19 from galaxy.tools.util.galaxyops import fail, parse_cols_arg, skipped
20 from utils.gff_util import convert_bed_coords_to_gff, GFFIntervalToBEDReaderWrapper
20 21
21 assert sys.version_info[:2] >= ( 2, 4 ) 22 assert sys.version_info[:2] >= ( 2, 4 )
22 23
23 def get_closest_feature (node, direction, threshold_up, threshold_down, report_func_up, report_func_down): 24
25 def get_closest_feature(node, direction, threshold_up, threshold_down, report_func_up, report_func_down):
24 #direction=1 for +ve strand upstream and -ve strand downstream cases; and it is 0 for +ve strand downstream and -ve strand upstream cases 26 #direction=1 for +ve strand upstream and -ve strand downstream cases; and it is 0 for +ve strand downstream and -ve strand upstream cases
25 #threhold_Up is equal to the interval start for +ve strand, and interval end for -ve strand 27 #threhold_Up is equal to the interval start for +ve strand, and interval end for -ve strand
26 #threhold_down is equal to the interval end for +ve strand, and interval start for -ve strand 28 #threhold_down is equal to the interval end for +ve strand, and interval start for -ve strand
27 if direction == 1: 29 if direction == 1:
28 if node.maxend <= threshold_up: 30 if node.maxend <= threshold_up:
29 if node.end == node.maxend: 31 if node.end == node.maxend:
30 report_func_up(node) 32 report_func_up(node)
31 elif node.right and node.left: 33 elif node.right and node.left:
32 if node.right.maxend == node.maxend: 34 if node.right.maxend == node.maxend:
58 get_closest_feature(node.left, direction, threshold_up, threshold_down, report_func_up, report_func_down) 60 get_closest_feature(node.left, direction, threshold_up, threshold_down, report_func_up, report_func_down)
59 else: 61 else:
60 if node.right: 62 if node.right:
61 get_closest_feature(node.right, direction, threshold_up, threshold_down, report_func_up, report_func_down) 63 get_closest_feature(node.right, direction, threshold_up, threshold_down, report_func_up, report_func_down)
62 64
65
63 def proximal_region_finder(readers, region, comments=True): 66 def proximal_region_finder(readers, region, comments=True):
64 """ 67 """
65 Returns an iterator that yields elements of the form [ <original_interval>, <closest_feature> ]. 68 Returns an iterator that yields elements of the form [ <original_interval>, <closest_feature> ].
66 Intervals are GenomicInterval objects. 69 Intervals are GenomicInterval objects.
67 """ 70 """
68 primary = readers[0] 71 primary = readers[0]
69 features = readers[1] 72 features = readers[1]
70 either = False 73 either = False
71 if region == 'Upstream': 74 if region == 'Upstream':
74 up, down = False, True 77 up, down = False, True
75 else: 78 else:
76 up, down = True, True 79 up, down = True, True
77 if region == 'Either': 80 if region == 'Either':
78 either = True 81 either = True
79 82
80 # Read features into memory: 83 # Read features into memory:
81 rightTree = quicksect.IntervalTree() 84 rightTree = quicksect.IntervalTree()
82 for item in features: 85 for item in features:
83 if type( item ) is GenomicInterval: 86 if type( item ) is GenomicInterval:
84 rightTree.insert( item, features.linenum, item ) 87 rightTree.insert( item, features.linenum, item )
85 88
86 for interval in primary: 89 for interval in primary:
87 if type( interval ) is Header: 90 if type( interval ) is Header:
88 yield interval 91 yield interval
89 if type( interval ) is Comment and comments: 92 if type( interval ) is Comment and comments:
90 yield interval 93 yield interval
94 end = int(interval.end) 97 end = int(interval.end)
95 strand = interval.strand 98 strand = interval.strand
96 if chrom not in rightTree.chroms: 99 if chrom not in rightTree.chroms:
97 continue 100 continue
98 else: 101 else:
99 root = rightTree.chroms[chrom] #root node for the chrom tree 102 root = rightTree.chroms[chrom] # root node for the chrom tree
100 result_up = [] 103 result_up = []
101 result_down = [] 104 result_down = []
102 if (strand == '+' and up) or (strand == '-' and down): 105 if (strand == '+' and up) or (strand == '-' and down):
103 #upstream +ve strand and downstream -ve strand cases 106 #upstream +ve strand and downstream -ve strand cases
104 get_closest_feature (root, 1, start, None, lambda node: result_up.append( node ), None) 107 get_closest_feature(root, 1, start, None, lambda node: result_up.append( node ), None)
105 108
106 if (strand == '+' and down) or (strand == '-' and up): 109 if (strand == '+' and down) or (strand == '-' and up):
107 #downstream +ve strand and upstream -ve strand case 110 #downstream +ve strand and upstream -ve strand case
108 get_closest_feature (root, 0, None, end-1, None, lambda node: result_down.append( node )) 111 get_closest_feature(root, 0, None, end - 1, None, lambda node: result_down.append( node ))
109 112
110 if result_up: 113 if result_up:
111 if len(result_up) > 1: #The results_up list has a list of intervals upstream to the given interval. 114 if len(result_up) > 1: # The results_up list has a list of intervals upstream to the given interval.
112 ends = [] 115 ends = []
113 for n in result_up: 116 for n in result_up:
114 ends.append(n.end) 117 ends.append(n.end)
115 res_ind = ends.index(max(ends)) #fetch the index of the closest interval i.e. the interval with the max end from the results_up list 118 res_ind = ends.index(max(ends)) # fetch the index of the closest interval i.e. the interval with the max end from the results_up list
116 else: 119 else:
117 res_ind = 0 120 res_ind = 0
118 if not(either): 121 if not(either):
119 yield [ interval, result_up[res_ind].other ] 122 yield [ interval, result_up[res_ind].other ]
120 123
121 if result_down: 124 if result_down:
122 if not(either): 125 if not(either):
123 #The last element of result_down will be the closest element to the given interval 126 #The last element of result_down will be the closest element to the given interval
124 yield [ interval, result_down[-1].other ] 127 yield [ interval, result_down[-1].other ]
125 128
126 if either and (result_up or result_down): 129 if either and (result_up or result_down):
127 iter_val = [] 130 iter_val = []
128 if result_up and result_down: 131 if result_up and result_down:
129 if abs(start - int(result_up[res_ind].end)) <= abs(end - int(result_down[-1].start)): 132 if abs(start - int(result_up[res_ind].end)) <= abs(end - int(result_down[-1].start)):
130 iter_val = [ interval, result_up[res_ind].other ] 133 iter_val = [ interval, result_up[res_ind].other ]
135 iter_val = [ interval, result_up[res_ind].other ] 138 iter_val = [ interval, result_up[res_ind].other ]
136 elif result_down: 139 elif result_down:
137 #The last element of result_down will be the closest element to the given interval 140 #The last element of result_down will be the closest element to the given interval
138 iter_val = [ interval, result_down[-1].other ] 141 iter_val = [ interval, result_down[-1].other ]
139 yield iter_val 142 yield iter_val
140 143
144
141 def main(): 145 def main():
142 options, args = doc_optparse.parse( __doc__ ) 146 options, args = doc_optparse.parse( __doc__ )
143 try: 147 try:
144 chr_col_1, start_col_1, end_col_1, strand_col_1 = parse_cols_arg( options.cols1 ) 148 chr_col_1, start_col_1, end_col_1, strand_col_1 = parse_cols_arg( options.cols1 )
145 chr_col_2, start_col_2, end_col_2, strand_col_2 = parse_cols_arg( options.cols2 ) 149 chr_col_2, start_col_2, end_col_2, strand_col_2 = parse_cols_arg( options.cols2 )
146 in1_gff_format = bool( options.gff1 ) 150 in1_gff_format = bool( options.gff1 )
147 in2_gff_format = bool( options.gff2 ) 151 in2_gff_format = bool( options.gff2 )
148 in_fname, in2_fname, out_fname, direction = args 152 in_fname, in2_fname, out_fname, direction = args
149 except: 153 except:
150 doc_optparse.exception() 154 doc_optparse.exception()
151 155
152 # Set readers to handle either GFF or default format. 156 # Set readers to handle either GFF or default format.
153 if in1_gff_format: 157 if in1_gff_format:
154 in1_reader_wrapper = GFFIntervalToBEDReaderWrapper 158 in1_reader_wrapper = GFFIntervalToBEDReaderWrapper
155 else: 159 else:
156 in1_reader_wrapper = NiceReaderWrapper 160 in1_reader_wrapper = NiceReaderWrapper
158 in2_reader_wrapper = GFFIntervalToBEDReaderWrapper 162 in2_reader_wrapper = GFFIntervalToBEDReaderWrapper
159 else: 163 else:
160 in2_reader_wrapper = NiceReaderWrapper 164 in2_reader_wrapper = NiceReaderWrapper
161 165
162 g1 = in1_reader_wrapper( fileinput.FileInput( in_fname ), 166 g1 = in1_reader_wrapper( fileinput.FileInput( in_fname ),
163 chrom_col=chr_col_1, 167 chrom_col=chr_col_1,
164 start_col=start_col_1, 168 start_col=start_col_1,
165 end_col=end_col_1, 169 end_col=end_col_1,
166 strand_col=strand_col_1, 170 strand_col=strand_col_1,
167 fix_strand=True ) 171 fix_strand=True )
168 g2 = in2_reader_wrapper( fileinput.FileInput( in2_fname ), 172 g2 = in2_reader_wrapper( fileinput.FileInput( in2_fname ),
169 chrom_col=chr_col_2, 173 chrom_col=chr_col_2,
170 start_col=start_col_2, 174 start_col=start_col_2,
171 end_col=end_col_2, 175 end_col=end_col_2,
172 strand_col=strand_col_2, 176 strand_col=strand_col_2,
173 fix_strand=True ) 177 fix_strand=True )
174 178
175 # Find flanking features. 179 # Find flanking features.
176 out_file = open( out_fname, "w" ) 180 out_file = open( out_fname, "w" )
177 try: 181 try:
178 for result in proximal_region_finder([g1,g2], direction): 182 for result in proximal_region_finder([g1, g2], direction):
179 if type( result ) is list: 183 if type( result ) is list:
180 line, closest_feature = result 184 line, closest_feature = result
181 # Need to join outputs differently depending on file types. 185 # Need to join outputs differently depending on file types.
182 if in1_gff_format: 186 if in1_gff_format:
183 # Output is GFF with added attribute 'closest feature.' 187 # Output is GFF with added attribute 'closest feature.'
184 188
185 # Invervals are in BED coordinates; need to convert to GFF. 189 # Invervals are in BED coordinates; need to convert to GFF.
186 line = convert_bed_coords_to_gff( line ) 190 line = convert_bed_coords_to_gff( line )
187 closest_feature = convert_bed_coords_to_gff( closest_feature ) 191 closest_feature = convert_bed_coords_to_gff( closest_feature )
188 192
189 # Replace double quotes with single quotes in closest feature's attributes. 193 # Replace double quotes with single quotes in closest feature's attributes.
190 out_file.write( "%s closest_feature \"%s\" \n" % 194 out_file.write( "%s closest_feature \"%s\" \n" %
191 ( "\t".join( line.fields ), \ 195 ( "\t".join( line.fields ),
192 "\t".join( closest_feature.fields ).replace( "\"", "\\\"" ) 196 "\t".join( closest_feature.fields ).replace( "\"", "\\\"" )
193 ) ) 197 ) )
194 else: 198 else:
195 # Output is BED + closest feature fields. 199 # Output is BED + closest feature fields.
196 output_line_fields = [] 200 output_line_fields = []
197 output_line_fields.extend( line.fields ) 201 output_line_fields.extend( line.fields )
198 output_line_fields.extend( closest_feature.fields ) 202 output_line_fields.extend( closest_feature.fields )
200 else: 204 else:
201 out_file.write( "%s\n" % result ) 205 out_file.write( "%s\n" % result )
202 except ParseError, exc: 206 except ParseError, exc:
203 fail( "Invalid file format: %s" % str( exc ) ) 207 fail( "Invalid file format: %s" % str( exc ) )
204 208
205 print "Direction: %s" %(direction) 209 print "Direction: %s" % (direction)
206 if g1.skipped > 0: 210 if g1.skipped > 0:
207 print skipped( g1, filedesc=" of 1st dataset" ) 211 print skipped( g1, filedesc=" of 1st dataset" )
208 if g2.skipped > 0: 212 if g2.skipped > 0:
209 print skipped( g2, filedesc=" of 2nd dataset" ) 213 print skipped( g2, filedesc=" of 2nd dataset" )
210 214