Mercurial > repos > devteam > flanking_features
comparison flanking_features.py @ 1:850c05b9af00 draft
planemo upload commit 33927a87ba2eee9bf0ecdd376a66241b17b3d734
author | devteam |
---|---|
date | Tue, 13 Oct 2015 12:50:14 -0400 |
parents | e928e029f6eb |
children | 94248d5b9b8b |
comparison
equal
deleted
inserted
replaced
0:e928e029f6eb | 1:850c05b9af00 |
---|---|
8 -2, --cols2=N,N,N,N: Columns for start, end, strand in second file | 8 -2, --cols2=N,N,N,N: Columns for start, end, strand in second file |
9 -G, --gff1: input 1 is GFF format, meaning start and end coordinates are 1-based, closed interval | 9 -G, --gff1: input 1 is GFF format, meaning start and end coordinates are 1-based, closed interval |
10 -H, --gff2: input 2 is GFF format, meaning start and end coordinates are 1-based, closed interval | 10 -H, --gff2: input 2 is GFF format, meaning start and end coordinates are 1-based, closed interval |
11 """ | 11 """ |
12 | 12 |
13 import sys, traceback, fileinput | 13 import fileinput |
14 from warnings import warn | 14 import sys |
15 from bx.cookbook import doc_optparse | 15 from bx.cookbook import doc_optparse |
16 from galaxy.tools.util.galaxyops import * | 16 from bx.intervals.io import Comment, GenomicInterval, Header, NiceReaderWrapper |
17 from bx.intervals.io import * | |
18 from bx.intervals.operations import quicksect | 17 from bx.intervals.operations import quicksect |
19 from utils.gff_util import * | 18 from bx.tabular.io import ParseError |
19 from galaxy.tools.util.galaxyops import fail, parse_cols_arg, skipped | |
20 from utils.gff_util import convert_bed_coords_to_gff, GFFIntervalToBEDReaderWrapper | |
20 | 21 |
21 assert sys.version_info[:2] >= ( 2, 4 ) | 22 assert sys.version_info[:2] >= ( 2, 4 ) |
22 | 23 |
23 def get_closest_feature (node, direction, threshold_up, threshold_down, report_func_up, report_func_down): | 24 |
25 def get_closest_feature(node, direction, threshold_up, threshold_down, report_func_up, report_func_down): | |
24 #direction=1 for +ve strand upstream and -ve strand downstream cases; and it is 0 for +ve strand downstream and -ve strand upstream cases | 26 #direction=1 for +ve strand upstream and -ve strand downstream cases; and it is 0 for +ve strand downstream and -ve strand upstream cases |
25 #threhold_Up is equal to the interval start for +ve strand, and interval end for -ve strand | 27 #threhold_Up is equal to the interval start for +ve strand, and interval end for -ve strand |
26 #threhold_down is equal to the interval end for +ve strand, and interval start for -ve strand | 28 #threhold_down is equal to the interval end for +ve strand, and interval start for -ve strand |
27 if direction == 1: | 29 if direction == 1: |
28 if node.maxend <= threshold_up: | 30 if node.maxend <= threshold_up: |
29 if node.end == node.maxend: | 31 if node.end == node.maxend: |
30 report_func_up(node) | 32 report_func_up(node) |
31 elif node.right and node.left: | 33 elif node.right and node.left: |
32 if node.right.maxend == node.maxend: | 34 if node.right.maxend == node.maxend: |
58 get_closest_feature(node.left, direction, threshold_up, threshold_down, report_func_up, report_func_down) | 60 get_closest_feature(node.left, direction, threshold_up, threshold_down, report_func_up, report_func_down) |
59 else: | 61 else: |
60 if node.right: | 62 if node.right: |
61 get_closest_feature(node.right, direction, threshold_up, threshold_down, report_func_up, report_func_down) | 63 get_closest_feature(node.right, direction, threshold_up, threshold_down, report_func_up, report_func_down) |
62 | 64 |
65 | |
63 def proximal_region_finder(readers, region, comments=True): | 66 def proximal_region_finder(readers, region, comments=True): |
64 """ | 67 """ |
65 Returns an iterator that yields elements of the form [ <original_interval>, <closest_feature> ]. | 68 Returns an iterator that yields elements of the form [ <original_interval>, <closest_feature> ]. |
66 Intervals are GenomicInterval objects. | 69 Intervals are GenomicInterval objects. |
67 """ | 70 """ |
68 primary = readers[0] | 71 primary = readers[0] |
69 features = readers[1] | 72 features = readers[1] |
70 either = False | 73 either = False |
71 if region == 'Upstream': | 74 if region == 'Upstream': |
74 up, down = False, True | 77 up, down = False, True |
75 else: | 78 else: |
76 up, down = True, True | 79 up, down = True, True |
77 if region == 'Either': | 80 if region == 'Either': |
78 either = True | 81 either = True |
79 | 82 |
80 # Read features into memory: | 83 # Read features into memory: |
81 rightTree = quicksect.IntervalTree() | 84 rightTree = quicksect.IntervalTree() |
82 for item in features: | 85 for item in features: |
83 if type( item ) is GenomicInterval: | 86 if type( item ) is GenomicInterval: |
84 rightTree.insert( item, features.linenum, item ) | 87 rightTree.insert( item, features.linenum, item ) |
85 | 88 |
86 for interval in primary: | 89 for interval in primary: |
87 if type( interval ) is Header: | 90 if type( interval ) is Header: |
88 yield interval | 91 yield interval |
89 if type( interval ) is Comment and comments: | 92 if type( interval ) is Comment and comments: |
90 yield interval | 93 yield interval |
94 end = int(interval.end) | 97 end = int(interval.end) |
95 strand = interval.strand | 98 strand = interval.strand |
96 if chrom not in rightTree.chroms: | 99 if chrom not in rightTree.chroms: |
97 continue | 100 continue |
98 else: | 101 else: |
99 root = rightTree.chroms[chrom] #root node for the chrom tree | 102 root = rightTree.chroms[chrom] # root node for the chrom tree |
100 result_up = [] | 103 result_up = [] |
101 result_down = [] | 104 result_down = [] |
102 if (strand == '+' and up) or (strand == '-' and down): | 105 if (strand == '+' and up) or (strand == '-' and down): |
103 #upstream +ve strand and downstream -ve strand cases | 106 #upstream +ve strand and downstream -ve strand cases |
104 get_closest_feature (root, 1, start, None, lambda node: result_up.append( node ), None) | 107 get_closest_feature(root, 1, start, None, lambda node: result_up.append( node ), None) |
105 | 108 |
106 if (strand == '+' and down) or (strand == '-' and up): | 109 if (strand == '+' and down) or (strand == '-' and up): |
107 #downstream +ve strand and upstream -ve strand case | 110 #downstream +ve strand and upstream -ve strand case |
108 get_closest_feature (root, 0, None, end-1, None, lambda node: result_down.append( node )) | 111 get_closest_feature(root, 0, None, end - 1, None, lambda node: result_down.append( node )) |
109 | 112 |
110 if result_up: | 113 if result_up: |
111 if len(result_up) > 1: #The results_up list has a list of intervals upstream to the given interval. | 114 if len(result_up) > 1: # The results_up list has a list of intervals upstream to the given interval. |
112 ends = [] | 115 ends = [] |
113 for n in result_up: | 116 for n in result_up: |
114 ends.append(n.end) | 117 ends.append(n.end) |
115 res_ind = ends.index(max(ends)) #fetch the index of the closest interval i.e. the interval with the max end from the results_up list | 118 res_ind = ends.index(max(ends)) # fetch the index of the closest interval i.e. the interval with the max end from the results_up list |
116 else: | 119 else: |
117 res_ind = 0 | 120 res_ind = 0 |
118 if not(either): | 121 if not(either): |
119 yield [ interval, result_up[res_ind].other ] | 122 yield [ interval, result_up[res_ind].other ] |
120 | 123 |
121 if result_down: | 124 if result_down: |
122 if not(either): | 125 if not(either): |
123 #The last element of result_down will be the closest element to the given interval | 126 #The last element of result_down will be the closest element to the given interval |
124 yield [ interval, result_down[-1].other ] | 127 yield [ interval, result_down[-1].other ] |
125 | 128 |
126 if either and (result_up or result_down): | 129 if either and (result_up or result_down): |
127 iter_val = [] | 130 iter_val = [] |
128 if result_up and result_down: | 131 if result_up and result_down: |
129 if abs(start - int(result_up[res_ind].end)) <= abs(end - int(result_down[-1].start)): | 132 if abs(start - int(result_up[res_ind].end)) <= abs(end - int(result_down[-1].start)): |
130 iter_val = [ interval, result_up[res_ind].other ] | 133 iter_val = [ interval, result_up[res_ind].other ] |
135 iter_val = [ interval, result_up[res_ind].other ] | 138 iter_val = [ interval, result_up[res_ind].other ] |
136 elif result_down: | 139 elif result_down: |
137 #The last element of result_down will be the closest element to the given interval | 140 #The last element of result_down will be the closest element to the given interval |
138 iter_val = [ interval, result_down[-1].other ] | 141 iter_val = [ interval, result_down[-1].other ] |
139 yield iter_val | 142 yield iter_val |
140 | 143 |
144 | |
141 def main(): | 145 def main(): |
142 options, args = doc_optparse.parse( __doc__ ) | 146 options, args = doc_optparse.parse( __doc__ ) |
143 try: | 147 try: |
144 chr_col_1, start_col_1, end_col_1, strand_col_1 = parse_cols_arg( options.cols1 ) | 148 chr_col_1, start_col_1, end_col_1, strand_col_1 = parse_cols_arg( options.cols1 ) |
145 chr_col_2, start_col_2, end_col_2, strand_col_2 = parse_cols_arg( options.cols2 ) | 149 chr_col_2, start_col_2, end_col_2, strand_col_2 = parse_cols_arg( options.cols2 ) |
146 in1_gff_format = bool( options.gff1 ) | 150 in1_gff_format = bool( options.gff1 ) |
147 in2_gff_format = bool( options.gff2 ) | 151 in2_gff_format = bool( options.gff2 ) |
148 in_fname, in2_fname, out_fname, direction = args | 152 in_fname, in2_fname, out_fname, direction = args |
149 except: | 153 except: |
150 doc_optparse.exception() | 154 doc_optparse.exception() |
151 | 155 |
152 # Set readers to handle either GFF or default format. | 156 # Set readers to handle either GFF or default format. |
153 if in1_gff_format: | 157 if in1_gff_format: |
154 in1_reader_wrapper = GFFIntervalToBEDReaderWrapper | 158 in1_reader_wrapper = GFFIntervalToBEDReaderWrapper |
155 else: | 159 else: |
156 in1_reader_wrapper = NiceReaderWrapper | 160 in1_reader_wrapper = NiceReaderWrapper |
158 in2_reader_wrapper = GFFIntervalToBEDReaderWrapper | 162 in2_reader_wrapper = GFFIntervalToBEDReaderWrapper |
159 else: | 163 else: |
160 in2_reader_wrapper = NiceReaderWrapper | 164 in2_reader_wrapper = NiceReaderWrapper |
161 | 165 |
162 g1 = in1_reader_wrapper( fileinput.FileInput( in_fname ), | 166 g1 = in1_reader_wrapper( fileinput.FileInput( in_fname ), |
163 chrom_col=chr_col_1, | 167 chrom_col=chr_col_1, |
164 start_col=start_col_1, | 168 start_col=start_col_1, |
165 end_col=end_col_1, | 169 end_col=end_col_1, |
166 strand_col=strand_col_1, | 170 strand_col=strand_col_1, |
167 fix_strand=True ) | 171 fix_strand=True ) |
168 g2 = in2_reader_wrapper( fileinput.FileInput( in2_fname ), | 172 g2 = in2_reader_wrapper( fileinput.FileInput( in2_fname ), |
169 chrom_col=chr_col_2, | 173 chrom_col=chr_col_2, |
170 start_col=start_col_2, | 174 start_col=start_col_2, |
171 end_col=end_col_2, | 175 end_col=end_col_2, |
172 strand_col=strand_col_2, | 176 strand_col=strand_col_2, |
173 fix_strand=True ) | 177 fix_strand=True ) |
174 | 178 |
175 # Find flanking features. | 179 # Find flanking features. |
176 out_file = open( out_fname, "w" ) | 180 out_file = open( out_fname, "w" ) |
177 try: | 181 try: |
178 for result in proximal_region_finder([g1,g2], direction): | 182 for result in proximal_region_finder([g1, g2], direction): |
179 if type( result ) is list: | 183 if type( result ) is list: |
180 line, closest_feature = result | 184 line, closest_feature = result |
181 # Need to join outputs differently depending on file types. | 185 # Need to join outputs differently depending on file types. |
182 if in1_gff_format: | 186 if in1_gff_format: |
183 # Output is GFF with added attribute 'closest feature.' | 187 # Output is GFF with added attribute 'closest feature.' |
184 | 188 |
185 # Invervals are in BED coordinates; need to convert to GFF. | 189 # Invervals are in BED coordinates; need to convert to GFF. |
186 line = convert_bed_coords_to_gff( line ) | 190 line = convert_bed_coords_to_gff( line ) |
187 closest_feature = convert_bed_coords_to_gff( closest_feature ) | 191 closest_feature = convert_bed_coords_to_gff( closest_feature ) |
188 | 192 |
189 # Replace double quotes with single quotes in closest feature's attributes. | 193 # Replace double quotes with single quotes in closest feature's attributes. |
190 out_file.write( "%s closest_feature \"%s\" \n" % | 194 out_file.write( "%s closest_feature \"%s\" \n" % |
191 ( "\t".join( line.fields ), \ | 195 ( "\t".join( line.fields ), |
192 "\t".join( closest_feature.fields ).replace( "\"", "\\\"" ) | 196 "\t".join( closest_feature.fields ).replace( "\"", "\\\"" ) |
193 ) ) | 197 ) ) |
194 else: | 198 else: |
195 # Output is BED + closest feature fields. | 199 # Output is BED + closest feature fields. |
196 output_line_fields = [] | 200 output_line_fields = [] |
197 output_line_fields.extend( line.fields ) | 201 output_line_fields.extend( line.fields ) |
198 output_line_fields.extend( closest_feature.fields ) | 202 output_line_fields.extend( closest_feature.fields ) |
200 else: | 204 else: |
201 out_file.write( "%s\n" % result ) | 205 out_file.write( "%s\n" % result ) |
202 except ParseError, exc: | 206 except ParseError, exc: |
203 fail( "Invalid file format: %s" % str( exc ) ) | 207 fail( "Invalid file format: %s" % str( exc ) ) |
204 | 208 |
205 print "Direction: %s" %(direction) | 209 print "Direction: %s" % (direction) |
206 if g1.skipped > 0: | 210 if g1.skipped > 0: |
207 print skipped( g1, filedesc=" of 1st dataset" ) | 211 print skipped( g1, filedesc=" of 1st dataset" ) |
208 if g2.skipped > 0: | 212 if g2.skipped > 0: |
209 print skipped( g2, filedesc=" of 2nd dataset" ) | 213 print skipped( g2, filedesc=" of 2nd dataset" ) |
210 | 214 |