Mercurial > repos > devteam > flanking_features
comparison flanking_features.py @ 0:e928e029f6eb draft
Imported from capsule None
author | devteam |
---|---|
date | Tue, 01 Apr 2014 09:13:13 -0400 |
parents | |
children | 850c05b9af00 |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:e928e029f6eb |
---|---|
1 #!/usr/bin/env python | |
2 #By: Guruprasad Ananda | |
3 """ | |
4 Fetch closest up/downstream interval from features corresponding to every interval in primary | |
5 | |
6 usage: %prog primary_file features_file out_file direction | |
7 -1, --cols1=N,N,N,N: Columns for start, end, strand in first file | |
8 -2, --cols2=N,N,N,N: Columns for start, end, strand in second file | |
9 -G, --gff1: input 1 is GFF format, meaning start and end coordinates are 1-based, closed interval | |
10 -H, --gff2: input 2 is GFF format, meaning start and end coordinates are 1-based, closed interval | |
11 """ | |
12 | |
13 import sys, traceback, fileinput | |
14 from warnings import warn | |
15 from bx.cookbook import doc_optparse | |
16 from galaxy.tools.util.galaxyops import * | |
17 from bx.intervals.io import * | |
18 from bx.intervals.operations import quicksect | |
19 from utils.gff_util import * | |
20 | |
21 assert sys.version_info[:2] >= ( 2, 4 ) | |
22 | |
23 def get_closest_feature (node, direction, threshold_up, threshold_down, report_func_up, report_func_down): | |
24 #direction=1 for +ve strand upstream and -ve strand downstream cases; and it is 0 for +ve strand downstream and -ve strand upstream cases | |
25 #threhold_Up is equal to the interval start for +ve strand, and interval end for -ve strand | |
26 #threhold_down is equal to the interval end for +ve strand, and interval start for -ve strand | |
27 if direction == 1: | |
28 if node.maxend <= threshold_up: | |
29 if node.end == node.maxend: | |
30 report_func_up(node) | |
31 elif node.right and node.left: | |
32 if node.right.maxend == node.maxend: | |
33 get_closest_feature(node.right, direction, threshold_up, threshold_down, report_func_up, report_func_down) | |
34 elif node.left.maxend == node.maxend: | |
35 get_closest_feature(node.left, direction, threshold_up, threshold_down, report_func_up, report_func_down) | |
36 elif node.right and node.right.maxend == node.maxend: | |
37 get_closest_feature(node.right, direction, threshold_up, threshold_down, report_func_up, report_func_down) | |
38 elif node.left and node.left.maxend == node.maxend: | |
39 get_closest_feature(node.left, direction, threshold_up, threshold_down, report_func_up, report_func_down) | |
40 elif node.minend <= threshold_up: | |
41 if node.end <= threshold_up: | |
42 report_func_up(node) | |
43 if node.left and node.right: | |
44 if node.right.minend <= threshold_up: | |
45 get_closest_feature(node.right, direction, threshold_up, threshold_down, report_func_up, report_func_down) | |
46 if node.left.minend <= threshold_up: | |
47 get_closest_feature(node.left, direction, threshold_up, threshold_down, report_func_up, report_func_down) | |
48 elif node.left: | |
49 if node.left.minend <= threshold_up: | |
50 get_closest_feature(node.left, direction, threshold_up, threshold_down, report_func_up, report_func_down) | |
51 elif node.right: | |
52 if node.right.minend <= threshold_up: | |
53 get_closest_feature(node.right, direction, threshold_up, threshold_down, report_func_up, report_func_down) | |
54 elif direction == 0: | |
55 if node.start > threshold_down: | |
56 report_func_down(node) | |
57 if node.left: | |
58 get_closest_feature(node.left, direction, threshold_up, threshold_down, report_func_up, report_func_down) | |
59 else: | |
60 if node.right: | |
61 get_closest_feature(node.right, direction, threshold_up, threshold_down, report_func_up, report_func_down) | |
62 | |
63 def proximal_region_finder(readers, region, comments=True): | |
64 """ | |
65 Returns an iterator that yields elements of the form [ <original_interval>, <closest_feature> ]. | |
66 Intervals are GenomicInterval objects. | |
67 """ | |
68 primary = readers[0] | |
69 features = readers[1] | |
70 either = False | |
71 if region == 'Upstream': | |
72 up, down = True, False | |
73 elif region == 'Downstream': | |
74 up, down = False, True | |
75 else: | |
76 up, down = True, True | |
77 if region == 'Either': | |
78 either = True | |
79 | |
80 # Read features into memory: | |
81 rightTree = quicksect.IntervalTree() | |
82 for item in features: | |
83 if type( item ) is GenomicInterval: | |
84 rightTree.insert( item, features.linenum, item ) | |
85 | |
86 for interval in primary: | |
87 if type( interval ) is Header: | |
88 yield interval | |
89 if type( interval ) is Comment and comments: | |
90 yield interval | |
91 elif type( interval ) == GenomicInterval: | |
92 chrom = interval.chrom | |
93 start = int(interval.start) | |
94 end = int(interval.end) | |
95 strand = interval.strand | |
96 if chrom not in rightTree.chroms: | |
97 continue | |
98 else: | |
99 root = rightTree.chroms[chrom] #root node for the chrom tree | |
100 result_up = [] | |
101 result_down = [] | |
102 if (strand == '+' and up) or (strand == '-' and down): | |
103 #upstream +ve strand and downstream -ve strand cases | |
104 get_closest_feature (root, 1, start, None, lambda node: result_up.append( node ), None) | |
105 | |
106 if (strand == '+' and down) or (strand == '-' and up): | |
107 #downstream +ve strand and upstream -ve strand case | |
108 get_closest_feature (root, 0, None, end-1, None, lambda node: result_down.append( node )) | |
109 | |
110 if result_up: | |
111 if len(result_up) > 1: #The results_up list has a list of intervals upstream to the given interval. | |
112 ends = [] | |
113 for n in result_up: | |
114 ends.append(n.end) | |
115 res_ind = ends.index(max(ends)) #fetch the index of the closest interval i.e. the interval with the max end from the results_up list | |
116 else: | |
117 res_ind = 0 | |
118 if not(either): | |
119 yield [ interval, result_up[res_ind].other ] | |
120 | |
121 if result_down: | |
122 if not(either): | |
123 #The last element of result_down will be the closest element to the given interval | |
124 yield [ interval, result_down[-1].other ] | |
125 | |
126 if either and (result_up or result_down): | |
127 iter_val = [] | |
128 if result_up and result_down: | |
129 if abs(start - int(result_up[res_ind].end)) <= abs(end - int(result_down[-1].start)): | |
130 iter_val = [ interval, result_up[res_ind].other ] | |
131 else: | |
132 #The last element of result_down will be the closest element to the given interval | |
133 iter_val = [ interval, result_down[-1].other ] | |
134 elif result_up: | |
135 iter_val = [ interval, result_up[res_ind].other ] | |
136 elif result_down: | |
137 #The last element of result_down will be the closest element to the given interval | |
138 iter_val = [ interval, result_down[-1].other ] | |
139 yield iter_val | |
140 | |
141 def main(): | |
142 options, args = doc_optparse.parse( __doc__ ) | |
143 try: | |
144 chr_col_1, start_col_1, end_col_1, strand_col_1 = parse_cols_arg( options.cols1 ) | |
145 chr_col_2, start_col_2, end_col_2, strand_col_2 = parse_cols_arg( options.cols2 ) | |
146 in1_gff_format = bool( options.gff1 ) | |
147 in2_gff_format = bool( options.gff2 ) | |
148 in_fname, in2_fname, out_fname, direction = args | |
149 except: | |
150 doc_optparse.exception() | |
151 | |
152 # Set readers to handle either GFF or default format. | |
153 if in1_gff_format: | |
154 in1_reader_wrapper = GFFIntervalToBEDReaderWrapper | |
155 else: | |
156 in1_reader_wrapper = NiceReaderWrapper | |
157 if in2_gff_format: | |
158 in2_reader_wrapper = GFFIntervalToBEDReaderWrapper | |
159 else: | |
160 in2_reader_wrapper = NiceReaderWrapper | |
161 | |
162 g1 = in1_reader_wrapper( fileinput.FileInput( in_fname ), | |
163 chrom_col=chr_col_1, | |
164 start_col=start_col_1, | |
165 end_col=end_col_1, | |
166 strand_col=strand_col_1, | |
167 fix_strand=True ) | |
168 g2 = in2_reader_wrapper( fileinput.FileInput( in2_fname ), | |
169 chrom_col=chr_col_2, | |
170 start_col=start_col_2, | |
171 end_col=end_col_2, | |
172 strand_col=strand_col_2, | |
173 fix_strand=True ) | |
174 | |
175 # Find flanking features. | |
176 out_file = open( out_fname, "w" ) | |
177 try: | |
178 for result in proximal_region_finder([g1,g2], direction): | |
179 if type( result ) is list: | |
180 line, closest_feature = result | |
181 # Need to join outputs differently depending on file types. | |
182 if in1_gff_format: | |
183 # Output is GFF with added attribute 'closest feature.' | |
184 | |
185 # Invervals are in BED coordinates; need to convert to GFF. | |
186 line = convert_bed_coords_to_gff( line ) | |
187 closest_feature = convert_bed_coords_to_gff( closest_feature ) | |
188 | |
189 # Replace double quotes with single quotes in closest feature's attributes. | |
190 out_file.write( "%s closest_feature \"%s\" \n" % | |
191 ( "\t".join( line.fields ), \ | |
192 "\t".join( closest_feature.fields ).replace( "\"", "\\\"" ) | |
193 ) ) | |
194 else: | |
195 # Output is BED + closest feature fields. | |
196 output_line_fields = [] | |
197 output_line_fields.extend( line.fields ) | |
198 output_line_fields.extend( closest_feature.fields ) | |
199 out_file.write( "%s\n" % ( "\t".join( output_line_fields ) ) ) | |
200 else: | |
201 out_file.write( "%s\n" % result ) | |
202 except ParseError, exc: | |
203 fail( "Invalid file format: %s" % str( exc ) ) | |
204 | |
205 print "Direction: %s" %(direction) | |
206 if g1.skipped > 0: | |
207 print skipped( g1, filedesc=" of 1st dataset" ) | |
208 if g2.skipped > 0: | |
209 print skipped( g2, filedesc=" of 2nd dataset" ) | |
210 | |
211 if __name__ == "__main__": | |
212 main() |