Mercurial > repos > devteam > flanking_features
annotate flanking_features.py @ 1:850c05b9af00 draft
planemo upload commit 33927a87ba2eee9bf0ecdd376a66241b17b3d734
| author | devteam |
|---|---|
| date | Tue, 13 Oct 2015 12:50:14 -0400 |
| parents | e928e029f6eb |
| children | 94248d5b9b8b |
| rev | line source |
|---|---|
| 0 | 1 #!/usr/bin/env python |
| 2 #By: Guruprasad Ananda | |
| 3 """ | |
| 4 Fetch closest up/downstream interval from features corresponding to every interval in primary | |
| 5 | |
| 6 usage: %prog primary_file features_file out_file direction | |
| 7 -1, --cols1=N,N,N,N: Columns for start, end, strand in first file | |
| 8 -2, --cols2=N,N,N,N: Columns for start, end, strand in second file | |
| 9 -G, --gff1: input 1 is GFF format, meaning start and end coordinates are 1-based, closed interval | |
| 10 -H, --gff2: input 2 is GFF format, meaning start and end coordinates are 1-based, closed interval | |
| 11 """ | |
| 12 | |
|
1
850c05b9af00
planemo upload commit 33927a87ba2eee9bf0ecdd376a66241b17b3d734
devteam
parents:
0
diff
changeset
|
13 import fileinput |
|
850c05b9af00
planemo upload commit 33927a87ba2eee9bf0ecdd376a66241b17b3d734
devteam
parents:
0
diff
changeset
|
14 import sys |
| 0 | 15 from bx.cookbook import doc_optparse |
|
1
850c05b9af00
planemo upload commit 33927a87ba2eee9bf0ecdd376a66241b17b3d734
devteam
parents:
0
diff
changeset
|
16 from bx.intervals.io import Comment, GenomicInterval, Header, NiceReaderWrapper |
| 0 | 17 from bx.intervals.operations import quicksect |
|
1
850c05b9af00
planemo upload commit 33927a87ba2eee9bf0ecdd376a66241b17b3d734
devteam
parents:
0
diff
changeset
|
18 from bx.tabular.io import ParseError |
|
850c05b9af00
planemo upload commit 33927a87ba2eee9bf0ecdd376a66241b17b3d734
devteam
parents:
0
diff
changeset
|
19 from galaxy.tools.util.galaxyops import fail, parse_cols_arg, skipped |
|
850c05b9af00
planemo upload commit 33927a87ba2eee9bf0ecdd376a66241b17b3d734
devteam
parents:
0
diff
changeset
|
20 from utils.gff_util import convert_bed_coords_to_gff, GFFIntervalToBEDReaderWrapper |
| 0 | 21 |
| 22 assert sys.version_info[:2] >= ( 2, 4 ) | |
| 23 | |
|
1
850c05b9af00
planemo upload commit 33927a87ba2eee9bf0ecdd376a66241b17b3d734
devteam
parents:
0
diff
changeset
|
24 |
|
850c05b9af00
planemo upload commit 33927a87ba2eee9bf0ecdd376a66241b17b3d734
devteam
parents:
0
diff
changeset
|
25 def get_closest_feature(node, direction, threshold_up, threshold_down, report_func_up, report_func_down): |
| 0 | 26 #direction=1 for +ve strand upstream and -ve strand downstream cases; and it is 0 for +ve strand downstream and -ve strand upstream cases |
| 27 #threhold_Up is equal to the interval start for +ve strand, and interval end for -ve strand | |
| 28 #threhold_down is equal to the interval end for +ve strand, and interval start for -ve strand | |
|
1
850c05b9af00
planemo upload commit 33927a87ba2eee9bf0ecdd376a66241b17b3d734
devteam
parents:
0
diff
changeset
|
29 if direction == 1: |
| 0 | 30 if node.maxend <= threshold_up: |
| 31 if node.end == node.maxend: | |
| 32 report_func_up(node) | |
| 33 elif node.right and node.left: | |
| 34 if node.right.maxend == node.maxend: | |
| 35 get_closest_feature(node.right, direction, threshold_up, threshold_down, report_func_up, report_func_down) | |
| 36 elif node.left.maxend == node.maxend: | |
| 37 get_closest_feature(node.left, direction, threshold_up, threshold_down, report_func_up, report_func_down) | |
| 38 elif node.right and node.right.maxend == node.maxend: | |
| 39 get_closest_feature(node.right, direction, threshold_up, threshold_down, report_func_up, report_func_down) | |
| 40 elif node.left and node.left.maxend == node.maxend: | |
| 41 get_closest_feature(node.left, direction, threshold_up, threshold_down, report_func_up, report_func_down) | |
| 42 elif node.minend <= threshold_up: | |
| 43 if node.end <= threshold_up: | |
| 44 report_func_up(node) | |
| 45 if node.left and node.right: | |
| 46 if node.right.minend <= threshold_up: | |
| 47 get_closest_feature(node.right, direction, threshold_up, threshold_down, report_func_up, report_func_down) | |
| 48 if node.left.minend <= threshold_up: | |
| 49 get_closest_feature(node.left, direction, threshold_up, threshold_down, report_func_up, report_func_down) | |
| 50 elif node.left: | |
| 51 if node.left.minend <= threshold_up: | |
| 52 get_closest_feature(node.left, direction, threshold_up, threshold_down, report_func_up, report_func_down) | |
| 53 elif node.right: | |
| 54 if node.right.minend <= threshold_up: | |
| 55 get_closest_feature(node.right, direction, threshold_up, threshold_down, report_func_up, report_func_down) | |
| 56 elif direction == 0: | |
| 57 if node.start > threshold_down: | |
| 58 report_func_down(node) | |
| 59 if node.left: | |
| 60 get_closest_feature(node.left, direction, threshold_up, threshold_down, report_func_up, report_func_down) | |
| 61 else: | |
| 62 if node.right: | |
| 63 get_closest_feature(node.right, direction, threshold_up, threshold_down, report_func_up, report_func_down) | |
| 64 | |
|
1
850c05b9af00
planemo upload commit 33927a87ba2eee9bf0ecdd376a66241b17b3d734
devteam
parents:
0
diff
changeset
|
65 |
| 0 | 66 def proximal_region_finder(readers, region, comments=True): |
| 67 """ | |
|
1
850c05b9af00
planemo upload commit 33927a87ba2eee9bf0ecdd376a66241b17b3d734
devteam
parents:
0
diff
changeset
|
68 Returns an iterator that yields elements of the form [ <original_interval>, <closest_feature> ]. |
|
850c05b9af00
planemo upload commit 33927a87ba2eee9bf0ecdd376a66241b17b3d734
devteam
parents:
0
diff
changeset
|
69 Intervals are GenomicInterval objects. |
| 0 | 70 """ |
| 71 primary = readers[0] | |
| 72 features = readers[1] | |
| 73 either = False | |
| 74 if region == 'Upstream': | |
| 75 up, down = True, False | |
| 76 elif region == 'Downstream': | |
| 77 up, down = False, True | |
| 78 else: | |
| 79 up, down = True, True | |
| 80 if region == 'Either': | |
| 81 either = True | |
|
1
850c05b9af00
planemo upload commit 33927a87ba2eee9bf0ecdd376a66241b17b3d734
devteam
parents:
0
diff
changeset
|
82 |
| 0 | 83 # Read features into memory: |
| 84 rightTree = quicksect.IntervalTree() | |
| 85 for item in features: | |
| 86 if type( item ) is GenomicInterval: | |
| 87 rightTree.insert( item, features.linenum, item ) | |
|
1
850c05b9af00
planemo upload commit 33927a87ba2eee9bf0ecdd376a66241b17b3d734
devteam
parents:
0
diff
changeset
|
88 |
| 0 | 89 for interval in primary: |
| 90 if type( interval ) is Header: | |
| 91 yield interval | |
| 92 if type( interval ) is Comment and comments: | |
| 93 yield interval | |
| 94 elif type( interval ) == GenomicInterval: | |
| 95 chrom = interval.chrom | |
| 96 start = int(interval.start) | |
| 97 end = int(interval.end) | |
| 98 strand = interval.strand | |
| 99 if chrom not in rightTree.chroms: | |
| 100 continue | |
| 101 else: | |
|
1
850c05b9af00
planemo upload commit 33927a87ba2eee9bf0ecdd376a66241b17b3d734
devteam
parents:
0
diff
changeset
|
102 root = rightTree.chroms[chrom] # root node for the chrom tree |
| 0 | 103 result_up = [] |
| 104 result_down = [] | |
|
1
850c05b9af00
planemo upload commit 33927a87ba2eee9bf0ecdd376a66241b17b3d734
devteam
parents:
0
diff
changeset
|
105 if (strand == '+' and up) or (strand == '-' and down): |
| 0 | 106 #upstream +ve strand and downstream -ve strand cases |
|
1
850c05b9af00
planemo upload commit 33927a87ba2eee9bf0ecdd376a66241b17b3d734
devteam
parents:
0
diff
changeset
|
107 get_closest_feature(root, 1, start, None, lambda node: result_up.append( node ), None) |
|
850c05b9af00
planemo upload commit 33927a87ba2eee9bf0ecdd376a66241b17b3d734
devteam
parents:
0
diff
changeset
|
108 |
| 0 | 109 if (strand == '+' and down) or (strand == '-' and up): |
| 110 #downstream +ve strand and upstream -ve strand case | |
|
1
850c05b9af00
planemo upload commit 33927a87ba2eee9bf0ecdd376a66241b17b3d734
devteam
parents:
0
diff
changeset
|
111 get_closest_feature(root, 0, None, end - 1, None, lambda node: result_down.append( node )) |
|
850c05b9af00
planemo upload commit 33927a87ba2eee9bf0ecdd376a66241b17b3d734
devteam
parents:
0
diff
changeset
|
112 |
| 0 | 113 if result_up: |
|
1
850c05b9af00
planemo upload commit 33927a87ba2eee9bf0ecdd376a66241b17b3d734
devteam
parents:
0
diff
changeset
|
114 if len(result_up) > 1: # The results_up list has a list of intervals upstream to the given interval. |
| 0 | 115 ends = [] |
| 116 for n in result_up: | |
| 117 ends.append(n.end) | |
|
1
850c05b9af00
planemo upload commit 33927a87ba2eee9bf0ecdd376a66241b17b3d734
devteam
parents:
0
diff
changeset
|
118 res_ind = ends.index(max(ends)) # fetch the index of the closest interval i.e. the interval with the max end from the results_up list |
| 0 | 119 else: |
| 120 res_ind = 0 | |
| 121 if not(either): | |
| 122 yield [ interval, result_up[res_ind].other ] | |
|
1
850c05b9af00
planemo upload commit 33927a87ba2eee9bf0ecdd376a66241b17b3d734
devteam
parents:
0
diff
changeset
|
123 |
|
850c05b9af00
planemo upload commit 33927a87ba2eee9bf0ecdd376a66241b17b3d734
devteam
parents:
0
diff
changeset
|
124 if result_down: |
| 0 | 125 if not(either): |
| 126 #The last element of result_down will be the closest element to the given interval | |
|
1
850c05b9af00
planemo upload commit 33927a87ba2eee9bf0ecdd376a66241b17b3d734
devteam
parents:
0
diff
changeset
|
127 yield [ interval, result_down[-1].other ] |
|
850c05b9af00
planemo upload commit 33927a87ba2eee9bf0ecdd376a66241b17b3d734
devteam
parents:
0
diff
changeset
|
128 |
| 0 | 129 if either and (result_up or result_down): |
| 130 iter_val = [] | |
| 131 if result_up and result_down: | |
| 132 if abs(start - int(result_up[res_ind].end)) <= abs(end - int(result_down[-1].start)): | |
| 133 iter_val = [ interval, result_up[res_ind].other ] | |
| 134 else: | |
| 135 #The last element of result_down will be the closest element to the given interval | |
| 136 iter_val = [ interval, result_down[-1].other ] | |
| 137 elif result_up: | |
| 138 iter_val = [ interval, result_up[res_ind].other ] | |
| 139 elif result_down: | |
| 140 #The last element of result_down will be the closest element to the given interval | |
| 141 iter_val = [ interval, result_down[-1].other ] | |
| 142 yield iter_val | |
|
1
850c05b9af00
planemo upload commit 33927a87ba2eee9bf0ecdd376a66241b17b3d734
devteam
parents:
0
diff
changeset
|
143 |
|
850c05b9af00
planemo upload commit 33927a87ba2eee9bf0ecdd376a66241b17b3d734
devteam
parents:
0
diff
changeset
|
144 |
| 0 | 145 def main(): |
| 146 options, args = doc_optparse.parse( __doc__ ) | |
| 147 try: | |
| 148 chr_col_1, start_col_1, end_col_1, strand_col_1 = parse_cols_arg( options.cols1 ) | |
| 149 chr_col_2, start_col_2, end_col_2, strand_col_2 = parse_cols_arg( options.cols2 ) | |
| 150 in1_gff_format = bool( options.gff1 ) | |
| 151 in2_gff_format = bool( options.gff2 ) | |
| 152 in_fname, in2_fname, out_fname, direction = args | |
| 153 except: | |
| 154 doc_optparse.exception() | |
|
1
850c05b9af00
planemo upload commit 33927a87ba2eee9bf0ecdd376a66241b17b3d734
devteam
parents:
0
diff
changeset
|
155 |
| 0 | 156 # Set readers to handle either GFF or default format. |
| 157 if in1_gff_format: | |
| 158 in1_reader_wrapper = GFFIntervalToBEDReaderWrapper | |
| 159 else: | |
| 160 in1_reader_wrapper = NiceReaderWrapper | |
| 161 if in2_gff_format: | |
| 162 in2_reader_wrapper = GFFIntervalToBEDReaderWrapper | |
| 163 else: | |
| 164 in2_reader_wrapper = NiceReaderWrapper | |
| 165 | |
| 166 g1 = in1_reader_wrapper( fileinput.FileInput( in_fname ), | |
|
1
850c05b9af00
planemo upload commit 33927a87ba2eee9bf0ecdd376a66241b17b3d734
devteam
parents:
0
diff
changeset
|
167 chrom_col=chr_col_1, |
|
850c05b9af00
planemo upload commit 33927a87ba2eee9bf0ecdd376a66241b17b3d734
devteam
parents:
0
diff
changeset
|
168 start_col=start_col_1, |
|
850c05b9af00
planemo upload commit 33927a87ba2eee9bf0ecdd376a66241b17b3d734
devteam
parents:
0
diff
changeset
|
169 end_col=end_col_1, |
|
850c05b9af00
planemo upload commit 33927a87ba2eee9bf0ecdd376a66241b17b3d734
devteam
parents:
0
diff
changeset
|
170 strand_col=strand_col_1, |
|
850c05b9af00
planemo upload commit 33927a87ba2eee9bf0ecdd376a66241b17b3d734
devteam
parents:
0
diff
changeset
|
171 fix_strand=True ) |
| 0 | 172 g2 = in2_reader_wrapper( fileinput.FileInput( in2_fname ), |
|
1
850c05b9af00
planemo upload commit 33927a87ba2eee9bf0ecdd376a66241b17b3d734
devteam
parents:
0
diff
changeset
|
173 chrom_col=chr_col_2, |
|
850c05b9af00
planemo upload commit 33927a87ba2eee9bf0ecdd376a66241b17b3d734
devteam
parents:
0
diff
changeset
|
174 start_col=start_col_2, |
|
850c05b9af00
planemo upload commit 33927a87ba2eee9bf0ecdd376a66241b17b3d734
devteam
parents:
0
diff
changeset
|
175 end_col=end_col_2, |
|
850c05b9af00
planemo upload commit 33927a87ba2eee9bf0ecdd376a66241b17b3d734
devteam
parents:
0
diff
changeset
|
176 strand_col=strand_col_2, |
|
850c05b9af00
planemo upload commit 33927a87ba2eee9bf0ecdd376a66241b17b3d734
devteam
parents:
0
diff
changeset
|
177 fix_strand=True ) |
| 0 | 178 |
| 179 # Find flanking features. | |
| 180 out_file = open( out_fname, "w" ) | |
| 181 try: | |
|
1
850c05b9af00
planemo upload commit 33927a87ba2eee9bf0ecdd376a66241b17b3d734
devteam
parents:
0
diff
changeset
|
182 for result in proximal_region_finder([g1, g2], direction): |
| 0 | 183 if type( result ) is list: |
| 184 line, closest_feature = result | |
| 185 # Need to join outputs differently depending on file types. | |
| 186 if in1_gff_format: | |
| 187 # Output is GFF with added attribute 'closest feature.' | |
| 188 | |
| 189 # Invervals are in BED coordinates; need to convert to GFF. | |
| 190 line = convert_bed_coords_to_gff( line ) | |
| 191 closest_feature = convert_bed_coords_to_gff( closest_feature ) | |
|
1
850c05b9af00
planemo upload commit 33927a87ba2eee9bf0ecdd376a66241b17b3d734
devteam
parents:
0
diff
changeset
|
192 |
| 0 | 193 # Replace double quotes with single quotes in closest feature's attributes. |
|
1
850c05b9af00
planemo upload commit 33927a87ba2eee9bf0ecdd376a66241b17b3d734
devteam
parents:
0
diff
changeset
|
194 out_file.write( "%s closest_feature \"%s\" \n" % |
|
850c05b9af00
planemo upload commit 33927a87ba2eee9bf0ecdd376a66241b17b3d734
devteam
parents:
0
diff
changeset
|
195 ( "\t".join( line.fields ), |
| 0 | 196 "\t".join( closest_feature.fields ).replace( "\"", "\\\"" ) |
|
1
850c05b9af00
planemo upload commit 33927a87ba2eee9bf0ecdd376a66241b17b3d734
devteam
parents:
0
diff
changeset
|
197 ) ) |
| 0 | 198 else: |
| 199 # Output is BED + closest feature fields. | |
| 200 output_line_fields = [] | |
| 201 output_line_fields.extend( line.fields ) | |
| 202 output_line_fields.extend( closest_feature.fields ) | |
| 203 out_file.write( "%s\n" % ( "\t".join( output_line_fields ) ) ) | |
| 204 else: | |
| 205 out_file.write( "%s\n" % result ) | |
| 206 except ParseError, exc: | |
| 207 fail( "Invalid file format: %s" % str( exc ) ) | |
| 208 | |
|
1
850c05b9af00
planemo upload commit 33927a87ba2eee9bf0ecdd376a66241b17b3d734
devteam
parents:
0
diff
changeset
|
209 print "Direction: %s" % (direction) |
| 0 | 210 if g1.skipped > 0: |
| 211 print skipped( g1, filedesc=" of 1st dataset" ) | |
| 212 if g2.skipped > 0: | |
| 213 print skipped( g2, filedesc=" of 2nd dataset" ) | |
| 214 | |
| 215 if __name__ == "__main__": | |
| 216 main() |
