| 0 | 1 #!/usr/bin/env python | 
|  | 2 """ | 
|  | 3 Find regions of first interval file that overlap regions in a second interval file. | 
|  | 4 Interval files can either be BED or GFF format. | 
|  | 5 | 
|  | 6 usage: %prog interval_file_1 interval_file_2 out_file | 
|  | 7     -1, --cols1=N,N,N,N: Columns for start, end, strand in first file | 
|  | 8     -2, --cols2=N,N,N,N: Columns for start, end, strand in second file | 
|  | 9     -m, --mincols=N: Require this much overlap (default 1bp) | 
|  | 10     -p, --pieces: just print pieces of second set (after padding) | 
|  | 11     -G, --gff1: input 1 is GFF format, meaning start and end coordinates are 1-based, closed interval | 
|  | 12     -H, --gff2: input 2 is GFF format, meaning start and end coordinates are 1-based, closed interval | 
|  | 13 """ | 
|  | 14 import sys, traceback, fileinput | 
|  | 15 from warnings import warn | 
|  | 16 from bx.intervals import * | 
|  | 17 from bx.intervals.io import * | 
|  | 18 from bx.intervals.operations.intersect import * | 
|  | 19 from bx.cookbook import doc_optparse | 
|  | 20 from galaxy.tools.util.galaxyops import * | 
|  | 21 from utils.gff_util import GFFFeature, GFFReaderWrapper, convert_bed_coords_to_gff | 
|  | 22 | 
|  | 23 assert sys.version_info[:2] >= ( 2, 4 ) | 
|  | 24 | 
|  | 25 def main(): | 
|  | 26     mincols = 1 | 
|  | 27     upstream_pad = 0 | 
|  | 28     downstream_pad = 0 | 
|  | 29 | 
|  | 30     options, args = doc_optparse.parse( __doc__ ) | 
|  | 31     try: | 
|  | 32         chr_col_1, start_col_1, end_col_1, strand_col_1 = parse_cols_arg( options.cols1 ) | 
|  | 33         chr_col_2, start_col_2, end_col_2, strand_col_2 = parse_cols_arg( options.cols2 ) | 
|  | 34         if options.mincols: mincols = int( options.mincols ) | 
|  | 35         pieces = bool( options.pieces ) | 
|  | 36         in1_gff_format = bool( options.gff1 ) | 
|  | 37         in2_gff_format = bool( options.gff2 ) | 
|  | 38         in_fname, in2_fname, out_fname = args | 
|  | 39     except: | 
|  | 40         doc_optparse.exception() | 
|  | 41 | 
|  | 42     # Set readers to handle either GFF or default format. | 
|  | 43     if in1_gff_format: | 
|  | 44         in1_reader_wrapper = GFFReaderWrapper | 
|  | 45     else: | 
|  | 46         in1_reader_wrapper = NiceReaderWrapper | 
|  | 47     if in2_gff_format: | 
|  | 48         in2_reader_wrapper = GFFReaderWrapper | 
|  | 49     else: | 
|  | 50         in2_reader_wrapper = NiceReaderWrapper | 
|  | 51 | 
|  | 52     g1 = in1_reader_wrapper( fileinput.FileInput( in_fname ), | 
|  | 53                             chrom_col=chr_col_1, | 
|  | 54                             start_col=start_col_1, | 
|  | 55                             end_col=end_col_1, | 
|  | 56                             strand_col=strand_col_1, | 
|  | 57                             fix_strand=True ) | 
|  | 58     if in1_gff_format: | 
|  | 59         # Intersect requires coordinates in BED format. | 
|  | 60         g1.convert_to_bed_coord=True | 
|  | 61     g2 = in2_reader_wrapper( fileinput.FileInput( in2_fname ), | 
|  | 62                             chrom_col=chr_col_2, | 
|  | 63                             start_col=start_col_2, | 
|  | 64                             end_col=end_col_2, | 
|  | 65                             strand_col=strand_col_2, | 
|  | 66                             fix_strand=True ) | 
|  | 67     if in2_gff_format: | 
|  | 68         # Intersect requires coordinates in BED format. | 
|  | 69         g2.convert_to_bed_coord=True | 
|  | 70 | 
|  | 71     out_file = open( out_fname, "w" ) | 
|  | 72     try: | 
|  | 73         for feature in intersect( [g1,g2], pieces=pieces, mincols=mincols ): | 
|  | 74             if isinstance( feature, GFFFeature ): | 
|  | 75                 # Convert back to GFF coordinates since reader converted automatically. | 
|  | 76                 convert_bed_coords_to_gff( feature ) | 
|  | 77                 for interval in feature.intervals: | 
|  | 78                     out_file.write( "%s\n" % "\t".join( interval.fields ) ) | 
|  | 79             elif isinstance( feature, GenomicInterval ): | 
|  | 80                 out_file.write( "%s\n" % "\t".join( feature.fields ) ) | 
|  | 81             else: | 
|  | 82                 out_file.write( "%s\n" % feature ) | 
|  | 83     except ParseError, e: | 
|  | 84         out_file.close() | 
|  | 85         fail( "Invalid file format: %s" % str( e ) ) | 
|  | 86 | 
|  | 87     out_file.close() | 
|  | 88 | 
|  | 89     if g1.skipped > 0: | 
|  | 90         print skipped( g1, filedesc=" of 1st dataset" ) | 
|  | 91     if g2.skipped > 0: | 
|  | 92         print skipped( g2, filedesc=" of 2nd dataset" ) | 
|  | 93 | 
|  | 94 if __name__ == "__main__": | 
|  | 95     main() |