Mercurial > repos > rico > testing_again
comparison select_snps.py @ 0:4d28d3295ac3 default tip
Uploaded
| author | rico |
|---|---|
| date | Fri, 06 Apr 2012 13:46:42 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:4d28d3295ac3 |
|---|---|
| 1 #!/usr/bin/env python | |
| 2 | |
| 3 import os | |
| 4 import sys | |
| 5 import math | |
| 6 from optparse import OptionParser | |
| 7 import genome_diversity as gd | |
| 8 | |
| 9 def main_function(parse_arguments=None): | |
| 10 if parse_arguments is None: | |
| 11 parse_arguments = lambda arguments: (None, arguments) | |
| 12 def main_decorator(to_decorate): | |
| 13 def decorated_main(arguments=None): | |
| 14 if arguments is None: | |
| 15 arguments = sys.argv | |
| 16 options, arguments = parse_arguments(arguments) | |
| 17 sys.exit(to_decorate(options, arguments)) | |
| 18 return decorated_main | |
| 19 return main_decorator | |
| 20 | |
| 21 def parse_arguments(arguments): | |
| 22 parser = OptionParser() | |
| 23 parser.add_option('--input', dest='input') | |
| 24 parser.add_option('--output', dest='output') | |
| 25 parser.add_option('--index_dir', dest='index_dir') | |
| 26 parser.add_option('--num_snps', dest='num_snps') | |
| 27 parser.add_option('--ref_chrom_col', dest='ref_chrom_col') | |
| 28 parser.add_option('--ref_pos_col', dest='ref_pos_col') | |
| 29 parser.add_option('--ref_species', dest='ref_species') | |
| 30 return parser.parse_args(arguments[1:]) | |
| 31 | |
| 32 @main_function(parse_arguments) | |
| 33 def main(options, arguments): | |
| 34 | |
| 35 ref_chrom_idx = to_int( options.ref_chrom_col ) -1 | |
| 36 ref_pos_idx = to_int( options.ref_pos_col ) -1 | |
| 37 | |
| 38 if (ref_chrom_idx < 1) or (ref_pos_idx < 1) or (ref_chrom_idx == ref_pos_idx): | |
| 39 print >> sys.stderr, "Cannot locate reference genome sequence (ref) or reference genome position (rPos) column for this dataset." | |
| 40 sys.exit(1) | |
| 41 | |
| 42 chrom_len_root = os.path.join( options.index_dir, 'shared/ucsc/chrom') | |
| 43 chrom_len_file = '%s.len' % options.ref_species | |
| 44 chrom_len_path = os.path.join(chrom_len_root, chrom_len_file) | |
| 45 | |
| 46 chrlens = gd.ChrLens( chrom_len_path ) | |
| 47 | |
| 48 total_len = 0 | |
| 49 for chrom in chrlens: | |
| 50 total_len += chrlens.length(chrom) | |
| 51 | |
| 52 total_requested = int( options.num_snps ) | |
| 53 lines, data, comments = get_snp_lines_data_and_comments( options.input, ref_chrom_idx, ref_pos_idx ) | |
| 54 selected = select_snps( data, total_len, total_requested ) | |
| 55 out_data = fix_selection_and_order_like_input(data, selected, total_requested) | |
| 56 write_selected_snps( options.output, out_data, lines, comments ) | |
| 57 | |
| 58 def to_int( value ): | |
| 59 try: | |
| 60 int_value = int( value ) | |
| 61 except ValueError: | |
| 62 int_value = 0 | |
| 63 return int_value | |
| 64 | |
| 65 def get_snp_lines_data_and_comments( filename, chrom_idx, pos_idx ): | |
| 66 fh = open( filename, 'r' ) | |
| 67 if (chrom_idx >= pos_idx): | |
| 68 needed = chrom_idx + 1 | |
| 69 else: | |
| 70 needed = pos_idx + 1 | |
| 71 lines = [] | |
| 72 data = [] | |
| 73 comments = [] | |
| 74 line_idx = 0 | |
| 75 line_num = 0 | |
| 76 for line in fh: | |
| 77 line_num += 1 | |
| 78 line = line.rstrip('\r\n') | |
| 79 if line: | |
| 80 if line.startswith('#'): | |
| 81 comments.append(line) | |
| 82 else: | |
| 83 elems = line.split('\t') | |
| 84 if len(elems) >= needed: | |
| 85 chrom = elems[chrom_idx] | |
| 86 try: | |
| 87 pos = int(elems[pos_idx]) | |
| 88 except ValueError: | |
| 89 sys.stderr.write( "bad reference position in line %d column %d: %s\n" % ( line_num, pos_idx+1, elems[pos_idx] ) ) | |
| 90 sys.exit(1) | |
| 91 lines.append(line) | |
| 92 chrom_sort = chrom.lstrip('chr') | |
| 93 data.append( [chrom_sort, chrom, pos, line_num, line_idx] ) | |
| 94 line_idx += 1 | |
| 95 fh.close() | |
| 96 data = sorted( data, key=lambda x: (x[0], x[2]) ) | |
| 97 return lines, data, comments | |
| 98 | |
| 99 def select_snps( data, total_len, requested ): | |
| 100 old_chrom = None | |
| 101 next_print = 0 | |
| 102 selected = [] | |
| 103 space = total_len / requested | |
| 104 for data_idx, datum in enumerate( data ): | |
| 105 chrom = datum[1] | |
| 106 pos = datum[2] | |
| 107 if chrom != old_chrom: | |
| 108 old_chrom = chrom | |
| 109 next_print = 0 | |
| 110 if pos >= next_print: | |
| 111 selected.append(data_idx) | |
| 112 next_print += space | |
| 113 return selected | |
| 114 | |
| 115 def fix_selection_and_order_like_input(data, selected, requested): | |
| 116 total_selected = len( selected ) | |
| 117 a = float( total_selected ) / requested | |
| 118 b = a / 2 | |
| 119 | |
| 120 idx_list = [] | |
| 121 for i in range( requested ): | |
| 122 idx = int( math.ceil( i * a + b ) - 1 ) | |
| 123 idx_list.append( idx ) | |
| 124 | |
| 125 out_data = [] | |
| 126 | |
| 127 for i, data_idx in enumerate(selected): | |
| 128 if total_selected > requested: | |
| 129 if i in idx_list: | |
| 130 out_data.append(data[data_idx]) | |
| 131 else: | |
| 132 out_data.append(data[data_idx]) | |
| 133 | |
| 134 out_data = sorted( out_data, key=lambda x: x[3] ) | |
| 135 | |
| 136 return out_data | |
| 137 | |
| 138 def write_selected_snps( filename, data, lines, comments ): | |
| 139 fh = open( filename, 'w' ) | |
| 140 | |
| 141 for comment in comments: | |
| 142 fh.write("%s\n" % comment ) | |
| 143 | |
| 144 for datum in data: | |
| 145 line_idx = datum[4] | |
| 146 fh.write("%s\n" % lines[line_idx]) | |
| 147 | |
| 148 fh.close() | |
| 149 | |
| 150 if __name__ == "__main__": | |
| 151 main() | |
| 152 | |
| 153 |
