Mercurial > repos > devteam > column_maker
changeset 0:4dadd821722f draft
Imported from capsule None
author | devteam |
---|---|
date | Mon, 28 Jul 2014 11:30:22 -0400 |
parents | |
children | d3f10c90fc96 |
files | column_maker.py column_maker.xml test-data/1.bed test-data/1.interval test-data/column_maker_out1.interval test-data/column_maker_out2.interval test-data/column_maker_out3.interval |
diffstat | 7 files changed, 353 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/column_maker.py Mon Jul 28 11:30:22 2014 -0400 @@ -0,0 +1,125 @@ +#!/usr/bin/env python +# This tool takes a tab-delimited textfile as input and creates another column in the file which is the result of +# a computation performed on every row in the original file. The tool will skip over invalid lines within the file, +# informing the user about the number of lines skipped. + +import sys, re +# These functions may be used in compute expression: +from math import log,exp,sqrt,ceil,floor + + +assert sys.version_info[:2] >= ( 2, 4 ) + +def stop_err( msg ): + sys.stderr.write( msg ) + sys.exit() + +inp_file = sys.argv[1] +out_file = sys.argv[2] +expr = sys.argv[3] +round_result = sys.argv[4] +try: + in_columns = int( sys.argv[5] ) +except: + stop_err( "Missing or invalid 'columns' metadata value, click the pencil icon in the history item and select the Auto-detect option to correct it. This tool can only be used with tab-delimited data." ) +if in_columns < 2: + # To be considered tabular, data must fulfill requirements of the sniff.is_column_based() method. + stop_err( "Missing or invalid 'columns' metadata value, click the pencil icon in the history item and select the Auto-detect option to correct it. This tool can only be used with tab-delimited data." ) +try: + in_column_types = sys.argv[6].split( ',' ) +except: + stop_err( "Missing or invalid 'column_types' metadata value, click the pencil icon in the history item and select the Auto-detect option to correct it. This tool can only be used with tab-delimited data." ) +if len( in_column_types ) != in_columns: + stop_err( "The 'columns' metadata setting does not conform to the 'column_types' metadata setting, click the pencil icon in the history item and select the Auto-detect option to correct it. This tool can only be used with tab-delimited data." ) + +# Unescape if input has been escaped +mapped_str = { + '__lt__': '<', + '__le__': '<=', + '__eq__': '==', + '__ne__': '!=', + '__gt__': '>', + '__ge__': '>=', + '__sq__': '\'', + '__dq__': '"', +} +for key, value in mapped_str.items(): + expr = expr.replace( key, value ) + +operators = 'is|not|or|and' +builtin_and_math_functions = 'abs|all|any|bin|chr|cmp|complex|divmod|float|hex|int|len|long|max|min|oct|ord|pow|range|reversed|round|sorted|str|sum|type|unichr|unicode|log|exp|sqrt|ceil|floor' +string_and_list_methods = [ name for name in dir('') + dir([]) if not name.startswith('_') ] +whitelist = "^([c0-9\+\-\*\/\(\)\.\'\"><=,:! ]|%s|%s|%s)*$" % (operators, builtin_and_math_functions, '|'.join(string_and_list_methods)) +if not re.compile(whitelist).match(expr): + stop_err("Invalid expression") + +# Prepare the column variable names and wrappers for column data types +cols, type_casts = [], [] +for col in range( 1, in_columns + 1 ): + col_name = "c%d" % col + cols.append( col_name ) + col_type = in_column_types[ col - 1 ].strip() + if round_result == 'no' and col_type == 'int': + col_type = 'float' + type_cast = "%s(%s)" % ( col_type, col_name ) + type_casts.append( type_cast ) + +col_str = ', '.join( cols ) # 'c1, c2, c3, c4' +type_cast_str = ', '.join( type_casts ) # 'str(c1), int(c2), int(c3), str(c4)' +assign = "%s = line.split( '\\t' )" % col_str +wrap = "%s = %s" % ( col_str, type_cast_str ) +skipped_lines = 0 +first_invalid_line = 0 +invalid_line = None +lines_kept = 0 +total_lines = 0 +out = open( out_file, 'wt' ) + +# Read input file, skipping invalid lines, and perform computation that will result in a new column +code = ''' +for i, line in enumerate( file( inp_file ) ): + total_lines += 1 + line = line.rstrip( '\\r\\n' ) + if not line or line.startswith( '#' ): + skipped_lines += 1 + if not invalid_line: + first_invalid_line = i + 1 + invalid_line = line + continue + try: + %s + %s + new_val = %s + if round_result == "yes": + new_val = int( round( new_val ) ) + new_line = line + '\\t' + str( new_val ) + print >> out, new_line + lines_kept += 1 + except: + skipped_lines += 1 + if not invalid_line: + first_invalid_line = i + 1 + invalid_line = line +''' % ( assign, wrap, expr ) + +valid_expr = True +try: + exec code +except Exception, e: + out.close() + if str( e ).startswith( 'invalid syntax' ): + valid_expr = False + stop_err( 'Expression "%s" likely invalid. See tool tips, syntax and examples.' % expr ) + else: + stop_err( str( e ) ) + +if valid_expr: + out.close() + valid_lines = total_lines - skipped_lines + print 'Creating column %d with expression %s' % ( in_columns + 1, expr ) + if valid_lines > 0: + print 'kept %4.2f%% of %d lines.' % ( 100.0*lines_kept/valid_lines, total_lines ) + else: + print 'Possible invalid expression "%s" or non-existent column referenced. See tool tips, syntax and examples.' % expr + if skipped_lines > 0: + print 'Skipped %d invalid lines starting at line #%d: "%s"' % ( skipped_lines, first_invalid_line, invalid_line )
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/column_maker.xml Mon Jul 28 11:30:22 2014 -0400 @@ -0,0 +1,83 @@ +<tool id="Add_a_column1" name="Compute" version="1.1.0"> + <description>an expression on every row</description> + <command interpreter="python"> + column_maker.py $input $out_file1 "$cond" $round ${input.metadata.columns} "${input.metadata.column_types}" + </command> + <inputs> + <param name="cond" size="40" type="text" value="c3-c2" label="Add expression"/> + <param format="tabular" name="input" type="data" label="as a new column to" help="Dataset missing? See TIP below"/> + <param name="round" type="select" label="Round result?"> + <option value="no">NO</option> + <option value="yes">YES</option> + </param> + </inputs> + <outputs> + <data format="input" name="out_file1" metadata_source="input"/> + </outputs> + <tests> + <test> + <param name="cond" value="c3-c2"/> + <param name="input" value="1.bed"/> + <param name="round" value="no"/> + <output name="out_file1" file="column_maker_out1.interval"/> + </test> + <test> + <param name="cond" value="c4*1"/> + <param name="input" value="1.interval"/> + <param name="round" value="no"/> + <output name="out_file1" file="column_maker_out2.interval"/> + </test> + <test> + <param name="cond" value="c4*1"/> + <param name="input" value="1.interval"/> + <param name="round" value="yes"/> + <output name="out_file1" file="column_maker_out3.interval"/> + </test> + </tests> + <help> + + .. class:: infomark + +**TIP:** If your data is not TAB delimited, use *Text Manipulation->Convert* + +----- + +**What it does** + +This tool computes an expression for every row of a dataset and appends the result as a new column (field). + +- Columns are referenced with **c** and a **number**. For example, **c1** refers to the first column of a tab-delimited file + +- **c3-c2** will add a length column to the dataset if **c2** and **c3** are start and end position + +----- + +**Example** + +If this is your input:: + + chr1 151077881 151077918 2 200 - + chr1 151081985 151082078 3 500 + + +computing "c4*c5" will produce:: + + chr1 151077881 151077918 2 200 - 400.0 + chr1 151081985 151082078 3 500 + 1500.0 + +if, at the same time, "Round result?" is set to **YES** results will look like this:: + + chr1 151077881 151077918 2 200 - 400 + chr1 151081985 151082078 3 500 + 1500 + +You can also use this tool to evaluate expressions. For example, computing "c3>=c2" for Input will result in the following:: + + chr1 151077881 151077918 2 200 - True + chr1 151081985 151082078 3 500 + True + +or computing "type(c2)==type('') for Input will return:: + + chr1 151077881 151077918 2 200 - False + chr1 151081985 151082078 3 500 + False + +</help> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/1.bed Mon Jul 28 11:30:22 2014 -0400 @@ -0,0 +1,65 @@ +chr1 147962192 147962580 CCDS989.1_cds_0_0_chr1_147962193_r 0 - +chr1 147984545 147984630 CCDS990.1_cds_0_0_chr1_147984546_f 0 + +chr1 148078400 148078582 CCDS993.1_cds_0_0_chr1_148078401_r 0 - +chr1 148185136 148185276 CCDS996.1_cds_0_0_chr1_148185137_f 0 + +chr10 55251623 55253124 CCDS7248.1_cds_0_0_chr10_55251624_r 0 - +chr11 116124407 116124501 CCDS8374.1_cds_0_0_chr11_116124408_r 0 - +chr11 116206508 116206563 CCDS8377.1_cds_0_0_chr11_116206509_f 0 + +chr11 116211733 116212337 CCDS8378.1_cds_0_0_chr11_116211734_r 0 - +chr11 1812377 1812407 CCDS7726.1_cds_0_0_chr11_1812378_f 0 + +chr12 38440094 38440321 CCDS8736.1_cds_0_0_chr12_38440095_r 0 - +chr13 112381694 112381953 CCDS9526.1_cds_0_0_chr13_112381695_f 0 + +chr14 98710240 98712285 CCDS9949.1_cds_0_0_chr14_98710241_r 0 - +chr15 41486872 41487060 CCDS10096.1_cds_0_0_chr15_41486873_r 0 - +chr15 41673708 41673857 CCDS10097.1_cds_0_0_chr15_41673709_f 0 + +chr15 41679161 41679250 CCDS10098.1_cds_0_0_chr15_41679162_r 0 - +chr15 41826029 41826196 CCDS10101.1_cds_0_0_chr15_41826030_f 0 + +chr16 142908 143003 CCDS10397.1_cds_0_0_chr16_142909_f 0 + +chr16 179963 180135 CCDS10401.1_cds_0_0_chr16_179964_r 0 - +chr16 244413 244681 CCDS10402.1_cds_0_0_chr16_244414_f 0 + +chr16 259268 259383 CCDS10403.1_cds_0_0_chr16_259269_r 0 - +chr18 23786114 23786321 CCDS11891.1_cds_0_0_chr18_23786115_r 0 - +chr18 59406881 59407046 CCDS11985.1_cds_0_0_chr18_59406882_f 0 + +chr18 59455932 59456337 CCDS11986.1_cds_0_0_chr18_59455933_r 0 - +chr18 59600586 59600754 CCDS11988.1_cds_0_0_chr18_59600587_f 0 + +chr19 59068595 59069564 CCDS12866.1_cds_0_0_chr19_59068596_f 0 + +chr19 59236026 59236146 CCDS12872.1_cds_0_0_chr19_59236027_r 0 - +chr19 59297998 59298008 CCDS12877.1_cds_0_0_chr19_59297999_f 0 + +chr19 59302168 59302288 CCDS12878.1_cds_0_0_chr19_59302169_r 0 - +chr2 118288583 118288668 CCDS2120.1_cds_0_0_chr2_118288584_f 0 + +chr2 118394148 118394202 CCDS2121.1_cds_0_0_chr2_118394149_r 0 - +chr2 220190202 220190242 CCDS2441.1_cds_0_0_chr2_220190203_f 0 + +chr2 220229609 220230869 CCDS2443.1_cds_0_0_chr2_220229610_r 0 - +chr20 33330413 33330423 CCDS13249.1_cds_0_0_chr20_33330414_r 0 - +chr20 33513606 33513792 CCDS13255.1_cds_0_0_chr20_33513607_f 0 + +chr20 33579500 33579527 CCDS13256.1_cds_0_0_chr20_33579501_r 0 - +chr20 33593260 33593348 CCDS13257.1_cds_0_0_chr20_33593261_f 0 + +chr21 32707032 32707192 CCDS13614.1_cds_0_0_chr21_32707033_f 0 + +chr21 32869641 32870022 CCDS13615.1_cds_0_0_chr21_32869642_r 0 - +chr21 33321040 33322012 CCDS13620.1_cds_0_0_chr21_33321041_f 0 + +chr21 33744994 33745040 CCDS13625.1_cds_0_0_chr21_33744995_r 0 - +chr22 30120223 30120265 CCDS13897.1_cds_0_0_chr22_30120224_f 0 + +chr22 30160419 30160661 CCDS13898.1_cds_0_0_chr22_30160420_r 0 - +chr22 30665273 30665360 CCDS13901.1_cds_0_0_chr22_30665274_f 0 + +chr22 30939054 30939266 CCDS13903.1_cds_0_0_chr22_30939055_r 0 - +chr5 131424298 131424460 CCDS4149.1_cds_0_0_chr5_131424299_f 0 + +chr5 131556601 131556672 CCDS4151.1_cds_0_0_chr5_131556602_r 0 - +chr5 131621326 131621419 CCDS4152.1_cds_0_0_chr5_131621327_f 0 + +chr5 131847541 131847666 CCDS4155.1_cds_0_0_chr5_131847542_r 0 - +chr6 108299600 108299744 CCDS5061.1_cds_0_0_chr6_108299601_r 0 - +chr6 108594662 108594687 CCDS5063.1_cds_0_0_chr6_108594663_f 0 + +chr6 108640045 108640151 CCDS5064.1_cds_0_0_chr6_108640046_r 0 - +chr6 108722976 108723115 CCDS5067.1_cds_0_0_chr6_108722977_f 0 + +chr7 113660517 113660685 CCDS5760.1_cds_0_0_chr7_113660518_f 0 + +chr7 116512159 116512389 CCDS5771.1_cds_0_0_chr7_116512160_r 0 - +chr7 116714099 116714152 CCDS5773.1_cds_0_0_chr7_116714100_f 0 + +chr7 116945541 116945787 CCDS5774.1_cds_0_0_chr7_116945542_r 0 - +chr8 118881131 118881317 CCDS6324.1_cds_0_0_chr8_118881132_r 0 - +chr9 128764156 128764189 CCDS6914.1_cds_0_0_chr9_128764157_f 0 + +chr9 128787519 128789136 CCDS6915.1_cds_0_0_chr9_128787520_r 0 - +chr9 128882427 128882523 CCDS6917.1_cds_0_0_chr9_128882428_f 0 + +chr9 128937229 128937445 CCDS6919.1_cds_0_0_chr9_128937230_r 0 - +chrX 122745047 122745924 CCDS14606.1_cds_0_0_chrX_122745048_f 0 + +chrX 152648964 152649196 CCDS14733.1_cds_0_0_chrX_152648965_r 0 - +chrX 152691446 152691471 CCDS14735.1_cds_0_0_chrX_152691447_f 0 + +chrX 152694029 152694263 CCDS14736.1_cds_0_0_chrX_152694030_r 0 -
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/1.interval Mon Jul 28 11:30:22 2014 -0400 @@ -0,0 +1,5 @@ +chr1 4348187 4348589 3.70 4.90 2.55 0.24 0.46 +chr1 4488177 4488442 4.03 5.77 1.92 -0.67 0.81 +chr1 4774091 4774440 8.07 8.33 7.82 0.85 -0.40 +chr1 4800122 4800409 6.40 7.35 5.44 1.19 -0.42 +chr1 4878925 4879277 2.18 0.28 4.93 -0.96 1.24
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/column_maker_out1.interval Mon Jul 28 11:30:22 2014 -0400 @@ -0,0 +1,65 @@ +chr1 147962192 147962580 CCDS989.1_cds_0_0_chr1_147962193_r 0 - 388.0 +chr1 147984545 147984630 CCDS990.1_cds_0_0_chr1_147984546_f 0 + 85.0 +chr1 148078400 148078582 CCDS993.1_cds_0_0_chr1_148078401_r 0 - 182.0 +chr1 148185136 148185276 CCDS996.1_cds_0_0_chr1_148185137_f 0 + 140.0 +chr10 55251623 55253124 CCDS7248.1_cds_0_0_chr10_55251624_r 0 - 1501.0 +chr11 116124407 116124501 CCDS8374.1_cds_0_0_chr11_116124408_r 0 - 94.0 +chr11 116206508 116206563 CCDS8377.1_cds_0_0_chr11_116206509_f 0 + 55.0 +chr11 116211733 116212337 CCDS8378.1_cds_0_0_chr11_116211734_r 0 - 604.0 +chr11 1812377 1812407 CCDS7726.1_cds_0_0_chr11_1812378_f 0 + 30.0 +chr12 38440094 38440321 CCDS8736.1_cds_0_0_chr12_38440095_r 0 - 227.0 +chr13 112381694 112381953 CCDS9526.1_cds_0_0_chr13_112381695_f 0 + 259.0 +chr14 98710240 98712285 CCDS9949.1_cds_0_0_chr14_98710241_r 0 - 2045.0 +chr15 41486872 41487060 CCDS10096.1_cds_0_0_chr15_41486873_r 0 - 188.0 +chr15 41673708 41673857 CCDS10097.1_cds_0_0_chr15_41673709_f 0 + 149.0 +chr15 41679161 41679250 CCDS10098.1_cds_0_0_chr15_41679162_r 0 - 89.0 +chr15 41826029 41826196 CCDS10101.1_cds_0_0_chr15_41826030_f 0 + 167.0 +chr16 142908 143003 CCDS10397.1_cds_0_0_chr16_142909_f 0 + 95.0 +chr16 179963 180135 CCDS10401.1_cds_0_0_chr16_179964_r 0 - 172.0 +chr16 244413 244681 CCDS10402.1_cds_0_0_chr16_244414_f 0 + 268.0 +chr16 259268 259383 CCDS10403.1_cds_0_0_chr16_259269_r 0 - 115.0 +chr18 23786114 23786321 CCDS11891.1_cds_0_0_chr18_23786115_r 0 - 207.0 +chr18 59406881 59407046 CCDS11985.1_cds_0_0_chr18_59406882_f 0 + 165.0 +chr18 59455932 59456337 CCDS11986.1_cds_0_0_chr18_59455933_r 0 - 405.0 +chr18 59600586 59600754 CCDS11988.1_cds_0_0_chr18_59600587_f 0 + 168.0 +chr19 59068595 59069564 CCDS12866.1_cds_0_0_chr19_59068596_f 0 + 969.0 +chr19 59236026 59236146 CCDS12872.1_cds_0_0_chr19_59236027_r 0 - 120.0 +chr19 59297998 59298008 CCDS12877.1_cds_0_0_chr19_59297999_f 0 + 10.0 +chr19 59302168 59302288 CCDS12878.1_cds_0_0_chr19_59302169_r 0 - 120.0 +chr2 118288583 118288668 CCDS2120.1_cds_0_0_chr2_118288584_f 0 + 85.0 +chr2 118394148 118394202 CCDS2121.1_cds_0_0_chr2_118394149_r 0 - 54.0 +chr2 220190202 220190242 CCDS2441.1_cds_0_0_chr2_220190203_f 0 + 40.0 +chr2 220229609 220230869 CCDS2443.1_cds_0_0_chr2_220229610_r 0 - 1260.0 +chr20 33330413 33330423 CCDS13249.1_cds_0_0_chr20_33330414_r 0 - 10.0 +chr20 33513606 33513792 CCDS13255.1_cds_0_0_chr20_33513607_f 0 + 186.0 +chr20 33579500 33579527 CCDS13256.1_cds_0_0_chr20_33579501_r 0 - 27.0 +chr20 33593260 33593348 CCDS13257.1_cds_0_0_chr20_33593261_f 0 + 88.0 +chr21 32707032 32707192 CCDS13614.1_cds_0_0_chr21_32707033_f 0 + 160.0 +chr21 32869641 32870022 CCDS13615.1_cds_0_0_chr21_32869642_r 0 - 381.0 +chr21 33321040 33322012 CCDS13620.1_cds_0_0_chr21_33321041_f 0 + 972.0 +chr21 33744994 33745040 CCDS13625.1_cds_0_0_chr21_33744995_r 0 - 46.0 +chr22 30120223 30120265 CCDS13897.1_cds_0_0_chr22_30120224_f 0 + 42.0 +chr22 30160419 30160661 CCDS13898.1_cds_0_0_chr22_30160420_r 0 - 242.0 +chr22 30665273 30665360 CCDS13901.1_cds_0_0_chr22_30665274_f 0 + 87.0 +chr22 30939054 30939266 CCDS13903.1_cds_0_0_chr22_30939055_r 0 - 212.0 +chr5 131424298 131424460 CCDS4149.1_cds_0_0_chr5_131424299_f 0 + 162.0 +chr5 131556601 131556672 CCDS4151.1_cds_0_0_chr5_131556602_r 0 - 71.0 +chr5 131621326 131621419 CCDS4152.1_cds_0_0_chr5_131621327_f 0 + 93.0 +chr5 131847541 131847666 CCDS4155.1_cds_0_0_chr5_131847542_r 0 - 125.0 +chr6 108299600 108299744 CCDS5061.1_cds_0_0_chr6_108299601_r 0 - 144.0 +chr6 108594662 108594687 CCDS5063.1_cds_0_0_chr6_108594663_f 0 + 25.0 +chr6 108640045 108640151 CCDS5064.1_cds_0_0_chr6_108640046_r 0 - 106.0 +chr6 108722976 108723115 CCDS5067.1_cds_0_0_chr6_108722977_f 0 + 139.0 +chr7 113660517 113660685 CCDS5760.1_cds_0_0_chr7_113660518_f 0 + 168.0 +chr7 116512159 116512389 CCDS5771.1_cds_0_0_chr7_116512160_r 0 - 230.0 +chr7 116714099 116714152 CCDS5773.1_cds_0_0_chr7_116714100_f 0 + 53.0 +chr7 116945541 116945787 CCDS5774.1_cds_0_0_chr7_116945542_r 0 - 246.0 +chr8 118881131 118881317 CCDS6324.1_cds_0_0_chr8_118881132_r 0 - 186.0 +chr9 128764156 128764189 CCDS6914.1_cds_0_0_chr9_128764157_f 0 + 33.0 +chr9 128787519 128789136 CCDS6915.1_cds_0_0_chr9_128787520_r 0 - 1617.0 +chr9 128882427 128882523 CCDS6917.1_cds_0_0_chr9_128882428_f 0 + 96.0 +chr9 128937229 128937445 CCDS6919.1_cds_0_0_chr9_128937230_r 0 - 216.0 +chrX 122745047 122745924 CCDS14606.1_cds_0_0_chrX_122745048_f 0 + 877.0 +chrX 152648964 152649196 CCDS14733.1_cds_0_0_chrX_152648965_r 0 - 232.0 +chrX 152691446 152691471 CCDS14735.1_cds_0_0_chrX_152691447_f 0 + 25.0 +chrX 152694029 152694263 CCDS14736.1_cds_0_0_chrX_152694030_r 0 - 234.0
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/column_maker_out2.interval Mon Jul 28 11:30:22 2014 -0400 @@ -0,0 +1,5 @@ +chr1 4348187 4348589 3.70 4.90 2.55 0.24 0.46 3.7 +chr1 4488177 4488442 4.03 5.77 1.92 -0.67 0.81 4.03 +chr1 4774091 4774440 8.07 8.33 7.82 0.85 -0.40 8.07 +chr1 4800122 4800409 6.40 7.35 5.44 1.19 -0.42 6.4 +chr1 4878925 4879277 2.18 0.28 4.93 -0.96 1.24 2.18
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/column_maker_out3.interval Mon Jul 28 11:30:22 2014 -0400 @@ -0,0 +1,5 @@ +chr1 4348187 4348589 3.70 4.90 2.55 0.24 0.46 4 +chr1 4488177 4488442 4.03 5.77 1.92 -0.67 0.81 4 +chr1 4774091 4774440 8.07 8.33 7.82 0.85 -0.40 8 +chr1 4800122 4800409 6.40 7.35 5.44 1.19 -0.42 6 +chr1 4878925 4879277 2.18 0.28 4.93 -0.96 1.24 2