# HG changeset patch
# User jjohnson
# Date 1453381776 18000
# Node ID afdbc71983536abba3dad66ba1a44283c46edd57
# Parent ffa5e34a55c1013aceb6b04428fbc70795c0a28a
planemo upload for repository https://github.com/jj-umn/galaxytools/tree/master/query_tabular commit 9ae87502ea7c3da33ecc453872c4eb2f41ecea4a-dirty
diff -r ffa5e34a55c1 -r afdbc7198353 query_tabular.py
--- a/query_tabular.py Sun Oct 04 10:52:21 2015 -0400
+++ b/query_tabular.py Thu Jan 21 08:09:36 2016 -0500
@@ -2,192 +2,246 @@
"""
"""
import sys
+import re
import os.path
+import json
import sqlite3 as sqlite
import optparse
from optparse import OptionParser
"""
-TODO:
-- could add some transformations on tabular columns, e.g. a regex to format date/time strings
+TODO:
+- could read column names from comment lines, but issues with legal names
+- could add some transformations on tabular columns,
+ e.g. a regex to format date/time strings
+ c2 : re.sub('pat', 'sub', c2)
+ c3 :
+- column_defs dict of columns to create from tabular input
+ column_defs : { 'name1' : 'expr', 'name2' : 'expr'}
- allow multiple queries and outputs
- add a --json input for table definitions (or yaml)
JSON config:
{ tables : [
- { file_path : '/home/galaxy/dataset_101.dat',
- table_name : 't1',
- column_names : ['c1','c2','c3']
+ { file_path : '/home/galaxy/dataset_101.dat',
+ table_name : 't1',
+ column_names : ['c1', 'c2', 'c3'],
+ comment_lines : 1
},
- { file_path : '/home/galaxy/dataset_102.dat',
- table_name : 't2',
- column_names : ['c1','c2','c3']
+ { file_path : '/home/galaxy/dataset_102.dat',
+ table_name : 't2',
+ column_names : ['c1', 'c2', 'c3']
+ },
+ { file_path : '/home/galaxy/dataset_103.dat',
+ table_name : 'test',
+ column_names : ['c1', 'c2', 'c3']
}
- ]
+ ]
}
"""
+tables_query = \
+ "SELECT name, sql FROM sqlite_master WHERE type='table' ORDER BY name"
+
+
def getValueType(val):
- if val or 0. == val:
- try:
- int(val)
- return 'INTEGER'
- except:
- try:
- float(val)
- return 'REAL'
- except:
- return 'TEXT'
- return None
-
-
-def get_column_def(file_path,table_name,skip=0,comment_char='#',column_names=None,max_lines=100):
- col_pref = ['TEXT','REAL','INTEGER',None]
- col_types = []
- data_lines = 0
- try:
- with open(file_path,"r") as fh:
- for linenum,line in enumerate(fh):
- if linenum < skip:
- continue
- if line.startswith(comment_char):
- continue
- data_lines += 1
+ if val or 0. == val:
try:
- fields = line.split('\t')
- while len(col_types) < len(fields):
- col_types.append(None)
- for i,val in enumerate(fields):
- colType = getValueType(val)
- if col_pref.index(colType) < col_pref.index(col_types[i]):
- col_types[i] = colType
- except Exception, e:
- print >> sys.stderr, 'Failed at line: %d err: %s' % (linenum,e)
- except Exception, e:
- print >> sys.stderr, 'Failed: %s' % (e)
- for i,col_type in enumerate(col_types):
- if not col_type:
- col_types[i] = 'TEXT'
- col_names = ['c%d' % i for i in range(1,len(col_types) + 1)]
- if column_names:
- for i,cname in enumerate([cn.strip() for cn in column_names.split(',')]):
- if cname and i < len(col_names):
- col_names[i] = cname
- col_def = []
- for i,col_name in enumerate(col_names):
- col_def.append('%s %s' % (col_names[i],col_types[i]))
- return col_names,col_types,col_def
-
-def create_table(conn,file_path,table_name,skip=0,comment_char='#',column_names=None):
- col_names,col_types,col_def = get_column_def(file_path,table_name,skip=skip,comment_char=comment_char,column_names=column_names)
- col_func = [float if t == 'REAL' else int if t == 'INTEGER' else str for t in col_types]
- table_def = 'CREATE TABLE %s (\n %s\n);' % (table_name,',\n '.join(col_def))
- # print >> sys.stdout, table_def
- insert_stmt = 'INSERT INTO %s(%s) VALUES(%s)' % (table_name,','.join(col_names),','.join([ "?" for x in col_names]))
- # print >> sys.stdout, insert_stmt
- data_lines = 0
- try:
- c = conn.cursor()
- c.execute(table_def)
- with open(file_path,"r") as fh:
- for linenum,line in enumerate(fh):
- if linenum < skip or line.startswith(comment_char):
- continue
- data_lines += 1
- try:
- fields = line.split('\t')
- vals = [col_func[i](x) if x else None for i,x in enumerate(fields)]
- c.execute(insert_stmt,vals)
- except Exception, e:
- print >> sys.stderr, 'Failed at line: %d err: %s' % (linenum,e)
- conn.commit()
- c.close()
- except Exception, e:
- print >> sys.stderr, 'Failed: %s' % (e)
- exit(1)
+ int(val)
+ return 'INTEGER'
+ except:
+ try:
+ float(val)
+ return 'REAL'
+ except:
+ return 'TEXT'
+ return None
+
+
+def get_column_def(file_path, table_name, skip=0, comment_char='#',
+ column_names=None, max_lines=100):
+ col_pref = ['TEXT', 'REAL', 'INTEGER', None]
+ col_types = []
+ data_lines = 0
+ try:
+ with open(file_path, "r") as fh:
+ for linenum, line in enumerate(fh):
+ if linenum < skip:
+ continue
+ if line.startswith(comment_char):
+ continue
+ data_lines += 1
+ try:
+ fields = line.split('\t')
+ while len(col_types) < len(fields):
+ col_types.append(None)
+ for i, val in enumerate(fields):
+ colType = getValueType(val)
+ if col_pref.index(colType) < col_pref.index(col_types[i]):
+ col_types[i] = colType
+ except Exception, e:
+ print >> sys.stderr, 'Failed at line: %d err: %s' % (linenum, e)
+ except Exception, e:
+ print >> sys.stderr, 'Failed: %s' % (e)
+ for i, col_type in enumerate(col_types):
+ if not col_type:
+ col_types[i] = 'TEXT'
+ col_names = ['c%d' % i for i in range(1, len(col_types) + 1)]
+ if column_names:
+ for i, cname in enumerate([cn.strip() for cn in column_names.split(',')]):
+ if cname and i < len(col_names):
+ col_names[i] = cname
+ col_def = []
+ for i, col_name in enumerate(col_names):
+ col_def.append('%s %s' % (col_names[i], col_types[i]))
+ return col_names, col_types, col_def
+
+
+def create_table(conn, file_path, table_name, skip=0, comment_char='#', column_names=None):
+ col_names, col_types, col_def = get_column_def(file_path, table_name, skip=skip, comment_char=comment_char, column_names=column_names)
+ col_func = [float if t == 'REAL' else int if t == 'INTEGER' else str for t in col_types]
+ table_def = 'CREATE TABLE %s (\n %s\n);' % (table_name, ', \n '.join(col_def))
+ # print >> sys.stdout, table_def
+ insert_stmt = 'INSERT INTO %s(%s) VALUES(%s)' % (table_name, ','.join(col_names), ','.join(["?" for x in col_names]))
+ # print >> sys.stdout, insert_stmt
+ data_lines = 0
+ try:
+ c = conn.cursor()
+ c.execute(table_def)
+ with open(file_path, "r") as fh:
+ for linenum, line in enumerate(fh):
+ if linenum < skip or line.startswith(comment_char):
+ continue
+ data_lines += 1
+ try:
+ fields = line.rstrip('\r\n').split('\t')
+ vals = [col_func[i](x) if x else None for i, x in enumerate(fields)]
+ c.execute(insert_stmt, vals)
+ except Exception, e:
+ print >> sys.stderr, 'Failed at line: %d err: %s' % (linenum, e)
+ conn.commit()
+ c.close()
+ except Exception, e:
+ print >> sys.stderr, 'Failed: %s' % (e)
+ exit(1)
+
+
+def regex_match(expr, item):
+ return re.match(expr, item) is not None
+
+
+def regex_search(expr, item):
+ return re.search(expr, item) is not None
+
+
+def regex_sub(expr, replace, item):
+ return re.sub(expr, replace, item)
+
+
+def get_connection(sqlitedb_path, addfunctions=False):
+ conn = sqlite.connect(sqlitedb_path)
+ if addfunctions:
+ conn.create_function("re_match", 2, regex_match)
+ conn.create_function("re_search", 2, regex_search)
+ conn.create_function("re_sub", 3, regex_sub)
+ return conn
+
def __main__():
- #Parse Command Line
- parser = optparse.OptionParser()
- parser.add_option( '-s', '--sqlitedb', dest='sqlitedb', default=None, help='The SQLite Database' )
- parser.add_option( '-t', '--table', dest='tables', action="append", default=[], help='Tabular file: file_path[=table_name[:column_name,...]' )
- parser.add_option( '-q', '--query', dest='query', default=None, help='SQL query' )
- parser.add_option( '-Q', '--query_file', dest='query_file', default=None, help='SQL query file' )
- parser.add_option( '-n', '--no_header', dest='no_header', action='store_true', default=False, help='Include a column headers line' )
- parser.add_option( '-o', '--output', dest='output', default=None, help='Output file for query results' )
- (options, args) = parser.parse_args()
+ # Parse Command Line
+ parser = optparse.OptionParser()
+ parser.add_option('-s', '--sqlitedb', dest='sqlitedb', default=None, help='The SQLite Database')
+ parser.add_option('-t', '--table', dest='tables', action="append", default=[], help='Tabular file: file_path[=table_name[:column_name, ...]')
+ parser.add_option('-j', '--jsonfile', dest='jsonfile', default=None, help='Tabular file: file_path[=table_name[:column_name, ...]')
+ parser.add_option('-q', '--query', dest='query', default=None, help='SQL query')
+ parser.add_option('-Q', '--query_file', dest='query_file', default=None, help='SQL query file')
+ parser.add_option('-n', '--no_header', dest='no_header', action='store_true', default=False, help='Include a column headers line')
+ parser.add_option('-o', '--output', dest='output', default=None, help='Output file for query results')
+ (options, args) = parser.parse_args()
- # oprn sqlite connection
- conn = sqlite.connect(options.sqlitedb)
- # determine output destination
- if options.output != None:
- try:
- outputPath = os.path.abspath(options.output)
- outputFile = open(outputPath, 'w')
- except Exception, e:
- print >> sys.stderr, "failed: %s" % e
- exit(3)
- else:
- outputFile = sys.stdout
-
- # determine output destination
- for ti,table in enumerate(options.tables):
- table_name = 't%d' % (ti + 1)
- column_names = None
- fields = table.split('=')
- path = fields[0]
- if len(fields) > 1:
- names = fields[1].split(':')
- table_name = names[0] if names[0] else table_name
- if len(names) > 1:
- column_names = names[1]
- # print >> sys.stdout, '%s %s' % (table_name, path)
- create_table(conn,path,table_name,column_names=column_names)
- conn.close()
+ # open sqlite connection
+ conn = get_connection(options.sqlitedb)
+ # determine output destination
+ if options.output is not None:
+ try:
+ outputPath = os.path.abspath(options.output)
+ outputFile = open(outputPath, 'w')
+ except Exception, e:
+ print >> sys.stderr, "failed: %s" % e
+ exit(3)
+ else:
+ outputFile = sys.stdout
- query = None
- if (options.query_file != None):
- with open(options.query_file,'r') as fh:
- query = ''
- for line in fh:
- query += line
- elif (options.query != None):
- query = options.query
-
- if (query is None):
- try:
- conn = sqlite.connect(options.sqlitedb)
- c = conn.cursor()
- tables_query = "SELECT name,sql FROM sqlite_master WHERE type='table' ORDER BY name"
- rslt = c.execute(tables_query).fetchall()
- for table,sql in rslt:
- print >> sys.stderr, "Table %s:" % table
+ # get table defs
+ if options.tables:
+ for ti, table in enumerate(options.tables):
+ table_name = 't%d' % (ti + 1)
+ column_names = None
+ fields = table.split('=')
+ path = fields[0]
+ if len(fields) > 1:
+ names = fields[1].split(':')
+ table_name = names[0] if names[0] else table_name
+ if len(names) > 1:
+ column_names = names[1]
+ # print >> sys.stdout, '%s %s' % (table_name, path)
+ create_table(conn, path, table_name, column_names=column_names)
+ if options.jsonfile:
try:
- col_query = 'SELECT * FROM %s LIMIT 0' % table
- cur = conn.cursor().execute(col_query)
- cols = [col[0] for col in cur.description]
- print >> sys.stderr, " Columns: %s" % cols
+ fh = open(options.jsonfile)
+ tdef = json.load(fh)
+ if 'tables' in tdef:
+ for ti, table in enumerate(tdef['tables']):
+ path = table['file_path']
+ table_name = table['table_name'] if 'table_name' in table else 't%d' % (ti + 1)
+ column_names = table['column_names'] if 'column_names' in table else None
+ comment_lines = table['comment_lines'] if 'comment_lines' in table else 0
+ create_table(conn, path, table_name, column_names=column_names, skip=comment_lines)
except Exception, exc:
- print >> sys.stderr, "Error: %s" % exc
+ print >> sys.stderr, "Error: %s" % exc
+ conn.close()
+
+ query = None
+ if (options.query_file is not None):
+ with open(options.query_file, 'r') as fh:
+ query = ''
+ for line in fh:
+ query += line
+ elif (options.query is not None):
+ query = options.query
+
+ if (query is None):
+ try:
+ conn = get_connection(options.sqlitedb)
+ c = conn.cursor()
+ rslt = c.execute(tables_query).fetchall()
+ for table, sql in rslt:
+ print >> sys.stderr, "Table %s:" % table
+ try:
+ col_query = 'SELECT * FROM %s LIMIT 0' % table
+ cur = conn.cursor().execute(col_query)
+ cols = [col[0] for col in cur.description]
+ print >> sys.stderr, " Columns: %s" % cols
+ except Exception, exc:
+ print >> sys.stderr, "Error: %s" % exc
+ except Exception, exc:
+ print >> sys.stderr, "Error: %s" % exc
+ exit(0)
+ # if not sqlite.is_read_only_query(query):
+ # print >> sys.stderr, "Error: Must be a read only query"
+ # exit(2)
+ try:
+ conn = get_connection(options.sqlitedb, addfunctions=True)
+ cur = conn.cursor()
+ results = cur.execute(query)
+ if not options.no_header:
+ outputFile.write("#%s\n" % '\t'.join([str(col[0]) for col in cur.description]))
+ # yield [col[0] for col in cur.description]
+ for i, row in enumerate(results):
+ # yield [val for val in row]
+ outputFile.write("%s\n" % '\t'.join([str(val) for val in row]))
except Exception, exc:
- print >> sys.stderr, "Error: %s" % exc
- exit(0)
- #if not sqlite.is_read_only_query(query):
- # print >> sys.stderr, "Error: Must be a read only query"
- # exit(2)
- try:
- conn = sqlite.connect(options.sqlitedb)
- cur = conn.cursor()
- results = cur.execute(query)
- if not options.no_header:
- outputFile.write("#%s\n" % '\t'.join([str(col[0]) for col in cur.description]))
- # yield [col[0] for col in cur.description]
- for i,row in enumerate(results):
- # yield [val for val in row]
- outputFile.write("%s\n" % '\t'.join([str(val) for val in row]))
- except Exception, exc:
- print >> sys.stderr, "Error: %s" % exc
- exit(1)
+ print >> sys.stderr, "Error: %s" % exc
+ exit(1)
-if __name__ == "__main__": __main__()
-
-
+if __name__ == "__main__":
+ __main__()
diff -r ffa5e34a55c1 -r afdbc7198353 query_tabular.xml
--- a/query_tabular.xml Sun Oct 04 10:52:21 2015 -0400
+++ b/query_tabular.xml Thu Jan 21 08:09:36 2016 -0500
@@ -6,11 +6,6 @@
-
-
-$sqlquery
-
-
+
+
+$sqlquery
+
+
+#import json
+#set $jtbldef = dict()
+#set $jtbls = []
+#set $jtbldef['tables'] = $jtbls
+#for $i,$tbl in enumerate($tables):
+ #set $jtbl = dict()
+ #set $jtbl['file_path'] = str($tbl.table)
+ #if $tbl.table_name
+ #set $tname = str($tbl.table_name)
+ #else
+ #set $tname = 't' + str($i + 1)
+ #end if
+ #set $jtbl['table_name'] = $tname
+ #if $tbl.col_names:
+ #set $col_names = str($tbl.col_names)
+ #else
+ #set $col_names = ''
+ #end if
+ #set $jtbl['column_names'] = $col_names
+ #if str($tbl.skip_lines) != '':
+ #set $jtbl['comment_lines'] = $tbl.skip_lines
+ #elif $tbl.table.metadata.comment_lines > 0:
+ #set $jtbl['comment_lines'] = int($tbl.table.metadata.comment_lines)
+ #end if
+ #set $jtbls += [$jtbl]
+#end for
+#echo $json.dumps($jtbldef)
+
+
@@ -52,6 +84,7 @@
By default, table columns will be named: c1,c2,c3,...,cn
^([A-Za-z]\w*)?(,([A-Za-z]\w*)?)*$
+
By default, tables will be named: t1,t2,...,tn
@@ -63,10 +96,10 @@
-
+
save_db or not (sqlquery and len(sqlquery) > 0)
-
+
sqlquery and len(sqlquery) > 0
@@ -84,7 +117,7 @@
-
+
@@ -96,7 +129,16 @@
-
+
+
+
+
+
+
+
+
+
+
@@ -111,7 +153,7 @@
-
+
@@ -207,7 +249,7 @@
Results in the following data base table
=========== ========== ========== ===================== ========== ============
- #c1 FirstName LastName c4 BithDate c6
+ #c1 FirstName LastName c4 BirthDate c6
=========== ========== ========== ===================== ========== ============
1 John Smith John.Smith@yahoo.com 1968-02-04 626 222-2222
2 Steven Goldfish goldfish@fishhere.net 1974-04-04 323 455-4545
@@ -215,6 +257,46 @@
4 James Smith jim@supergig.co.uk 1980-10-20 416 323-8888
=========== ========== ========== ===================== ========== ============
+ Regular_expression_ functions are included for:
+
+ ::
+
+ matching: re_match('pattern',column)
+
+ SELECT t1.FirstName, t1.LastName
+ FROM t1
+ WHERE re_match('^.*\.(net|org)$',c4)
+
+ Results:
+
+ =========== ==========
+ #FirstName LastName
+ =========== ==========
+ Steven Goldfish
+ Paula Brown
+ =========== ==========
+
+
+ ::
+
+ searching: re_search('pattern',column)
+ substituting: re_sub('pattern','replacement,column)
+
+ SELECT t1.FirstName, t1.LastName, re_sub('^\d{2}(\d{2})-(\d\d)-(\d\d)','\3/\2/\1',BirthDate) as "DOB"
+ FROM t1
+ WHERE re_search('[hp]er',c4)
+
+ Results:
+
+ =========== ========== ==========
+ #FirstName LastName DOB
+ =========== ========== ==========
+ Steven Goldfish 04/04/74
+ Paula Brown 24/05/78
+ James Smith 20/10/80
+ =========== ========== ==========
+
+.. _Regular_expression: https://docs.python.org/release/2.7/library/re.html
.. _SQLite: http://www.sqlite.org/index.html
]]>
diff -r ffa5e34a55c1 -r afdbc7198353 test-data/._IEDB.tsv
Binary file test-data/._IEDB.tsv has changed
diff -r ffa5e34a55c1 -r afdbc7198353 test-data/._netMHC_summary.tsv
Binary file test-data/._netMHC_summary.tsv has changed
diff -r ffa5e34a55c1 -r afdbc7198353 test-data/._query_results.tsv
Binary file test-data/._query_results.tsv has changed
diff -r ffa5e34a55c1 -r afdbc7198353 test-data/._sales_results.tsv
Binary file test-data/._sales_results.tsv has changed
diff -r ffa5e34a55c1 -r afdbc7198353 test-data/regex_results.tsv
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/regex_results.tsv Thu Jan 21 08:09:36 2016 -0500
@@ -0,0 +1,4 @@
+#FirstName LastName DOB
+Steven Goldfish 04/04/74
+Paula Brown 24/05/78
+James Smith 20/10/80