# HG changeset patch # User recetox # Date 1607479460 0 # Node ID 0369de831b32e250b85ed0a2e6701633f3d29e7f "planemo upload for repository https://github.com/RECETOX/galaxytools/tree/master/tools/query commit e781279d988f26eff9ccfff898a7d8cc4f0ceafb" diff -r 000000000000 -r 0369de831b32 query.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/query.py Wed Dec 09 02:04:20 2020 +0000 @@ -0,0 +1,56 @@ +import json +from typing import Tuple + +import click +import pandas +import pandasql +from pandas import DataFrame + + +def read(path: str, filetype: str, name: str) -> Tuple[str, DataFrame]: + if filetype == 'csv': + return name, pandas.read_csv(path) + elif filetype in ('tsv', 'tabular'): + return name, pandas.read_table(path) + elif filetype in ('h5', 'hdf'): + return name, pandas.read_hdf(path, name) + elif filetype == 'feather': + return name, pandas.read_feather(path) + elif filetype == 'parquet': + return name, pandas.read_parquet(path) + elif filetype == 'sqlite': + return pandas.read_sql(name, f'sqlite:///{path}') + else: + raise NotImplementedError(f'Unknown filetype {filetype}') + + +def write(df: DataFrame, path: str, filetype: str, name: str) -> None: + if filetype == 'csv': + df.to_csv(path) + elif filetype in ('tsv', 'tabular'): + df.to_csv(path, sep='\t') + elif filetype in ('h5', 'hdf'): + with pandas.HDFStore(path) as file: + file.append(name, df, data_columns=list(df.columns)) + elif filetype == 'feather': + df.to_feather(path) + elif filetype == 'parquet': + df.to_parquet(path) + elif filetype == 'sqlite': + df.to_sql(name, f'sqlite:///{path}') + else: + raise NotImplementedError(f'Unknown filetype {filetype}') + + +@click.command() +@click.argument('config', type=click.File()) +def main(config) -> None: + config = json.load(config) + + tables = dict(read(table['path'], table['format'], table['name']) for table in config['tables']) + result = pandasql.sqldf(config['query'], tables) + write(result, config['result']['path'], config['result']['format'], config['result']['name']) + + +if __name__ == '__main__': + main() diff -r 000000000000 -r 0369de831b32 query.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/query.xml Wed Dec 09 02:04:20 2020 +0000 @@ -0,0 +1,134 @@ + + + 0.1 + + + + + + + + + + + + + + + + + click + pyarrow + pytables + pandas + pandasql + python + + + + + + + + + + + + + + Force a particular file format to get around formats unknown to Galaxy, ie. Parquet. + + + + + Choose an unique name for the table which then you may use in the SQL Query. By default tables will be named as: t0, t1, ... + + + + + + Type an arbitrary SQL SELECT to perform on the input tables. The input tables may be referred by their given names. + An example query is SELECT employees.name AS employee, emails.value AS email FROM one JOIN b ON employees.id = emails.employee, provided that the input tables are named employees and emails. + + + + + + Specify the file format of the query result. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff -r 000000000000 -r 0369de831b32 test-data/a.csv --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/a.csv Wed Dec 09 02:04:20 2020 +0000 @@ -0,0 +1,4 @@ +x,y,z +1,1,a +2,2,b +3,3,c diff -r 000000000000 -r 0369de831b32 test-data/a.parquet Binary file test-data/a.parquet has changed diff -r 000000000000 -r 0369de831b32 test-data/a.sqlite Binary file test-data/a.sqlite has changed diff -r 000000000000 -r 0369de831b32 test-data/ab.csv --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/ab.csv Wed Dec 09 02:04:20 2020 +0000 @@ -0,0 +1,4 @@ +,ax,ay,az,bx,by,bz +0,1,1,a,1,2,3 +1,2,2,b,2,3,4 +2,3,3,c,3,4,5 diff -r 000000000000 -r 0369de831b32 test-data/b.csv --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/b.csv Wed Dec 09 02:04:20 2020 +0000 @@ -0,0 +1,4 @@ +x,y,z +1,2,3 +2,3,4 +3,4,5