Mercurial > repos > rhpvorderman > data_manager_select_index_by_path
changeset 13:0a1afc109ad9 draft
planemo upload for repository https://github.com/LUMC/lumc-galaxy-tools/tree/master/data_manager_select_index_by_path commit 72334942019bfd598086f39ba93d4b4fce3cda19
author | rhpvorderman |
---|---|
date | Tue, 03 Jul 2018 10:29:48 -0400 |
parents | 680110ffdcfe |
children | 9ac9089b1914 |
files | README data_manager/__pycache__/path_name_value_key_manager.cpython-35-PYTEST.pyc data_manager/__pycache__/path_name_value_key_manager.cpython-35.pyc data_manager/__pycache__/test_path_name_value_key_manager.cpython-35-PYTEST.pyc data_manager/data_manager_select_index_by_path.xml data_manager/indexes.yml data_manager/path_name_value_key_manager.py data_manager_conf.xml test.json tool-data/rnastar_index2.loc.sample |
diffstat | 10 files changed, 319 insertions(+), 107 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/README Tue Jul 03 10:29:48 2018 -0400 @@ -0,0 +1,2 @@ +This is a fork of the data_manager_all_fasta_by_path (https://github.com/Christian-B/galaxy_shedtools/tree/master/all_fasta_by_path) data manager by Cristian-B(https://github.com/Christian-B). +The all_fasta_by_path data manager was forked on 2017-09-07 from Christian-B's galaxy_shedtools (https://github.com/Christian-B/galaxy_shedtools) repository at commit d9f5343.
Binary file data_manager/__pycache__/test_path_name_value_key_manager.cpython-35-PYTEST.pyc has changed
--- a/data_manager/data_manager_select_index_by_path.xml Mon Sep 11 07:33:51 2017 -0400 +++ b/data_manager/data_manager_select_index_by_path.xml Tue Jul 03 10:29:48 2018 -0400 @@ -1,13 +1,20 @@ -<tool id="data_manager_select_index_by_path" name="Select index by path manager" tool_type="manage_data" version="0.0.2"> +<tool id="data_manager_select_index_by_path" name="Select index by path manager" tool_type="manage_data" version="0.0.3"> + <requirements> + <!-- Away with python 2! --> + <requirement type="package" version="3">python</requirement> + </requirements> <description>path inputer</description> - <command interpreter="python"> + <command detect_errors="exit_code" interpreter="python"> path_name_value_key_manager.py --value "${value}" --dbkey "${dbkey}" --name "${name}" --path "${path}" --data_table_name "${data_table}" - --json_output_file "${json_output_file}" + --json_output_file "${json_output_file} + #if $data_table == "rnastar_index2" + --extra-columns {'with-gtf': '$data_table.with_gtf'} + #end if </command> <inputs> <param name="value" type="text" value="" label="value field for the entry. Defaults to name if left blank." /> @@ -15,25 +22,31 @@ <param name="name" type="text" value="" label="name field for the entry. Defaults to the file name from path if left blank." /> <param name="path" type="text" value="" label="path field for the entry" /> <param name="data_table" type="select" value="" label="data table for the index"> - <option value='all_fasta'>all_fasta</option> - <option value='bowtie2_indexes'>bowtie2_indexes</option> - <option value='bowtie_indexes'>bowtie_indexes</option> - <option value='bowtie_indexes_color'>bowtie_indexes_color</option> - <option value='bwa_mem_indexes'>bwa_mem_indexes</option> - <option value='bwameth_indexes'>bwameth_indexes</option> - <option value='fasta_indexes'>fasta_indexes</option> - <option value='gatk_picard_indexes'>gatk_picard_indexes</option> - <option value='gene_transfer'>gene_transfer</option> - <option value='hisat2_indexes'>hisat2_indexes</option> - <option value='kallisto_indexes'>kallisto_indexes</option> - <option value='picard_indexes'>picard_indexes</option> - <option value='tophat2_indexes'>tophat2_indexes</option> + <option value='all_fasta'>all_fasta</option> + <option value='bowtie2_indexes'>bowtie2_indexes</option> + <option value='bowtie_indexes'>bowtie_indexes</option> + <option value='bowtie_indexes_color'>bowtie_indexes_color</option> + <option value='bwa_mem_indexes'>bwa_mem_indexes</option> + <option value='bwameth_indexes'>bwameth_indexes</option> + <option value='fasta_indexes'>fasta_indexes</option> + <option value='gatk_picard_indexes'>gatk_picard_indexes</option> + <option value='gene_transfer'>gene_transfer</option> + <option value='hisat2_indexes'>hisat2_indexes</option> + <option value='kallisto_indexes'>kallisto_indexes</option> + <option value='picard_indexes'>picard_indexes</option> + <option value='tophat2_indexes'>tophat2_indexes</option> + <option value="rnastar_index2">rnastar_index2</option> + <when value="rnastar_index2"> + <param name="with_gtf" type="select" value="" label="Index with embedded gtf?"> + <option value="0">No</option> + <option value="1">Yes</option> + </param> + </when> </param> </inputs> <outputs> <data name="json_output_file" format="data_manager_json"/> </outputs> - <help> Adds a server path to the selected data table.
--- a/data_manager/indexes.yml Mon Sep 11 07:33:51 2017 -0400 +++ b/data_manager/indexes.yml Tue Jul 03 10:29:48 2018 -0400 @@ -1,20 +1,63 @@ +--- +# This is a file containing information about all the indexes. +# +# Top keys are table names as used in Galaxy. +# These names can be viewed in the 'local data' part of the admin menu +# +# Keys for each table +# name: +# (STRING) The name of the index. +# This is used for error reporting in the program +# +# prefix: +# (BOOLEAN) whether the index is a prefix. For example +# for bwa_mem-indexes, the index path is 'reference.fa'. +# This is a prefix because all the reference files are: +# 'reference.fa.amb', 'reference.fa.ann' etc. +# +# prefix_strip_extension: +# (BOOLEAN) whether the prefix should be stripped +# of its extensions. Ie from 'reference.fa' to +# 'reference'. For a picard index also a 'reference.dict' +# should be present, so the prefix needs to be stripped of +# its extension to look for the index files. +# +# extensions: +# (LIST[STRING]) a list of strings with the extensions: +# for example: +# extensions: +# - .fai +# +# folder: +# (LIST[STRING]) Use this when the index is not a prefix but a folder +# the program will check if all the files in the list are present. +# If they are not, an exception will follow. +# +# extra_columns: +# (LIST[STRING]) Usual indexes have 4 columns in the data table: path, name, +# value, dbkey. But some indexes have additional columns. rnastar_index2 +# needs a 'with-gtf' column for instance. Add these columns to the list to +# make sure their presence, or non-presence is checked. + all_fasta: name: fasta file - extensions: - - .fa - no_prefix: True + prefix: false + bowtie2_indexes: name: bowtie2 index extensions: - .bt2 + bowtie_indexes: name: bowtie index extensions: - .ebwt + bowtie_indexes_color: name: bowtie color index extensions: - .ebwt + bwa_mem_indexes: name: bwa mem index extensions: @@ -23,27 +66,53 @@ - .bwt - .pac - .sa + bwameth_indexes: name: bwa_meth_index fasta_indexes: name: fasta index extensions: - .fai + gatk_picard_index: name: picard index for GATK + gene_transfer: name: Gene Transfer File extensions: - .gtf + hisat2_indexes: name: hisat2 index extensions: - .ht2 + kallisto_indexes: name: kallisto index - no_prefix: True + prefix: false + picard_indexes: name: picard index + prefix_strip_extension: true + extensions: + - ".fa" + - ".dict" + +rnastar_index2: + name: "Star index" + prefix: false + extra_columns: + - with-gtf + folder: + - chrLength.txt + - chrNameLength.txt + - chrStart.txt + - chrName.txt + - Genome + - SA + - SAindex + - genomeParameters.txt + tophat2_indexes: name: tophat2 index extensions:
--- a/data_manager/path_name_value_key_manager.py Mon Sep 11 07:33:51 2017 -0400 +++ b/data_manager/path_name_value_key_manager.py Tue Jul 03 10:29:48 2018 -0400 @@ -1,104 +1,201 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 +"""Script to create data manager jsons""" +import argparse import json -import argparse -import os +from pathlib import Path + import yaml -def _add_data_table_entry( data_manager_dict, data_table_name, data_table_entry ): - data_manager_dict['data_tables'] = data_manager_dict.get( 'data_tables', {} ) - data_manager_dict['data_tables'][ data_table_name ] = data_manager_dict['data_tables'].get( data_table_name, [] ) - data_manager_dict['data_tables'][ data_table_name ].append( data_table_entry ) - return data_manager_dict - -def check_param(name, value, default=None, check_tab=True): - if value in [ None, '', '?' ]: - if default: - print "Using {0} for {1} as no value provided".format( default, name ) - value = default - else: - raise Exception( '{0} is not a valid {1}. You must specify a valid {1}.'.format( value, name ) ) - if check_tab and "\t" in value: - raise Exception( '{0} is not a valid {1}. It may not contain a tab because these are used as seperators by galaxy .'.format( value, name ) ) - return value +def argument_parser(): + parser = argparse.ArgumentParser() + parser.add_argument('--value', type=str, help='value') + parser.add_argument('--dbkey', type=str, help='dbkey') + parser.add_argument('--name', type=str, help='name') + parser.add_argument('--path', type=Path, help='path', + required=True) + parser.add_argument('--data_table_name', action='store', type=str, + help='Name of the data table', + required=True) + parser.add_argument('--json_output_file', action='store', type=Path, + help='Json output file', + required=True) + parser.add_argument("--extra-columns", type=str, + help='Yaml formatted string with extra columns ' + 'and their values. For example ' + '\'{"with-gtf":"0"}\' for STAR indexes') + return parser -def prefix_exists(directory, prefix): - '''checks if files exist with prefix in a directory. Returns Boolean''' - matched_files = [] - directory_files = os.listdir(directory) - for directory_file in directory_files: - if directory_file.startswith(prefix): - matched_files.append(directory_file) - # Empty list should return False - return bool(matched_files) + +def check_tab(name: str, value: str): + if '\t' in value: + raise ValueError( + '\'{0}\' is not a valid \'{1}\'. It may not contain a tab because ' + 'these are used as seperators by galaxy .'.format( + value, name)) -def prefix_plus_extension_exists(directory, prefix, extension): - '''checks if files exist with prefix in a directory. Returns Boolean''' - matched_files = [] - directory_files = os.listdir(directory) - for directory_file in directory_files: - if directory_file.startswith(prefix) and directory_file.endswith(extension): - matched_files.append(directory_file) + +def prefix_plus_extension_exists(directory: Path, prefix: str, extension: str): + """checks if files exist with prefix in a directory. Returns Boolean""" + matched_files = [directory_file for directory_file in directory.iterdir() + if + directory_file.name.startswith( + prefix) and directory_file.suffix == extension] # Empty list should return False return bool(matched_files) -def main(): + +class DataTable(object): - #value = "test_value" - #name = "test_name" - #print '{0} other {1} more{0}'.format(value, name ) - #print '{0} is not a valid {1}. It may not contain a tab.'.format( value, name ) + def __init__(self, + index_path: Path, + data_table_name: str, + indexes_properties_file: Path, + name: str = None, + dbkey: str = None, + value: str = None, + extra_columns: dict = None + ): + self.index_path = index_path + self.data_table_name = data_table_name + self.name = name if name else str(self.index_path.with_suffix( + '').name) + self.value = value if value is not None else self.name + self.dbkey = dbkey if dbkey is not None else self.value + self.extra_columns = extra_columns if extra_columns is not None else {} + self.indexes_properties_file = indexes_properties_file + + self.check_params() + + self.index_properties = self.get_index_properties() + + self.check_index_file_presence() + + def check_params(self): + + check_tab('name', self.name) + check_tab('index_path', str(self.index_path.absolute().name)) + check_tab('value', self.value) + check_tab('dbkey', self.dbkey) + self.check_extra_columns() - #Parse Command Line - parser = argparse.ArgumentParser() - parser.add_argument( '--value', action='store', type=str, default=None, help='value' ) - parser.add_argument( '--dbkey', action='store', type=str, default=None, help='dbkey' ) - parser.add_argument( '--name', action='store', type=str, default=None, help='name' ) - parser.add_argument( '--path', action='store', type=str, default=None, help='path' ) - parser.add_argument( '--data_table_name', action='store', type=str, default=None, help='path' ) - parser.add_argument( '--json_output_file', action='store', type=str, default=None, help='path' ) - options = parser.parse_args() + def check_extra_columns(self): + index_properties = self.get_index_properties() + index_extra_columns = set(index_properties.get("extra_columns", [])) + given_extra_columns = self.extra_columns.keys() + if index_extra_columns != given_extra_columns: + if len(index_extra_columns) > 0: + raise ValueError( + "Values for the following columns should be " + "supplied: {0}.".format( + str(index_extra_columns).strip("{}"))) + if len(index_extra_columns) == 0: + raise ValueError( + "The table \'{0}\' does not have extra columns".format( + self.data_table_name)) + for key, value in self.extra_columns.items(): + check_tab(key, value) - path = check_param("path", options.path) - basename = os.path.basename(path) - filename = os.path.splitext(basename)[0] - name = check_param("name", options.name, default=filename) - value = check_param("value", options.value, default=name) - dbkey = check_param("dbkey", options.dbkey, default=value) - data_table_name = check_param("data_table_name", options.data_table_name) - json_output_file = check_param("json_output_file", options.json_output_file, check_tab=False) + def get_index_properties(self) -> dict: + with self.indexes_properties_file.open('r') as properties_file: + indexes = yaml.safe_load(properties_file) + index_properties = indexes.get(self.data_table_name) + if index_properties is None: + raise ValueError( + "\'{0}\' not a supported table name".format( + self.data_table_name)) + return index_properties + + def check_index_file_presence(self): + index_name = self.index_properties.get( + 'name', + '[Index name not found. Please report to developers]') + index_extensions = self.index_properties.get('extensions', ['']) + + # Sometimes an index path is a prefix. + # For example, with BWA. 'reference.fa' is the index. + # But the actual index files are + # 'reference.fa.amb', 'reference.fa.ann' etc. - # Check if file or prefix exists - indexes = yaml.load(file(os.path.join(os.path.dirname(__file__), 'indexes.yml'))) - index_dict = indexes.get(data_table_name,{}) - index_name = index_dict.get('name','index') - index_extensions = index_dict.get('extensions', ['']) - no_prefix = index_dict.get('no_prefix', False) - if not no_prefix: - dirname = os.path.dirname(path) - prefix = basename - for extension in index_extensions: - if not prefix_plus_extension_exists(dirname,prefix,extension): - raise Exception( 'Unable to find files with prefix "{0}" and extension "{1}" in {2}. Is this a valid {3}?'.format( prefix, extension, dirname, index_name ) ) + # If the index is not a prefix, + # the index file is taken to be the path itself. + index_is_a_prefix = self.index_properties.get('prefix', True) + prefix_strip_extension = self.index_properties.get( + 'prefix_strip_extension', False) + if index_is_a_prefix: + if prefix_strip_extension: + prefix = str(self.index_path.with_suffix("").name) + else: + prefix = str(self.index_path.name) + for extension in index_extensions: + if not prefix_plus_extension_exists(self.index_path.parent, + prefix, extension): + raise FileNotFoundError( + 'Unable to find files with prefix \'{0}\' ' + 'and extension \'{1}\' in {2}. Is this a valid {3}?' + .format( + prefix, + extension, + str(self.index_path.parent), + index_name)) + elif self.index_properties.get('folder') is not None: + for file in self.index_properties.get('folder'): + if not (self.index_path / Path(file)).exists(): + raise FileNotFoundError( + "A file named \'{0}\' was not found in \'{1}\'".format( + file, str(self.index_path))) + else: + if not self.index_path.exists(): + raise FileNotFoundError( + 'Unable to find path {0}.'.format(self.index_path)) + + @property + def data_manager_dict(self) -> dict: + data_table_entry = dict(value=self.value, dbkey=self.dbkey, + name=self.name, + path=str(self.index_path), + **self.extra_columns) + data_manager_dict = dict(data_tables=dict()) + data_manager_dict["data_tables"][ + self.data_table_name] = [data_table_entry] + return data_manager_dict + + @property + def data_manager_json(self) -> str: + return json.dumps(self.data_manager_dict) + + +def main(): + options = argument_parser().parse_args() + + if options.json_output_file.exists(): + raise FileExistsError( + "\'{0}\' already exists.".format(str(options.json_output_file))) + + if options.extra_columns is None: + extra_columns = dict() else: - if not os.path.exists(path): - raise Exception( 'Unable to find path {0}.'.format( path ) ) - - if os.path.exists(json_output_file): - params = json.loads( open( json_output_file ).read() ) - print "params", params - else: - params = {} + try: + extra_columns = yaml.safe_load(options.extra_columns) + except yaml.parser.ParserError as e: + raise yaml.parser.ParserError( + "Invalid yaml string for --extra_indexes. \nError {0}".format( + e)) - data_manager_dict = {} - data_table_entry = dict( value=value, dbkey=dbkey, name=name, path=path ) - _add_data_table_entry( data_manager_dict, data_table_name, data_table_entry ) + index_properties_file = Path(__file__).parent / Path("indexes.yml") + data_table = DataTable(index_path=options.path, + data_table_name=options.data_table_name, + name=options.name, + value=options.value, + dbkey=options.dbkey, + indexes_properties_file=index_properties_file, + extra_columns=extra_columns) - #save info to json file - with open( json_output_file, 'wb' ) as output_file: - output_file.write( json.dumps( data_manager_dict ) ) - output_file.write( "\n" ) + # save info to json file + with options.json_output_file.open('w') as output_file: + output_file.write(data_table.data_manager_json) + if __name__ == "__main__": main()
--- a/data_manager_conf.xml Mon Sep 11 07:33:51 2017 -0400 +++ b/data_manager_conf.xml Tue Jul 03 10:29:48 2018 -0400 @@ -1,6 +1,6 @@ <?xml version="1.0"?> <data_managers> - <data_manager tool_file="data_manager/data_manager_select_index_by_path.xml" id="data_manager_select_index_by_path" version="0.0.2"> + <data_manager tool_file="data_manager/data_manager_select_index_by_path.xml" id="data_manager_select_index_by_path" version="0.0.3"> <data_table name="all_fasta"> <output> <column name="value" /> @@ -105,6 +105,15 @@ <column name="path" /> </output> </data_table> + <data_table name="rnastar_index2"> + <output> + <column name="value" /> + <column name="dbkey" /> + <column name="name" /> + <column name="path" /> + <column name="with-gtf" /> + </output> + </data_table> </data_manager> </data_managers>
--- a/test.json Mon Sep 11 07:33:51 2017 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,1 +0,0 @@ -{"data_tables": {"all_fasta": [{"path": "test-data/EboVir3.fa", "dbkey": "EboVir3", "name": "EboVir3", "value": "EboVir3"}]}}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool-data/rnastar_index2.loc.sample Tue Jul 03 10:29:48 2018 -0400 @@ -0,0 +1,23 @@ +#This is a sample file distributed with Galaxy that enables tools +#to use a directory of rna-star indexed sequences data files. You will +#need to create these data files and then create a rnastar_index2.loc +#file similar to this one (store it in this directory) that points to +#the directories in which those files are stored. The rnastar_index2.loc +#file has this format (longer white space characters are TAB characters): +# +#<unique_build_id> <dbkey> <display_name> <file_base_path> <with-gtf> +# +#The <with-gtf> column should be 1 or 0, indicating whether the index was made +#with an annotation (i.e., --sjdbGTFfile and --sjdbOverhang were used) or not, +#respecively. +# +#Note that STAR indices can become quite large. Consequently, it is only +#advisable to create indices with annotations if it's known ahead of time that +#(A) the annotations won't be frequently updated and (B) the read lengths used +#will also rarely vary. If either of these is not the case, it's advisable to +#create indices without annotations and then specify an annotation file and +#maximum read length (minus 1) when running STAR. +# +#hg19 hg19 hg19 full /mnt/galaxyIndices/genomes/hg19/rnastar 0 +#hg19Ensembl hg19Ensembl hg19 full with Ensembl annotation /mnt/galaxyIndices/genomes/hg19Ensembl/rnastar 1 +