Mercurial > repos > rhpvorderman > data_manager_select_index_by_path

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/README	Tue Jul 03 10:29:48 2018 -0400
@@ -0,0 +1,2 @@
+This is a fork of the data_manager_all_fasta_by_path (https://github.com/Christian-B/galaxy_shedtools/tree/master/all_fasta_by_path) data manager by Cristian-B(https://github.com/Christian-B).
+The all_fasta_by_path data manager was forked on 2017-09-07 from Christian-B's galaxy_shedtools (https://github.com/Christian-B/galaxy_shedtools) repository at commit d9f5343.
Binary file data_manager/__pycache__/path_name_value_key_manager.cpython-35-PYTEST.pyc has changed
Binary file data_manager/__pycache__/path_name_value_key_manager.cpython-35.pyc has changed
Binary file data_manager/__pycache__/test_path_name_value_key_manager.cpython-35-PYTEST.pyc has changed
--- a/data_manager/data_manager_select_index_by_path.xml	Mon Sep 11 07:33:51 2017 -0400
+++ b/data_manager/data_manager_select_index_by_path.xml	Tue Jul 03 10:29:48 2018 -0400
@@ -1,13 +1,20 @@
-<tool id="data_manager_select_index_by_path" name="Select index by path manager" tool_type="manage_data" version="0.0.2">
+<tool id="data_manager_select_index_by_path" name="Select index by path manager" tool_type="manage_data" version="0.0.3">
+    <requirements>
+        <!-- Away with python 2! -->
+        <requirement type="package" version="3">python</requirement>
+    </requirements>
     <description>path inputer</description>
-    <command interpreter="python">
+    <command detect_errors="exit_code" interpreter="python">
         path_name_value_key_manager.py
         --value "${value}"
         --dbkey "${dbkey}"
         --name "${name}"
         --path "${path}"
         --data_table_name "${data_table}"
-        --json_output_file "${json_output_file}"
+        --json_output_file "${json_output_file}
+        #if $data_table == "rnastar_index2"
+        --extra-columns {'with-gtf': '$data_table.with_gtf'}
+        #end if
     </command>
     <inputs>
         <param name="value" type="text" value="" label="value field for the entry.  Defaults to name if left blank." />
@@ -15,25 +22,31 @@
         <param name="name" type="text" value="" label="name field for the entry. Defaults to the file name from path if left blank." />
         <param name="path" type="text" value="" label="path field for the entry" />
         <param name="data_table" type="select" value="" label="data table for the index">
-          <option value='all_fasta'>all_fasta</option>
-          <option value='bowtie2_indexes'>bowtie2_indexes</option>
-          <option value='bowtie_indexes'>bowtie_indexes</option>
-          <option value='bowtie_indexes_color'>bowtie_indexes_color</option>
-          <option value='bwa_mem_indexes'>bwa_mem_indexes</option>
-          <option value='bwameth_indexes'>bwameth_indexes</option>
-          <option value='fasta_indexes'>fasta_indexes</option>
-          <option value='gatk_picard_indexes'>gatk_picard_indexes</option>
-          <option value='gene_transfer'>gene_transfer</option>
-          <option value='hisat2_indexes'>hisat2_indexes</option>
-          <option value='kallisto_indexes'>kallisto_indexes</option>
-          <option value='picard_indexes'>picard_indexes</option>
-          <option value='tophat2_indexes'>tophat2_indexes</option>
+            <option value='all_fasta'>all_fasta</option>
+            <option value='bowtie2_indexes'>bowtie2_indexes</option>
+            <option value='bowtie_indexes'>bowtie_indexes</option>
+            <option value='bowtie_indexes_color'>bowtie_indexes_color</option>
+            <option value='bwa_mem_indexes'>bwa_mem_indexes</option>
+            <option value='bwameth_indexes'>bwameth_indexes</option>
+            <option value='fasta_indexes'>fasta_indexes</option>
+            <option value='gatk_picard_indexes'>gatk_picard_indexes</option>
+            <option value='gene_transfer'>gene_transfer</option>
+            <option value='hisat2_indexes'>hisat2_indexes</option>
+            <option value='kallisto_indexes'>kallisto_indexes</option>
+            <option value='picard_indexes'>picard_indexes</option>
+            <option value='tophat2_indexes'>tophat2_indexes</option>
+            <option value="rnastar_index2">rnastar_index2</option>
+            <when value="rnastar_index2">
+                <param name="with_gtf" type="select" value="" label="Index with embedded gtf?">
+                    <option value="0">No</option>
+                    <option value="1">Yes</option>
+                </param>
+            </when>
         </param>
     </inputs>
     <outputs>
         <data name="json_output_file" format="data_manager_json"/>
     </outputs>
-
     <help>
 Adds a server path to the selected data table.
--- a/data_manager/indexes.yml	Mon Sep 11 07:33:51 2017 -0400
+++ b/data_manager/indexes.yml	Tue Jul 03 10:29:48 2018 -0400
@@ -1,20 +1,63 @@
+---
+# This is a file containing information about all the indexes.
+#
+# Top keys are table names as used in Galaxy.
+# These names can be viewed in the 'local data' part of the admin menu
+#
+# Keys for each table
+#  name:
+#    (STRING) The name of the index.
+#    This is used for error reporting in the program
+#
+#  prefix:
+#    (BOOLEAN) whether the index is a prefix. For example
+#    for bwa_mem-indexes, the index path is 'reference.fa'.
+#    This is a prefix because all the reference files are:
+#    'reference.fa.amb', 'reference.fa.ann' etc.
+#
+#  prefix_strip_extension:
+#    (BOOLEAN) whether the prefix should be stripped
+#    of its extensions. Ie from 'reference.fa' to
+#    'reference'. For a picard index also a 'reference.dict'
+#    should be present, so the prefix needs to be stripped of
+#    its extension to look for the index files.
+#
+#  extensions:
+#    (LIST[STRING]) a list of strings with the extensions:
+#      for example:
+#      extensions:
+#        - .fai
+#
+#  folder:
+#    (LIST[STRING]) Use this when the index is not a prefix but a folder
+#    the program will check if all the files in the list are present.
+#    If they are not, an exception will follow.
+#
+#  extra_columns:
+#    (LIST[STRING]) Usual indexes have 4 columns in the data table: path, name,
+#    value, dbkey. But some indexes have additional columns. rnastar_index2
+#    needs a 'with-gtf' column for instance. Add these columns to the list to
+#    make sure their presence, or non-presence is checked.
+
 all_fasta:
   name: fasta file
-  extensions:
-    - .fa
-  no_prefix: True
+  prefix: false
+
 bowtie2_indexes:
   name: bowtie2 index
   extensions:
     - .bt2
+
 bowtie_indexes:
   name: bowtie index
   extensions:
     - .ebwt
+
 bowtie_indexes_color:
   name: bowtie color index
   extensions:
     - .ebwt
+
 bwa_mem_indexes:
   name: bwa mem index
   extensions:
@@ -23,27 +66,53 @@
     - .bwt
     - .pac
     - .sa
+
 bwameth_indexes:
   name: bwa_meth_index
 fasta_indexes:
   name: fasta index
   extensions:
     - .fai
+
 gatk_picard_index:
   name: picard index for GATK
+
 gene_transfer:
   name: Gene Transfer File
   extensions:
     - .gtf
+
 hisat2_indexes:
   name: hisat2 index
   extensions:
     - .ht2
+
 kallisto_indexes:
   name: kallisto index
-  no_prefix: True
+  prefix: false
+
 picard_indexes:
   name: picard index
+  prefix_strip_extension: true
+  extensions:
+    - ".fa"
+    - ".dict"
+
+rnastar_index2:
+  name: "Star index"
+  prefix: false
+  extra_columns:
+    - with-gtf
+  folder:
+    - chrLength.txt
+    - chrNameLength.txt
+    - chrStart.txt
+    - chrName.txt
+    - Genome
+    - SA
+    - SAindex
+    - genomeParameters.txt
+
 tophat2_indexes:
   name: tophat2 index
   extensions:
--- a/data_manager/path_name_value_key_manager.py	Mon Sep 11 07:33:51 2017 -0400
+++ b/data_manager/path_name_value_key_manager.py	Tue Jul 03 10:29:48 2018 -0400
@@ -1,104 +1,201 @@
-#!/usr/bin/env python
+#!/usr/bin/env python3
+"""Script to create data manager jsons"""

+import argparse
 import json
-import argparse
-import os
+from pathlib import Path
+
 import yaml

-def _add_data_table_entry( data_manager_dict, data_table_name, data_table_entry ):
-    data_manager_dict['data_tables'] = data_manager_dict.get( 'data_tables', {} )
-    data_manager_dict['data_tables'][ data_table_name ] = data_manager_dict['data_tables'].get( data_table_name, [] )
-    data_manager_dict['data_tables'][ data_table_name ].append( data_table_entry )
-    return data_manager_dict
-

-def check_param(name, value, default=None,  check_tab=True):
-    if value in [ None, '', '?' ]:
-        if default:
-            print "Using {0} for {1} as no value provided".format( default, name )
-            value = default
-        else:
-            raise Exception( '{0} is not a valid {1}. You must specify a valid {1}.'.format( value, name ) )
-    if check_tab and "\t" in value:
-        raise Exception( '{0} is not a valid {1}. It may not contain a tab because these are used as seperators by galaxy .'.format( value, name ) )
-    return value
+def argument_parser():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--value', type=str, help='value')
+    parser.add_argument('--dbkey', type=str, help='dbkey')
+    parser.add_argument('--name', type=str, help='name')
+    parser.add_argument('--path', type=Path, help='path',
+                        required=True)
+    parser.add_argument('--data_table_name', action='store', type=str,
+                        help='Name of the data table',
+                        required=True)
+    parser.add_argument('--json_output_file', action='store', type=Path,
+                        help='Json output file',
+                        required=True)
+    parser.add_argument("--extra-columns", type=str,
+                        help='Yaml formatted string with extra columns '
+                             'and their values. For example '
+                             '\'{"with-gtf":"0"}\' for STAR indexes')
+    return parser

-def prefix_exists(directory, prefix):
-    '''checks if files exist with prefix in a directory. Returns Boolean'''
-    matched_files = []
-    directory_files = os.listdir(directory)
-    for directory_file in directory_files:
-        if directory_file.startswith(prefix):
-            matched_files.append(directory_file)
-    # Empty list should return False
-    return bool(matched_files)
+
+def check_tab(name: str, value: str):
+    if '\t' in value:
+        raise ValueError(
+            '\'{0}\' is not a valid \'{1}\'. It may not contain a tab because '
+            'these are used as seperators by galaxy .'.format(
+                value, name))

-def prefix_plus_extension_exists(directory, prefix, extension):
-    '''checks if files exist with prefix in a directory. Returns Boolean'''
-    matched_files = []
-    directory_files = os.listdir(directory)
-    for directory_file in directory_files:
-        if directory_file.startswith(prefix) and directory_file.endswith(extension):
-            matched_files.append(directory_file)
+
+def prefix_plus_extension_exists(directory: Path, prefix: str, extension: str):
+    """checks if files exist with prefix in a directory. Returns Boolean"""
+    matched_files = [directory_file for directory_file in directory.iterdir()
+                     if
+                     directory_file.name.startswith(
+                         prefix) and directory_file.suffix == extension]
     # Empty list should return False
     return bool(matched_files)

-def main():
+
+class DataTable(object):

-    #value = "test_value"
-    #name = "test_name"
-    #print '{0} other {1} more{0}'.format(value, name )
-    #print '{0} is not a valid {1}. It may not contain a tab.'.format( value, name )
+    def __init__(self,
+                 index_path: Path,
+                 data_table_name: str,
+                 indexes_properties_file: Path,
+                 name: str = None,
+                 dbkey: str = None,
+                 value: str = None,
+                 extra_columns: dict = None
+                 ):
+        self.index_path = index_path
+        self.data_table_name = data_table_name
+        self.name = name if name else str(self.index_path.with_suffix(
+            '').name)
+        self.value = value if value is not None else self.name
+        self.dbkey = dbkey if dbkey is not None else self.value
+        self.extra_columns = extra_columns if extra_columns is not None else {}
+        self.indexes_properties_file = indexes_properties_file
+
+        self.check_params()
+
+        self.index_properties = self.get_index_properties()
+
+        self.check_index_file_presence()
+
+    def check_params(self):
+
+        check_tab('name', self.name)
+        check_tab('index_path', str(self.index_path.absolute().name))
+        check_tab('value', self.value)
+        check_tab('dbkey', self.dbkey)
+        self.check_extra_columns()

-    #Parse Command Line
-    parser = argparse.ArgumentParser()
-    parser.add_argument( '--value', action='store', type=str, default=None, help='value' )
-    parser.add_argument( '--dbkey', action='store', type=str, default=None, help='dbkey' )
-    parser.add_argument( '--name',  action='store', type=str, default=None, help='name' )
-    parser.add_argument( '--path', action='store', type=str, default=None, help='path' )
-    parser.add_argument( '--data_table_name', action='store', type=str, default=None, help='path' )
-    parser.add_argument( '--json_output_file', action='store', type=str, default=None, help='path' )
-    options = parser.parse_args()
+    def check_extra_columns(self):
+        index_properties = self.get_index_properties()
+        index_extra_columns = set(index_properties.get("extra_columns", []))
+        given_extra_columns = self.extra_columns.keys()
+        if index_extra_columns != given_extra_columns:
+            if len(index_extra_columns) > 0:
+                raise ValueError(
+                    "Values for the following columns should be "
+                    "supplied: {0}.".format(
+                        str(index_extra_columns).strip("{}")))
+            if len(index_extra_columns) == 0:
+                raise ValueError(
+                    "The table \'{0}\' does not have extra columns".format(
+                        self.data_table_name))
+        for key, value in self.extra_columns.items():
+            check_tab(key, value)

-    path = check_param("path", options.path)
-    basename = os.path.basename(path)
-    filename = os.path.splitext(basename)[0]
-    name = check_param("name", options.name, default=filename)
-    value = check_param("value", options.value, default=name)
-    dbkey = check_param("dbkey", options.dbkey, default=value)
-    data_table_name = check_param("data_table_name", options.data_table_name)
-    json_output_file = check_param("json_output_file", options.json_output_file, check_tab=False)
+    def get_index_properties(self) -> dict:
+        with self.indexes_properties_file.open('r') as properties_file:
+            indexes = yaml.safe_load(properties_file)
+        index_properties = indexes.get(self.data_table_name)
+        if index_properties is None:
+            raise ValueError(
+                "\'{0}\' not a supported table name".format(
+                    self.data_table_name))
+        return index_properties
+
+    def check_index_file_presence(self):
+        index_name = self.index_properties.get(
+            'name',
+            '[Index name not found. Please report to developers]')
+        index_extensions = self.index_properties.get('extensions', [''])
+
+        # Sometimes an index path is a prefix.
+        # For example, with BWA. 'reference.fa' is the index.
+        # But the actual index files are
+        # 'reference.fa.amb', 'reference.fa.ann' etc.

-    # Check if file or prefix exists
-    indexes = yaml.load(file(os.path.join(os.path.dirname(__file__), 'indexes.yml')))
-    index_dict = indexes.get(data_table_name,{})
-    index_name = index_dict.get('name','index')
-    index_extensions = index_dict.get('extensions', [''])
-    no_prefix = index_dict.get('no_prefix', False)
-    if not no_prefix:
-        dirname = os.path.dirname(path)
-        prefix = basename
-        for extension in index_extensions:
-            if not prefix_plus_extension_exists(dirname,prefix,extension):
-                raise Exception( 'Unable to find files with prefix "{0}" and extension "{1}" in {2}. Is this a valid {3}?'.format( prefix, extension, dirname, index_name ) )
+        # If the index is not a prefix,
+        # the index file is taken to be the path itself.
+        index_is_a_prefix = self.index_properties.get('prefix', True)
+        prefix_strip_extension = self.index_properties.get(
+            'prefix_strip_extension', False)
+        if index_is_a_prefix:
+            if prefix_strip_extension:
+                prefix = str(self.index_path.with_suffix("").name)
+            else:
+                prefix = str(self.index_path.name)
+            for extension in index_extensions:
+                if not prefix_plus_extension_exists(self.index_path.parent,
+                                                    prefix, extension):
+                    raise FileNotFoundError(
+                        'Unable to find files with prefix \'{0}\' '
+                        'and extension \'{1}\' in {2}. Is this a valid {3}?'
+                        .format(
+                            prefix,
+                            extension,
+                            str(self.index_path.parent),
+                            index_name))
+        elif self.index_properties.get('folder') is not None:
+            for file in self.index_properties.get('folder'):
+                if not (self.index_path / Path(file)).exists():
+                    raise FileNotFoundError(
+                        "A file named \'{0}\' was not found in \'{1}\'".format(
+                            file, str(self.index_path)))
+        else:
+            if not self.index_path.exists():
+                raise FileNotFoundError(
+                    'Unable to find path {0}.'.format(self.index_path))
+
+    @property
+    def data_manager_dict(self) -> dict:
+        data_table_entry = dict(value=self.value, dbkey=self.dbkey,
+                                name=self.name,
+                                path=str(self.index_path),
+                                **self.extra_columns)
+        data_manager_dict = dict(data_tables=dict())
+        data_manager_dict["data_tables"][
+            self.data_table_name] = [data_table_entry]
+        return data_manager_dict
+
+    @property
+    def data_manager_json(self) -> str:
+        return json.dumps(self.data_manager_dict)
+
+
+def main():
+    options = argument_parser().parse_args()
+
+    if options.json_output_file.exists():
+        raise FileExistsError(
+            "\'{0}\' already exists.".format(str(options.json_output_file)))
+
+    if options.extra_columns is None:
+        extra_columns = dict()
     else:
-        if not os.path.exists(path):
-            raise Exception( 'Unable to find path {0}.'.format( path ) )
-
-    if os.path.exists(json_output_file):
-        params = json.loads( open( json_output_file ).read() )
-        print "params", params
-    else:
-        params = {}
+        try:
+            extra_columns = yaml.safe_load(options.extra_columns)
+        except yaml.parser.ParserError as e:
+            raise yaml.parser.ParserError(
+                "Invalid yaml string for --extra_indexes. \nError {0}".format(
+                    e))

-    data_manager_dict = {}
-    data_table_entry = dict( value=value, dbkey=dbkey, name=name, path=path )
-    _add_data_table_entry( data_manager_dict, data_table_name, data_table_entry )
+    index_properties_file = Path(__file__).parent / Path("indexes.yml")
+    data_table = DataTable(index_path=options.path,
+                           data_table_name=options.data_table_name,
+                           name=options.name,
+                           value=options.value,
+                           dbkey=options.dbkey,
+                           indexes_properties_file=index_properties_file,
+                           extra_columns=extra_columns)

-    #save info to json file
-    with open( json_output_file, 'wb' ) as output_file:
-        output_file.write( json.dumps( data_manager_dict ) )
-        output_file.write( "\n" )
+    # save info to json file
+    with options.json_output_file.open('w') as output_file:
+        output_file.write(data_table.data_manager_json)
+

 if __name__ == "__main__":
     main()
--- a/data_manager_conf.xml	Mon Sep 11 07:33:51 2017 -0400
+++ b/data_manager_conf.xml	Tue Jul 03 10:29:48 2018 -0400
@@ -1,6 +1,6 @@
 <?xml version="1.0"?>
 <data_managers>
-    <data_manager tool_file="data_manager/data_manager_select_index_by_path.xml" id="data_manager_select_index_by_path" version="0.0.2">
+    <data_manager tool_file="data_manager/data_manager_select_index_by_path.xml" id="data_manager_select_index_by_path" version="0.0.3">
         <data_table name="all_fasta">
             <output>
                 <column name="value" />
@@ -105,6 +105,15 @@
                 <column name="path" />
             </output>
         </data_table>
+        <data_table name="rnastar_index2">
+            <output>
+                <column name="value" />
+                <column name="dbkey" />
+                <column name="name" />
+                <column name="path" />
+                <column name="with-gtf" />
+            </output>
+        </data_table>
     </data_manager>

 </data_managers>
--- a/test.json	Mon Sep 11 07:33:51 2017 -0400
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,1 +0,0 @@
-{"data_tables": {"all_fasta": [{"path": "test-data/EboVir3.fa", "dbkey": "EboVir3", "name": "EboVir3", "value": "EboVir3"}]}}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tool-data/rnastar_index2.loc.sample	Tue Jul 03 10:29:48 2018 -0400
@@ -0,0 +1,23 @@
+#This is a sample file distributed with Galaxy that enables tools
+#to use a directory of rna-star indexed sequences data files. You will
+#need to create these data files and then create a rnastar_index2.loc
+#file similar to this one (store it in this directory) that points to
+#the directories in which those files are stored. The rnastar_index2.loc
+#file has this format (longer white space characters are TAB characters):
+#
+#<unique_build_id>   <dbkey>   <display_name>   <file_base_path>	<with-gtf>
+#
+#The <with-gtf> column should be 1 or 0, indicating whether the index was made
+#with an annotation (i.e., --sjdbGTFfile and --sjdbOverhang were used) or not,
+#respecively.
+#
+#Note that STAR indices can become quite large. Consequently, it is only
+#advisable to create indices with annotations if it's known ahead of time that
+#(A) the annotations won't be frequently updated and (B) the read lengths used
+#will also rarely vary. If either of these is not the case, it's advisable to
+#create indices without annotations and then specify an annotation file and
+#maximum read length (minus 1) when running STAR.
+#
+#hg19   hg19    hg19 full   /mnt/galaxyIndices/genomes/hg19/rnastar	0
+#hg19Ensembl   hg19Ensembl    hg19 full with Ensembl annotation   /mnt/galaxyIndices/genomes/hg19Ensembl/rnastar	1
+