changeset 0:d220209e47f4 draft

Upload First set of files.
author trinity_ctat
date Tue, 12 Dec 2017 14:51:18 -0500
parents
children e071b1d24f24
files data_manager/add_ctat_ref_lib.py data_manager/add_ctat_ref_lib.xml data_manager_conf.xml tool-data/ctat_genome_ref_libs.loc.sample tool_data_table_conf.xml.sample
diffstat 5 files changed, 186 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/data_manager/add_ctat_ref_lib.py	Tue Dec 12 14:51:18 2017 -0500
@@ -0,0 +1,94 @@
+#!/usr/bin/env python
+# ref: https://galaxyproject.org/admin/tools/data-managers/how-to/define/
+
+# Rewritten by H.E. Cicada Brokaw Dennis from source downloaded from the toolshed.
+# Eventually this should be modified to allow downloading of more than just the one library,
+# to let the user select what library/location to download, but that would require the
+# download tool to generate the list of libraries to download on the fly. Currently
+# we are only using the one library.
+# Users can create other ones locally and use this tool to add them if they don't want
+# to add them by hand.
+
+import argparse
+import os
+import tarfile
+import urllib
+
+from galaxy.util.json import from_json_string, to_json_string
+
+# The following was used by prior program to get input parameters from the json.
+# Just leaving here for reference.
+#def get_reference_id_name(params):
+#    genome_id = params['param_dict']['genome_id']
+#    genome_name = params['param_dict']['genome_name']
+#    return genome_id, genome_name
+#
+#def get_url(params):
+#    trained_url = params['param_dict']['trained_url']
+#    return trained_url
+
+def download_from_BroadInst(destination):
+    ctat_resource_lib = 'https://data.broadinstitute.org/Trinity/CTAT_RESOURCE_LIB/GRCh38_gencode_v26_CTAT_lib_Nov012017.plug-n-play.tar.gz'
+    # FIX - Check that the download directory is empty if it exists. Also, can we check if there is enough space on the device as well?
+    # FIX - Also we want to make sure that destination is absolute fully specified path.
+    os.mkdir(destination)
+    full_filepath = os.path.join(destination, 'GRCh38_gencode_v26_CTAT_lib_Nov012017.plug-n-play.tar.gz')
+
+    #Download ref: https://dzone.com/articles/how-download-file-python
+    #f = urllib2.urlopen(ctat_resource_lib)
+    #data = f.read()
+    #with open(filepath, 'wb') as code:
+    #    code.write(data)
+
+    urllib.urlretrieve(url=ctat_resource_lib, filename=full_filepath)
+    # Put the following into a try statement, so that if there is a failure something can be printed about it before reraising exception.
+    tarfile.open(full_filepath, mode='r:*').extractall()
+    # FIX - There is additional processing that needs to happen for gmap-fusion to work.
+    # Get the root filename of the extracted file. 
+    # That directory is the actual destination that needs to be set as the ctat_genome_resource_library
+
+def main():
+    #Parse Command Line
+    parser = argparse.ArgumentParser()
+    parser.add_argument('-d', '--download', action="store_true", \
+        help='Do not use if you already have the CTAT Resource Library that this program downloads.')
+    parser.add_argument('-g', '--genome_name', default="GRCh38_gencode_v26", \
+        help='Is used as the selector text of the entry in the data table.')
+    parser.add_argument('-p', '--destination_path', \
+        help='Full path of the CTAT Resource Library location or destination.')
+    parser.add_argument('-o', '--output_filename', \
+        help='Name of the output file, where the json dictionary will be written.')
+    args = parser.parse_args()
+
+    # All of the input parameters are written by default to the output file prior to
+    # this program being called.
+    # But I do not get input values from the json file, but rather from command line.
+    # Just leaving the following code as a comment, in case it might be useful to someone later.
+    # params = from_json_string(open(filename).read())
+    # target_directory = params['output_data'][0]['extra_files_path']
+    # os.mkdir(target_directory)
+
+    if args.download:
+        ctat_genome_resource_lib_path = download_from_BroadInst(destination=args.destination_path)
+    else:
+        # FIX - probably should check if this is a valid path with an actual CTAT Genome Ref Lib there.
+        ctat_genome_resource_lib_path = args.destination_path
+
+    if (args.genome_name is None) or (args.genome_name == ""):
+        genome_name = "GRCh38_gencode_v26"
+    else:
+        genome_name = args.genome_name
+
+    data_manager_dict = {}
+    data_manager_dict['data_tables'] = {}
+    data_manager_dict['data_tables']['ctat_genome_ref_libs'] = []
+    data_table_entry = dict(value="CTAT_RESOURCE_LIB", name=genome_name, path=ctat_genome_resource_lib_path)
+    data_manager_dict['data_tables']['ctat_genome_ref_libs'].append(data_table_entry)
+
+    # Save info to json file. This is used to transfer data from the DataManager tool, to the data manager,
+    # which then puts it into the correct .loc file (I think).
+    open(args.output_filename, 'wb').write(to_json_string(data_manager_dict))
+
+if __name__ == "__main__":
+    main()
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/data_manager/add_ctat_ref_lib.xml	Tue Dec 12 14:51:18 2017 -0500
@@ -0,0 +1,35 @@
+<tool id="ctat_genome_ref_lib_data_manager" 
+    name="CTAT Genome Reference Library Data Manager" 
+    version="1.0.0" tool_type="manage_data">
+    <description>Retrieve, and/or Specify the location of, a CTAT Genome Reference Library. 
+    </description>
+    <requirements>
+        <requirement type="package" version="2.7">python</requirement>
+    </requirements>
+    <command detect_errors="default">
+        <![CDATA[
+        python add_ctat_ref_lib.py ${download}
+            --ref_genome "${genome_name}"
+            --destination_path "${destination}" 
+            -o "${out_file}"
+        ]]>
+    </command>
+    <inputs>
+        <param name="download" type="boolean" checked="false"
+             truevalue="--download" falsevalue="" label="Need to Download? (yes/no)" />
+        <param name="genome_name" type="text" label="Reference Genome name" />
+        <param name="destination" type="text" label="Local Destination (full path)" />
+    </inputs>
+    <outputs>
+        <data name="out_file" format="data_manager_json" />
+    </outputs>
+    <help>
+        Retrieve, and/or specify the location of, a CTAT Genome Reference Library.
+        When download is true, the file retrieved and processed is https://data.broadinstitute.org/Trinity/CTAT_RESOURCE_LIB/GRCh38_gencode_v26_CTAT_lib_Nov012017.plug-n-play.tar.gz.
+        Specify the Full Path of the location where the CTAT Reference Library should be placed.
+        You will need approximately 30GB of space for this library.
+        If you already have the library, specify the full path of the location where it exists and leave the download box unchecked.
+        The Reference Genome name may be left empty if downloading. The name will be used as the selector text of the entry in the data table.
+        For more information on CTAT Genome Reference Libraries, see <a http="https://github.com/FusionFilter/FusionFilter/wiki">FusionFilter</a>
+    </help>
+</tool>
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/data_manager_conf.xml	Tue Dec 12 14:51:18 2017 -0500
@@ -0,0 +1,39 @@
+<?xml version="1.0"?>
+<data_managers>
+    <data_manager tool_file="data_manager/add_ctat_ref_lib.xml" id="ctat_genome_ref_lib_data_manager"> 
+        <data_table name="ctat_genome_ref_libs">
+            <output>
+                <column name="value" />
+                    <!-- value is used to uniquely identify this entry in the table.
+                    For now id is also the name of the environment variable that is used within tools to
+                    access a CTAT Resource Library. 
+                    FIX - Need to get rid of that and use command line params...
+                    -->
+                <column name="name" />
+                    <!-- name is used as the selector in the pull down lists for items in this table.
+                    -->
+                <column name="path">
+                    <!-- path is the absolute path of the corresponding CTAT Genome Reference Library.
+                    -->
+                <!-- <column name="path" output_ref="out_file"> -->
+                    <!-- It is typical to move the data file, but because our tool gets the destination
+                    location from the user, we do not want to move the data from that location.
+                    The full path of the CTAT Resource library is returned in location. 
+                    So no need to change the value either.
+                    -->
+                    <!-- <move type="file" relativize_symlinks="False"> -->
+                        <!--<source>${path}</source> -->
+                        <!--<target base="${GALAXY_DATA_MANAGER_DATA_PATH}">ctat_genome_lib_build_dir</target> -->
+                    <!--</move> -->
+                    <!--
+                    <value_translation>${GALAXY_DATA_MANAGER_DATA_PATH}/ctat_genome_lib_build_dir
+                    </value_translation>
+                    -->
+                    <!-- The location returned by the tool should already be an absolute path.
+                    <value_translation type="function">abspath</value_translation>
+                    -->
+                <!--</column> -->
+            </output>
+        </data_table>
+    </data_manager>
+</data_managers>
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tool-data/ctat_genome_ref_libs.loc.sample	Tue Dec 12 14:51:18 2017 -0500
@@ -0,0 +1,12 @@
+# This file lists the locations of CTAT Genome Reference Libraries
+# Usually there will only be one library, but it is concievable 
+# that there could be multiple libraries.
+# This file format is as follows
+# (white space characters are TAB characters):
+#
+#<unique_id>    <display_name>  <file_path>
+#
+#ctat_genome_ref_libs.loc could look like:
+#
+#CTAT_RESOURCE_LIB  GRCh38_gencode_v26   /ctat/genome/resource/lib/path
+#
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tool_data_table_conf.xml.sample	Tue Dec 12 14:51:18 2017 -0500
@@ -0,0 +1,6 @@
+<tables>
+    <table name="ctat_genome_ref_libs" comment_char="#" allow_duplicate_entries="False">
+        <columns>value, name, path</columns>
+        <file path="tool-data/ctat_genome_ref_libs.loc" />
+    </table>
+</tables>