Mercurial > repos > trinity_ctat > ctat_genome_ref_lib_data_manager_test2
changeset 0:d220209e47f4 draft
Upload First set of files.
author | trinity_ctat |
---|---|
date | Tue, 12 Dec 2017 14:51:18 -0500 |
parents | |
children | e071b1d24f24 |
files | data_manager/add_ctat_ref_lib.py data_manager/add_ctat_ref_lib.xml data_manager_conf.xml tool-data/ctat_genome_ref_libs.loc.sample tool_data_table_conf.xml.sample |
diffstat | 5 files changed, 186 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/data_manager/add_ctat_ref_lib.py Tue Dec 12 14:51:18 2017 -0500 @@ -0,0 +1,94 @@ +#!/usr/bin/env python +# ref: https://galaxyproject.org/admin/tools/data-managers/how-to/define/ + +# Rewritten by H.E. Cicada Brokaw Dennis from source downloaded from the toolshed. +# Eventually this should be modified to allow downloading of more than just the one library, +# to let the user select what library/location to download, but that would require the +# download tool to generate the list of libraries to download on the fly. Currently +# we are only using the one library. +# Users can create other ones locally and use this tool to add them if they don't want +# to add them by hand. + +import argparse +import os +import tarfile +import urllib + +from galaxy.util.json import from_json_string, to_json_string + +# The following was used by prior program to get input parameters from the json. +# Just leaving here for reference. +#def get_reference_id_name(params): +# genome_id = params['param_dict']['genome_id'] +# genome_name = params['param_dict']['genome_name'] +# return genome_id, genome_name +# +#def get_url(params): +# trained_url = params['param_dict']['trained_url'] +# return trained_url + +def download_from_BroadInst(destination): + ctat_resource_lib = 'https://data.broadinstitute.org/Trinity/CTAT_RESOURCE_LIB/GRCh38_gencode_v26_CTAT_lib_Nov012017.plug-n-play.tar.gz' + # FIX - Check that the download directory is empty if it exists. Also, can we check if there is enough space on the device as well? + # FIX - Also we want to make sure that destination is absolute fully specified path. + os.mkdir(destination) + full_filepath = os.path.join(destination, 'GRCh38_gencode_v26_CTAT_lib_Nov012017.plug-n-play.tar.gz') + + #Download ref: https://dzone.com/articles/how-download-file-python + #f = urllib2.urlopen(ctat_resource_lib) + #data = f.read() + #with open(filepath, 'wb') as code: + # code.write(data) + + urllib.urlretrieve(url=ctat_resource_lib, filename=full_filepath) + # Put the following into a try statement, so that if there is a failure something can be printed about it before reraising exception. + tarfile.open(full_filepath, mode='r:*').extractall() + # FIX - There is additional processing that needs to happen for gmap-fusion to work. + # Get the root filename of the extracted file. + # That directory is the actual destination that needs to be set as the ctat_genome_resource_library + +def main(): + #Parse Command Line + parser = argparse.ArgumentParser() + parser.add_argument('-d', '--download', action="store_true", \ + help='Do not use if you already have the CTAT Resource Library that this program downloads.') + parser.add_argument('-g', '--genome_name', default="GRCh38_gencode_v26", \ + help='Is used as the selector text of the entry in the data table.') + parser.add_argument('-p', '--destination_path', \ + help='Full path of the CTAT Resource Library location or destination.') + parser.add_argument('-o', '--output_filename', \ + help='Name of the output file, where the json dictionary will be written.') + args = parser.parse_args() + + # All of the input parameters are written by default to the output file prior to + # this program being called. + # But I do not get input values from the json file, but rather from command line. + # Just leaving the following code as a comment, in case it might be useful to someone later. + # params = from_json_string(open(filename).read()) + # target_directory = params['output_data'][0]['extra_files_path'] + # os.mkdir(target_directory) + + if args.download: + ctat_genome_resource_lib_path = download_from_BroadInst(destination=args.destination_path) + else: + # FIX - probably should check if this is a valid path with an actual CTAT Genome Ref Lib there. + ctat_genome_resource_lib_path = args.destination_path + + if (args.genome_name is None) or (args.genome_name == ""): + genome_name = "GRCh38_gencode_v26" + else: + genome_name = args.genome_name + + data_manager_dict = {} + data_manager_dict['data_tables'] = {} + data_manager_dict['data_tables']['ctat_genome_ref_libs'] = [] + data_table_entry = dict(value="CTAT_RESOURCE_LIB", name=genome_name, path=ctat_genome_resource_lib_path) + data_manager_dict['data_tables']['ctat_genome_ref_libs'].append(data_table_entry) + + # Save info to json file. This is used to transfer data from the DataManager tool, to the data manager, + # which then puts it into the correct .loc file (I think). + open(args.output_filename, 'wb').write(to_json_string(data_manager_dict)) + +if __name__ == "__main__": + main() +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/data_manager/add_ctat_ref_lib.xml Tue Dec 12 14:51:18 2017 -0500 @@ -0,0 +1,35 @@ +<tool id="ctat_genome_ref_lib_data_manager" + name="CTAT Genome Reference Library Data Manager" + version="1.0.0" tool_type="manage_data"> + <description>Retrieve, and/or Specify the location of, a CTAT Genome Reference Library. + </description> + <requirements> + <requirement type="package" version="2.7">python</requirement> + </requirements> + <command detect_errors="default"> + <![CDATA[ + python add_ctat_ref_lib.py ${download} + --ref_genome "${genome_name}" + --destination_path "${destination}" + -o "${out_file}" + ]]> + </command> + <inputs> + <param name="download" type="boolean" checked="false" + truevalue="--download" falsevalue="" label="Need to Download? (yes/no)" /> + <param name="genome_name" type="text" label="Reference Genome name" /> + <param name="destination" type="text" label="Local Destination (full path)" /> + </inputs> + <outputs> + <data name="out_file" format="data_manager_json" /> + </outputs> + <help> + Retrieve, and/or specify the location of, a CTAT Genome Reference Library. + When download is true, the file retrieved and processed is https://data.broadinstitute.org/Trinity/CTAT_RESOURCE_LIB/GRCh38_gencode_v26_CTAT_lib_Nov012017.plug-n-play.tar.gz. + Specify the Full Path of the location where the CTAT Reference Library should be placed. + You will need approximately 30GB of space for this library. + If you already have the library, specify the full path of the location where it exists and leave the download box unchecked. + The Reference Genome name may be left empty if downloading. The name will be used as the selector text of the entry in the data table. + For more information on CTAT Genome Reference Libraries, see <a http="https://github.com/FusionFilter/FusionFilter/wiki">FusionFilter</a> + </help> +</tool> \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/data_manager_conf.xml Tue Dec 12 14:51:18 2017 -0500 @@ -0,0 +1,39 @@ +<?xml version="1.0"?> +<data_managers> + <data_manager tool_file="data_manager/add_ctat_ref_lib.xml" id="ctat_genome_ref_lib_data_manager"> + <data_table name="ctat_genome_ref_libs"> + <output> + <column name="value" /> + <!-- value is used to uniquely identify this entry in the table. + For now id is also the name of the environment variable that is used within tools to + access a CTAT Resource Library. + FIX - Need to get rid of that and use command line params... + --> + <column name="name" /> + <!-- name is used as the selector in the pull down lists for items in this table. + --> + <column name="path"> + <!-- path is the absolute path of the corresponding CTAT Genome Reference Library. + --> + <!-- <column name="path" output_ref="out_file"> --> + <!-- It is typical to move the data file, but because our tool gets the destination + location from the user, we do not want to move the data from that location. + The full path of the CTAT Resource library is returned in location. + So no need to change the value either. + --> + <!-- <move type="file" relativize_symlinks="False"> --> + <!--<source>${path}</source> --> + <!--<target base="${GALAXY_DATA_MANAGER_DATA_PATH}">ctat_genome_lib_build_dir</target> --> + <!--</move> --> + <!-- + <value_translation>${GALAXY_DATA_MANAGER_DATA_PATH}/ctat_genome_lib_build_dir + </value_translation> + --> + <!-- The location returned by the tool should already be an absolute path. + <value_translation type="function">abspath</value_translation> + --> + <!--</column> --> + </output> + </data_table> + </data_manager> +</data_managers> \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool-data/ctat_genome_ref_libs.loc.sample Tue Dec 12 14:51:18 2017 -0500 @@ -0,0 +1,12 @@ +# This file lists the locations of CTAT Genome Reference Libraries +# Usually there will only be one library, but it is concievable +# that there could be multiple libraries. +# This file format is as follows +# (white space characters are TAB characters): +# +#<unique_id> <display_name> <file_path> +# +#ctat_genome_ref_libs.loc could look like: +# +#CTAT_RESOURCE_LIB GRCh38_gencode_v26 /ctat/genome/resource/lib/path +#
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool_data_table_conf.xml.sample Tue Dec 12 14:51:18 2017 -0500 @@ -0,0 +1,6 @@ +<tables> + <table name="ctat_genome_ref_libs" comment_char="#" allow_duplicate_entries="False"> + <columns>value, name, path</columns> + <file path="tool-data/ctat_genome_ref_libs.loc" /> + </table> +</tables>