annotate data_manager/add_ctat_ref_lib.py @ 14:3bb91cebec5c draft

Uploaded
author trinity_ctat
date Fri, 15 Dec 2017 15:53:28 -0500
parents d220209e47f4
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
d220209e47f4 Upload First set of files.
trinity_ctat
parents:
diff changeset
1 #!/usr/bin/env python
d220209e47f4 Upload First set of files.
trinity_ctat
parents:
diff changeset
2 # ref: https://galaxyproject.org/admin/tools/data-managers/how-to/define/
d220209e47f4 Upload First set of files.
trinity_ctat
parents:
diff changeset
3
d220209e47f4 Upload First set of files.
trinity_ctat
parents:
diff changeset
4 # Rewritten by H.E. Cicada Brokaw Dennis from source downloaded from the toolshed.
d220209e47f4 Upload First set of files.
trinity_ctat
parents:
diff changeset
5 # Eventually this should be modified to allow downloading of more than just the one library,
d220209e47f4 Upload First set of files.
trinity_ctat
parents:
diff changeset
6 # to let the user select what library/location to download, but that would require the
d220209e47f4 Upload First set of files.
trinity_ctat
parents:
diff changeset
7 # download tool to generate the list of libraries to download on the fly. Currently
d220209e47f4 Upload First set of files.
trinity_ctat
parents:
diff changeset
8 # we are only using the one library.
d220209e47f4 Upload First set of files.
trinity_ctat
parents:
diff changeset
9 # Users can create other ones locally and use this tool to add them if they don't want
d220209e47f4 Upload First set of files.
trinity_ctat
parents:
diff changeset
10 # to add them by hand.
d220209e47f4 Upload First set of files.
trinity_ctat
parents:
diff changeset
11
d220209e47f4 Upload First set of files.
trinity_ctat
parents:
diff changeset
12 import argparse
d220209e47f4 Upload First set of files.
trinity_ctat
parents:
diff changeset
13 import os
d220209e47f4 Upload First set of files.
trinity_ctat
parents:
diff changeset
14 import tarfile
d220209e47f4 Upload First set of files.
trinity_ctat
parents:
diff changeset
15 import urllib
d220209e47f4 Upload First set of files.
trinity_ctat
parents:
diff changeset
16
d220209e47f4 Upload First set of files.
trinity_ctat
parents:
diff changeset
17 from galaxy.util.json import from_json_string, to_json_string
d220209e47f4 Upload First set of files.
trinity_ctat
parents:
diff changeset
18
d220209e47f4 Upload First set of files.
trinity_ctat
parents:
diff changeset
19 # The following was used by prior program to get input parameters from the json.
d220209e47f4 Upload First set of files.
trinity_ctat
parents:
diff changeset
20 # Just leaving here for reference.
d220209e47f4 Upload First set of files.
trinity_ctat
parents:
diff changeset
21 #def get_reference_id_name(params):
d220209e47f4 Upload First set of files.
trinity_ctat
parents:
diff changeset
22 # genome_id = params['param_dict']['genome_id']
d220209e47f4 Upload First set of files.
trinity_ctat
parents:
diff changeset
23 # genome_name = params['param_dict']['genome_name']
d220209e47f4 Upload First set of files.
trinity_ctat
parents:
diff changeset
24 # return genome_id, genome_name
d220209e47f4 Upload First set of files.
trinity_ctat
parents:
diff changeset
25 #
d220209e47f4 Upload First set of files.
trinity_ctat
parents:
diff changeset
26 #def get_url(params):
d220209e47f4 Upload First set of files.
trinity_ctat
parents:
diff changeset
27 # trained_url = params['param_dict']['trained_url']
d220209e47f4 Upload First set of files.
trinity_ctat
parents:
diff changeset
28 # return trained_url
d220209e47f4 Upload First set of files.
trinity_ctat
parents:
diff changeset
29
d220209e47f4 Upload First set of files.
trinity_ctat
parents:
diff changeset
30 def download_from_BroadInst(destination):
d220209e47f4 Upload First set of files.
trinity_ctat
parents:
diff changeset
31 ctat_resource_lib = 'https://data.broadinstitute.org/Trinity/CTAT_RESOURCE_LIB/GRCh38_gencode_v26_CTAT_lib_Nov012017.plug-n-play.tar.gz'
d220209e47f4 Upload First set of files.
trinity_ctat
parents:
diff changeset
32 # FIX - Check that the download directory is empty if it exists. Also, can we check if there is enough space on the device as well?
d220209e47f4 Upload First set of files.
trinity_ctat
parents:
diff changeset
33 # FIX - Also we want to make sure that destination is absolute fully specified path.
d220209e47f4 Upload First set of files.
trinity_ctat
parents:
diff changeset
34 os.mkdir(destination)
d220209e47f4 Upload First set of files.
trinity_ctat
parents:
diff changeset
35 full_filepath = os.path.join(destination, 'GRCh38_gencode_v26_CTAT_lib_Nov012017.plug-n-play.tar.gz')
d220209e47f4 Upload First set of files.
trinity_ctat
parents:
diff changeset
36
d220209e47f4 Upload First set of files.
trinity_ctat
parents:
diff changeset
37 #Download ref: https://dzone.com/articles/how-download-file-python
d220209e47f4 Upload First set of files.
trinity_ctat
parents:
diff changeset
38 #f = urllib2.urlopen(ctat_resource_lib)
d220209e47f4 Upload First set of files.
trinity_ctat
parents:
diff changeset
39 #data = f.read()
d220209e47f4 Upload First set of files.
trinity_ctat
parents:
diff changeset
40 #with open(filepath, 'wb') as code:
d220209e47f4 Upload First set of files.
trinity_ctat
parents:
diff changeset
41 # code.write(data)
d220209e47f4 Upload First set of files.
trinity_ctat
parents:
diff changeset
42
d220209e47f4 Upload First set of files.
trinity_ctat
parents:
diff changeset
43 urllib.urlretrieve(url=ctat_resource_lib, filename=full_filepath)
d220209e47f4 Upload First set of files.
trinity_ctat
parents:
diff changeset
44 # Put the following into a try statement, so that if there is a failure something can be printed about it before reraising exception.
d220209e47f4 Upload First set of files.
trinity_ctat
parents:
diff changeset
45 tarfile.open(full_filepath, mode='r:*').extractall()
d220209e47f4 Upload First set of files.
trinity_ctat
parents:
diff changeset
46 # FIX - There is additional processing that needs to happen for gmap-fusion to work.
d220209e47f4 Upload First set of files.
trinity_ctat
parents:
diff changeset
47 # Get the root filename of the extracted file.
d220209e47f4 Upload First set of files.
trinity_ctat
parents:
diff changeset
48 # That directory is the actual destination that needs to be set as the ctat_genome_resource_library
d220209e47f4 Upload First set of files.
trinity_ctat
parents:
diff changeset
49
d220209e47f4 Upload First set of files.
trinity_ctat
parents:
diff changeset
50 def main():
d220209e47f4 Upload First set of files.
trinity_ctat
parents:
diff changeset
51 #Parse Command Line
d220209e47f4 Upload First set of files.
trinity_ctat
parents:
diff changeset
52 parser = argparse.ArgumentParser()
d220209e47f4 Upload First set of files.
trinity_ctat
parents:
diff changeset
53 parser.add_argument('-d', '--download', action="store_true", \
d220209e47f4 Upload First set of files.
trinity_ctat
parents:
diff changeset
54 help='Do not use if you already have the CTAT Resource Library that this program downloads.')
d220209e47f4 Upload First set of files.
trinity_ctat
parents:
diff changeset
55 parser.add_argument('-g', '--genome_name', default="GRCh38_gencode_v26", \
d220209e47f4 Upload First set of files.
trinity_ctat
parents:
diff changeset
56 help='Is used as the selector text of the entry in the data table.')
d220209e47f4 Upload First set of files.
trinity_ctat
parents:
diff changeset
57 parser.add_argument('-p', '--destination_path', \
d220209e47f4 Upload First set of files.
trinity_ctat
parents:
diff changeset
58 help='Full path of the CTAT Resource Library location or destination.')
d220209e47f4 Upload First set of files.
trinity_ctat
parents:
diff changeset
59 parser.add_argument('-o', '--output_filename', \
d220209e47f4 Upload First set of files.
trinity_ctat
parents:
diff changeset
60 help='Name of the output file, where the json dictionary will be written.')
d220209e47f4 Upload First set of files.
trinity_ctat
parents:
diff changeset
61 args = parser.parse_args()
d220209e47f4 Upload First set of files.
trinity_ctat
parents:
diff changeset
62
d220209e47f4 Upload First set of files.
trinity_ctat
parents:
diff changeset
63 # All of the input parameters are written by default to the output file prior to
d220209e47f4 Upload First set of files.
trinity_ctat
parents:
diff changeset
64 # this program being called.
d220209e47f4 Upload First set of files.
trinity_ctat
parents:
diff changeset
65 # But I do not get input values from the json file, but rather from command line.
d220209e47f4 Upload First set of files.
trinity_ctat
parents:
diff changeset
66 # Just leaving the following code as a comment, in case it might be useful to someone later.
d220209e47f4 Upload First set of files.
trinity_ctat
parents:
diff changeset
67 # params = from_json_string(open(filename).read())
d220209e47f4 Upload First set of files.
trinity_ctat
parents:
diff changeset
68 # target_directory = params['output_data'][0]['extra_files_path']
d220209e47f4 Upload First set of files.
trinity_ctat
parents:
diff changeset
69 # os.mkdir(target_directory)
d220209e47f4 Upload First set of files.
trinity_ctat
parents:
diff changeset
70
d220209e47f4 Upload First set of files.
trinity_ctat
parents:
diff changeset
71 if args.download:
d220209e47f4 Upload First set of files.
trinity_ctat
parents:
diff changeset
72 ctat_genome_resource_lib_path = download_from_BroadInst(destination=args.destination_path)
d220209e47f4 Upload First set of files.
trinity_ctat
parents:
diff changeset
73 else:
d220209e47f4 Upload First set of files.
trinity_ctat
parents:
diff changeset
74 # FIX - probably should check if this is a valid path with an actual CTAT Genome Ref Lib there.
d220209e47f4 Upload First set of files.
trinity_ctat
parents:
diff changeset
75 ctat_genome_resource_lib_path = args.destination_path
d220209e47f4 Upload First set of files.
trinity_ctat
parents:
diff changeset
76
d220209e47f4 Upload First set of files.
trinity_ctat
parents:
diff changeset
77 if (args.genome_name is None) or (args.genome_name == ""):
d220209e47f4 Upload First set of files.
trinity_ctat
parents:
diff changeset
78 genome_name = "GRCh38_gencode_v26"
d220209e47f4 Upload First set of files.
trinity_ctat
parents:
diff changeset
79 else:
d220209e47f4 Upload First set of files.
trinity_ctat
parents:
diff changeset
80 genome_name = args.genome_name
14
3bb91cebec5c Uploaded
trinity_ctat
parents: 0
diff changeset
81 # Set the value to the basename of the directory path minus the extension.
3bb91cebec5c Uploaded
trinity_ctat
parents: 0
diff changeset
82 # FIX - Need to make sure is unique. This is not good way to do it. Just doing it this way now for testing.
3bb91cebec5c Uploaded
trinity_ctat
parents: 0
diff changeset
83 table_entry_value = os.path.basename(ctat_genome_resource_lib_path).split(".")[0]
0
d220209e47f4 Upload First set of files.
trinity_ctat
parents:
diff changeset
84 data_manager_dict = {}
d220209e47f4 Upload First set of files.
trinity_ctat
parents:
diff changeset
85 data_manager_dict['data_tables'] = {}
d220209e47f4 Upload First set of files.
trinity_ctat
parents:
diff changeset
86 data_manager_dict['data_tables']['ctat_genome_ref_libs'] = []
14
3bb91cebec5c Uploaded
trinity_ctat
parents: 0
diff changeset
87 data_table_entry = dict(value=table_entry_value, name=genome_name, path=ctat_genome_resource_lib_path)
0
d220209e47f4 Upload First set of files.
trinity_ctat
parents:
diff changeset
88 data_manager_dict['data_tables']['ctat_genome_ref_libs'].append(data_table_entry)
d220209e47f4 Upload First set of files.
trinity_ctat
parents:
diff changeset
89
d220209e47f4 Upload First set of files.
trinity_ctat
parents:
diff changeset
90 # Save info to json file. This is used to transfer data from the DataManager tool, to the data manager,
d220209e47f4 Upload First set of files.
trinity_ctat
parents:
diff changeset
91 # which then puts it into the correct .loc file (I think).
d220209e47f4 Upload First set of files.
trinity_ctat
parents:
diff changeset
92 open(args.output_filename, 'wb').write(to_json_string(data_manager_dict))
d220209e47f4 Upload First set of files.
trinity_ctat
parents:
diff changeset
93
d220209e47f4 Upload First set of files.
trinity_ctat
parents:
diff changeset
94 if __name__ == "__main__":
d220209e47f4 Upload First set of files.
trinity_ctat
parents:
diff changeset
95 main()
d220209e47f4 Upload First set of files.
trinity_ctat
parents:
diff changeset
96