annotate data_manager/add_ctat_centrifuge_index.py @ 15:367b0d693b0c draft default tip

Uploaded
author trinity_ctat
date Mon, 30 Apr 2018 16:17:37 -0400
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
15
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
1 #!/usr/bin/env python
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
2 # ref: https://galaxyproject.org/admin/tools/data-managers/how-to/define/
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
3
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
4 # Rewritten by H.E. Cicada Brokaw Dennis from a source downloaded from the toolshed and
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
5 # other example code on the web.
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
6 # This allows downloading of a centrifuge index, or specification of its disk location.
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
7 # This index is one of the input paramters needed by the ctat_metagenomics tool.
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
8 # At the moment only one index is supported by the ctat_metagenomics tool:
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
9 # ftp://ftp.ccb.jhu.edu/pub/infphilo/centrifuge/data/p_compressed+h+v.tar.gz
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
10
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
11 import argparse
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
12 import os
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
13 #import tarfile
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
14 #import urllib
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
15 import subprocess
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
16
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
17 # The following is used to generate a unique_id value
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
18 from datetime import *
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
19
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
20 # Remove the following line when testing without galaxy package:
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
21 from galaxy.util.json import to_json_string
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
22 # Am not using the following:
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
23 # from galaxy.util.json import from_json_string
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
24
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
25 # The FileListParser is used by get_ctat_genome_filenames(),
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
26 # which is called by the Data Manager interface (.xml file) to get
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
27 # the filenames that are available online at broadinstitute.org
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
28 # Not sure best way to do it.
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
29 # This object uses HTMLParser to look through the html
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
30 # searching for the filenames within anchor tags.
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
31 import urllib2
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
32 from HTMLParser import HTMLParser
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
33
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
34 _CTAT_CentrifugeIndexPage_URL = 'https://ccb.jhu.edu/software/centrifuge/'
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
35 _CTAT_CentrifugeDownload_URL = 'ftp://ftp.ccb.jhu.edu/pub/infphilo/centrifuge/data/p_compressed+h+v.tar.gz'
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
36 _CTAT_CentrifugeIndexTableName = 'ctat_centrifuge_indexes'
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
37 _CTAT_CentrifugeDir_Name = 'p_compressed+h+v'
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
38 _CTAT_Centrifuge_DisplayNamePrefix = 'CTAT_CentrifugeIndex_'
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
39 _CentrifugeIndexFileExtension = 'cf'
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
40 _NumBytesNeededForIndex = 7400130287 # 6.9 GB
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
41 #_DownloadFileSize = 5790678746 # 5.4 Gigabytes.
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
42 _Download_TestFile = 'write_testfile.txt'
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
43 _DownloadSuccessFile = 'download_succeeded.txt'
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
44
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
45 class FileListParser(HTMLParser):
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
46 def __init__(self):
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
47 # Have to use direct call to super class rather than using super():
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
48 # super(FileListParser, self).__init__()
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
49 # because HTMLParser is an "old style" class and its inheritance chain does not include object.
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
50 HTMLParser.__init__(self)
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
51 self.filenames = set()
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
52 def handle_starttag(self, tag, attrs):
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
53 # Look for filename references in anchor tags and add them to filenames.
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
54 if tag == "a":
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
55 # The tag is an anchor tag.
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
56 for attribute in attrs:
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
57 # print "Checking: {:s}".format(str(attribute))
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
58 if attribute[0] == "href":
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
59 # Does the href have a tar.gz in it?
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
60 if ("tar.gz" in attribute[1]) and ("md5" not in attribute[1]):
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
61 # Add the value to filenames.
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
62 self.filenames.add(attribute[1])
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
63 # End of class FileListParser
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
64
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
65 def get_ctat_centrifuge_index_locations():
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
66 # For dynamic options need to return an interable with contents that are tuples with 3 items.
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
67 # Item one is a string that is the display name put into the option list.
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
68 # Item two is the value that is put into the parameter associated with the option list.
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
69 # Item three is a True or False value, indicating whether the item is selected.
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
70 options = []
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
71 # open the url and retrieve the filenames of the files in the directory.
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
72 resource = urllib2.urlopen(_CTAT_CentrifugeIndexPage_URL)
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
73 theHTML = resource.read()
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
74 filelist_parser = FileListParser()
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
75 filelist_parser.feed(theHTML)
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
76 # This is what was returned on 2018-04-23
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
77 # ftp://ftp.ccb.jhu.edu/pub/infphilo/centrifuge/data/p_compressed_2018_4_15.tar.gz
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
78 # ftp://ftp.ccb.jhu.edu/pub/infphilo/centrifuge/data/nt_2018_3_3.tar.gz
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
79 # ftp://ftp.ccb.jhu.edu/pub/infphilo/centrifuge/data/p_compressed+h+v.tar.gz
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
80 # ftp://ftp.ccb.jhu.edu/pub/infphilo/centrifuge/data/p+h+v.tar.gz
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
81 # Which could be hard coded:
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
82 # vals.append(("p_compressed+h+v", "ftp://ftp.ccb.jhu.edu/pub/infphilo/centrifuge/data/p_compressed+h+v.tar.gz", True))
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
83 # vals.append(("p+h+v", "ftp://ftp.ccb.jhu.edu/pub/infphilo/centrifuge/data/p+h+v.tar.gz", False))
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
84 # vals.append(("nt_2018_3_3", "ftp://ftp.ccb.jhu.edu/pub/infphilo/centrifuge/data/nt_2018_3_3.tar.gz", False))
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
85 # vals.append(("p_compressed_2018_4_15", "ftp://ftp.ccb.jhu.edu/pub/infphilo/centrifuge/data/p_compressed_2018_4_15.tar.gz", False))
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
86 # but only returning the one we want, which for now is assumed to be present.
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
87 # For now, I am printing the list, just so I can see what was returned,
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
88 print "FYI: The URL's that were found on Centrifuge's page are:"
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
89 print "\t" + "\n\t".join(filelist_parser.filenames)
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
90 # For now instead of sending back the list of found URL's, send back the one URL we want.
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
91 # Currently, only one of the options is supported.
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
92 vals.append((_CTAT_CentrifugeDir_Name, _CTAT_CentrifugeDownload_URL, True))
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
93 print "The items in vals are:"
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
94 print str(vals)
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
95 return vals
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
96
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
97 # The following was used by the example program to get input parameters through the json.
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
98 # Just leaving here for reference.
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
99 # We are getting all of our parameter values through command line arguments.
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
100 #def get_reference_id_name(params):
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
101 # genome_id = params['param_dict']['genome_id']
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
102 # genome_name = params['param_dict']['genome_name']
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
103 # return genome_id, genome_name
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
104 #
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
105 #def get_url(params):
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
106 # trained_url = params['param_dict']['trained_url']
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
107 # return trained_url
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
108
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
109 def download_index(src_location, destination, force_download):
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
110 # We do not know if the index has been downloaded already.
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
111 # This function returns whether or not the index actually gets downloaded.
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
112 index_was_downloaded = False
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
113 # Get the root filename of the Genome Directory.
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
114 # The part after the last '/' and before the first '.'
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
115 root_index_dirname = src_location.split("/")[-1].split(".")[0]
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
116
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
117 # We want to make sure that destination is absolute fully specified path.
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
118 cannonical_destination = os.path.realpath(destination)
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
119 if cannonical_destination.split("/")[-1] != root_index_dirname:
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
120 cannonical_destination += "/" + root_index_dirname
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
121 if os.path.exists(cannonical_destination):
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
122 if not os.path.isdir(cannonical_destination):
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
123 raise ValueError("The destination is not a directory: " + \
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
124 "{:s}".format(cannonical_destination))
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
125 # else all is good. It is a directory.
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
126 else:
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
127 # We need to create it.
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
128 try:
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
129 os.makedirs(cannonical_destination)
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
130 except os.error:
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
131 print "ERROR: Trying to create the following directory path:"
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
132 print "\t{:s}".format(cannonical_destination)
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
133 raise
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
134
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
135 # Make sure the directory now exists and we can write to it.
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
136 if not os.path.exists(cannonical_destination):
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
137 # It should have been created, but if it doesn't exist at this point
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
138 # in the code, something is wrong. Raise an error.
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
139 raise OSError("The destination directory could not be created: " + \
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
140 "{:s}".format(cannonical_destination))
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
141 test_writing_file = "{:s}/{:s}".format(cannonical_destination, _Download_TestFile)
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
142 try:
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
143 filehandle = open(test_writing_file, "w")
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
144 filehandle.write("Testing writing to this file.")
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
145 filehandle.close()
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
146 os.remove(test_writing_file)
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
147 except IOError:
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
148 print "The destination directory could not be written into: " + \
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
149 "{:s}".format(cannonical_destination)
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
150 raise
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
151
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
152 # Get the list of files in the directory,
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
153 # We use it to check for a previous download or extraction among other things.
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
154 orig_files_in_destdir = set(os.listdir(cannonical_destination))
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
155 # See whether the file has been downloaded already.
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
156 download_success_file_path = "{:s}/{:s}".format(cannonical_destination, _DownloadSuccessFile)
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
157 if (_DownloadSuccessFile not in orig_files_in_destdir) or force_download:
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
158 # Check whether there is enough space on the device for the index.
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
159 statvfs = os.statvfs(cannonical_destination)
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
160 # fs_size = statvfs.f_frsize * statvfs.f_blocks # Size of filesystem in bytes
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
161 # num_free_bytes = statvfs.f_frsize * statvfs.f_bfree # Actual number of free bytes
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
162 num_avail_bytes = statvfs.f_frsize * statvfs.f_bavail # Number of free bytes that ordinary users
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
163 # are allowed to use (excl. reserved space)
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
164 if (num_avail_bytes < _NumBytesNeededForIndex):
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
165 raise OSError("There is insufficient space ({:s} bytes)".format(str(num_avail_bytes)) + \
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
166 " on the device of the destination directory: " + \
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
167 "{:s}".format(cannonical_destination))
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
168
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
169 #Previous code to download and untar. Not using anymore.
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
170 #full_filepath = os.path.join(destination, src_filename)
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
171 #
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
172 #Download ref: https://dzone.com/articles/how-download-file-python
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
173 #f = urllib2.urlopen(ctat_resource_lib_url)
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
174 #data = f.read()
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
175 #with open(full_filepath, 'wb') as code:
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
176 # code.write(data)
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
177 #
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
178 #Another way to download:
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
179 #try:
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
180 # urllib.urlretrieve(url=ctat_resource_lib_url, filename=full_filepath)
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
181 #
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
182 #Then untar the file.
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
183 #try:
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
184 # tarfile.open(full_filepath, mode='r:*').extractall()
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
185
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
186 if (_DownloadSuccessFile in orig_files_in_destdir):
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
187 # Since we are redoing the download,
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
188 # the success file needs to be removed
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
189 # until the download has succeeded.
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
190 os.remove(download_success_file_path)
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
191 # We want to transfer and untar the file without storing the tar file, because that
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
192 # adds all that much more space to the needed amount of free space on the disk.
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
193 # Use subprocess to pipe the output of curl into tar.
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
194 # Make curl silent so progress is not printed to stderr.
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
195 command = "curl --silent {:s} | tar -xzf - -C {:s}".format(src_location, cannonical_destination)
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
196 try: # to send the command that downloads and extracts the file.
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
197 command_output = subprocess.check_output(command, shell=True)
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
198 # FIX - not sure check_output is what we want to use. If we want to have an error raised on
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
199 # any problem, maybe we should not be checking output.
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
200 except subprocess.CalledProcessError:
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
201 print "ERROR: Trying to run the following command:\n\t{:s}".format(command)
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
202 raise
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
203 else:
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
204 index_was_downloaded = True
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
205
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
206 # Some code to help us if errors occur.
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
207 print "\n*******************************\nFinished download and extraction."
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
208 subprocess.check_call("ls -lad {:s}/*".format(cannonical_destination), shell=True)
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
209
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
210 files_in_destdir = set(os.listdir(cannonical_destination))
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
211 found_filenames = set()
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
212 for filename in files_in_destdir:
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
213 # There should be three files, but some OS's might have created
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
214 # other files in the directory, or maybe the user did.
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
215 # Look for the index files.
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
216 # The download files' names should start with the root_index_dirname
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
217 # print "Is root: {:s} in file: {:s}".format(root_index_dirname, filename)
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
218 if root_index_dirname in filename:
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
219 found_filenames.add(filename)
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
220 # print "The found_filenames are:\n\t{:s}".format(str(found_filenames))
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
221 if (len(found_filenames) >= 3):
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
222 # FIX - we could md5 the files to make sure they are correct.
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
223 # Or at least check their sizes, to see if the download completed ok.
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
224 # Also we could check the names of the files.
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
225 try:
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
226 # Create a file to indicate that the download succeeded.
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
227 subprocess.check_call("touch {:s}".format(download_success_file_path), shell=True)
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
228 except IOError:
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
229 print "The download_success file could not be created: " + \
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
230 "{:s}".format(download_success_file_path)
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
231 raise
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
232 else:
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
233 print "After download, the potential index files found are:\n\t{:s}".format(str(found_filenames))
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
234 raise ValueError("ERROR: Could not find the extracted index files " + \
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
235 "in the destination directory:\n\t{:s}".format(cannonical_destination))
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
236
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
237 return (cannonical_destination, root_index_dirname, index_was_downloaded)
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
238
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
239 def main():
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
240 #Parse Command Line
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
241 # print "At start before parsing arguments."
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
242 parser = argparse.ArgumentParser()
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
243 parser.add_argument('-d', '--download_location', default="", \
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
244 help='This is the download location of the centrifuge index.')
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
245 parser.add_argument('-n', '--display_name', default="", \
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
246 help='Is used as the selector text for the entry of this Centrifuge Index in the data table.')
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
247 parser.add_argument('-p', '--destination_path', \
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
248 help='Full path of the Centrifuge Index location or destination, either where it is, or where it will be placed.')
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
249 parser.add_argument('-o', '--output_filename', \
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
250 help='Name of the output file, where the json dictionary will be written.')
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
251 parser.add_argument('-f', '--force_download',
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
252 help='Forces download of the Centrifuge Index, even if previously downloaded. ' + \
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
253 'Requires download_location to be set in order to work.', action="store_true")
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
254 args = parser.parse_args()
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
255
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
256 # All of the input parameters are written by default to the output file prior to
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
257 # this program being called.
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
258 # But I do not get input values from the json file, but rather from command line.
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
259 # Just leaving the following code as a comment, in case it might be useful to someone later.
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
260 # params = from_json_string(open(filename).read())
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
261 # target_directory = params['output_data'][0]['extra_files_path']
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
262 # os.mkdir(target_directory)
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
263
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
264 # print "Arguments are parsed."
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
265 print "\ndownload_location is {:s}".format(str(args.download_location))
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
266 print "display_name is {:s}".format(str(args.display_name))
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
267 print "destination_path is {:s}\n".format(str(args.destination_path))
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
268 root_index_dirname = None
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
269 # FIX - Prob don't need index_was_downloaded. Not doing anything with it.
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
270 # But it indicates success downloading the index, so maybe should be checking it.
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
271 index_was_downloaded = False
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
272 if (args.download_location != ""):
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
273 index_directory, root_index_dirname, index_was_downloaded = \
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
274 download_index(src_location=args.download_location, \
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
275 destination=args.destination_path, \
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
276 force_download=args.force_download)
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
277 else:
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
278 cannonical_destination = os.path.realpath(args.destination_path)
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
279 if not os.path.exists(cannonical_destination):
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
280 raise ValueError("Cannot find the Centrifuge Index.\n" + \
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
281 "The directory does not exist:\n\t{:s}".format(index_directory))
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
282 # If args.destination_path is a directory containing
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
283 # a subdirectory that contains the index files,
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
284 # then we need to set the index_directory to be that subdirectory.
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
285 files_in_destination_path = os.listdir(cannonical_destination)
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
286 if (len(files_in_destination_path) == 1):
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
287 path_to_file = "{:s}/{:s}".format(cannonical_destination, files_in_destination_path[0])
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
288 if os.path.isdir(path_to_file):
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
289 index_directory = path_to_file
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
290 else:
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
291 index_directory = cannonical_destination
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
292 else:
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
293 index_directory = cannonical_destination
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
294 # Get the root_index_dirname of the index from the index_directory name.
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
295 root_index_dirname = index_directory.split("/")[-1].split(".")[0]
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
296
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
297 # Check if there is an actual Centrifuge Index file in the index_directory.
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
298 print "\nThe location of the Centrifuge Index is {:s}.\n".format(index_directory)
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
299 files_in_index_directory = set(os.listdir(index_directory))
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
300 index_file_found = False
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
301 index_file_path = index_directory
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
302 for filename in files_in_index_directory:
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
303 # The current index is split into 3 files.
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
304 # filenames are in the form: index_root_name.#.cf,
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
305 # where # is a numeral (1, 2, or 3)
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
306 # indicating the order of the files.
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
307 if filename.split(".")[-1] == _CentrifugeIndexFileExtension:
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
308 index_file_found = True
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
309 # The centrifuge program wants the root name of the files to be final part of the path.
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
310 index_file_path = "{:s}/{:s}".format(index_directory, filename.split(".")[0])
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
311 if not index_file_found:
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
312 raise ValueError("Cannot find any Centrifuge Index files.\n" + \
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
313 "The contents of the directory {:s} are:\n\t".format(index_directory) + \
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
314 "\n\t".join(files_in_index_directory))
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
315
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
316 # Set the display_name
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
317 if (args.display_name is None) or (args.display_name == ""):
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
318 # Use the root_index_dirname.
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
319 if (root_index_dirname != None) and (root_index_dirname != ""):
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
320 display_name = _CTAT_Centrifuge_DisplayNamePrefix + root_index_dirname
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
321 else:
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
322 display_name = _CTAT_Centrifuge_DisplayNamePrefix + _CTAT_CentrifugeDir_Name
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
323 print "WARNING: Did not set the display name. Using the default: {:s}".format(display_name_value)
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
324 else:
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
325 display_name = _CTAT_Centrifuge_DisplayNamePrefix + args.display_name
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
326 display_name = display_name.replace(" ","_")
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
327
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
328 # Set the unique_id
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
329 datetime_stamp = datetime.now().strftime("_%Y_%m_%d_%H_%M_%S_%f")
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
330 if (root_index_dirname != None) and (root_index_dirname != ""):
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
331 unique_id = root_index_dirname + datetime_stamp
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
332 else:
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
333 unique_id = _CTAT_CentrifugeDir_Name + datetime_stamp
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
334
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
335 print "The Index's display_name will be set to: {:s}\n".format(display_name)
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
336 print "Its unique_id will be set to: {:s}\n".format(unique_id)
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
337 print "Its dir_path will be set to: {:s}\n".format(index_file_path)
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
338
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
339 data_manager_dict = {}
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
340 data_manager_dict['data_tables'] = {}
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
341 data_manager_dict['data_tables'][_CTAT_CentrifugeIndexTableName] = []
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
342 data_table_entry = dict(value=unique_id, name=display_name, path=index_file_path)
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
343 data_manager_dict['data_tables'][_CTAT_CentrifugeIndexTableName].append(data_table_entry)
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
344
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
345 # Temporarily the output file's dictionary is written for debugging:
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
346 print "The dictionary for the output file is:\n\t{:s}".format(str(data_manager_dict))
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
347 # Save info to json file. This is used to transfer data from the DataManager tool, to the data manager,
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
348 # which then puts it into the correct .loc file (I think).
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
349 # Remove the following line when testing without galaxy package.
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
350 open(args.output_filename, 'wb').write(to_json_string(data_manager_dict))
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
351
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
352 if __name__ == "__main__":
367b0d693b0c Uploaded
trinity_ctat
parents:
diff changeset
353 main()