44
|
1 #!/usr/bin/env python
|
|
2 # ref: https://galaxyproject.org/admin/tools/data-managers/how-to/define/
|
|
3
|
|
4 # Rewritten by H.E. Cicada Brokaw Dennis from code downloaded from the toolshed and
|
|
5 # other example code on the web. It has however been extensively modified and augmented.
|
|
6 # This now allows downloading of a user selected library
|
|
7 # but only from the CTAT Genome Resource Library website.
|
|
8 # Ultimately we might want to allow the user to specify any location
|
|
9 # from which to download.
|
|
10 # Users can create or download other libraries and use this Data Manger to add them
|
|
11 # if they don't want to add them by hand.
|
|
12
|
|
13 import sys
|
45
|
14 # The many calls to sys.stdout.flush() are done in order to get the output to be synchronized.
|
44
|
15 import argparse
|
|
16 import os
|
|
17 import shutil
|
|
18 import tarfile
|
|
19 import hashlib
|
|
20 import urllib
|
|
21 import urlparse
|
|
22 import contextlib
|
|
23 import subprocess
|
|
24
|
|
25 # Comment out the following line when testing without galaxy package.
|
|
26 from galaxy.util.json import to_json_string
|
|
27 # The following is not being used, but leaving as info
|
|
28 # in case we ever want to get input values using json.
|
|
29 # from galaxy.util.json import from_json_string
|
|
30
|
|
31 # datetime.now() is used to create the unique_id
|
|
32 from datetime import datetime
|
|
33
|
|
34 # The Data Manager uses a subclass of HTMLParser to look through a web page's html
|
|
35 # searching for the filenames within anchor tags.
|
|
36 import urllib2
|
|
37 from HTMLParser import HTMLParser
|
|
38
|
|
39 _CTAT_ResourceLib_URL = 'https://data.broadinstitute.org/Trinity/CTAT_RESOURCE_LIB/'
|
|
40 _CTAT_Mutation_URL = 'https://data.broadinstitute.org/Trinity/CTAT/mutation/'
|
|
41 _CTAT_Build_dirname = 'ctat_genome_lib_build_dir'
|
|
42 _CTAT_MutationLibDirname = 'ctat_mutation_lib'
|
|
43 _CTAT_ResourceLib_DisplayNamePrefix = 'CTAT_GenomeResourceLib_'
|
|
44 _CTAT_ResourceLib_DefaultGenome = 'Unspecified_Genome'
|
|
45 _CTAT_HumanFusionLib_FilenamePrefix = 'CTAT_HumanFusionLib'
|
|
46 _CTAT_RefGenome_Filename = 'ref_genome.fa'
|
|
47 _CTAT_MouseGenome_Prefix = 'Mouse'
|
|
48 _CTAT_HumanGenome_Prefix = 'GRCh'
|
|
49 _COSMIC_Mutant_Filename = 'CosmicMutantExport.tsv.gz'
|
|
50 _COSMIC_Coding_Filename = 'CosmicCodingMuts.vcf.gz'
|
|
51
|
|
52 # FIX - The following numbers need to be checked and other numbers for gmap, etc. need to be determined.
|
|
53 # Values for each genome should be determined, so we can get more precise values for each genome.
|
|
54 _NumBytesNeededForSourceDataExtraction = 10737418240 # 10 Gigabytes. FIX - Not checked - Largest archive is currently 2.5GB.
|
|
55 _NumBytesNeededForPlugNPlayExtraction = 48318382080 # 45 Gigabytes. Largest archive is currently 28GB and extracts to 43GB.
|
|
56 # Built Human Genome archive (GRCh38_v27_CTAT_lib_Feb092018) with mutation lib is 46GB.
|
|
57 # Fix - check amount with gmap.
|
|
58 _NumBytesNeededForBuild = 66571993088 # 62 Gigabytes. FIX - This might not be correct.
|
|
59 _NumBytesNeededForMutationResources = 4294967296 # 4 Gigabytes. Actually need about 3.8GB.
|
|
60 # Once built the downloaded archive could be deleted to reduce the amount used, but with the archive
|
|
61 # there and the Cosmic files and the built ctat_mutation_library, 3.8GB is needed.
|
|
62 # If the archive files are deleted after the integration of the library, only 1.8GB would be used at that point.
|
|
63 _Write_TestFile = 'write_testfile.txt'
|
|
64 _DownloadSuccessFile = 'download_succeeded.txt'
|
|
65 _ExtractionSuccessFile = 'extraction_succeeded.txt'
|
|
66 _LibBuiltSuccessFile = 'build_succeeded.txt'
|
|
67 _GmapSuccessFile = 'gmap_succeeded.txt'
|
|
68 _MutationDownloadSuccessFile = 'mutation_download_succeeded.txt'
|
|
69 _MutationIntegrationSuccessFile = 'mutation_integration_succeeded.txt'
|
|
70 _LIBTYPE_SOURCE_DATA = 'source_data'
|
|
71 _LIBTYPE_PLUG_N_PLAY = 'plug-n-play'
|
|
72
|
|
73 class resumable_URL_opener(urllib.FancyURLopener):
|
|
74 # This class is used to do downloads that can restart a download from
|
|
75 # the point where it left off after a partial download was interupted.
|
|
76 # This class and code using it was found online:
|
|
77 # http://code.activestate.com/recipes/83208-resuming-download-of-a-file/
|
|
78 # A sub-class is created in order to overide error 206.
|
|
79 # This error means a partial file is being sent,
|
|
80 # which is ok in this case. Do nothing with this error.
|
|
81 def http_error_206(self, url, fp, errcode, errmsg, headers, data=None):
|
|
82 pass
|
|
83 # End of class resumable_URL_opener
|
|
84
|
|
85 class FileListParser(HTMLParser):
|
|
86 # The FileListParser object is used by get_ctat_genome_urls() and get_mutation_resource_urls(),
|
|
87 # which can be called by the Data Manager interface (.xml file) to get
|
|
88 # the filenames that are available online at broadinstitute.org
|
|
89 # Apparently creating dynamic option lists this way is deprecated, but no
|
|
90 # other method exists by which I can get the options dynamically from the web.
|
|
91 # I believe that it is considered a security risk.
|
|
92
|
|
93 # This HTMLParser facilitates getting url's of tar.gz links in an HTML page.
|
|
94 # These are assumed to be files that can be downloaded and are the files we
|
|
95 # are particularly interested in this Data Manager.
|
|
96 def __init__(self):
|
|
97 # Have to use direct call to super class rather than using super():
|
|
98 # super(FileListParser, self).__init__()
|
|
99 # because HTMLParser is an "old style" class and its inheritance chain does not include object.
|
|
100 HTMLParser.__init__(self)
|
|
101 self.urls = set()
|
|
102 def handle_starttag(self, tag, attrs):
|
|
103 # Look for filename references in anchor tags and add them to urls.
|
|
104 if tag == "a":
|
|
105 # The tag is an anchor tag.
|
|
106 for attribute in attrs:
|
|
107 # print "Checking: {:s}".format(str(attribute))
|
|
108 if attribute[0] == "href":
|
|
109 # Does the href have a tar.gz in it?
|
|
110 if ("tar.gz" in attribute[1]) and ("md5" not in attribute[1]):
|
|
111 # Add the value to urls.
|
|
112 self.urls.add(attribute[1])
|
|
113 # End of class FileListParser
|
|
114
|
|
115 def get_ctat_genome_urls():
|
|
116 # open the url and retrieve the urls of the files in the directory.
|
|
117 # If we can't get the list, send a default list.
|
|
118
|
|
119 build_default_list = False
|
|
120 resource = urllib2.urlopen(_CTAT_ResourceLib_URL)
|
|
121 if resource is None:
|
|
122 build_default_list = True
|
|
123 else:
|
|
124 theHTML = resource.read()
|
|
125 if (theHTML is None) or (theHTML == ""):
|
|
126 build_default_list = True
|
|
127 if build_default_list:
|
|
128 # These are the filenames for what was there at least until 2018/10/09.
|
|
129 urls_to_return = set()
|
|
130 urls_to_return.add("GRCh37_v19_CTAT_lib_Feb092018.plug-n-play.tar.gz")
|
|
131 urls_to_return.add("GRCh37_v19_CTAT_lib_Feb092018.source_data.tar.gz")
|
|
132 urls_to_return.add("GRCh38_v27_CTAT_lib_Feb092018.plug-n-play.tar.gz")
|
|
133 urls_to_return.add("GRCh38_v27_CTAT_lib_Feb092018.source_data.tar.gz")
|
|
134 urls_to_return.add("Mouse_M16_CTAT_lib_Feb202018.plug-n-play.tar.gz")
|
|
135 urls_to_return.add("Mouse_M16_CTAT_lib_Feb202018.source_data.tar.gz")
|
|
136 else:
|
|
137 filelist_parser = FileListParser()
|
|
138 filelist_parser.feed(theHTML)
|
|
139 urls_to_return = filelist_parser.urls
|
|
140
|
|
141 # For dynamic options need to return an itterable with contents that are tuples with 3 items.
|
|
142 # Item one is a string that is the display name put into the option list.
|
|
143 # Item two is the value that is put into the parameter associated with the option list.
|
|
144 # Item three is a True or False value, indicating whether the item is selected.
|
|
145 options = []
|
|
146 for i, url in enumerate(filelist_parser.urls):
|
|
147 # The urls should look like:
|
|
148 # https://data.broadinstitute.org/Trinity/CTAT_RESOURCE_LIB/GRCh37_v19_CTAT_lib_Feb092018.plug-n-play.tar.gz
|
|
149 # https://data.broadinstitute.org/Trinity/CTAT_RESOURCE_LIB/Mouse_M16_CTAT_lib_Feb202018.source_data.tar.gz
|
|
150 # But in actuality, they are coming in looking like:
|
|
151 # GRCh37_v19_CTAT_lib_Feb092018.plug-n-play.tar.gz
|
|
152 # Mouse_M16_CTAT_lib_Feb202018.source_data.tar.gz
|
|
153 # Write code to handle both situations, or an ftp: url.
|
|
154 url_parts = urlparse.urlparse(url)
|
|
155 if (url_parts.scheme != ""):
|
|
156 full_url_path = url
|
|
157 else:
|
|
158 # Assume the path is relative to the page location.
|
|
159 full_url_path = os.path.join(_CTAT_ResourceLib_URL, url)
|
|
160 filename = os.path.basename(url)
|
|
161 # if filename.split("_")[0] != _CTAT_MouseGenome_Prefix:
|
|
162 # # Don't put in the mouse genome options for now.
|
|
163 # # The mouse genome option is not handled correctly yet
|
|
164 # options.append((filename, full_url_path, i == 0))
|
|
165 # Mouse genomes should work now (we hope) - FIX - still not tested.
|
|
166 options.append((filename, full_url_path, i == 0))
|
|
167 options.sort() # So the list will be in alphabetical order.
|
|
168 # return a tuple of the urls
|
|
169 print "The list being returned as options is:"
|
|
170 print "{:s}\n".format(str(options))
|
|
171 sys.stdout.flush()
|
|
172 return options
|
|
173
|
|
174 def get_mutation_resource_urls():
|
|
175 # FIX - Rather than letting user choose mutation resource url,
|
|
176 # download the correct one for the chosen library?
|
|
177 # Not sure about this.
|
|
178 # In that case don't provide a pull down interface for this.
|
|
179 # FIX -
|
|
180 build_default_list = False
|
|
181 resource = urllib2.urlopen(_CTAT_Mutation_URL)
|
|
182 if resource is None:
|
|
183 build_default_list = True
|
|
184 else:
|
|
185 theHTML = resource.read()
|
|
186 if (theHTML is None) or (theHTML == ""):
|
|
187 build_default_list = True
|
|
188 if build_default_list:
|
|
189 # These are the filenames for what was there at least until 2018/10/09.
|
|
190 urls_to_return = set()
|
|
191 urls_to_return.add("mutation_lib.hg19.tar.gz")
|
|
192 urls_to_return.add("mutation_lib.hg38.tar.gz")
|
|
193 else:
|
|
194 filelist_parser = FileListParser()
|
|
195 filelist_parser.feed(theHTML)
|
|
196 urls_to_return = filelist_parser.urls
|
|
197
|
|
198 # For dynamic options need to return an itterable with contents that are tuples with 3 items.
|
|
199 # Item one is a string that is the display name put into the option list.
|
|
200 # Item two is the value that is put into the parameter associated with the option list.
|
|
201 # Item three is a True or False value, indicating whether the item is selected.
|
|
202 options = []
|
|
203 for i, url in enumerate(filelist_parser.urls):
|
|
204 # The urls should look like:
|
|
205 # https://data.broadinstitute.org/Trinity/CTAT/mutation/mc7.tar.gz
|
|
206 # https://data.broadinstitute.org/Trinity/CTAT/mutation/hg19.tar.gz
|
|
207 # But in actuality, they are coming in looking like:
|
|
208 # hg19.tar.gz
|
|
209 # mc7.tar.gz
|
|
210 #
|
|
211 # On 2018/10/06, the following tar.gz files were present:
|
|
212 # mutation_lib.hg19.tar.gz
|
|
213 # mutation_lib.hg38.tar.gz
|
|
214 # mc-7.tar.gz
|
|
215 # ctat_mutation_demo.tar.gz
|
|
216 #
|
|
217 # Write code to handle both situations, or an ftp: url.
|
|
218 url_parts = urlparse.urlparse(url)
|
|
219 if (url_parts.scheme != ""):
|
|
220 full_url_path = url
|
|
221 else:
|
|
222 # Assume the path is relative to the page location.
|
|
223 full_url_path = os.path.join(_CTAT_Mutation_URL, url)
|
|
224 filename = os.path.basename(url)
|
|
225 if (filename.split(".")[0] == "mutation_lib"):
|
|
226 # As of 2018_10_09, the only ones supported have mutation_lib as the first part of the name.
|
|
227 options.append((filename, full_url_path, i == 0))
|
|
228 options.sort() # So the list will be in alphabetical order.
|
|
229 # return a tuple of the urls
|
|
230 print "The list being returned as options is:"
|
|
231 print "{:s}\n".format(str(options))
|
|
232 sys.stdout.flush()
|
|
233 return options
|
|
234
|
|
235 # The following was used by the example program to get input parameters through the json.
|
|
236 # Just leaving here for reference.
|
|
237 # We are getting all of our parameter values through command line arguments.
|
|
238 #def get_reference_id_name(params):
|
|
239 # genome_id = params['param_dict']['genome_id']
|
|
240 # genome_name = params['param_dict']['genome_name']
|
|
241 # return genome_id, genome_name
|
|
242 #
|
|
243 #def get_url(params):
|
|
244 # trained_url = params['param_dict']['trained_url']
|
|
245 # return trained_url
|
|
246
|
|
247 def print_directory_contents(dir_path, num_levels):
|
|
248 # This procedure is used to help with debugging and for user information.
|
|
249 if num_levels > 0:
|
|
250 if os.path.exists(dir_path) and os.path.isdir(dir_path):
|
|
251 print "\nDirectory {:s}:".format(dir_path)
|
|
252 sys.stdout.flush()
|
|
253 subprocess.call("ls -la {:s} 2>&1".format(dir_path), shell=True)
|
|
254 else:
|
|
255 print "Path either does not exist, or is not a directory:\n\t{:s}.".format(dir_path)
|
|
256 sys.stdout.flush()
|
|
257 if num_levels > 1:
|
|
258 if os.path.exists(dir_path) and os.path.isdir(dir_path):
|
|
259 for filename in os.listdir(dir_path):
|
|
260 filename_path = os.path.join(dir_path, filename)
|
|
261 if os.path.exists(filename_path) and os.path.isdir(filename_path):
|
|
262 print_directory_contents(filename_path, num_levels-1)
|
|
263 else:
|
|
264 print "Path either does not exist, or is not a directory:\n\t{:s}.".format(dir_path)
|
|
265 sys.stdout.flush()
|
|
266
|
|
267 def which(file):
|
|
268 # This procedure is similar to the linux "which" command.
|
|
269 # It is used to find the location of an executable program that is in the PATH.
|
|
270 # However this implementation does not check whether the program's file is executable.
|
|
271 for path in os.environ["PATH"].split(os.pathsep):
|
|
272 if os.path.exists(os.path.join(path, file)):
|
|
273 return os.path.join(path, file)
|
|
274 return None
|
|
275
|
|
276 def size_of_file_at(file_url):
|
|
277 # Returns the size of the file at file_url.
|
|
278 # We have to open the file, in order to find out how big it is.
|
|
279 file_retriever = resumable_URL_opener()
|
|
280 with contextlib.closing(file_retriever.open(file_url)) as filelike_object:
|
|
281 filesize = int(filelike_object.headers['Content-Length'])
|
|
282 return filesize
|
|
283
|
|
284 def md5sum_for(filename, blocksize=2**20):
|
|
285 # I got this code for this function off the web, but don't remember where.
|
|
286 m = hashlib.md5()
|
|
287 finished = False
|
|
288 with open(filename, "rb" ) as f:
|
|
289 while not finished:
|
|
290 buf = f.read(blocksize)
|
|
291 if buf:
|
|
292 m.update( buf )
|
|
293 else:
|
|
294 finished = True
|
|
295 return m.hexdigest()
|
|
296
|
|
297 def ctat_library_type(filepath):
|
|
298 # This function pulls out the string indicating the library type of the file.
|
|
299 # If the filename indicates source_data, as opposed to plug-n-play,
|
|
300 # then the library will have to be built after it is downloaded.
|
|
301 base_filename = os.path.basename(filepath)
|
|
302 library_type = base_filename.split(".")[1]
|
|
303 #print "The file {:s}".format(base_filename)
|
|
304 #print "is of type {:s}".format(library_type)
|
|
305 return library_type
|
|
306
|
|
307 def find_genome_name_in_path(path, raise_error=False):
|
|
308 # The form of the genome name in directory names (if present in the path) looks like:
|
|
309 # GRCh37_v19_CTAT_lib_Feb092018
|
|
310 # GRCh38_v27_CTAT_lib_Feb092018
|
|
311 # Mouse_M16_CTAT_lib_Feb202018
|
|
312 # Raises a ValueError if there is no genome name in the given path.
|
|
313 genome_name = None
|
|
314 if (path is not None) and (path != ""):
|
|
315 for element in path.split(os.sep):
|
|
316 # print "Looking for genome name in {:s}.".format(element)
|
|
317 if (element[0:len(_CTAT_MouseGenome_Prefix)] == _CTAT_MouseGenome_Prefix) \
|
|
318 or (element[0:len(_CTAT_HumanGenome_Prefix)] == _CTAT_HumanGenome_Prefix):
|
|
319 # Remove any extension that might be in the filename.
|
|
320 genome_name = element.split(".")[0]
|
|
321 if (genome_name is None or (genome_name == "")) and raise_error:
|
|
322 raise ValueError("Cannnot find genome name in the given filename path:\n\t".format(path))
|
|
323 return genome_name
|
|
324
|
|
325 def bytes_needed_to_extract(archive_filepath):
|
|
326 # FIX -- The following should be replaced by a series of statements that return the right value for each archive.
|
|
327 # The numbers used now estimates for the human genome, and so are big enough for the mouse genome, so ok for now.
|
|
328 # But now we are also using this for the mutation resource files, so really need to FIX this.
|
|
329 # FIX --
|
|
330 bytes_needed = _NumBytesNeededForPlugNPlayExtraction
|
|
331 if (ctat_library_type(archive_filepath) == _LIBTYPE_SOURCE_DATA):
|
|
332 bytes_needed = _NumBytesNeededForSourceDataExtraction
|
|
333 else: # assume otherwise that it is a plug-n-play archive.
|
|
334 bytes_needed = _NumBytesNeededForPlugNPlayExtraction
|
|
335 return bytes_needed
|
|
336
|
|
337 def bytes_needed_to_build(source_data_filepath):
|
|
338 # FIX - The following should be replaced by a series of statements that return the right value for each archive.
|
|
339 # The numbers used now estimates that largest size needed. Also, it is probably not correct.
|
|
340 return _NumBytesNeededForBuild
|
|
341
|
|
342 def create_success_file(full_file_path, contents=None):
|
|
343 # full_file_path is the path to the file to write.
|
|
344 # It should not exist before calling this function,
|
|
345 # but if it does, it will be overwritten.
|
|
346 # contents is some text that will be written into the file.
|
|
347 # It can be empty and nothing will be written.
|
|
348 try:
|
|
349 with open(full_file_path,"w") as success_file:
|
|
350 if contents is not None:
|
|
351 success_file.write(contents)
|
|
352 # else nothing is written into it, but we still will have created the file.
|
|
353 except IOError:
|
|
354 print "The success indication file could not be created: " + \
|
|
355 "{:s}".format(full_file_path)
|
|
356 sys.stdout.flush()
|
|
357 raise
|
|
358
|
|
359 def download_file_from_url(file_url, dest_dir, resume_download=True):
|
|
360 # Some of the code used in this procedure was downloaded and modified for our needs.
|
|
361 # That code was at: http://code.activestate.com/recipes/83208-resuming-download-of-a-file/
|
|
362 # Given a file_url, downloads that file to dest_dir.
|
|
363 # The url must specify a file to download, so I can grab the filename from the end of the url's path.
|
|
364 # It is best to fully specify dest_dir. Otherwise the dest_dir will be opened relative to whatever cwd is.
|
|
365 # If resume_download is True (the default), the function will attempt to resume the download where it left off,
|
|
366 # if, for example, a previous download was interupted.
|
|
367 # If resume_download is False, any existing download of the file is deleted and a new download is started.
|
|
368
|
|
369 # DOWNLOAD_BLOCK_SIZE = 65536 # 64KB. Old number was 8192 or 8KB.
|
|
370 DOWNLOAD_BLOCK_SIZE = 1048576 # 1 MB
|
|
371 download_complete = False
|
|
372 existing_size = 0
|
|
373 bytes_read = 0
|
|
374 file_retriever = resumable_URL_opener()
|
|
375 dest_filename = os.path.basename(file_url)
|
|
376 dest_fullpath = os.path.join(dest_dir, dest_filename)
|
|
377 source_filesize = size_of_file_at(file_url)
|
|
378 print "Downloading {:s}\nSize of the file is {:d}".format(file_url, source_filesize)
|
|
379 print "Destination file for the download is {:s}".format(dest_fullpath)
|
|
380 sys.stdout.flush()
|
|
381
|
|
382 # If the file exists and resume_download is requested, then only download the remainder
|
|
383 if resume_download and os.path.exists(dest_fullpath):
|
|
384 existing_size = os.path.getsize(dest_fullpath)
|
|
385 #If the file exists, but we already have the whole thing, don't download again
|
|
386 print "The destination file exists and is {:d} bytes in size.".format(existing_size)
|
|
387 if (source_filesize == existing_size):
|
|
388 print "The file has already been completely downloaded:\n\t{:s}".format(dest_fullpath)
|
|
389 download_complete = True
|
|
390 else:
|
|
391 header = "Range","bytes={:s}-".format(str(existing_size))
|
|
392 print "Adding header to resume download:\n\t{:s}".format(header)
|
|
393 file_retriever.addheader("Range","bytes={:s}-".format(str(existing_size)))
|
|
394 # We open even if download is complete, to avoid adding code to determine whether to close.
|
|
395 output_file = open(dest_fullpath,"ab")
|
|
396 else:
|
|
397 if os.path.exists(dest_fullpath):
|
|
398 print "The destination file exists:\n\t{:s}".format(dest_fullpath)
|
|
399 print "However a new download has been requested."
|
|
400 print "The download will overwrite the existing file."
|
|
401 else:
|
|
402 print "The destination file does not exist yet."
|
|
403 existing_size = 0
|
|
404 output_file = open(dest_fullpath,"wb")
|
|
405 sys.stdout.flush()
|
|
406
|
|
407 try:
|
|
408 # Check whether there is enough space on the device for the rest of the file to download.
|
|
409 statvfs = os.statvfs(dest_dir)
|
|
410 num_avail_bytes = statvfs.f_frsize * statvfs.f_bavail
|
|
411 # num_avail_bytes is the number of free bytes that ordinary users
|
|
412 # are allowed to use (excl. reserved space)
|
|
413 # Perhaps should subtract some padding amount from num_avail_bytes
|
|
414 # rather than raising only if there is less than exactly what is needed.
|
|
415 if (num_avail_bytes < (source_filesize-existing_size)):
|
|
416 raise OSError("There is insufficient space ({:s} bytes)".format(str(num_avail_bytes)) + \
|
|
417 " on the device of the destination directory for the download: " + \
|
|
418 "{:s}".format(cannonical_destination))
|
|
419
|
|
420 source_file = file_retriever.open(file_url)
|
|
421 while not download_complete:
|
|
422 data = source_file.read(DOWNLOAD_BLOCK_SIZE)
|
|
423 if data:
|
|
424 output_file.write(data)
|
|
425 bytes_read = bytes_read + len(data)
|
|
426 else:
|
|
427 download_complete = True
|
|
428 source_file.close()
|
|
429 except IOError:
|
|
430 print "Error while attempting to download {:s}".format(file_url)
|
|
431 sys.stdout.flush()
|
|
432 raise
|
|
433 finally:
|
|
434 output_file.close()
|
|
435
|
|
436 for k,v in source_file.headers.items():
|
|
437 print k, "=",v
|
|
438 print "Downloaded {:s} bytes from {:s}".format(str(bytes_read), str(file_url))
|
|
439 dest_filesize = os.path.getsize(dest_fullpath)
|
|
440 print "{:s} {:s}".format(str(dest_filesize), str(dest_fullpath))
|
|
441 sys.stdout.flush()
|
|
442 if source_filesize != dest_filesize:
|
|
443 raise IOError("Download error:\n\t" + \
|
|
444 "The source file\n\t\t{:d}\t{:s}\n\t".format(source_filesize, file_url) + \
|
|
445 "and the destination file\n\t\t{:d}\t{:s}\n\t".format(dest_filesize, dest_fullpath) + \
|
|
446 "are different sizes.")
|
|
447 return dest_fullpath
|
|
448
|
|
449 def ensure_we_can_write_numbytes_to(destination, numbytes):
|
|
450 # Attempts to create the destination directory if it does not exist.
|
|
451 # Tests whether a file can be written to that directory.
|
|
452 # Tests whether there is numbytes space on the device of the destination.
|
|
453 # Raises errors if it cannot do any of the above.
|
|
454 #
|
|
455 # Returns the full specification of the destination path.
|
|
456 # We want to make sure that destination is an absolute fully specified path.
|
|
457 cannonical_destination = os.path.realpath(destination)
|
|
458 if os.path.exists(cannonical_destination):
|
|
459 if not os.path.isdir(cannonical_destination):
|
|
460 raise ValueError("The destination is not a directory: " + \
|
|
461 "{:s}".format(cannonical_destination))
|
|
462 # else all is good. It is a directory.
|
|
463 else:
|
|
464 # We need to create it since it does not exist.
|
|
465 try:
|
|
466 os.makedirs(cannonical_destination)
|
|
467 except os.error:
|
|
468 print "ERROR: Trying to create the following directory path:"
|
|
469 print "\t{:s}".format(cannonical_destination)
|
|
470 sys.stdout.flush()
|
|
471 raise
|
|
472 # Make sure the directory now exists and we can write to it.
|
|
473 if not os.path.exists(cannonical_destination):
|
|
474 # It should have been created, but if it doesn't exist at this point
|
|
475 # in the code, something is wrong. Raise an error.
|
|
476 raise OSError("The destination directory could not be created: " + \
|
|
477 "{:s}".format(cannonical_destination))
|
|
478 test_writing_filename = "{:s}.{:s}".format(os.path.basename(cannonical_destination), _Write_TestFile)
|
|
479 test_writing_filepath = os.path.join(cannonical_destination, test_writing_filename)
|
|
480 try:
|
|
481 with open(test_writing_filepath, "w") as test_writing_file:
|
|
482 test_writing_file.write("Testing writing to this file.")
|
|
483 if os.path.exists(test_writing_filepath):
|
|
484 os.remove(test_writing_filepath)
|
|
485 except IOError:
|
|
486 print "The destination directory could not be written into:\n\t" + \
|
|
487 "{:s}".format(cannonical_destination)
|
|
488 sys.stdout.flush()
|
|
489 raise
|
|
490 # Check whether there are numbytes available on cannonical_destination's device.
|
|
491 statvfs = os.statvfs(cannonical_destination)
|
|
492 # fs_size = statvfs.f_frsize * statvfs.f_blocks # Size of filesystem in bytes
|
|
493 # num_free_bytes = statvfs.f_frsize * statvfs.f_bfree # Actual number of free bytes
|
|
494 num_avail_bytes = statvfs.f_frsize * statvfs.f_bavail # Number of free bytes that ordinary users
|
|
495 # are allowed to use (excl. reserved space)
|
|
496 if (num_avail_bytes < numbytes):
|
|
497 raise OSError("There is insufficient space ({:s} bytes)".format(str(num_avail_bytes)) + \
|
|
498 " on the device of the destination directory:\n\t" + \
|
|
499 "{:s}\n\t{:d} bytes are needed.".format(cannonical_destination, numbytes))
|
|
500
|
|
501 return cannonical_destination
|
|
502
|
|
503 def download_genome_archive(source_url, destination, force_new_download=False):
|
|
504 # This function downloads but does not extract the archive at source_url.
|
|
505 # This function can be called on a file whose download was interrupted, and if force_new_download
|
|
506 # is False, the download will proceed where it left off.
|
|
507 # If download does not succeed, an IOError is raised.
|
|
508 # The function checks whether there is enough space at the destination for the expanded library.
|
|
509 # It raises an OSError if not.
|
|
510 # ValueError can also be raised by this function.
|
|
511
|
|
512 # Input Parameters
|
|
513 # source_url is the full URL of the file we want to download.
|
|
514 # It should look something like:
|
|
515 # https://data.broadinstitute.org/Trinity/CTAT_RESOURCE_LIB/GRCh37_v19_CTAT_lib_Feb092018.plug-n-play.tar.gz
|
|
516 # If only the filename is given, it is assumed to reside at _CTAT_ResourceLib_URL.
|
|
517 # destination is the location (directory) where a copy of the source file will be placed.
|
|
518 # Relative paths are expanded using the current working directory, so within Galaxy,
|
|
519 # it is best to send in absolute fully specified path names so you know to where
|
|
520 # the source file is going to be copied.
|
|
521 # force_new_download if True, will cause a new download to occur, even if the file has been downloaded previously.
|
|
522 #
|
|
523 # Returns the canonical path to the file that was downloaded.
|
|
524
|
|
525 dest_fullpath = None
|
|
526 url_parts = urlparse.urlparse(source_url)
|
|
527 source_filename = os.path.basename(url_parts.path)
|
|
528 if url_parts.scheme == "":
|
|
529 # Then we were given a source_url without a leading https: or similar.
|
|
530 # Assume we only were given the filename and that it exists at _CTAT_ResourceLib_URL.
|
|
531 source_url = urlparse.urljoin(_CTAT_ResourceLib_URL, source_url)
|
|
532 # FIX - We might want to otherwise check if we have a valid url and/or if we can reach it.
|
|
533
|
|
534 print "Downloading:\n\t{:s}".format(str(source_url))
|
|
535 print "to:\n\t{:s}".format(destination)
|
|
536 sys.stdout.flush()
|
|
537 # The next is done so that if the source_url does not have a genome name in it, an error will be raised.
|
|
538 find_genome_name_in_path(source_url, raise_error=True)
|
|
539 cannonical_destination = ensure_we_can_write_numbytes_to(destination, size_of_file_at(source_url))
|
|
540
|
|
541 # Get the list of files in the directory,
|
|
542 # We use it to check for a previous download.
|
|
543 orig_files_in_destdir = set(os.listdir(cannonical_destination))
|
|
544 # See whether the file has been downloaded already.
|
|
545 download_success_filename = "{:s}.{:s}".format(source_filename, _DownloadSuccessFile)
|
|
546 download_success_full_file_path = os.path.join(cannonical_destination, download_success_filename)
|
|
547 if ((download_success_filename not in orig_files_in_destdir) \
|
|
548 or force_new_download):
|
|
549 if (download_success_filename in orig_files_in_destdir):
|
|
550 # Since we are redoing the download,
|
|
551 # the success file needs to be removed
|
|
552 # until the download has succeeded.
|
|
553 os.remove(download_success_full_file_path)
|
|
554 # The following raises an error if the download fails for some reason.
|
|
555 dest_fullpath = download_file_from_url(source_url, cannonical_destination, \
|
|
556 resume_download=(not force_new_download))
|
|
557 # Check the md5sum of the cannonical_destination file to ensure the data in the file is correct.
|
|
558 file_retriever = resumable_URL_opener()
|
|
559 md5_url = "{:s}.md5".format(source_url)
|
|
560 print "Checking the md5sum of the downloaded file."
|
|
561 try:
|
|
562 md5_file = file_retriever.open(md5_url, "r")
|
|
563 md5sum_from_web = md5_file.readlines()[0].strip().split()[0]
|
|
564 md5_file.close()
|
|
565 md5sum_from_file = md5sum_for(dest_fullpath)
|
|
566 except IOError:
|
|
567 print "Error while attempting to check the md5sum for {:s}".format(dest_fullpath)
|
|
568 sys.stdout.flush()
|
|
569 raise
|
|
570 if md5sum_from_web != md5sum_from_file:
|
|
571 raise IOError("Download error:\n\t" + \
|
|
572 "The md5 sum for\n\t\t({:s})\n\t".format(dest_fullpath) + \
|
|
573 "does not match the value read from the web:\n\t\t" + \
|
|
574 "({:s} != {:s})".format(md5sum_from_file, md5sum_from_web))
|
|
575 print "Check of md5sum succeeded."
|
|
576 create_success_file(download_success_full_file_path, \
|
|
577 "Download of:\n\t{:s}\n".format(source_url) + \
|
|
578 "to:\n\t{:s}\nsucceeded.".format(dest_fullpath))
|
|
579 elif download_success_filename in orig_files_in_destdir:
|
|
580 print "The download success file exists, so no download is being attempted:"
|
|
581 print "\t{:s}".format(download_success_full_file_path)
|
|
582 print "Remove the file or set <Force New Download> if you want a new download to occur."
|
|
583 dest_filename = os.path.basename(source_url)
|
|
584 dest_fullpath = os.path.join(cannonical_destination, dest_filename)
|
|
585 else:
|
|
586 print "download_genome_archive(): This code should never be printed. Something is wrong."
|
|
587 sys.stdout.flush()
|
|
588
|
|
589 # Some code to help us if errors occur.
|
|
590 print "\n*******************************"
|
|
591 print "* Finished download. *"
|
|
592 sys.stdout.flush()
|
|
593 print_directory_contents(cannonical_destination, 1)
|
|
594 print "*******************************\n"
|
|
595 sys.stdout.flush()
|
|
596
|
|
597 return dest_fullpath
|
|
598
|
|
599 def extract_archive(archive_filepath, destination, force_new_extraction=False):
|
|
600 # Generic function will use tarfile object to extract the given archive_filepath
|
|
601 # to the destination. If a file indicating a previous successful extraction exists
|
|
602 # the file is not extracted again unless force_new_extraction is True.
|
|
603 # This procedure does not write the extraction success file, because some error checking
|
|
604 # is dependant on the file being extracted. The calling procedure can/should write the
|
|
605 # success file after doing error checking.
|
|
606 cannonical_destination = ensure_we_can_write_numbytes_to(destination, bytes_needed_to_extract(archive_filepath))
|
|
607
|
|
608 # Create the name of the file used to indicate prior success of the file's extraction.
|
|
609 extraction_success_filename = "{:s}.{:s}".format(os.path.basename(archive_filepath), _ExtractionSuccessFile)
|
|
610 extraction_success_full_file_path = os.path.join(cannonical_destination, extraction_success_filename)
|
|
611 #print "extraction_success_filename is {:s}".format(extraction_success_filename)
|
|
612
|
|
613 orig_files_in_destination = set(os.listdir(cannonical_destination))
|
|
614 if ((extraction_success_filename not in orig_files_in_destination) \
|
|
615 or force_new_extraction):
|
|
616 # Do the extraction.
|
|
617 if (extraction_success_filename in orig_files_in_destination):
|
|
618 # Since we are redoing the extraction,
|
|
619 # the success file needs to be removed
|
|
620 # until the extraction has succeeded.
|
|
621 os.remove(extraction_success_full_file_path)
|
|
622 with tarfile.open(archive_filepath, mode="r:*") as archive_file:
|
|
623 archive_file.extractall(path=cannonical_destination)
|
|
624 elif (extraction_success_filename in orig_files_in_destination):
|
|
625 # The archive was successfully extracted before so we do not do it again.
|
|
626 print "The extraction success file exists, so no new extraction was attempted:"
|
|
627 print "\t{:s}".format(extraction_success_full_file_path)
|
|
628 print "Remove the success file or set <force new extraction> if you want a new extraction to occur."
|
|
629 else:
|
|
630 print "extract_archive(): This code should never be printed. Something is wrong."
|
|
631 sys.stdout.flush()
|
|
632
|
|
633 # Some code to help us if errors occur.
|
|
634 print "\n*******************************************************"
|
|
635 print "* Finished extraction. Destination directory listing. *"
|
|
636 sys.stdout.flush()
|
|
637 print_directory_contents(cannonical_destination, 1)
|
|
638 print "*******************************************************\n"
|
|
639 sys.stdout.flush()
|
|
640 return
|
|
641
|
|
642 def extract_genome_file(archive_filepath, destination, force_new_extraction=False, keep_archive=False):
|
|
643 # Extract a CTAT Genome Reference Library archive file.
|
|
644 # It is best if archive_filepath is an absolute, fully specified filepath, not a relative one.
|
|
645 # destination is the directory to which the archive will be extracted.
|
|
646 # force_new_extraction can be used to cause extraction to occur, even if the file was extracted before.
|
|
647 #
|
|
648 # Returns extracted_directory
|
|
649 # The full path of the top level directory that is
|
|
650 # created by the extraction of the files from the archive.
|
|
651
|
|
652 print "Extracting:\n\t {:s}".format(str(archive_filepath))
|
|
653 print "to:\n\t{:s}".format(destination)
|
|
654 sys.stdout.flush()
|
|
655 cannonical_destination = ensure_we_can_write_numbytes_to(destination, bytes_needed_to_extract(archive_filepath))
|
|
656 # Get the root filename of the Genome Directory from the source file's name.
|
|
657 # That should also be the name of the extracted directory.
|
|
658 genome_dirname = find_genome_name_in_path(archive_filepath, raise_error=True)
|
|
659
|
|
660 orig_files_in_destination = set(os.listdir(cannonical_destination))
|
|
661 extract_archive(archive_filepath, destination, force_new_extraction)
|
|
662 newfiles_in_destdir = set(os.listdir(cannonical_destination)) - orig_files_in_destination
|
|
663
|
|
664 if (genome_dirname not in newfiles_in_destdir):
|
|
665 # Perhaps it has a different name than what we expect it to be.
|
|
666 # It will be a sub-directory that was not in the directory
|
|
667 # before we did the download and extraction.
|
|
668 found_filename = None
|
|
669 if len(newfiles_in_destdir) == 1:
|
|
670 found_filename = newfiles_in_destdir[0]
|
|
671 else:
|
|
672 for filename in newfiles_in_destdir:
|
|
673 # In most cases, there will only be one new file, but some OS's might have created
|
|
674 # other files in the directory.
|
|
675 # Look for the directory that was downloaded and extracted.
|
|
676 # The correct file's name should be a substring of the tar file that was downloaded.
|
|
677 if filename in src_filename:
|
|
678 # make sure it is a directory
|
|
679 if os.path.isdir(os.path.join(cannonical_destination,filename)):
|
|
680 found_filename = filename
|
|
681 if found_filename is not None:
|
|
682 genome_dirname = found_filename
|
|
683
|
|
684 extracted_directory = os.path.join(cannonical_destination, genome_dirname)
|
|
685 if (os.path.exists(extracted_directory)):
|
|
686 # Create the name of the file used to indicate prior success of the file's extraction.
|
|
687 extraction_success_filename = "{:s}.{:s}".format(os.path.basename(archive_filepath), _ExtractionSuccessFile)
|
|
688 extraction_success_full_file_path = os.path.join(cannonical_destination, extraction_success_filename)
|
|
689 create_success_file(extraction_success_full_file_path, \
|
|
690 "Extraction of:\n\t{:s}\n".format(archive_filepath) + \
|
|
691 "to:\n\t{:s}\nsucceeded.".format(extracted_directory))
|
|
692 else:
|
|
693 raise ValueError("ERROR: Could not find the extracted directory in the destination directory:" + \
|
|
694 "\n\t{:s}".format(cannonical_destination))
|
|
695 if not keep_archive:
|
|
696 # We are done extracting, so remove the archive file.
|
|
697 if os.path.exists(archive_filepath):
|
|
698 print "Removing the archive file:\n\t{:s}".format(archive_filepath)
|
|
699 sys.stdout.flush()
|
|
700 os.remove(archive_filepath)
|
|
701 # else: # It was removed previously, so we don't need to remove it again.
|
|
702 return extracted_directory
|
|
703
|
|
704 def get_gmap_success_filename(genome_build_directory):
|
|
705 genome_name = find_genome_name_in_path(genome_build_directory)
|
|
706 if genome_name is None:
|
|
707 genome_name = os.path.basename(genome_build_directory)
|
|
708 return "{:s}.{:s}".format(genome_name, _GmapSuccessFile)
|
|
709
|
|
710 def gmap_the_library(genome_build_directory, force_new_gmap=False):
|
|
711 # This is the processing that needs to happen for gmap-fusion to work.
|
|
712 # genome_build_directory should normally be a fully specified path,
|
|
713 # though this function should work even if it is relative.
|
|
714 # The command prints messages out to stderr, even when there is not an error,
|
|
715 # so route stderr to stdout. Otherwise, galaxy thinks an error occurred.
|
|
716
|
|
717 # Create the name of the file used to indicate prior success of gmap.
|
|
718 gmap_success_filename = get_gmap_success_filename(genome_build_directory)
|
|
719 gmap_success_full_file_path = os.path.join(genome_build_directory, gmap_success_filename)
|
|
720
|
|
721 orig_files_in_build_dir = set(os.listdir(genome_build_directory))
|
|
722 if ((gmap_success_filename not in orig_files_in_build_dir) \
|
|
723 or force_new_gmap):
|
|
724 # Do the gmap.
|
|
725 if (gmap_success_filename in orig_files_in_build_dir):
|
|
726 # Since we are redoing the gmap,
|
|
727 # the success file needs to be removed
|
|
728 # until the gmap has succeeded.
|
|
729 os.remove(gmap_success_full_file_path)
|
|
730 command = "gmap_build -D {:s}/ -d ref_genome.fa.gmap -k 13 {:s}/ref_genome.fa 2>&1".format( \
|
|
731 genome_build_directory, genome_build_directory)
|
|
732 try: # to send the gmap_build command.
|
|
733 subprocess.check_call(command, shell=True)
|
|
734 except subprocess.CalledProcessError:
|
|
735 print "ERROR: While trying to run the gmap_build command on the library:\n\t{:s}".format(command)
|
|
736 sys.stdout.flush()
|
|
737 raise
|
|
738 finally:
|
45
|
739 sys.stdout.flush()
|
44
|
740 # Some code to help us if errors occur.
|
|
741 print "\n*******************************\nAfter running gmap_build."
|
|
742 sys.stdout.flush()
|
|
743 print_directory_contents(genome_build_directory, 2)
|
|
744 print "*******************************\n"
|
|
745 sys.stdout.flush()
|
|
746 create_success_file(gmap_success_full_file_path, \
|
|
747 "gmap of:\n\t{:s}\nsucceeded.".format(genome_build_directory))
|
|
748 elif gmap_success_filename in orig_files_in_build_dir:
|
|
749 print "The gmap success file exists, so no gmap is being attempted:"
|
|
750 print "\t{:s}".format(gmap_success_full_file_path)
|
|
751 print "Remove the file or set <force new gmap> if you want a new gmap to occur."
|
|
752 else:
|
|
753 print "gmap_the_library(): This code should never be printed. Something is wrong."
|
|
754 sys.stdout.flush()
|
|
755 return
|
|
756
|
|
757
|
|
758 def build_the_library(genome_source_directory, \
|
|
759 genome_build_directory, force_new_build=False, \
|
|
760 gmap_build=False, force_gmap_build=False):
|
|
761 """ genome_source_directory is the location of the source_data needed to build the library.
|
|
762 Normally it is fully specified, but could be relative.
|
|
763 genome_build_directory is the location where the library will be built.
|
|
764 It can be relative to the current working directory or an absolute path.
|
|
765 build specifies whether to run prep_genome_lib.pl even if it was run before.
|
|
766 gmap_build specifies whether to run gmap_build or not.
|
|
767
|
|
768 Following was the old way to do it. Before FusionFilter 0.5.0.
|
|
769 prep_genome_lib.pl \
|
|
770 --genome_fa ref_genome.fa \
|
|
771 --gtf ref_annot.gtf \
|
|
772 --blast_pairs blast_pairs.gene_syms.outfmt6.gz \
|
|
773 --fusion_annot_lib fusion_lib.dat.gz
|
|
774 --output_dir ctat_genome_lib_build_dir
|
|
775 index_pfam_domain_info.pl \
|
|
776 --pfam_domains PFAM.domtblout.dat.gz \
|
|
777 --genome_lib_dir ctat_genome_lib_build_dir
|
|
778 gmap_build -D ctat_genome_lib_build_dir -d ref_genome.fa.gmap -k 13 ctat_genome_lib_build_dir/ref_genome.fa"
|
|
779 """
|
|
780
|
|
781 if (genome_source_directory is None) or (genome_source_directory == "" ) or not os.path.exists(genome_source_directory):
|
|
782 raise ValueError("Cannot build the CTAT Genome Resource Library. " + \
|
|
783 "The source directory does not exist:\n\t{:s}".format(str(genome_source_directory)))
|
|
784 cannonical_destination = ensure_we_can_write_numbytes_to(genome_build_directory, \
|
|
785 bytes_needed_to_build(genome_source_directory))
|
|
786 print "Building the CTAT Genome Resource Library from source data at:\n\t{:s}".format(str(genome_source_directory))
|
|
787 print "The Destination directory is at:\n\t{:s}".format(str(cannonical_destination))
|
|
788 sys.stdout.flush()
|
|
789
|
|
790 # Get the root filename of the Genome Directory.
|
|
791 src_filename = os.path.basename(genome_source_directory)
|
|
792 # See whether the library has been built already. The success file is written into the source directory.
|
|
793 files_in_sourcedir = set(os.listdir(genome_source_directory))
|
|
794 build_success_filename = "{:s}.{:s}".format(src_filename, _LibBuiltSuccessFile)
|
|
795 build_success_file_path = os.path.join(genome_source_directory, build_success_filename)
|
|
796 if (build_success_filename not in files_in_sourcedir) or force_new_build:
|
|
797 os.chdir(genome_source_directory)
|
|
798 if (build_success_filename in files_in_sourcedir):
|
|
799 # Since we are redoing the build,
|
|
800 # the success file needs to be removed
|
|
801 # until the build has succeeded.
|
|
802 os.remove(build_success_file_path)
|
|
803 # Create the command that builds the Genome Resource Library form the source data.
|
|
804 command = "prep_genome_lib.pl --genome_fa ref_genome.fa --gtf ref_annot.gtf " + \
|
|
805 "--pfam_db PFAM.domtblout.dat.gz " + \
|
|
806 "--output_dir {:s} ".format(cannonical_destination)
|
|
807 found_HumanFusionLib = False
|
|
808 HumanFusionLib_filename = "NoFileFound"
|
|
809 for filename in os.listdir(genome_source_directory):
|
|
810 # At the time this was written, the filename was CTAT_HumanFusionLib.v0.1.0.dat.gz
|
|
811 # We only check the prefix, in case other versions are used later.
|
|
812 # I assume there is only one in the directory, but if there are more than one,
|
|
813 # the later one, alphabetically, will be used.
|
|
814 if filename.split(".")[0] == _CTAT_HumanFusionLib_FilenamePrefix:
|
|
815 found_HumanFusionLib = True
|
|
816 filename_of_HumanFusionLib = filename
|
|
817 if found_HumanFusionLib:
|
|
818 # The mouse genomes do not have a fusion_annot_lib
|
|
819 # so only add the following for Human genomes.
|
|
820 command += "--fusion_annot_lib {:s} ".format(filename_of_HumanFusionLib) + \
|
|
821 "--annot_filter_rule AnnotFilterRule.pm "
|
|
822 if gmap_build:
|
|
823 command += "--gmap_build "
|
|
824 # Send stderr of the command to stdout, because some functions may write to stderr,
|
|
825 # even though no error has occurred. We will depend on error code return in order
|
|
826 # to know if an error occurred.
|
|
827 command += " 2>&1"
|
|
828 print "About to run the following command:\n\t{:s}".format(command)
|
|
829 sys.stdout.flush()
|
|
830 try: # to send the prep_genome_lib command.
|
|
831 subprocess.check_call(command, shell=True)
|
|
832 except subprocess.CalledProcessError:
|
|
833 print "ERROR: While trying to run the prep_genome_lib.pl command " + \
|
|
834 "on the CTAT Genome Resource Library:\n\t{:s}".format(command)
|
|
835 raise
|
|
836 finally:
|
|
837 # Some code to help us if errors occur.
|
|
838 print "\n*******************************"
|
|
839 print "Contents of Genome Source Directory {:s}:".format(genome_source_directory)
|
|
840 sys.stdout.flush()
|
|
841 print_directory_contents(genome_source_directory, 2)
|
|
842 print "\nContents of Genome Build Directory {:s}:".format(cannonical_destination)
|
|
843 sys.stdout.flush()
|
|
844 print_directory_contents(cannonical_destination, 2)
|
|
845 print "*******************************\n"
|
|
846 sys.stdout.flush()
|
|
847 create_success_file(build_success_file_path, \
|
|
848 "Build of:\n\t{:s}\n".format(genome_source_directory) + \
|
|
849 "to:\n\t{:s}\nsucceeded.".format(cannonical_destination))
|
|
850 if gmap_build:
|
|
851 # Create the gmap success file.
|
|
852 gmap_success_filename = get_gmap_success_filename(cannonical_destination)
|
|
853 gmap_success_full_file_path = os.path.join(cannonical_destination, gmap_success_filename)
|
|
854 create_success_file(gmap_success_full_file_path, \
|
|
855 "gmap of:\n\t{:s}\nsucceeded.".format(cannonical_destination))
|
|
856 elif (build_success_filename in files_in_sourcedir):
|
|
857 print "The build success file exists, so no build is being attempted:"
|
|
858 print "\t{:s}".format(build_success_file_path)
|
|
859 print "Remove the file or set <force new build> if you want a new build to occur."
|
|
860 # We might still need to do a gmap_build.
|
|
861 if gmap_build:
|
|
862 print "Checking if we need to gmap the library."
|
45
|
863 sys.stdout.flush()
|
44
|
864 gmap_the_library(cannonical_destination, force_gmap_build)
|
45
|
865 sys.stdout.flush()
|
44
|
866 # gmap_the_library creates a gmap success file if it succeeds.
|
|
867 else:
|
|
868 print "build_the_library(): This code should never be printed. Something is wrong."
|
|
869 sys.stdout.flush()
|
|
870 return
|
|
871 # End of build_the_library()
|
|
872
|
|
873 def find_path_to_mutation_lib_integration():
|
|
874 # We are assuming that we exist inside of a conda environment and that the directory that we want
|
|
875 # is in the share directory, one level up from the bin directory that contains the ctat_mutations
|
|
876 # command.
|
|
877 path_to_mutation_lib_integration = None
|
|
878 path_to_ctat_mutations = which("ctat_mutations")
|
|
879 if (path_to_ctat_mutations is None) or (path_to_ctat_mutations == ""):
|
|
880 raise ValueError("Unable to find ctat_mutations, which is required to do mutation resource processing.")
|
|
881 conda_root_dir = os.path.dirname(os.path.dirname(path_to_ctat_mutations))
|
|
882 share_dir = os.path.join(conda_root_dir, "share")
|
|
883 ctat_mutations_dir = None
|
|
884 for filename in os.listdir(share_dir):
|
|
885 if "ctat-mutations" in filename:
|
|
886 ctat_mutations_dir = filename
|
|
887 if (ctat_mutations_dir is None) or (ctat_mutations_dir == ""):
|
|
888 raise ValueError("Unable to find the home of ctat_mutations.\n" + \
|
|
889 "It should be in the share directory:\n\t{:s}.".format(share_dir))
|
|
890 path_to_mutation_lib_integration = os.path.join(share_dir, \
|
|
891 ctat_mutations_dir, \
|
|
892 "mutation_lib_prep", \
|
|
893 "ctat-mutation-lib-integration.py")
|
|
894 return path_to_mutation_lib_integration
|
|
895
|
|
896 def find_path_to_picard_home():
|
|
897 picard_home = None
|
|
898 path_to_ctat_mutations = which("ctat_mutations")
|
|
899 if (path_to_ctat_mutations is None) or (path_to_ctat_mutations == ""):
|
|
900 raise ValueError("Unable to find ctat_mutations, which is required to do mutation resources processing.")
|
|
901 # The ctat_mutations shell script defines PICARD_HOME. We just need to get it out of that file.
|
|
902 ctat_mutations_file = open(path_to_ctat_mutations, "r")
|
|
903 for line in ctat_mutations_file:
|
|
904 if ("export" in line) and ("PICARD_HOME=" in line):
|
|
905 # Get the value after the equal sign and strip off the newline at the end of string.
|
|
906 # Then strip off quotes at begin and end if they are there.
|
|
907 # And then strip off any other whitespace that might have been inside of stripped off quotes.
|
|
908 picard_home = line.split("=")[1].strip().strip('\"').strip()
|
|
909 if (picard_home is None) or (picard_home == ""):
|
|
910 # We didn't find it in the ctat_mutations file. Search for it.
|
|
911 conda_root_dir = os.path.dirname(os.path.dirname(path_to_ctat_mutations))
|
|
912 share_dir = os.path.join(conda_root_dir, "share")
|
|
913 for filename in os.listdir(share_dir):
|
|
914 if "picard" in filename:
|
|
915 picard_home = os.path.join(share_dir,filename)
|
|
916 if (picard_home is None) or (picard_home == ""):
|
|
917 raise ValueError("Unable to find PICARD_HOME.\n" +
|
|
918 "It should be in the share directory:\n\t{:s}.".format(share_dir))
|
|
919 return picard_home
|
|
920
|
|
921 def download_and_integrate_mutation_resources(source_url, genome_build_directory, cosmic_resources_location=None, \
|
|
922 force_new_download=False, force_new_integration=False):
|
|
923 # source_url is the url of the mutation resources archive to download.
|
|
924 # genome_build_dir is the location where the archive will be placed.
|
|
925 # If cosmic_files_location is set, that is the location where the files are presumed to exist.
|
|
926 # If cosmic_files_location is not set, the files will assumed to exist in genome_build_directory.
|
|
927 # If force_new_download is True, then even if the archive has previously been downloaded,
|
|
928 # it will be downloaded again.
|
|
929 """
|
|
930 From https://github.com/NCIP/ctat-mutations/tree/master/mutation_lib_prep
|
|
931
|
|
932 Step 1 (after CTAT Genome Resource Library is built)
|
|
933 download mutation_lib.hg38.tar.gz into GRCh38_v27_CTAT_lib_Feb092018
|
|
934 or
|
|
935 download mutation_lib.hg19.tar.gz into GRCh37_v19_CTAT_lib_Feb092018
|
|
936 or
|
|
937 download mc-7.tar.gz into Mouse_M16_CTAT_lib_Feb202018
|
|
938 (Need to ask about support for mouse, since there is not info about Cosmic mouse genome files in instracutions.)
|
|
939
|
|
940 Step 2: Cosmic files download - User must perform this step prior to running this code. We check if files are present.
|
|
941
|
|
942 Next download COSMIC resources required in this directory. Depending on the version of genome you need you can install either COSMIC's hg38 or COSMIC's hg19. You will need to download 2 sets of files: COSMIC Mutation Data (CosmicMutantExport.tsv.gz) and COSMIC Coding Mutation VCF File (CosmicCodingMuts.vcf.gz). Please note, for download to succeed you will need to register and login to their service.
|
|
943
|
|
944 So is there a way the user can give their credentials through the Data Manager interface as a part of specifying Mutation parameters and then I can programatically use those credentials to download the file, or maybe instead, the interface needs to have the intructions for the user to download the files, then the use needs to specify the absolute path to where those files are.
|
|
945
|
|
946 Step 3: Mutation lib integration
|
|
947
|
|
948 Once you have downloaded CosmicMutantExport.tsv.gz AND CosmicCodingMuts.vcf.gz (hg38 or hg19), proceed with mutation lib integration step which will integrate the mutation resource with CTAT_GENOME_LIB (This corresponds to "GRCh37_v19_CTAT_lib_Feb092018" or "GRCh38_v27_CTAT_lib_Feb092018" downloaded in Step 1). You will find this script in ctat-mutations repo in 'src' directory.
|
|
949
|
|
950 #Keep Picard in PICARD_HOME environmental variable like so
|
|
951 export PICARD_HOME=/path/to/picard
|
|
952
|
|
953 #Integrate CTAT mutations lib with CTAT genome library
|
|
954 python ctat-mutations/mutation_lib_prep/ctat-mutation-lib-integration.py \
|
|
955 --CosmicMutantExport CosmicMutantExport.tsv.gz \
|
|
956 --CosmicCodingMuts CosmicCodingMuts.vcf.gz \
|
|
957 --genome_lib_dir GRCh37_v19_CTAT_lib_Feb092018/ # OR GRCh38_v27_CTAT_lib_Feb092018/
|
|
958
|
|
959 Now you are all set to run the ctat-mutations pipeline
|
|
960 """
|
|
961 print "\n***********************************"
|
|
962 print "* Integrating Mutation Resources. *"
|
|
963 print "***********************************\n"
|
|
964 sys.stdout.flush()
|
|
965 # It is assumed that this procedure is only called with a valid genome_build_directory.
|
|
966 url_parts = urlparse.urlparse(source_url)
|
|
967 source_filename = os.path.basename(url_parts.path)
|
|
968 if url_parts.scheme == "":
|
|
969 # Then we were given a source_url without a leading https: or similar.
|
|
970 # Assume we only were given the filename and that it exists at _CTAT_Mutation_URL.
|
|
971 source_url = urlparse.urljoin(_CTAT_Mutation_URL, source_url)
|
|
972 # FIX - We might want to otherwise check if we have a valid url and/or if we can reach it.
|
|
973 cannonical_destination = ensure_we_can_write_numbytes_to(genome_build_directory, _NumBytesNeededForMutationResources)
|
|
974 print "Download and Integrate a Mutation Resource Archive."
|
|
975 print "The source URL is:\n\t{:s}".format(str(source_url))
|
|
976 print "The destination is:\n\t{:s}".format(str(cannonical_destination))
|
|
977 sys.stdout.flush()
|
|
978 # Get the list of files in the directory,
|
|
979 # We use it to check for a previous download or extraction among other things.
|
|
980 orig_files_in_destdir = set(os.listdir(cannonical_destination))
|
|
981
|
|
982 # DOWNLOAD SECTION
|
|
983 # See whether the index file has been downloaded already.
|
|
984 download_success_file = "{:s}.{:s}".format(source_filename, _MutationDownloadSuccessFile)
|
|
985 download_success_file_path = os.path.join(cannonical_destination, download_success_file)
|
|
986 if ((download_success_file not in orig_files_in_destdir) or force_new_download):
|
|
987 # DO THE DOWNLOAD
|
|
988 if (download_success_file in orig_files_in_destdir):
|
|
989 # Since we are redoing the download,
|
|
990 # the success file needs to be removed
|
|
991 # until the download has succeeded.
|
|
992 os.remove(download_success_file_path)
|
|
993 # The following raises an IOError if the download fails for some reason.
|
|
994 archive_fullpath = download_file_from_url(source_url, cannonical_destination, resume_download=(not force_new_download))
|
|
995 create_success_file(download_success_file_path, \
|
|
996 "Download of the mutation resource archive:\n\t{:s}\n".format(source_url) + \
|
|
997 "to:\n\t{:s}\nsucceeded.".format(cannonical_destination))
|
|
998 elif (download_success_file in orig_files_in_destdir):
|
|
999 print "The download success file exists, so no download is being attempted:"
|
|
1000 print "\t{:s}".format(download_success_file_path)
|
|
1001 print "Remove the file or set <new_mutation_download> if you want a new download to occur."
|
|
1002 else:
|
|
1003 print "download_and_integrate_mutation_resources() - Download: This code should never be printed. Something is wrong."
|
|
1004 sys.stdout.flush()
|
|
1005
|
|
1006 # INTEGRATION SECTION
|
|
1007 integration_success_file = "{:s}.{:s}".format(source_filename, _MutationIntegrationSuccessFile)
|
|
1008 integration_success_file_path = os.path.join(cannonical_destination, integration_success_file)
|
|
1009 if ((integration_success_file not in orig_files_in_destdir) or force_new_integration):
|
|
1010 # INTEGRATE THE LIBRARY
|
|
1011 if (integration_success_file in orig_files_in_destdir):
|
|
1012 # Since we are redoing the integration,
|
|
1013 # the success file needs to be removed
|
|
1014 # until the download has succeeded.
|
|
1015 os.remove(integration_success_file_path)
|
|
1016 mutation_lib_dirpath = os.path.join(cannonical_destination, _CTAT_MutationLibDirname)
|
|
1017 # If we do not remove the directory, then the old files will exist and a new integration does not occur.
|
|
1018 # Also, with the Cosmic files, when the integrated file is created, if there is a previous one, gzip
|
|
1019 # asks a question of the user, and this program is not prepared to respond to a question from a subprocess:
|
|
1020 # [bgzip] /path/to/ctat_mutation_lib/cosmic.vcf.gz already exists; do you wish to overwrite (y or n)?
|
|
1021 if os.path.exists(mutation_lib_dirpath):
|
|
1022 shutil.rmtree(mutation_lib_dirpath)
|
|
1023 # Check for Cosmic resources. User has to place these files into the correct location.
|
|
1024 if (cosmic_resources_location is None) or (cosmic_resources_location == ""):
|
|
1025 cosmic_resources_loc_full_path = cannonical_destination
|
|
1026 end_err_msg = "These files must be placed into:\n\t{:s}".format(cosmic_resources_loc_full_path)
|
|
1027 else:
|
|
1028 cosmic_resources_loc_full_path = os.path.realpath(cosmic_resources_location)
|
|
1029 end_err_msg = "This function was told they would be placed into:\n\t{:s}".format(cosmic_resources_loc_full_path)
|
|
1030 cosmic_mutant_full_path = os.path.join(cosmic_resources_loc_full_path, _COSMIC_Mutant_Filename)
|
|
1031 cosmic_coding_full_path = os.path.join(cosmic_resources_loc_full_path, _COSMIC_Coding_Filename)
|
|
1032 if not (os.path.exists(cosmic_mutant_full_path) and os.path.exists(cosmic_coding_full_path)):
|
|
1033 raise IOError("Either one or both of Cosmic Resources are missing:\n\t" + \
|
|
1034 "{:s}\nand/or\n\t{:s}\n".format(cosmic_mutant_full_path, cosmic_coding_full_path) + \
|
|
1035 "Unable to integrate mutation resources.\n{:s}".format(end_err_msg))
|
|
1036 # Create the integration command. We also must define PICARD_HOME for the command to work.
|
|
1037 picard_home = find_path_to_picard_home()
|
|
1038 integration_command = find_path_to_mutation_lib_integration()
|
|
1039 command = "export PICARD_HOME={:s} && python {:s} ".format(picard_home, integration_command) + \
|
|
1040 "--CosmicMutantExport {:s} ".format(cosmic_mutant_full_path) + \
|
|
1041 "--CosmicCodingMuts {:s} ".format(cosmic_coding_full_path) + \
|
|
1042 "--genome_lib_dir {:s}".format(cannonical_destination)
|
|
1043 try: # to send the ctat-mutation-lib-integration command.
|
|
1044 subprocess.check_call(command, shell=True)
|
|
1045 except subprocess.CalledProcessError:
|
|
1046 print "ERROR: While trying to integrate the mutation resources:\n\t{:s}".format(command)
|
|
1047 sys.stdout.flush()
|
|
1048 raise
|
|
1049 finally:
|
|
1050 # Some code to help us if errors occur.
|
|
1051 print "/n*********************************************************"
|
|
1052 print "* After download and integration of Mutation Resources. *"
|
|
1053 sys.stdout.flush()
|
|
1054 print_directory_contents(cannonical_destination, 2)
|
|
1055 print "*********************************************************\n"
|
|
1056 sys.stdout.flush()
|
|
1057 create_success_file(integration_success_file_path, \
|
|
1058 "Download and integration of mutation resources:\n\t{:s}\n".format(source_url) + \
|
|
1059 "to:\n\t{:s}\nsucceeded.".format(genome_build_directory))
|
|
1060 elif (integration_success_file in orig_files_in_destdir):
|
|
1061 print "The mutation resources integration success file exists, so no integration is being attempted:"
|
|
1062 print "\t{:s}".format(integration_success_file_path)
|
|
1063 print "Remove the file or set <new_mutation_integration> if you want a new integration to occur."
|
|
1064 else:
|
|
1065 print "download_and_integrate_mutation_resources() - Integration: This code should never be printed. Something is wrong."
|
|
1066 sys.stdout.flush()
|
|
1067 return
|
|
1068
|
|
1069 def search_for_genome_build_dir(top_dir_path):
|
|
1070 # If we do not download the directory, the topdir_path could be the
|
|
1071 # location of the genome resource library, but we also want to allow the
|
|
1072 # user to give the same value for top_dir_path that they do when a
|
|
1073 # build happens, so we need to handle all three cases:
|
|
1074 # 1) Is the top_dir_path the build directory,
|
|
1075 # 2) or is it inside of the given directory,
|
|
1076 # 3) or is it inside a subdirectory of the given directory.
|
|
1077 # The source_data downloads are built to a directory named _CTAT_Build_dirname,
|
|
1078 # and the plug-n-play downloads contain a sub-directory named _CTAT_Build_dirname.
|
|
1079 # We also look for the genome name and return that, if we find it in the
|
|
1080 # directory name of the directory holding the build directory.
|
|
1081 top_dir_full_path = os.path.realpath(top_dir_path)
|
|
1082 genome_build_directory = None
|
|
1083 genome_name_from_dirname = None
|
|
1084 print_warning = False
|
|
1085
|
|
1086 if not os.path.exists(top_dir_full_path):
|
|
1087 raise ValueError("Cannot find the CTAT Genome Resource Library. " + \
|
|
1088 "The given directory does not exist:\n\t{:s}".format(top_dir_full_path))
|
|
1089 elif not os.path.isdir(top_dir_full_path):
|
|
1090 raise ValueError("Cannot find the CTAT Genome Resource Library. " + \
|
|
1091 "The given directory is not a directory:\n\t{:s}".format(top_dir_full_path))
|
|
1092 if os.path.basename(top_dir_full_path) == _CTAT_Build_dirname:
|
|
1093 print "Build directory is: {:s}".format(top_dir_full_path)
|
|
1094 sys.stdout.flush()
|
|
1095 # The top_dir_path is the path to the genome_build_directory.
|
|
1096 genome_build_directory = top_dir_full_path
|
|
1097 else:
|
|
1098 # Look for it inside of the top_dir_path directory.
|
|
1099 print "Looking inside of: {:s}".format(top_dir_full_path)
|
|
1100 sys.stdout.flush()
|
|
1101 top_dir_contents = os.listdir(top_dir_full_path)
|
|
1102 if (_CTAT_Build_dirname in top_dir_contents):
|
|
1103 # The genome_build_directory is inside of the top_dir_path directory.
|
|
1104 print "1. Found it."
|
|
1105 sys.stdout.flush()
|
|
1106 genome_build_directory = "{:s}/{:s}".format(top_dir_full_path,_CTAT_Build_dirname)
|
|
1107 else:
|
|
1108 # Find all subdirectories containing the _CTAT_Build_dirname or the _CTAT_RefGenome_Filename.
|
|
1109 # Look down the directory tree two levels.
|
|
1110 build_dirs_in_subdirs = list()
|
|
1111 subdirs_with_genome_files = list()
|
|
1112 build_dirs_in_sub_subdirs = list()
|
|
1113 sub_subdirs_with_genome_files = list()
|
|
1114 subdirs = [entry for entry in top_dir_contents if (os.path.isdir("{:s}/{:s}".format(top_dir_full_path,entry)))]
|
|
1115 for subdir in subdirs:
|
|
1116 subdir_path = "{:s}/{:s}".format(top_dir_full_path, subdir)
|
|
1117 subdir_path_contents = os.listdir(subdir_path)
|
|
1118 # print "Is it one of:\n\t" + "\n\t".join(subdir_path_contents)
|
|
1119 if (_CTAT_Build_dirname in subdir_path_contents):
|
|
1120 # The genome_build_directory is inside of the subdir_path directory.
|
|
1121 print "2a, Found one."
|
|
1122 build_dirs_in_subdirs.append("{:s}/{:s}".format(subdir_path, _CTAT_Build_dirname))
|
|
1123 if (_CTAT_RefGenome_Filename in subdir_path_contents):
|
|
1124 subdirs_with_genome_files.append(subdir_path)
|
|
1125 # Since we are already looping, loop through all dirs one level deeper as well.
|
|
1126 sub_subdirs = [entry for entry in subdir_path_contents if (os.path.isdir("{:s}/{:s}".format(subdir_path,entry)))]
|
|
1127 for sub_subdir in sub_subdirs:
|
|
1128 sub_subdir_path = "{:s}/{:s}".format(subdir_path, sub_subdir)
|
|
1129 sub_subdir_path_contents = os.listdir(sub_subdir_path)
|
|
1130 # print "Is it one of:\n\t" + "\n\t".join(sub_subdir_path_contents)
|
|
1131 if (_CTAT_Build_dirname in sub_subdir_path_contents):
|
|
1132 # The genome_build_directory is inside of the sub_subdir_path directory.
|
|
1133 print "3a. Found one."
|
|
1134 build_dirs_in_sub_subdirs.append("{:s}/{:s}".format(sub_subdir_path, _CTAT_Build_dirname))
|
|
1135 if (_CTAT_RefGenome_Filename in sub_subdir_path_contents):
|
|
1136 sub_subdirs_with_genome_files.append(sub_subdir_path)
|
|
1137 # Hopefully there is one and only one found build directory.
|
|
1138 # If none are found we check for a directory containing the genome reference file,
|
|
1139 # but the build process sometimes causes more than one directory to have a copy,
|
|
1140 # so finding that file is not a sure thing.
|
|
1141 if (len(build_dirs_in_subdirs) + len(build_dirs_in_sub_subdirs)) > 1:
|
|
1142 print "\n***************************************"
|
|
1143 print "Found multiple CTAT Genome Resource Libraries " + \
|
|
1144 "in the given directory:\n\t{:s}".format(top_dir_full_path)
|
|
1145 sys.stdout.flush()
|
|
1146 print_directory_contents(top_dir_full_path, 2)
|
|
1147 print "***************************************\n"
|
|
1148 sys.stdout.flush()
|
|
1149 raise ValueError("Found multiple CTAT Genome Resource Libraries " + \
|
|
1150 "in the given directory:\n\t{:s}".format(top_dir_full_path))
|
|
1151 elif len(build_dirs_in_subdirs) == 1:
|
|
1152 # The genome_build_directory is inside of the subdir_path directory.
|
|
1153 print "2b, Found it."
|
|
1154 sys.stdout.flush()
|
|
1155 genome_build_directory = build_dirs_in_subdirs[0]
|
|
1156 elif len(build_dirs_in_sub_subdirs) == 1:
|
|
1157 # The genome_build_directory is inside of the subdir_path directory.
|
|
1158 print "3b, Found it."
|
|
1159 sys.stdout.flush()
|
|
1160 genome_build_directory = build_dirs_in_sub_subdirs[0]
|
|
1161 elif (len(sub_subdirs_with_genome_files) + len(subdirs_with_genome_files)) > 1:
|
|
1162 print "\n***************************************"
|
|
1163 print "Unable to find CTAT Genome Resource Library " + \
|
|
1164 "in the given directory:\n\t{:s}".format(top_dir_full_path)
|
|
1165 print "And multiple directories contain {:s}".format(_CTAT_RefGenome_Filename)
|
|
1166 sys.stdout.flush()
|
|
1167 print_directory_contents(top_dir_full_path, 2)
|
|
1168 print "***************************************\n"
|
|
1169 sys.stdout.flush()
|
|
1170 raise ValueError("Unable to find CTAT Genome Resource Library " + \
|
|
1171 "in the given directory:\n\t{:s}".format(top_dir_full_path))
|
|
1172 elif (len(sub_subdirs_with_genome_files) == 1):
|
|
1173 print "3c, Maybe found it."
|
|
1174 sys.stdout.flush()
|
|
1175 genome_build_directory = sub_subdirs_with_genome_files[0]
|
|
1176 print_warning = True
|
|
1177 elif (len(subdirs_with_genome_files) == 1):
|
|
1178 print "2c, Maybe found it."
|
|
1179 sys.stdout.flush()
|
|
1180 genome_build_directory = subdirs_with_genome_files[0]
|
|
1181 print_warning = True
|
|
1182 elif (_CTAT_RefGenome_Filename in top_dir_contents):
|
|
1183 print "1c. Maybe found it."
|
|
1184 sys.stdout.flush()
|
|
1185 genome_build_directory = top_dir_full_path
|
|
1186 print_warning = True
|
|
1187 else:
|
|
1188 print "\n***************************************"
|
|
1189 print "Unable to find CTAT Genome Resource Library " + \
|
|
1190 "in the given directory:\n\t{:s}".format(top_dir_full_path)
|
|
1191 sys.stdout.flush()
|
|
1192 print_directory_contents(top_dir_full_path, 2)
|
|
1193 print "***************************************\n"
|
|
1194 sys.stdout.flush()
|
|
1195 raise ValueError("Unable to find CTAT Genome Resource Library " + \
|
|
1196 "in the given directory:\n\t{:s}".format(top_dir_full_path))
|
|
1197 # end else
|
|
1198 # Check if the CTAT Genome Resource Lib has anything in it (and specifically ref_genome.fa).
|
|
1199 if (genome_build_directory is None):
|
|
1200 print "\n***************************************"
|
|
1201 print "Cannot find the CTAT Genome Resource Library " + \
|
|
1202 "in the given directory:\n\t{:s}".format(top_dir_full_path)
|
|
1203 sys.stdout.flush()
|
|
1204 print_directory_contents(top_dir_full_path, 2)
|
|
1205 print "***************************************\n"
|
|
1206 sys.stdout.flush()
|
|
1207 raise ValueError("Cannot find the CTAT Genome Resource Library " + \
|
|
1208 "in the given directory:\n\t{:s}".format(top_dir_full_path))
|
|
1209 else:
|
|
1210 if (_CTAT_RefGenome_Filename not in os.listdir(genome_build_directory)):
|
|
1211 print "\n***************************************"
|
|
1212 print "\nWARNING: Cannot find Genome Reference file {:s} ".format(_CTAT_RefGenome_Filename) + \
|
|
1213 "in the genome build directory:\n\t{:s}".format(genome_build_directory)
|
|
1214 sys.stdout.flush()
|
|
1215 print_directory_contents(genome_build_directory, 2)
|
|
1216 print "***************************************\n"
|
|
1217 sys.stdout.flush()
|
|
1218 if print_warning and genome_build_directory:
|
|
1219 print "\n***************************************"
|
|
1220 print "\nWARNING: Cannot find the CTAT Genome Resource Library, " + \
|
|
1221 "but found a {:s} file, so set its directory as the library.".format(_CTAT_RefGenome_Filename)
|
|
1222 print "This my not be the correct directory:\n\t{:s}".format(genome_build_directory)
|
|
1223 sys.stdout.flush()
|
|
1224 print_directory_contents(genome_build_directory, 2)
|
|
1225 print "***************************************\n"
|
|
1226 sys.stdout.flush()
|
|
1227 return genome_build_directory
|
|
1228
|
|
1229 def build_directory_from_build_location(src_filename, build_location):
|
|
1230 build_directory = None
|
|
1231 genome_dir_name = find_genome_name_in_path(src_filename)
|
|
1232 if os.path.basename(build_location) == genome_dir_name:
|
|
1233 build_directory = os.path.join(build_location, _CTAT_Build_dirname)
|
|
1234 elif os.path.basename(build_location) == _CTAT_Build_dirname:
|
|
1235 build_directory = build_location
|
|
1236 else:
|
|
1237 build_directory = os.path.join(build_location, genome_dir_name, _CTAT_Build_dirname)
|
|
1238 return build_directory
|
|
1239
|
|
1240 def main():
|
|
1241 #Parse Command Line. There are three basic ways to use this tool.
|
|
1242 # 1) Download and Build the CTAT Genome Resource Library from an archive.
|
|
1243 # 2) Build the library from source data files that are already downloaded.
|
|
1244 # 3) Specify the location of an already built library.
|
|
1245 # Any of these methods can incorporate or be followed by a gmap build.
|
|
1246 # Any of these methods can be followed by a mutation resources download and/or integration.
|
|
1247 # Choose arguments for only one method.
|
|
1248 # Do not use arguments in a mixed manner. I am not writing code to handle that at this time.
|
|
1249 parser = argparse.ArgumentParser()
|
|
1250 # Arguments for all methods:
|
|
1251 parser.add_argument('-o', '--output_filename', \
|
|
1252 help='Name of the output file, where the json dictionary will be written.')
|
|
1253 parser.add_argument('-y', '--display_name',
|
|
1254 default='', \
|
|
1255 help='Is used as the display name for the entry of this Genome Resource Library in the data table.')
|
|
1256 parser.add_argument('-g', '--gmap_build', \
|
|
1257 help='Will do a gmap_build on the Genome Resource Library, if it has not previously been gmapped.',
|
|
1258 action='store_true')
|
|
1259 parser.add_argument('-f', '--force_gmap_build', \
|
|
1260 help='Will force gmap_build of the Genome Resource Library, even if previously gmapped.',
|
|
1261 action='store_true')
|
|
1262 parser.add_argument('-m', '--download_mutation_resources_url',
|
|
1263 default='', \
|
|
1264 help='Value should be the url of the zipped up mutation resources. ' + \
|
|
1265 'These are located at: https://data.broadinstitute.org/Trinity/CTAT/mutation/.' + \
|
|
1266 'Will download mutation resources and integrate them into the Genome Resource Library.' + \
|
|
1267 'Cosmic resources must previously have beeen downloaded (https://cancer.sanger.ac.uk/cosmic/download).' + \
|
|
1268 'Cosmic resources can be placed directly into the Genome Resource Library ' + \
|
|
1269 'or you can set the --cosmic_resources_location argument.' + \
|
|
1270 'See https://github.com/NCIP/ctat-mutations/tree/no_sciedpiper/mutation_lib_prep for more info. ' + \
|
|
1271 'If a previous download and integration was not completed, ' + \
|
|
1272 'calling with this option set will attempt to finish the integration.')
|
|
1273 parser.add_argument('-l', '--new_mutation_download', \
|
|
1274 help='Forces the mutation resources to be downloaded, ' + \
|
|
1275 'even if previously downloaded into this Genome Resource Library.',
|
|
1276 action='store_true')
|
|
1277 parser.add_argument('-i', '--new_mutation_integration', \
|
|
1278 help='Forces the mutation resources to be integrated, ' + \
|
|
1279 'even if previously integrated into this Genome Resource Library.',
|
|
1280 action='store_true')
|
|
1281 parser.add_argument('-c', '--cosmic_resources_location',
|
|
1282 default='', \
|
|
1283 help='Specify a non-default location where the Cosmic files reside. ' + \
|
|
1284 'Normally they are assumed to reside in the build directory, ' + \
|
|
1285 'but if that directory has not been created yet when this program ' + \
|
|
1286 'is called, you can specify the full path to the directory where they reside.')
|
|
1287 # Method 1) arguments - Download and Build.
|
|
1288 # - One can optionally utilize --build_location argument with this group of arguments.
|
|
1289 download_and_build_args = parser.add_argument_group('Download and Build arguments')
|
|
1290 download_and_build_args.add_argument('-u', '--download_url',
|
|
1291 default='', \
|
|
1292 help='This is the url of an archive file containing the library files. ' + \
|
|
1293 'These are located at https://data.broadinstitute.org/Trinity/CTAT_RESOURCE_LIB/. ' + \
|
|
1294 'Works with both source-data and plug-n-play archives.')
|
|
1295 download_and_build_args.add_argument('-d', '--download_location',
|
|
1296 default='', \
|
|
1297 help='Full path of the CTAT Resource Library download location, where the download will be placed. ' + \
|
|
1298 'If the archive file has already had been successfully downloaded, ' + \
|
|
1299 'it will only be downloaded again if --new_archive_download is selected. ' + \
|
|
1300 'If --build_location is not set, then the archive will be built in place at the download_location. ' + \
|
|
1301 'If a previous download and build was started but not completed at this or a specified build_location, ' + \
|
|
1302 'calling with this and the previous option set, but not --new_archive_download, ' + \
|
|
1303 'will attempt to finish the download and build.')
|
|
1304 download_and_build_args.add_argument('-a', '--new_archive_download', \
|
|
1305 help='Forces a new download (and build if needed) of the Genome Resource Library, ' + \
|
|
1306 'even if previously downloaded and built.',
|
|
1307 action='store_true')
|
|
1308 download_and_build_args.add_argument('-k', '--keep_archive', \
|
|
1309 help='The archive will not be deleted after it is extracted.',
|
|
1310 action='store_true')
|
|
1311 # Method 2) arguments - Specify source and build locations.
|
|
1312 specify_source_and_build_args = parser.add_argument_group('Specify Source and Build locations arguments')
|
|
1313 specify_source_and_build_args.add_argument('-s', '--source_location',
|
|
1314 default='', \
|
|
1315 help='Full path to the directory containing CTAT Resource Library source-data files ' + \
|
|
1316 'or the full path to a CTAT Resource Library archive file (.tar.gz). ' + \
|
|
1317 'If the --build_location option is not set, the reference library will be built in the source_location directory.' + \
|
|
1318 'If a previous download and build was started but not completed at this location, ' + \
|
|
1319 'calling with this option set, but not --new_library_build, ' + \
|
|
1320 'will attempt to finish the build.')
|
|
1321 specify_source_and_build_args.add_argument('-r', '--new_library_build', \
|
|
1322 help='Forces build of the CTAT Genome Resource Library, even if previously built. ' + \
|
|
1323 'The --source_location must be a source-data archive or directory, or this is a no-op.',
|
|
1324 action='store_true')
|
|
1325 # Method 3) arguments - Specify the location of a built library.
|
|
1326 built_lib_location_arg = parser.add_argument_group('Specify location of built library arguments')
|
|
1327 built_lib_location_arg.add_argument('-b', '--build_location',
|
|
1328 default='', \
|
|
1329 help='Full path to the location of a built CTAT Genome Resource Library, ' + \
|
|
1330 'either where it is, or where it will be placed.')
|
|
1331
|
|
1332 args = parser.parse_args()
|
|
1333
|
|
1334 # All of the input parameters are written by default to the output file prior to
|
|
1335 # this program being called.
|
|
1336 # But I do not get input values from the json file, but rather from command line.
|
|
1337 # Just leaving the following code as a comment, in case it might be useful to someone later.
|
|
1338 # params = from_json_string(open(filename).read())
|
|
1339 # target_directory = params['output_data'][0]['extra_files_path']
|
|
1340 # os.mkdir(target_directory)
|
|
1341
|
|
1342 lib_was_built = False
|
|
1343 extracted_directory = None
|
|
1344 source_data_directory = None
|
|
1345 genome_build_directory = None
|
|
1346 download_url_is_set = (args.download_url is not None) and (args.download_url != "")
|
|
1347 download_location_is_set = (args.download_location is not None) and (args.download_location != "")
|
|
1348 source_location_is_set = (args.source_location is not None) and (args.source_location != "")
|
|
1349 build_location_is_set = (args.build_location is not None) and (args.build_location != "")
|
46
|
1350
|
44
|
1351 if download_url_is_set:
|
46
|
1352 print "The value of download_url argument is:\n\t{:s}".format(str(args.download_url))
|
|
1353 sys.stdout.flush()
|
44
|
1354 if source_location_is_set:
|
|
1355 raise ValueError("Argument --source_location cannot be used in combination with --download_url.")
|
|
1356 if not download_location_is_set:
|
|
1357 raise ValueError("Argument --download_url requires that --download_location be specified.")
|
|
1358 downloaded_filename_full_path = \
|
|
1359 download_genome_archive(source_url=args.download_url, \
|
|
1360 destination=args.download_location, \
|
|
1361 force_new_download=args.new_archive_download)
|
|
1362 print "\nThe downloaded file is:\n\t{:s}.\n".format(str(downloaded_filename_full_path))
|
|
1363 sys.stdout.flush()
|
|
1364
|
|
1365 if ctat_library_type(downloaded_filename_full_path) == _LIBTYPE_SOURCE_DATA:
|
|
1366 print "It is source data."
|
|
1367 sys.stdout.flush()
|
|
1368 # If it is source_data, extract to download_location (the directory where the download was placed).
|
|
1369 extracted_directory = extract_genome_file(archive_filepath=downloaded_filename_full_path, \
|
|
1370 destination=args.download_location, \
|
|
1371 force_new_extraction=args.new_archive_download, \
|
|
1372 keep_archive=args.keep_archive)
|
|
1373 source_data_directory = extracted_directory
|
|
1374 if build_location_is_set:
|
|
1375 genome_build_directory = build_directory_from_build_location(source_data_directory, args.build_location)
|
|
1376 else:
|
|
1377 # We will build within a subdirectory of the source_data_directory .
|
|
1378 # The name of the build directory will be the default _CTAT_Build_dirname.
|
|
1379 # This _CTAT_Build_dirname directory will not exist until the library is built.
|
|
1380 genome_build_directory = os.path.join(source_data_directory, _CTAT_Build_dirname)
|
|
1381
|
|
1382 elif ctat_library_type(downloaded_filename_full_path) == _LIBTYPE_PLUG_N_PLAY:
|
|
1383 print "It is plug-n-play data."
|
|
1384 sys.stdout.flush()
|
|
1385 if build_location_is_set:
|
|
1386 # Extract to the build location. The library is already built.
|
|
1387 extracted_directory = extract_genome_file(archive_filepath=downloaded_filename_full_path, \
|
|
1388 destination=args.build_location, \
|
|
1389 force_new_extraction=args.new_archive_download, \
|
|
1390 keep_archive=args.keep_archive)
|
|
1391 else:
|
|
1392 # Extract to the download location.
|
|
1393 extracted_directory = extract_genome_file(archive_filepath=downloaded_filename_full_path, \
|
|
1394 destination=args.download_location, \
|
|
1395 force_new_extraction=args.new_archive_download, \
|
|
1396 keep_archive=args.keep_archive)
|
|
1397 # There is no source_data_directory, so its value stays as None.
|
|
1398
|
|
1399 # Look for the build directory. It should be inside the extracted_directory
|
|
1400 if len(os.listdir(extracted_directory)) == 1:
|
|
1401 # Then that one file is a subdirectory that should be the build_directory.
|
|
1402 # That is how the plug-n-play directories are structured.
|
|
1403 subdir_filename = os.listdir(extracted_directory)[0]
|
|
1404 genome_build_directory = os.path.join(extracted_directory, subdir_filename)
|
|
1405 else:
|
|
1406 # We need to search for the build directory, since there is more than one file.
|
|
1407 genome_build_directory = search_for_genome_build_dir(extracted_directory)
|
|
1408 else:
|
|
1409 raise ValueError("Unexpected CTAT Library type. Neither plug-n-play nor source_data:\n\t" + \
|
|
1410 "{:s}".format(downloaded_filename_full_path))
|
|
1411 elif source_location_is_set:
|
|
1412 # Then the user wants to build the directory from the source data.
|
|
1413 source_data_directory = os.path.realpath(args.source_location)
|
|
1414 print "\nThe program is being told that the source data is in:\n\t{:s}.\n".format(str(source_data_directory))
|
|
1415 sys.stdout.flush()
|
|
1416 if build_location_is_set:
|
|
1417 genome_build_directory = build_directory_from_build_location(source_data_directory, args.build_location)
|
|
1418 else:
|
|
1419 # We will build within a subdirectory of the source_data_directory .
|
|
1420 # The name of the build directory will be the default _CTAT_Build_dirname.
|
|
1421 # This _CTAT_Build_dirname directory will not exist until the library is built.
|
|
1422 genome_build_directory = os.path.join(source_data_directory, _CTAT_Build_dirname)
|
|
1423 elif build_location_is_set:
|
|
1424 genome_build_directory = args.build_location
|
|
1425
|
|
1426 if (genome_build_directory is None) or (genome_build_directory == ""):
|
|
1427 raise ValueError("At least one of --download_url, --source_location, or --build_location must be specified.")
|
|
1428
|
|
1429 print "\nThe location where the CTAT Genome Resource Library exists " + \
|
|
1430 "or will be built is {:s}.\n".format(str(genome_build_directory))
|
|
1431 sys.stdout.flush()
|
|
1432
|
|
1433 # To take out builds for testing, comment out the lines that do the building.
|
|
1434 # The command that builds the ctat genome library also has an option for building the gmap indexes.
|
|
1435 # That is why the gmap_build values are sent to build_the_library(), but if we are not building the
|
|
1436 # library, the user might still be asking for a gmap_build. That is done after rechecking for the
|
|
1437 # genome_build_directory.
|
|
1438 if (source_data_directory is not None):
|
|
1439 build_the_library(source_data_directory, \
|
|
1440 genome_build_directory, \
|
|
1441 args.new_library_build, \
|
|
1442 args.gmap_build, \
|
|
1443 args.force_gmap_build)
|
|
1444 lib_was_built = True
|
|
1445
|
|
1446 # The following looks to see if the library actually exists after the build,
|
|
1447 # and raises an error if it cannot find the library files.
|
|
1448 # The reassignment of genome_build_directory can be superfluous,
|
|
1449 # since many times the genome_build_directory will already point to the correct directory.
|
|
1450 # There are cases, however, where a user specifies a location that contains the
|
|
1451 # genome_build_directory rather than is the genome_build_directory.
|
|
1452 genome_build_directory = search_for_genome_build_dir(genome_build_directory)
|
|
1453
|
|
1454 if (args.gmap_build and not lib_was_built):
|
|
1455 # If we did not build the genome resource library
|
|
1456 # the user might still be asking for a gmap_build.
|
|
1457 gmap_the_library(genome_build_directory, args.force_gmap_build)
|
45
|
1458 sys.stdout.flush()
|
44
|
1459
|
|
1460 if (args.download_mutation_resources_url != ""):
|
|
1461 download_and_integrate_mutation_resources(source_url=args.download_mutation_resources_url, \
|
|
1462 genome_build_directory=genome_build_directory, \
|
|
1463 cosmic_resources_location=args.cosmic_resources_location, \
|
|
1464 force_new_download=args.new_mutation_download, \
|
|
1465 force_new_integration=args.new_mutation_integration)
|
|
1466
|
|
1467 # Need to get the genome name.
|
|
1468 genome_name = find_genome_name_in_path(args.download_url)
|
|
1469 if genome_name is None:
|
|
1470 genome_name = find_genome_name_in_path(genome_build_directory)
|
|
1471 if genome_name is None:
|
|
1472 genome_name = find_genome_name_in_path(extracted_directory)
|
|
1473 if genome_name is None:
|
|
1474 genome_name = find_genome_name_in_path(args.source_location)
|
|
1475 if genome_name is None:
|
|
1476 genome_name = find_genome_name_in_path(args.download_location)
|
|
1477 if genome_name is None:
|
|
1478 genome_name = find_genome_name_in_path(args.display_name)
|
|
1479 if genome_name is None:
|
|
1480 genome_name = _CTAT_ResourceLib_DefaultGenome
|
|
1481 print "WARNING: We could not find a genome name in any of the directory paths."
|
|
1482 sys.stdout.flush()
|
|
1483
|
|
1484 # Determine the display_name for the library.
|
|
1485 if (args.display_name is None) or (args.display_name == ""):
|
|
1486 # Create the display_name from the genome_name.
|
|
1487 display_name = _CTAT_ResourceLib_DisplayNamePrefix + genome_name
|
|
1488 else:
|
|
1489 display_name = _CTAT_ResourceLib_DisplayNamePrefix + args.display_name
|
|
1490 display_name = display_name.replace(" ","_")
|
|
1491
|
|
1492 # Create a unique_id for the library.
|
|
1493 datetime_stamp = datetime.now().strftime("_%Y_%m_%d_%H_%M_%S_%f")
|
|
1494 unique_id = genome_name + "." + datetime_stamp
|
|
1495
|
|
1496 print "The Genome Resource Library's display_name will be set to: {:s}\n".format(display_name)
|
|
1497 print "Its unique_id will be set to: {:s}\n".format(unique_id)
|
|
1498 print "Its dir_path will be set to: {:s}\n".format(genome_build_directory)
|
|
1499 sys.stdout.flush()
|
|
1500
|
|
1501 data_manager_dict = {}
|
|
1502 data_manager_dict['data_tables'] = {}
|
|
1503 data_manager_dict['data_tables']['ctat_genome_resource_libs'] = []
|
|
1504 data_table_entry = dict(value=unique_id, name=display_name, path=genome_build_directory)
|
|
1505 data_manager_dict['data_tables']['ctat_genome_resource_libs'].append(data_table_entry)
|
|
1506
|
|
1507 # Temporarily the output file's dictionary is written for debugging:
|
|
1508 print "The dictionary for the output file is:\n\t{:s}".format(str(data_manager_dict))
|
|
1509 sys.stdout.flush()
|
|
1510 # Save info to json file. This is used to transfer data from the DataManager tool, to the data manager,
|
|
1511 # which then puts it into the correct .loc file (I think).
|
|
1512 # Comment out the following line when testing without galaxy package.
|
|
1513 open(args.output_filename, 'wb').write(to_json_string(data_manager_dict))
|
|
1514
|
|
1515 if __name__ == "__main__":
|
|
1516 main()
|