62
|
1 #!/usr/bin/env python
|
|
2 # ref: https://galaxyproject.org/admin/tools/data-managers/how-to/define/
|
|
3
|
|
4 # Written by H.E. Cicada Brokaw Dennis of Indiana University for the Broad Institute.
|
|
5 # Initial starting point was some code downloaded from the toolshed and
|
|
6 # other example code on the web.
|
|
7 # That code has however been extensively modified and augmented.
|
|
8
|
|
9 # This is part of Data Manager code to be used within a Galaxy.
|
|
10 # This Data Manager allows users to add entries to the ctat_genome_resource_libs table.
|
|
11
|
|
12 # This code allows downloading of a user selected Genome Reference Library
|
|
13 # from the CTAT Genome Resource Library website.
|
|
14 # It also provides for building libraries from source, doing a gmap_build over,
|
|
15 # and/or integrating mutation resources with, a Genome Reference Library.
|
|
16 # For more information on CTAT Genome Resource Libraries,
|
|
17 # see https://github.com/FusionFilter/FusionFilter/wiki
|
|
18 # Users can create or download their own libraries and use this Data Manger to add them
|
|
19 # if they don't want to add them by hand.
|
|
20
|
|
21 import sys
|
|
22 # The many calls to sys.stdout.flush() are done in order to get the output to be synchronized.
|
|
23 # Otherwise output from subprocesses can get streamed to stdout in a disjunct manner from
|
|
24 # the output of the process running this code.
|
|
25 # This is particularly evident in the stdout stream when running within a Galaxy instance.
|
|
26 import argparse
|
|
27 import os
|
|
28 import shutil
|
|
29 import tarfile
|
|
30 import hashlib
|
|
31 import urllib
|
|
32 import urlparse
|
|
33 import contextlib
|
|
34 import subprocess
|
|
35
|
|
36 # One can comment out the following line when testing without galaxy package.
|
|
37 # In that case, also comment out the last line in main(). That is, the line that uses to_json_string.
|
|
38 from galaxy.util.json import to_json_string
|
|
39
|
|
40 # The following is not being used, but leaving here as info
|
|
41 # in case one ever wants to get input values using json.
|
|
42 # from galaxy.util.json import from_json_string
|
|
43 # However in this datamanager, the command line arguments are used instead.
|
|
44
|
|
45 # datetime.now() is used to create the unique_id
|
|
46 from datetime import datetime
|
|
47
|
|
48 # The Data Manager uses a subclass of HTMLParser to look through a web page's html
|
|
49 # searching for the filenames within anchor tags.
|
|
50 import urllib2
|
|
51 from HTMLParser import HTMLParser
|
|
52
|
|
53 _CTAT_ResourceLib_URL = 'https://data.broadinstitute.org/Trinity/CTAT_RESOURCE_LIB/'
|
|
54 _CTAT_Mutation_URL = 'https://data.broadinstitute.org/Trinity/CTAT/mutation/'
|
|
55 _CTAT_Build_dirname = 'ctat_genome_lib_build_dir'
|
|
56 _CTAT_MutationLibDirname = 'ctat_mutation_lib'
|
|
57 _CTAT_ResourceLib_DisplayNamePrefix = 'CTAT_GenomeResourceLib_'
|
|
58 _CTAT_ResourceLib_DefaultGenome = 'Unspecified_Genome'
|
|
59 _CTAT_HumanFusionLib_FilenamePrefix = 'CTAT_HumanFusionLib'
|
|
60 _CTAT_RefGenome_Filename = 'ref_genome.fa'
|
|
61 _CTAT_MouseGenome_Prefix = 'Mouse'
|
|
62 _CTAT_HumanGenome_Prefix = 'GRCh'
|
|
63 _COSMIC_Mutant_Filename = 'CosmicMutantExport.tsv.gz'
|
|
64 _COSMIC_Coding_Filename = 'CosmicCodingMuts.vcf.gz'
|
|
65
|
|
66 # FIX - The following numbers need to be checked and other numbers for gmap, etc. need to be determined.
|
|
67 # Values for each genome should be determined, so we can get more precise values for each genome.
|
|
68 _NumBytesNeededForSourceDataExtraction = 10737418240 # 10 Gigabytes. FIX - Not checked - Largest archive is currently 2.5GB.
|
|
69 _NumBytesNeededForPlugNPlayExtraction = 48318382080 # 45 Gigabytes. Largest archive is currently 28GB and extracts to 43GB.
|
|
70 # Built Human Genome archive (GRCh38_v27_CTAT_lib_Feb092018) with mutation lib is 46GB.
|
|
71 # Fix - double check what amount needed when the library is gmap'ed.
|
|
72 _NumBytesNeededForBuild = 66571993088 # 62 Gigabytes. FIX - This might not be correct.
|
|
73 _NumBytesNeededForMutationResources = 4294967296 # 4 Gigabytes. Actually need about 3.8GB.
|
|
74 # Once built the downloaded archive could be deleted to reduce the amount used, but with the archive
|
|
75 # there and the Cosmic files and the built ctat_mutation_library, 3.8GB is needed.
|
|
76 # If the archive files are deleted after the integration of the library, only 1.8GB would be used at that point.
|
|
77 # This program does not currently provide a method for deleting the mutation resource archive files.
|
|
78 _Write_TestFile = 'write_testfile.txt'
|
|
79 _DownloadSuccessFile = 'download_succeeded.txt'
|
|
80 _ExtractionSuccessFile = 'extraction_succeeded.txt'
|
|
81 _LibBuiltSuccessFile = 'build_succeeded.txt'
|
|
82 _GmapSuccessFile = 'gmap_succeeded.txt'
|
|
83 _MutationDownloadSuccessFile = 'mutation_download_succeeded.txt'
|
|
84 _MutationIntegrationSuccessFile = 'mutation_integration_succeeded.txt'
|
|
85 _LIBTYPE_SOURCE_DATA = 'source_data'
|
|
86 _LIBTYPE_PLUG_N_PLAY = 'plug-n-play'
|
|
87
|
|
88 class resumable_URL_opener(urllib.FancyURLopener):
|
|
89 # This class is used to do downloads that can restart a download from
|
|
90 # the point where it left off after a partial download was interupted.
|
|
91 # This class and code using it was found online:
|
|
92 # http://code.activestate.com/recipes/83208-resuming-download-of-a-file/
|
|
93 # A sub-class is created in order to overide error 206.
|
|
94 # This error means a partial file is being sent,
|
|
95 # which is ok in this case. Do nothing with this error.
|
|
96 def http_error_206(self, url, fp, errcode, errmsg, headers, data=None):
|
|
97 pass
|
|
98 # End of class resumable_URL_opener
|
|
99
|
|
100 class FileListParser(HTMLParser):
|
|
101 # The FileListParser object is used by get_ctat_genome_urls() and get_mutation_resource_urls(),
|
|
102 # which can be called by the Data Manager interface (.xml file) to get
|
|
103 # the filenames that are available online at broadinstitute.org
|
|
104 # Apparently creating dynamic option lists this way is deprecated, but no
|
|
105 # other method exists by which I can get the options dynamically from the web.
|
|
106 # I believe that it is considered a security risk.
|
|
107
|
|
108 # This HTMLParser facilitates getting url's of tar.gz links in an HTML page.
|
|
109 # These are assumed to be files that can be downloaded and are the files we
|
|
110 # are particularly interested in this Data Manager.
|
|
111 def __init__(self):
|
|
112 # Have to use direct call to super class rather than using super():
|
|
113 # super(FileListParser, self).__init__()
|
|
114 # because HTMLParser is an "old style" class and its inheritance chain does not include object.
|
|
115 HTMLParser.__init__(self)
|
|
116 self.urls = set()
|
|
117 def handle_starttag(self, tag, attrs):
|
|
118 # Look for filename references in anchor tags and add them to urls.
|
|
119 if tag == "a":
|
|
120 # The tag is an anchor tag.
|
|
121 for attribute in attrs:
|
|
122 # print "Checking: {:s}".format(str(attribute))
|
|
123 if attribute[0] == "href":
|
|
124 # Does the href have a tar.gz in it?
|
|
125 if ("tar.gz" in attribute[1]) and ("md5" not in attribute[1]):
|
|
126 # Add the value to urls.
|
|
127 self.urls.add(attribute[1])
|
|
128 # End of class FileListParser
|
|
129
|
|
130 def get_ctat_genome_urls():
|
|
131 # open the url and retrieve the urls of the files in the directory.
|
|
132 # If we can't get the list, send a default list.
|
|
133
|
|
134 build_default_list = False
|
|
135 default_url_filename = "GRCh38_v27_CTAT_lib_Feb092018.plug-n-play.tar.gz"
|
|
136 resource = urllib2.urlopen(_CTAT_ResourceLib_URL)
|
|
137 if resource is None:
|
|
138 build_default_list = True
|
|
139 else:
|
|
140 theHTML = resource.read()
|
|
141 if (theHTML is None) or (theHTML == ""):
|
|
142 build_default_list = True
|
|
143 if build_default_list:
|
|
144 # These are the filenames for what was there at least until 2018/10/09.
|
|
145 urls_to_return = set()
|
|
146 urls_to_return.add("GRCh37_v19_CTAT_lib_Feb092018.plug-n-play.tar.gz")
|
|
147 urls_to_return.add("GRCh37_v19_CTAT_lib_Feb092018.source_data.tar.gz")
|
|
148 urls_to_return.add("GRCh38_v27_CTAT_lib_Feb092018.plug-n-play.tar.gz")
|
|
149 urls_to_return.add("GRCh38_v27_CTAT_lib_Feb092018.source_data.tar.gz")
|
|
150 urls_to_return.add("Mouse_M16_CTAT_lib_Feb202018.plug-n-play.tar.gz")
|
|
151 urls_to_return.add("Mouse_M16_CTAT_lib_Feb202018.source_data.tar.gz")
|
|
152 else:
|
|
153 filelist_parser = FileListParser()
|
|
154 filelist_parser.feed(theHTML)
|
|
155 urls_to_return = filelist_parser.urls
|
|
156
|
|
157 # For dynamic options need to return an itterable with contents that are tuples with 3 items.
|
|
158 # Item one is a string that is the display name put into the option list.
|
|
159 # Item two is the value that is put into the parameter associated with the option list.
|
|
160 # Item three is a True or False value, indicating whether the item is selected.
|
|
161 options = []
|
|
162 found_default_url = False
|
|
163 if len([item for item in urls_to_return if default_url_filename in item]) > 0:
|
|
164 found_default_url = True
|
|
165 for i, url in enumerate(filelist_parser.urls):
|
|
166 # The urls should look like:
|
|
167 # https://data.broadinstitute.org/Trinity/CTAT_RESOURCE_LIB/GRCh37_v19_CTAT_lib_Feb092018.plug-n-play.tar.gz
|
|
168 # https://data.broadinstitute.org/Trinity/CTAT_RESOURCE_LIB/Mouse_M16_CTAT_lib_Feb202018.source_data.tar.gz
|
|
169 # But in actuality, they are coming in looking like:
|
|
170 # GRCh37_v19_CTAT_lib_Feb092018.plug-n-play.tar.gz
|
|
171 # Mouse_M16_CTAT_lib_Feb202018.source_data.tar.gz
|
|
172 # Write code to handle both situations, or an ftp: url.
|
|
173 url_parts = urlparse.urlparse(url)
|
|
174 if (url_parts.scheme != ""):
|
|
175 full_url_path = url
|
|
176 else:
|
|
177 # Assume the path is relative to the page location.
|
|
178 full_url_path = os.path.join(_CTAT_ResourceLib_URL, url)
|
|
179 filename = os.path.basename(url)
|
|
180 if (found_default_url and (filename == default_url_filename)) or ((not found_default_url) and (i == 0)):
|
|
181 # This should be the default option chosen.
|
|
182 options.append((filename, full_url_path, True))
|
|
183 else:
|
|
184 options.append((filename, full_url_path, False))
|
|
185 options.sort() # So the list will be in alphabetical order.
|
|
186 # return a tuple of the urls
|
|
187 print "The list being returned as options is:"
|
|
188 print "{:s}\n".format(str(options))
|
|
189 sys.stdout.flush()
|
|
190 return options
|
|
191
|
|
192 def get_mutation_resource_urls():
|
|
193 # FIX - Perhaps rather than letting the user choose a mutation resource url,
|
|
194 # should we download the correct one for the chosen library?
|
|
195 # Not sure about this.
|
|
196 # In that case we wouldn't provide a pull down interface that would call this.
|
|
197 # FIX -
|
|
198 build_default_list = False
|
|
199 resource = urllib2.urlopen(_CTAT_Mutation_URL)
|
|
200 if resource is None:
|
|
201 build_default_list = True
|
|
202 else:
|
|
203 theHTML = resource.read()
|
|
204 if (theHTML is None) or (theHTML == ""):
|
|
205 build_default_list = True
|
|
206 if build_default_list:
|
|
207 # These are the filenames for what was there at least until 2018/10/09.
|
|
208 urls_to_return = set()
|
|
209 urls_to_return.add("mutation_lib.hg19.tar.gz")
|
|
210 urls_to_return.add("mutation_lib.hg38.tar.gz")
|
|
211 else:
|
|
212 filelist_parser = FileListParser()
|
|
213 filelist_parser.feed(theHTML)
|
|
214 urls_to_return = filelist_parser.urls
|
|
215
|
|
216 # For dynamic options need to return an itterable with contents that are tuples with 3 items.
|
|
217 # Item one is a string that is the display name put into the option list.
|
|
218 # Item two is the value that is put into the parameter associated with the option list.
|
|
219 # Item three is a True or False value, indicating whether the item is selected.
|
|
220 options = []
|
|
221 for i, url in enumerate(filelist_parser.urls):
|
|
222 # The urls should look like:
|
|
223 # https://data.broadinstitute.org/Trinity/CTAT/mutation/mc7.tar.gz
|
|
224 # https://data.broadinstitute.org/Trinity/CTAT/mutation/hg19.tar.gz
|
|
225 # But in actuality, they are coming in looking like:
|
|
226 # hg19.tar.gz
|
|
227 # mc7.tar.gz
|
|
228 #
|
|
229 # On 2018/10/06, the following tar.gz files were present:
|
|
230 # mutation_lib.hg19.tar.gz
|
|
231 # mutation_lib.hg38.tar.gz
|
|
232 # mc-7.tar.gz
|
|
233 # ctat_mutation_demo.tar.gz
|
|
234 #
|
|
235 # Write code to handle both situations, or an ftp: url.
|
|
236 url_parts = urlparse.urlparse(url)
|
|
237 if (url_parts.scheme != ""):
|
|
238 full_url_path = url
|
|
239 else:
|
|
240 # Assume the path is relative to the page location.
|
|
241 full_url_path = os.path.join(_CTAT_Mutation_URL, url)
|
|
242 filename = os.path.basename(url)
|
|
243 if (filename.split(".")[0] == "mutation_lib"):
|
|
244 # As of 2018_10_09, the only ones supported have mutation_lib as the first part of the name.
|
|
245 options.append((filename, full_url_path, i == 0))
|
|
246 options.sort() # So the list will be in alphabetical order.
|
|
247 # return a tuple of the urls
|
|
248 print "The list being returned as options is:"
|
|
249 print "{:s}\n".format(str(options))
|
|
250 sys.stdout.flush()
|
|
251 return options
|
|
252
|
|
253 # The following was used by the example program to get input parameters through the json.
|
|
254 # Just leaving here for reference.
|
|
255 # We are getting all of our parameter values through command line arguments.
|
|
256 #def get_reference_id_name(params):
|
|
257 # genome_id = params['param_dict']['genome_id']
|
|
258 # genome_name = params['param_dict']['genome_name']
|
|
259 # return genome_id, genome_name
|
|
260 #
|
|
261 #def get_url(params):
|
|
262 # trained_url = params['param_dict']['trained_url']
|
|
263 # return trained_url
|
|
264
|
|
265 def print_directory_contents(dir_path, num_levels):
|
|
266 # This procedure is used to help with debugging and for user information.
|
|
267 if num_levels > 0:
|
|
268 if os.path.exists(dir_path) and os.path.isdir(dir_path):
|
|
269 print "\nDirectory {:s}:".format(dir_path)
|
|
270 sys.stdout.flush()
|
|
271 subprocess.call("ls -la {:s} 2>&1".format(dir_path), shell=True)
|
|
272 else:
|
|
273 print "Path either does not exist, or is not a directory:\n\t{:s}.".format(dir_path)
|
|
274 sys.stdout.flush()
|
|
275 if num_levels > 1:
|
|
276 if os.path.exists(dir_path) and os.path.isdir(dir_path):
|
|
277 for filename in os.listdir(dir_path):
|
|
278 filename_path = os.path.join(dir_path, filename)
|
|
279 if os.path.exists(filename_path) and os.path.isdir(filename_path):
|
|
280 print_directory_contents(filename_path, num_levels-1)
|
|
281 else:
|
|
282 print "Path either does not exist, or is not a directory:\n\t{:s}.".format(dir_path)
|
|
283 sys.stdout.flush()
|
|
284
|
|
285 def which(file):
|
|
286 # This procedure is similar to the linux "which" command.
|
|
287 # It is used to find the location of an executable program that is in the PATH.
|
|
288 # However this implementation does not check whether the program's file is executable.
|
|
289 for path in os.environ["PATH"].split(os.pathsep):
|
|
290 if os.path.exists(os.path.join(path, file)):
|
|
291 return os.path.join(path, file)
|
|
292 return None
|
|
293
|
|
294 def size_of_file_at(file_url):
|
|
295 # Returns the size of the file at file_url.
|
|
296 # We have to open the file, in order to find out how big it is.
|
|
297 file_retriever = resumable_URL_opener()
|
|
298 with contextlib.closing(file_retriever.open(file_url)) as filelike_object:
|
|
299 filesize = int(filelike_object.headers['Content-Length'])
|
|
300 return filesize
|
|
301
|
|
302 def md5sum_for(filename, blocksize=2**20):
|
|
303 # I got this code for this function off the web, but don't remember where.
|
|
304 m = hashlib.md5()
|
|
305 finished = False
|
|
306 with open(filename, "rb" ) as f:
|
|
307 while not finished:
|
|
308 buf = f.read(blocksize)
|
|
309 if buf:
|
|
310 m.update( buf )
|
|
311 else:
|
|
312 finished = True
|
|
313 return m.hexdigest()
|
|
314
|
|
315 def ctat_library_type(filepath):
|
|
316 # This function pulls out the string indicating the library type of the file.
|
|
317 # If the filename indicates source_data, as opposed to plug-n-play,
|
|
318 # then the library will have to be built after it is downloaded.
|
|
319 base_filename = os.path.basename(filepath)
|
|
320 library_type = base_filename.split(".")[1]
|
|
321 #print "The file {:s}".format(base_filename)
|
|
322 #print "is of type {:s}".format(library_type)
|
|
323 return library_type
|
|
324
|
|
325 def find_genome_name_in_path(path, raise_error=False):
|
|
326 # The form of the genome name in directory names (if present in the path) looks like:
|
|
327 # GRCh37_v19_CTAT_lib_Feb092018
|
|
328 # GRCh38_v27_CTAT_lib_Feb092018
|
|
329 # Mouse_M16_CTAT_lib_Feb202018
|
|
330 # When raise_error is True, a ValueError will be raised if there is no genome name in the given path.
|
|
331 genome_name = None
|
|
332 if (path is not None) and (path != ""):
|
|
333 for element in path.split(os.sep):
|
|
334 # print "Looking for genome name in {:s}.".format(element)
|
|
335 if (element[0:len(_CTAT_MouseGenome_Prefix)] == _CTAT_MouseGenome_Prefix) \
|
|
336 or (element[0:len(_CTAT_HumanGenome_Prefix)] == _CTAT_HumanGenome_Prefix):
|
|
337 # Remove any extension that might be in the filename.
|
|
338 genome_name = element.split(".")[0]
|
|
339 if ((genome_name is None) or (genome_name == "")) and raise_error:
|
|
340 raise ValueError("Cannnot find genome name in the given filename path:\n\t".format(path))
|
|
341 return genome_name
|
|
342
|
|
343 def bytes_needed_to_extract(archive_filepath):
|
|
344 # FIX -- The following should be replaced by a series of statements that return the right value for each archive.
|
|
345 # The numbers used now estimates for the human genome, and so are big enough for the mouse genome, so ok for now.
|
|
346 # FIX --
|
|
347 bytes_needed = _NumBytesNeededForPlugNPlayExtraction
|
|
348 if (ctat_library_type(archive_filepath) == _LIBTYPE_SOURCE_DATA):
|
|
349 bytes_needed = _NumBytesNeededForSourceDataExtraction
|
|
350 else: # assume otherwise that it is a plug-n-play archive.
|
|
351 bytes_needed = _NumBytesNeededForPlugNPlayExtraction
|
|
352 return bytes_needed
|
|
353
|
|
354 def bytes_needed_to_build(source_data_filepath):
|
|
355 # FIX - The following should be replaced by a series of statements that return the right value for each archive.
|
|
356 # The numbers used now estimates that largest size needed. Also, it is probably not correct.
|
|
357 return _NumBytesNeededForBuild
|
|
358
|
|
359 def create_success_file(full_file_path, contents=None):
|
|
360 # full_file_path is the path to the file to write.
|
|
361 # It should not exist before calling this function,
|
|
362 # but if it does, it will be overwritten.
|
|
363 # contents is some text that will be written into the file.
|
|
364 # It can be empty and nothing will be written.
|
|
365 try:
|
|
366 with open(full_file_path,"w") as success_file:
|
|
367 if contents is not None:
|
|
368 success_file.write(contents)
|
|
369 # else nothing is written into it, but we still will have created the file.
|
|
370 except IOError:
|
|
371 print "The success indication file could not be created: " + \
|
|
372 "{:s}".format(full_file_path)
|
|
373 sys.stdout.flush()
|
|
374 raise
|
|
375
|
|
376 def download_file_from_url(file_url, dest_dir, resume_download=True):
|
|
377 # Some of the code used in this procedure was downloaded and modified for our needs.
|
|
378 # That code was at: http://code.activestate.com/recipes/83208-resuming-download-of-a-file/
|
|
379 # Given a file_url, downloads that file to dest_dir.
|
|
380 # The url must specify a file to download, so I can grab the filename from the end of the url's path.
|
|
381 # It is best to fully specify dest_dir. Otherwise the dest_dir will be opened relative to whatever cwd is.
|
|
382 # If resume_download is True (the default), the function will attempt to resume the download where it left off,
|
|
383 # if, for example, a previous download was interupted.
|
|
384 # If resume_download is False, any existing download of the file is deleted and a new download is started.
|
|
385
|
|
386 # DOWNLOAD_BLOCK_SIZE = 65536 # 64KB. Old number was 8192 or 8KB.
|
|
387 DOWNLOAD_BLOCK_SIZE = 1048576 # 1 MB
|
|
388 download_complete = False
|
|
389 existing_size = 0
|
|
390 bytes_read = 0
|
|
391 file_retriever = resumable_URL_opener()
|
|
392 dest_filename = os.path.basename(file_url)
|
|
393 dest_fullpath = os.path.join(dest_dir, dest_filename)
|
|
394 source_filesize = size_of_file_at(file_url)
|
|
395 print "Downloading {:s}\nSize of the file is {:d}".format(file_url, source_filesize)
|
|
396 print "Destination file for the download is {:s}".format(dest_fullpath)
|
|
397 sys.stdout.flush()
|
|
398
|
|
399 # If the file exists and resume_download is requested, then only download the remainder
|
|
400 if resume_download and os.path.exists(dest_fullpath):
|
|
401 existing_size = os.path.getsize(dest_fullpath)
|
|
402 #If the file exists, but we already have the whole thing, don't download again
|
|
403 print "The destination file exists and is {:d} bytes in size.".format(existing_size)
|
|
404 if (source_filesize == existing_size):
|
|
405 print "The file has already been completely downloaded:\n\t{:s}".format(dest_fullpath)
|
|
406 download_complete = True
|
|
407 else:
|
|
408 header = "Range","bytes={:s}-".format(str(existing_size))
|
|
409 print "Adding header to resume download:\n\t{:s}".format(header)
|
|
410 file_retriever.addheader("Range","bytes={:s}-".format(str(existing_size)))
|
|
411 # We open even if download is complete, to avoid adding code to determine whether to close.
|
|
412 output_file = open(dest_fullpath,"ab")
|
|
413 else:
|
|
414 if os.path.exists(dest_fullpath):
|
|
415 print "The destination file exists:\n\t{:s}".format(dest_fullpath)
|
|
416 print "However a new download has been requested."
|
|
417 print "The download will overwrite the existing file."
|
|
418 else:
|
|
419 print "The destination file does not exist yet."
|
|
420 existing_size = 0
|
|
421 output_file = open(dest_fullpath,"wb")
|
|
422 sys.stdout.flush()
|
|
423
|
|
424 try:
|
|
425 # Check whether there is enough space on the device for the rest of the file to download.
|
|
426 statvfs = os.statvfs(dest_dir)
|
|
427 num_avail_bytes = statvfs.f_frsize * statvfs.f_bavail
|
|
428 # num_avail_bytes is the number of free bytes that ordinary users
|
|
429 # are allowed to use (excl. reserved space)
|
|
430 # Perhaps should subtract some padding amount from num_avail_bytes
|
|
431 # rather than raising only if there is less than exactly what is needed.
|
|
432 if (num_avail_bytes < (source_filesize-existing_size)):
|
|
433 raise OSError("There is insufficient space ({:s} bytes)".format(str(num_avail_bytes)) + \
|
|
434 " on the device of the destination directory for the download: " + \
|
|
435 "{:s}".format(cannonical_destination))
|
|
436
|
|
437 source_file = file_retriever.open(file_url)
|
|
438 while not download_complete:
|
|
439 data = source_file.read(DOWNLOAD_BLOCK_SIZE)
|
|
440 if data:
|
|
441 output_file.write(data)
|
|
442 bytes_read = bytes_read + len(data)
|
|
443 else:
|
|
444 download_complete = True
|
|
445 source_file.close()
|
|
446 except IOError:
|
|
447 print "Error while attempting to download {:s}".format(file_url)
|
|
448 sys.stdout.flush()
|
|
449 raise
|
|
450 finally:
|
|
451 output_file.close()
|
|
452
|
|
453 for k,v in source_file.headers.items():
|
|
454 print k, "=",v
|
|
455 print "Downloaded {:s} bytes from {:s}".format(str(bytes_read), str(file_url))
|
|
456 dest_filesize = os.path.getsize(dest_fullpath)
|
|
457 print "{:s} {:s}".format(str(dest_filesize), str(dest_fullpath))
|
|
458 sys.stdout.flush()
|
|
459 if source_filesize != dest_filesize:
|
|
460 raise IOError("Download error:\n\t" + \
|
|
461 "The source file\n\t\t{:d}\t{:s}\n\t".format(source_filesize, file_url) + \
|
|
462 "and the destination file\n\t\t{:d}\t{:s}\n\t".format(dest_filesize, dest_fullpath) + \
|
|
463 "are different sizes.")
|
|
464 return dest_fullpath
|
|
465
|
|
466 def ensure_we_can_write_numbytes_to(destination, numbytes):
|
|
467 # Attempts to create the destination directory if it does not exist.
|
|
468 # Tests whether a file can be written to that directory.
|
|
469 # Tests whether there is numbytes space on the device of the destination.
|
|
470 # Raises errors if it cannot do any of the above.
|
|
471 #
|
|
472 # Returns the full specification of the destination path.
|
|
473 # We want to make sure that destination is an absolute fully specified path.
|
|
474 cannonical_destination = os.path.realpath(destination)
|
|
475 if os.path.exists(cannonical_destination):
|
|
476 if not os.path.isdir(cannonical_destination):
|
|
477 raise ValueError("The destination is not a directory: " + \
|
|
478 "{:s}".format(cannonical_destination))
|
|
479 # else all is good. It is a directory.
|
|
480 else:
|
|
481 # We need to create it since it does not exist.
|
|
482 try:
|
|
483 os.makedirs(cannonical_destination)
|
|
484 except os.error:
|
|
485 print "ERROR: Trying to create the following directory path:"
|
|
486 print "\t{:s}".format(cannonical_destination)
|
|
487 sys.stdout.flush()
|
|
488 raise
|
|
489 # Make sure the directory now exists and we can write to it.
|
|
490 if not os.path.exists(cannonical_destination):
|
|
491 # It should have been created, but if it doesn't exist at this point
|
|
492 # in the code, something is wrong. Raise an error.
|
|
493 raise OSError("The destination directory could not be created: " + \
|
|
494 "{:s}".format(cannonical_destination))
|
|
495 test_writing_filename = "{:s}.{:s}".format(os.path.basename(cannonical_destination), _Write_TestFile)
|
|
496 test_writing_filepath = os.path.join(cannonical_destination, test_writing_filename)
|
|
497 try:
|
|
498 with open(test_writing_filepath, "w") as test_writing_file:
|
|
499 test_writing_file.write("Testing writing to this file.")
|
|
500 if os.path.exists(test_writing_filepath):
|
|
501 os.remove(test_writing_filepath)
|
|
502 except IOError:
|
|
503 print "The destination directory could not be written into:\n\t" + \
|
|
504 "{:s}".format(cannonical_destination)
|
|
505 sys.stdout.flush()
|
|
506 raise
|
|
507 # Check whether there are numbytes available on cannonical_destination's device.
|
|
508 statvfs = os.statvfs(cannonical_destination)
|
|
509 # fs_size = statvfs.f_frsize * statvfs.f_blocks # Size of filesystem in bytes
|
|
510 # num_free_bytes = statvfs.f_frsize * statvfs.f_bfree # Actual number of free bytes
|
|
511 num_avail_bytes = statvfs.f_frsize * statvfs.f_bavail # Number of free bytes that ordinary users
|
|
512 # are allowed to use (excl. reserved space)
|
|
513 if (num_avail_bytes < numbytes):
|
|
514 raise OSError("There is insufficient space ({:s} bytes)".format(str(num_avail_bytes)) + \
|
|
515 " on the device of the destination directory:\n\t" + \
|
|
516 "{:s}\n\t{:d} bytes are needed.".format(cannonical_destination, numbytes))
|
|
517
|
|
518 return cannonical_destination
|
|
519
|
|
520 def download_genome_archive(source_url, destination, force_new_download=False):
|
|
521 # This function downloads but does not extract the archive at source_url.
|
|
522 # This function can be called on a file whose download was interrupted, and if force_new_download
|
|
523 # is False, the download will proceed where it left off.
|
|
524 # If download does not succeed, an IOError is raised.
|
|
525 # The function checks whether there is enough space at the destination for the expanded library.
|
|
526 # It raises an OSError if not.
|
|
527 # ValueError can also be raised by this function.
|
|
528
|
|
529 # Input Parameters
|
|
530 # source_url is the full URL of the file we want to download.
|
|
531 # It should look something like:
|
|
532 # https://data.broadinstitute.org/Trinity/CTAT_RESOURCE_LIB/GRCh37_v19_CTAT_lib_Feb092018.plug-n-play.tar.gz
|
|
533 # If only the filename is given, it is assumed to reside at _CTAT_ResourceLib_URL.
|
|
534 # destination is the location (directory) where a copy of the source file will be placed.
|
|
535 # Relative paths are expanded using the current working directory, so within Galaxy,
|
|
536 # it is best to send in absolute fully specified path names so you know to where
|
|
537 # the source file is going to be copied.
|
|
538 # force_new_download if True, will cause a new download to occur, even if the file has been downloaded previously.
|
|
539 #
|
|
540 # Returns the canonical path to the file that was downloaded.
|
|
541
|
|
542 dest_fullpath = None
|
|
543 url_parts = urlparse.urlparse(source_url)
|
|
544 source_filename = os.path.basename(url_parts.path)
|
|
545 if url_parts.scheme == "":
|
|
546 # Then we were given a source_url without a leading https: or similar.
|
|
547 # Assume we only were given the filename and that it exists at _CTAT_ResourceLib_URL.
|
|
548 source_url = urlparse.urljoin(_CTAT_ResourceLib_URL, source_url)
|
|
549 # FIX - We might want to otherwise check if we have a valid url and/or if we can reach it.
|
|
550
|
|
551 print "Downloading:\n\t{:s}".format(str(source_url))
|
|
552 print "to:\n\t{:s}".format(destination)
|
|
553 sys.stdout.flush()
|
|
554 # The next is done so that if the source_url does not have a genome name in it, an error will be raised.
|
|
555 find_genome_name_in_path(source_url, raise_error=True)
|
|
556 cannonical_destination = ensure_we_can_write_numbytes_to(destination, size_of_file_at(source_url))
|
|
557
|
|
558 # Get the list of files in the directory,
|
|
559 # We use it to check for a previous download.
|
|
560 orig_files_in_destdir = set(os.listdir(cannonical_destination))
|
|
561 # See whether the file has been downloaded already.
|
|
562 download_success_filename = "{:s}.{:s}".format(source_filename, _DownloadSuccessFile)
|
|
563 download_success_full_file_path = os.path.join(cannonical_destination, download_success_filename)
|
|
564 if ((download_success_filename not in orig_files_in_destdir) \
|
|
565 or force_new_download):
|
|
566 if (download_success_filename in orig_files_in_destdir):
|
|
567 # Since we are redoing the download,
|
|
568 # the success file needs to be removed
|
|
569 # until the download has succeeded.
|
|
570 os.remove(download_success_full_file_path)
|
|
571 # The following raises an error if the download fails for some reason.
|
|
572 dest_fullpath = download_file_from_url(source_url, cannonical_destination, \
|
|
573 resume_download=(not force_new_download))
|
|
574 # Check the md5sum of the cannonical_destination file to ensure the data in the file is correct.
|
|
575 file_retriever = resumable_URL_opener()
|
|
576 md5_url = "{:s}.md5".format(source_url)
|
|
577 print "Checking the md5sum of the downloaded file."
|
|
578 try:
|
|
579 md5_file = file_retriever.open(md5_url, "r")
|
|
580 md5sum_from_web = md5_file.readlines()[0].strip().split()[0]
|
|
581 md5_file.close()
|
|
582 md5sum_from_file = md5sum_for(dest_fullpath)
|
|
583 except IOError:
|
|
584 print "Error while attempting to check the md5sum for {:s}".format(dest_fullpath)
|
|
585 sys.stdout.flush()
|
|
586 raise
|
|
587 if md5sum_from_web != md5sum_from_file:
|
|
588 raise IOError("Download error:\n\t" + \
|
|
589 "The md5 sum for\n\t\t({:s})\n\t".format(dest_fullpath) + \
|
|
590 "does not match the value read from the web:\n\t\t" + \
|
|
591 "({:s} != {:s})".format(md5sum_from_file, md5sum_from_web))
|
|
592 print "Check of md5sum succeeded."
|
|
593 create_success_file(download_success_full_file_path, \
|
|
594 "Download of:\n\t{:s}\n".format(source_url) + \
|
|
595 "to:\n\t{:s}\nsucceeded.".format(dest_fullpath))
|
|
596 elif download_success_filename in orig_files_in_destdir:
|
|
597 print "The download success file exists, so no download is being attempted:"
|
|
598 print "\t{:s}".format(download_success_full_file_path)
|
|
599 print "Remove the file or set <Force New Download> if you want a new download to occur."
|
|
600 dest_filename = os.path.basename(source_url)
|
|
601 dest_fullpath = os.path.join(cannonical_destination, dest_filename)
|
|
602 else:
|
|
603 print "download_genome_archive(): This code should never be printed. Something is wrong."
|
|
604 sys.stdout.flush()
|
|
605
|
|
606 # Some code to help us if errors occur.
|
|
607 print "\n*******************************"
|
|
608 print "* Finished download. *"
|
|
609 sys.stdout.flush()
|
|
610 print_directory_contents(cannonical_destination, 1)
|
|
611 print "*******************************\n"
|
|
612 sys.stdout.flush()
|
|
613
|
|
614 return dest_fullpath
|
|
615
|
|
616 def extract_archive(archive_filepath, destination, force_new_extraction=False):
|
|
617 # Generic function will use tarfile object to extract the given archive_filepath
|
|
618 # to the destination. If a file indicating a previous successful extraction exists
|
|
619 # the file is not extracted again unless force_new_extraction is True.
|
|
620 # This procedure does not write the extraction success file, because some error checking
|
|
621 # is dependant on the file being extracted. The calling procedure can/should write the
|
|
622 # success file after doing error checking.
|
|
623 cannonical_destination = ensure_we_can_write_numbytes_to(destination, bytes_needed_to_extract(archive_filepath))
|
|
624
|
|
625 # Create the name of the file used to indicate prior success of the file's extraction.
|
|
626 extraction_success_filename = "{:s}.{:s}".format(os.path.basename(archive_filepath), _ExtractionSuccessFile)
|
|
627 extraction_success_full_file_path = os.path.join(cannonical_destination, extraction_success_filename)
|
|
628 #print "extraction_success_filename is {:s}".format(extraction_success_filename)
|
|
629
|
|
630 orig_files_in_destination = set(os.listdir(cannonical_destination))
|
|
631 if ((extraction_success_filename not in orig_files_in_destination) \
|
|
632 or force_new_extraction):
|
|
633 # Do the extraction.
|
|
634 if (extraction_success_filename in orig_files_in_destination):
|
|
635 # Since we are redoing the extraction,
|
|
636 # the success file needs to be removed
|
|
637 # until the extraction has succeeded.
|
|
638 os.remove(extraction_success_full_file_path)
|
|
639 with tarfile.open(archive_filepath, mode="r:*") as archive_file:
|
|
640 archive_file.extractall(path=cannonical_destination)
|
|
641 elif (extraction_success_filename in orig_files_in_destination):
|
|
642 # The archive was successfully extracted before so we do not do it again.
|
|
643 print "The extraction success file exists, so no new extraction was attempted:"
|
|
644 print "\t{:s}".format(extraction_success_full_file_path)
|
|
645 print "Remove the success file or set <force new extraction> if you want a new extraction to occur."
|
|
646 else:
|
|
647 print "extract_archive(): This code should never be printed. Something is wrong."
|
|
648 sys.stdout.flush()
|
|
649
|
|
650 # Some code to help us if errors occur.
|
|
651 print "\n*******************************************************"
|
|
652 print "* Finished extraction. Destination directory listing. *"
|
|
653 sys.stdout.flush()
|
|
654 print_directory_contents(cannonical_destination, 1)
|
|
655 print "*******************************************************\n"
|
|
656 sys.stdout.flush()
|
|
657 return
|
|
658
|
|
659 def extract_genome_file(archive_filepath, destination, force_new_extraction=False, keep_archive=False):
|
|
660 # Extract a CTAT Genome Reference Library archive file.
|
|
661 # It is best if archive_filepath is an absolute, fully specified filepath, not a relative one.
|
|
662 # destination is the directory to which the archive will be extracted.
|
|
663 # force_new_extraction can be used to cause extraction to occur, even if the file was extracted before.
|
|
664 #
|
|
665 # Returns extracted_directory
|
|
666 # The full path of the top level directory that is
|
|
667 # created by the extraction of the files from the archive.
|
|
668
|
|
669 print "Extracting:\n\t {:s}".format(str(archive_filepath))
|
|
670 print "to:\n\t{:s}".format(destination)
|
|
671 sys.stdout.flush()
|
|
672 cannonical_destination = ensure_we_can_write_numbytes_to(destination, bytes_needed_to_extract(archive_filepath))
|
|
673 # Get the root filename of the Genome Directory from the source file's name.
|
|
674 # That should also be the name of the extracted directory.
|
|
675 genome_dirname = find_genome_name_in_path(archive_filepath, raise_error=True)
|
|
676
|
|
677 orig_files_in_destination = set(os.listdir(cannonical_destination))
|
|
678 extract_archive(archive_filepath, destination, force_new_extraction)
|
|
679 newfiles_in_destdir = set(os.listdir(cannonical_destination)) - orig_files_in_destination
|
|
680
|
|
681 if (genome_dirname not in newfiles_in_destdir):
|
|
682 # Perhaps it has a different name than what we expect it to be.
|
|
683 # It will be a sub-directory that was not in the directory
|
|
684 # before we did the download and extraction.
|
|
685 found_filename = None
|
|
686 if len(newfiles_in_destdir) == 1:
|
|
687 found_filename = newfiles_in_destdir[0]
|
|
688 else:
|
|
689 for filename in newfiles_in_destdir:
|
|
690 # In most cases, there will only be one new file, but some OS's might have created
|
|
691 # other files in the directory.
|
|
692 # Look for the directory that was downloaded and extracted.
|
|
693 # The correct file's name should be a substring of the tar file that was downloaded.
|
|
694 if filename in src_filename:
|
|
695 # make sure it is a directory
|
|
696 if os.path.isdir(os.path.join(cannonical_destination,filename)):
|
|
697 found_filename = filename
|
|
698 if found_filename is not None:
|
|
699 genome_dirname = found_filename
|
|
700
|
|
701 extracted_directory = os.path.join(cannonical_destination, genome_dirname)
|
|
702 if (os.path.exists(extracted_directory)):
|
|
703 # Create the name of the file used to indicate prior success of the file's extraction.
|
|
704 extraction_success_filename = "{:s}.{:s}".format(os.path.basename(archive_filepath), _ExtractionSuccessFile)
|
|
705 extraction_success_full_file_path = os.path.join(cannonical_destination, extraction_success_filename)
|
|
706 create_success_file(extraction_success_full_file_path, \
|
|
707 "Extraction of:\n\t{:s}\n".format(archive_filepath) + \
|
|
708 "to:\n\t{:s}\nsucceeded.".format(extracted_directory))
|
|
709 else:
|
|
710 raise ValueError("ERROR: Could not find the extracted directory in the destination directory:" + \
|
|
711 "\n\t{:s}".format(cannonical_destination))
|
|
712 if not keep_archive:
|
|
713 # We are done extracting, so remove the archive file.
|
|
714 if os.path.exists(archive_filepath):
|
|
715 print "Removing the archive file:\n\t{:s}".format(archive_filepath)
|
|
716 sys.stdout.flush()
|
|
717 os.remove(archive_filepath)
|
|
718 # else: # It was removed previously, so we don't need to remove it again.
|
|
719 return extracted_directory
|
|
720
|
|
721 def get_gmap_success_filename(genome_build_directory):
|
|
722 # This function was created because there are two places where the success_filename was being created.
|
|
723 # Using this function makes sure that the names being used are the same.
|
|
724 # FIX - We could use a static string like "gmap_build" as the first part of the name,
|
|
725 # rather than the genome name, and maybe that would be more logical.
|
|
726 # The name in that case would not be different in different libraries.
|
|
727 # Leaving for now because I don't want to do another round of testing.
|
|
728 genome_name = find_genome_name_in_path(genome_build_directory)
|
|
729 if genome_name is None:
|
|
730 genome_name = os.path.basename(genome_build_directory)
|
|
731 return "{:s}.{:s}".format(genome_name, _GmapSuccessFile)
|
|
732
|
|
733 def gmap_the_library(genome_build_directory, force_new_gmap=False):
|
|
734 # This is the processing that needs to happen for the ctat_gmap_fusion tool to work.
|
|
735 # genome_build_directory should normally be a fully specified path,
|
|
736 # though this function should work even if it is relative.
|
|
737 # The gmap_build command prints messages out to stderr, even when there is not an error,
|
|
738 # so I route stderr to stdout.
|
|
739
|
|
740 # Create the name of the file used to indicate prior success of gmap.
|
|
741 gmap_success_filename = get_gmap_success_filename(genome_build_directory)
|
|
742 gmap_success_full_file_path = os.path.join(genome_build_directory, gmap_success_filename)
|
|
743
|
|
744 orig_files_in_build_dir = set(os.listdir(genome_build_directory))
|
|
745 if ((gmap_success_filename not in orig_files_in_build_dir) \
|
|
746 or force_new_gmap):
|
|
747 # Do the gmap.
|
|
748 if (gmap_success_filename in orig_files_in_build_dir):
|
|
749 # Since we are redoing the gmap,
|
|
750 # the success file needs to be removed
|
|
751 # until the gmap has succeeded.
|
|
752 os.remove(gmap_success_full_file_path)
|
|
753 command = "gmap_build -D {:s}/ -d ref_genome.fa.gmap -k 13 {:s}/ref_genome.fa 2>&1".format( \
|
|
754 genome_build_directory, genome_build_directory)
|
|
755 print "Doing a gmap_build with the following command:\n\t{:s}\n".format(command)
|
|
756 sys.stdout.flush()
|
|
757 try: # to send the gmap_build command.
|
|
758 subprocess.check_call(command, shell=True)
|
|
759 except subprocess.CalledProcessError:
|
|
760 print "ERROR: While trying to run the gmap_build command on the library:\n\t{:s}".format(command)
|
|
761 sys.stdout.flush()
|
|
762 raise
|
|
763 finally:
|
|
764 sys.stdout.flush()
|
|
765 # Some code to help us if errors occur.
|
|
766 print "\n*******************************\nAfter running gmap_build."
|
|
767 sys.stdout.flush()
|
|
768 print_directory_contents(genome_build_directory, 2)
|
|
769 print "*******************************\n"
|
|
770 sys.stdout.flush()
|
|
771 create_success_file(gmap_success_full_file_path, \
|
|
772 "gmap of:\n\t{:s}\nsucceeded.".format(genome_build_directory))
|
|
773 elif gmap_success_filename in orig_files_in_build_dir:
|
|
774 print "The gmap success file exists, so no gmap is being attempted:"
|
|
775 print "\t{:s}".format(gmap_success_full_file_path)
|
|
776 print "Remove the file or set <force new gmap> if you want a new gmap to occur."
|
|
777 else:
|
|
778 print "gmap_the_library(): This code should never be printed. Something is wrong."
|
|
779 sys.stdout.flush()
|
|
780 return
|
|
781
|
|
782
|
|
783 def build_the_library(genome_source_directory, \
|
|
784 genome_build_directory, force_new_build=False, \
|
|
785 gmap_build=False, force_gmap_build=False):
|
|
786 """ genome_source_directory is the location of the source_data needed to build the library.
|
|
787 Normally it is fully specified, but could be relative.
|
|
788 genome_build_directory is the location where the library will be built.
|
|
789 It can be relative to the current working directory or an absolute path.
|
|
790 build specifies whether to run prep_genome_lib.pl even if it was run before.
|
|
791 gmap_build specifies whether to run gmap_build or not.
|
|
792 The prep_genome_lib.pl command can send messages out to stderr, even when there is not an error,
|
|
793 so I route stderr to stdout.
|
|
794
|
|
795 Following was the old way to do it. Before FusionFilter 0.5.0.
|
|
796 prep_genome_lib.pl \
|
|
797 --genome_fa ref_genome.fa \
|
|
798 --gtf ref_annot.gtf \
|
|
799 --blast_pairs blast_pairs.gene_syms.outfmt6.gz \
|
|
800 --fusion_annot_lib fusion_lib.dat.gz
|
|
801 --output_dir ctat_genome_lib_build_dir
|
|
802 index_pfam_domain_info.pl \
|
|
803 --pfam_domains PFAM.domtblout.dat.gz \
|
|
804 --genome_lib_dir ctat_genome_lib_build_dir
|
|
805 gmap_build -D ctat_genome_lib_build_dir -d ref_genome.fa.gmap -k 13 ctat_genome_lib_build_dir/ref_genome.fa"
|
|
806 """
|
|
807
|
|
808 if (genome_source_directory is None) or (genome_source_directory == "" ) or not os.path.exists(genome_source_directory):
|
|
809 raise ValueError("Cannot build the CTAT Genome Resource Library. " + \
|
|
810 "The source directory does not exist:\n\t{:s}".format(str(genome_source_directory)))
|
|
811 cannonical_destination = ensure_we_can_write_numbytes_to(genome_build_directory, \
|
|
812 bytes_needed_to_build(genome_source_directory))
|
|
813 print "Building the CTAT Genome Resource Library from source data at:\n\t{:s}".format(str(genome_source_directory))
|
|
814 print "The Destination directory is at:\n\t{:s}".format(str(cannonical_destination))
|
|
815 sys.stdout.flush()
|
|
816
|
|
817 # Get the root filename of the Genome Directory.
|
|
818 src_filename = os.path.basename(genome_source_directory)
|
|
819 # See whether the library has been built already. The success file is written into the source directory.
|
|
820 files_in_sourcedir = set(os.listdir(genome_source_directory))
|
|
821 build_success_filename = "{:s}.{:s}".format(src_filename, _LibBuiltSuccessFile)
|
|
822 build_success_file_path = os.path.join(genome_source_directory, build_success_filename)
|
|
823 if (build_success_filename not in files_in_sourcedir) or force_new_build:
|
|
824 os.chdir(genome_source_directory)
|
|
825 if (build_success_filename in files_in_sourcedir):
|
|
826 # Since we are redoing the build,
|
|
827 # the success file needs to be removed
|
|
828 # until the build has succeeded.
|
|
829 os.remove(build_success_file_path)
|
|
830 # Create the command that builds the Genome Resource Library form the source data.
|
|
831 command = "prep_genome_lib.pl --genome_fa ref_genome.fa --gtf ref_annot.gtf " + \
|
|
832 "--pfam_db PFAM.domtblout.dat.gz " + \
|
|
833 "--output_dir {:s} ".format(cannonical_destination)
|
|
834 found_HumanFusionLib = False
|
|
835 HumanFusionLib_filename = "NoFileFound"
|
|
836 for filename in os.listdir(genome_source_directory):
|
|
837 # At the time this was written, the filename was CTAT_HumanFusionLib.v0.1.0.dat.gz
|
|
838 # We only check the prefix, in case other versions are used later.
|
|
839 # I assume there is only one in the directory, but if there are more than one,
|
|
840 # the later one, alphabetically, will be used.
|
|
841 if filename.split(".")[0] == _CTAT_HumanFusionLib_FilenamePrefix:
|
|
842 found_HumanFusionLib = True
|
|
843 filename_of_HumanFusionLib = filename
|
|
844 if found_HumanFusionLib:
|
|
845 # The mouse genomes do not have a fusion_annot_lib
|
|
846 # so only add the following for Human genomes.
|
|
847 command += "--fusion_annot_lib {:s} ".format(filename_of_HumanFusionLib) + \
|
|
848 "--annot_filter_rule AnnotFilterRule.pm "
|
|
849 if gmap_build:
|
|
850 command += "--gmap_build "
|
|
851 # Send stderr of the command to stdout, because some functions may write to stderr,
|
|
852 # even though no error has occurred. We will depend on error code return in order
|
|
853 # to know if an error occurred.
|
|
854 command += " 2>&1"
|
|
855 print "About to run the following command:\n\t{:s}".format(command)
|
|
856 sys.stdout.flush()
|
|
857 try: # to send the prep_genome_lib command.
|
|
858 subprocess.check_call(command, shell=True)
|
|
859 except subprocess.CalledProcessError:
|
|
860 print "ERROR: While trying to run the prep_genome_lib.pl command " + \
|
|
861 "on the CTAT Genome Resource Library:\n\t{:s}".format(command)
|
|
862 raise
|
|
863 finally:
|
|
864 # Some code to help us if errors occur.
|
|
865 print "\n*******************************"
|
|
866 print "Contents of Genome Source Directory {:s}:".format(genome_source_directory)
|
|
867 sys.stdout.flush()
|
|
868 print_directory_contents(genome_source_directory, 2)
|
|
869 print "\nContents of Genome Build Directory {:s}:".format(cannonical_destination)
|
|
870 sys.stdout.flush()
|
|
871 print_directory_contents(cannonical_destination, 2)
|
|
872 print "*******************************\n"
|
|
873 sys.stdout.flush()
|
|
874 create_success_file(build_success_file_path, \
|
|
875 "Build of:\n\t{:s}\n".format(genome_source_directory) + \
|
|
876 "to:\n\t{:s}\nsucceeded.".format(cannonical_destination))
|
|
877 if gmap_build:
|
|
878 # Create the gmap success file.
|
|
879 gmap_success_filename = get_gmap_success_filename(cannonical_destination)
|
|
880 gmap_success_full_file_path = os.path.join(cannonical_destination, gmap_success_filename)
|
|
881 create_success_file(gmap_success_full_file_path, \
|
|
882 "gmap of:\n\t{:s}\nsucceeded.".format(cannonical_destination))
|
|
883 elif (build_success_filename in files_in_sourcedir):
|
|
884 print "The build success file exists, so no build is being attempted:"
|
|
885 print "\t{:s}".format(build_success_file_path)
|
|
886 print "Remove the file or set <force new build> if you want a new build to occur."
|
|
887 # We might still need to do a gmap_build.
|
|
888 if gmap_build:
|
|
889 print "Checking if we need to gmap the library."
|
|
890 sys.stdout.flush()
|
|
891 gmap_the_library(cannonical_destination, force_gmap_build)
|
|
892 sys.stdout.flush()
|
|
893 # gmap_the_library creates a gmap success file if it succeeds.
|
|
894 else:
|
|
895 print "build_the_library(): This code should never be printed. Something is wrong."
|
|
896 sys.stdout.flush()
|
|
897 return
|
|
898 # End of build_the_library()
|
|
899
|
|
900 def find_path_to_mutation_lib_integration():
|
|
901 # We are assuming that we exist inside of a conda environment and that the directory that we want
|
|
902 # is in the share directory, one level up from the bin directory that contains the ctat_mutations
|
|
903 # command.
|
|
904 path_to_mutation_lib_integration = None
|
|
905 path_to_ctat_mutations = which("ctat_mutations")
|
|
906 if (path_to_ctat_mutations is None) or (path_to_ctat_mutations == ""):
|
|
907 raise ValueError("Unable to find ctat_mutations, which is required to do mutation resource processing.")
|
|
908 conda_root_dir = os.path.dirname(os.path.dirname(path_to_ctat_mutations))
|
|
909 share_dir = os.path.join(conda_root_dir, "share")
|
|
910 ctat_mutations_dir = None
|
|
911 for filename in os.listdir(share_dir):
|
|
912 if "ctat-mutations" in filename:
|
|
913 ctat_mutations_dir = filename
|
|
914 if (ctat_mutations_dir is None) or (ctat_mutations_dir == ""):
|
|
915 raise ValueError("Unable to find the home of ctat_mutations.\n" + \
|
|
916 "It should be in the share directory:\n\t{:s}.".format(share_dir))
|
|
917 path_to_mutation_lib_integration = os.path.join(share_dir, \
|
|
918 ctat_mutations_dir, \
|
|
919 "mutation_lib_prep", \
|
|
920 "ctat-mutation-lib-integration.py")
|
|
921 return path_to_mutation_lib_integration
|
|
922
|
|
923 def find_path_to_picard_home():
|
|
924 picard_home = None
|
|
925 path_to_ctat_mutations = which("ctat_mutations")
|
|
926 if (path_to_ctat_mutations is None) or (path_to_ctat_mutations == ""):
|
|
927 raise ValueError("Unable to find ctat_mutations, which is required to do mutation resources processing.")
|
|
928 # The ctat_mutations shell script defines PICARD_HOME. We just need to get it out of that file.
|
|
929 ctat_mutations_file = open(path_to_ctat_mutations, "r")
|
|
930 for line in ctat_mutations_file:
|
|
931 if ("export" in line) and ("PICARD_HOME=" in line):
|
|
932 # Get the value after the equal sign and strip off the newline at the end of string.
|
|
933 # Then strip off quotes at begin and end if they are there.
|
|
934 # And then strip off any other whitespace that might have been inside of stripped off quotes.
|
|
935 picard_home = line.split("=")[1].strip().strip('\"').strip()
|
|
936 if (picard_home is None) or (picard_home == ""):
|
|
937 # We didn't find it in the ctat_mutations file. Search for it.
|
|
938 conda_root_dir = os.path.dirname(os.path.dirname(path_to_ctat_mutations))
|
|
939 share_dir = os.path.join(conda_root_dir, "share")
|
|
940 for filename in os.listdir(share_dir):
|
|
941 if "picard" in filename:
|
|
942 picard_home = os.path.join(share_dir,filename)
|
|
943 if (picard_home is None) or (picard_home == ""):
|
|
944 raise ValueError("Unable to find PICARD_HOME.\n" +
|
|
945 "It should be in the share directory:\n\t{:s}.".format(share_dir))
|
|
946 return picard_home
|
|
947
|
|
948 def download_and_integrate_mutation_resources(source_url, genome_build_directory, cosmic_resources_location=None, \
|
|
949 force_new_download=False, force_new_integration=False):
|
|
950 # source_url is the url of the mutation resources archive to download.
|
|
951 # genome_build_dir is the location where the archive will be placed.
|
|
952 # If cosmic_files_location is set, that is the location where the files are presumed to exist.
|
|
953 # If cosmic_files_location is not set, the files will be assumed to exist in genome_build_directory.
|
|
954 # If force_new_download is True, then even if the archive has previously been downloaded,
|
|
955 # it will be downloaded again.
|
|
956 # If force_new_integration is True, the resources will be integrated again, even if there has been a
|
|
957 # a previous successful integration.
|
|
958 # The ctat-mutation-lib-integration command may print messages out to stderr, even when there is not an error.
|
|
959 # FIX - However, I forgot to route stderr to stdout as I did with other commands.
|
|
960 # I have left it this way for now because I do not want to do another round of testing.
|
|
961 """
|
|
962 From https://github.com/NCIP/ctat-mutations/tree/master/mutation_lib_prep
|
|
963
|
|
964 Step 1 (after CTAT Genome Resource Library is built)
|
|
965 download mutation_lib.hg38.tar.gz into GRCh38_v27_CTAT_lib_Feb092018
|
|
966 or
|
|
967 download mutation_lib.hg19.tar.gz into GRCh37_v19_CTAT_lib_Feb092018
|
|
968 (mouse genome is not yet supported)
|
|
969
|
|
970 Step 2: Cosmic files download - User must perform this step prior to running this code. We check if files are present.
|
|
971
|
|
972 Next download COSMIC resources required in this directory. Depending on the version of genome you need you can install either COSMIC's hg38 or COSMIC's hg19. You will need to download 2 sets of files: COSMIC Mutation Data (CosmicMutantExport.tsv.gz) and COSMIC Coding Mutation VCF File (CosmicCodingMuts.vcf.gz). Please note, for download to succeed you will need to register and login to their service.
|
|
973
|
|
974 So is there a way the user can give their credentials through the Data Manager interface as a part of specifying Mutation parameters and then I can programatically use those credentials to download the file, or maybe instead, the interface needs to have the intructions for the user to download the files, then the use needs to specify the absolute path to where those files are.
|
|
975
|
|
976 Step 3: Mutation lib integration
|
|
977
|
|
978 Once you have downloaded CosmicMutantExport.tsv.gz AND CosmicCodingMuts.vcf.gz (hg38 or hg19), proceed with mutation lib integration step which will integrate the mutation resource with CTAT_GENOME_LIB (This corresponds to "GRCh37_v19_CTAT_lib_Feb092018" or "GRCh38_v27_CTAT_lib_Feb092018" downloaded in Step 1). You will find this script in ctat-mutations repo in 'src' directory.
|
|
979
|
|
980 #Keep Picard in PICARD_HOME environmental variable like so
|
|
981 export PICARD_HOME=/path/to/picard
|
|
982
|
|
983 #Integrate CTAT mutations lib with CTAT genome library
|
|
984 python ctat-mutations/mutation_lib_prep/ctat-mutation-lib-integration.py \
|
|
985 --CosmicMutantExport CosmicMutantExport.tsv.gz \
|
|
986 --CosmicCodingMuts CosmicCodingMuts.vcf.gz \
|
|
987 --genome_lib_dir GRCh37_v19_CTAT_lib_Feb092018/ # OR GRCh38_v27_CTAT_lib_Feb092018/
|
|
988
|
|
989 Now you are all set to run the ctat-mutations pipeline
|
|
990 """
|
|
991 print "\n***********************************"
|
|
992 print "* Integrating Mutation Resources. *"
|
|
993 print "***********************************\n"
|
|
994 sys.stdout.flush()
|
|
995 # It is assumed that this procedure is only called with a valid genome_build_directory.
|
|
996 url_parts = urlparse.urlparse(source_url)
|
|
997 source_filename = os.path.basename(url_parts.path)
|
|
998 if url_parts.scheme == "":
|
|
999 # Then we were given a source_url without a leading https: or similar.
|
|
1000 # Assume we only were given the filename and that it exists at _CTAT_Mutation_URL.
|
|
1001 source_url = urlparse.urljoin(_CTAT_Mutation_URL, source_url)
|
|
1002 # FIX - We might want to otherwise check if we have a valid url and/or if we can reach it.
|
|
1003 cannonical_destination = ensure_we_can_write_numbytes_to(genome_build_directory, _NumBytesNeededForMutationResources)
|
|
1004 print "Download and Integrate a Mutation Resource Archive."
|
|
1005 print "The source URL is:\n\t{:s}".format(str(source_url))
|
|
1006 print "The destination is:\n\t{:s}".format(str(cannonical_destination))
|
|
1007 sys.stdout.flush()
|
|
1008 # Get the list of files in the directory,
|
|
1009 # We use it to check for a previous download or extraction among other things.
|
|
1010 orig_files_in_destdir = set(os.listdir(cannonical_destination))
|
|
1011
|
|
1012 # DOWNLOAD SECTION
|
|
1013 # See whether the index file has been downloaded already.
|
|
1014 download_success_file = "{:s}.{:s}".format(source_filename, _MutationDownloadSuccessFile)
|
|
1015 download_success_file_path = os.path.join(cannonical_destination, download_success_file)
|
|
1016 if ((download_success_file not in orig_files_in_destdir) or force_new_download):
|
|
1017 # DO THE DOWNLOAD
|
|
1018 if (download_success_file in orig_files_in_destdir):
|
|
1019 # Since we are redoing the download,
|
|
1020 # the success file needs to be removed
|
|
1021 # until the download has succeeded.
|
|
1022 os.remove(download_success_file_path)
|
|
1023 # The following raises an IOError if the download fails for some reason.
|
|
1024 archive_fullpath = download_file_from_url(source_url, cannonical_destination, resume_download=(not force_new_download))
|
|
1025 create_success_file(download_success_file_path, \
|
|
1026 "Download of the mutation resource archive:\n\t{:s}\n".format(source_url) + \
|
|
1027 "to:\n\t{:s}\nsucceeded.".format(cannonical_destination))
|
|
1028 elif (download_success_file in orig_files_in_destdir):
|
|
1029 print "The download success file exists, so no download is being attempted:"
|
|
1030 print "\t{:s}".format(download_success_file_path)
|
|
1031 print "Remove the file or set <new_mutation_download> if you want a new download to occur."
|
|
1032 else:
|
|
1033 print "download_and_integrate_mutation_resources() - Download: This code should never be printed. Something is wrong."
|
|
1034 sys.stdout.flush()
|
|
1035
|
|
1036 # INTEGRATION SECTION
|
|
1037 integration_success_file = "{:s}.{:s}".format(source_filename, _MutationIntegrationSuccessFile)
|
|
1038 integration_success_file_path = os.path.join(cannonical_destination, integration_success_file)
|
|
1039 if ((integration_success_file not in orig_files_in_destdir) or force_new_integration):
|
|
1040 # INTEGRATE THE LIBRARY
|
|
1041 if (integration_success_file in orig_files_in_destdir):
|
|
1042 # Since we are redoing the integration,
|
|
1043 # the success file needs to be removed
|
|
1044 # until the download has succeeded.
|
|
1045 os.remove(integration_success_file_path)
|
|
1046 mutation_lib_dirpath = os.path.join(cannonical_destination, _CTAT_MutationLibDirname)
|
|
1047 # If we do not remove the directory, then the old files will exist and a new integration does not occur.
|
|
1048 # Also, with the Cosmic files, when the integrated file is created, if there is a previous one, gzip
|
|
1049 # asks a question of the user, and this program is not prepared to respond to a question from a subprocess:
|
|
1050 # [bgzip] /path/to/ctat_mutation_lib/cosmic.vcf.gz already exists; do you wish to overwrite (y or n)?
|
|
1051 if os.path.exists(mutation_lib_dirpath):
|
|
1052 shutil.rmtree(mutation_lib_dirpath)
|
|
1053 # Check for Cosmic resources. User has to place these files into the correct location.
|
|
1054 if (cosmic_resources_location is None) or (cosmic_resources_location == ""):
|
|
1055 cosmic_resources_loc_full_path = cannonical_destination
|
|
1056 end_err_msg = "These files must be placed into:\n\t{:s}".format(cosmic_resources_loc_full_path)
|
|
1057 else:
|
|
1058 cosmic_resources_loc_full_path = os.path.realpath(cosmic_resources_location)
|
|
1059 end_err_msg = "This function was told they would be placed into:\n\t{:s}".format(cosmic_resources_loc_full_path)
|
|
1060 cosmic_mutant_full_path = os.path.join(cosmic_resources_loc_full_path, _COSMIC_Mutant_Filename)
|
|
1061 cosmic_coding_full_path = os.path.join(cosmic_resources_loc_full_path, _COSMIC_Coding_Filename)
|
|
1062 if not (os.path.exists(cosmic_mutant_full_path) and os.path.exists(cosmic_coding_full_path)):
|
|
1063 raise IOError("Either one or both of Cosmic Resources are missing:\n\t" + \
|
|
1064 "{:s}\nand/or\n\t{:s}\n".format(cosmic_mutant_full_path, cosmic_coding_full_path) + \
|
|
1065 "Unable to integrate mutation resources.\n{:s}".format(end_err_msg))
|
|
1066 # Create the integration command. We also must define PICARD_HOME for the command to work.
|
|
1067 picard_home = find_path_to_picard_home()
|
|
1068 integration_command = find_path_to_mutation_lib_integration()
|
|
1069 command = "export PICARD_HOME={:s} && python {:s} ".format(picard_home, integration_command) + \
|
|
1070 "--CosmicMutantExport {:s} ".format(cosmic_mutant_full_path) + \
|
|
1071 "--CosmicCodingMuts {:s} ".format(cosmic_coding_full_path) + \
|
|
1072 "--genome_lib_dir {:s}".format(cannonical_destination)
|
|
1073 try: # to send the ctat-mutation-lib-integration command.
|
|
1074 subprocess.check_call(command, shell=True)
|
|
1075 except subprocess.CalledProcessError:
|
|
1076 print "ERROR: While trying to integrate the mutation resources:\n\t{:s}".format(command)
|
|
1077 sys.stdout.flush()
|
|
1078 raise
|
|
1079 finally:
|
|
1080 # Some code to help us if errors occur.
|
|
1081 print "/n*********************************************************"
|
|
1082 print "* After download and integration of Mutation Resources. *"
|
|
1083 sys.stdout.flush()
|
|
1084 print_directory_contents(cannonical_destination, 2)
|
|
1085 print "*********************************************************\n"
|
|
1086 sys.stdout.flush()
|
|
1087 create_success_file(integration_success_file_path, \
|
|
1088 "Download and integration of mutation resources:\n\t{:s}\n".format(source_url) + \
|
|
1089 "to:\n\t{:s}\nsucceeded.".format(genome_build_directory))
|
|
1090 elif (integration_success_file in orig_files_in_destdir):
|
|
1091 print "The mutation resources integration success file exists, so no integration is being attempted:"
|
|
1092 print "\t{:s}".format(integration_success_file_path)
|
|
1093 print "Remove the file or set <new_mutation_integration> if you want a new integration to occur."
|
|
1094 else:
|
|
1095 print "download_and_integrate_mutation_resources() - Integration: This code should never be printed. Something is wrong."
|
|
1096 sys.stdout.flush()
|
|
1097 return
|
|
1098
|
|
1099 def search_for_genome_build_dir(top_dir_path):
|
|
1100 # If we do not download the directory, the topdir_path could be the
|
|
1101 # location of the genome resource library, but we also want to allow the
|
|
1102 # user to give the same value for top_dir_path that they do when a
|
|
1103 # build happens, so we need to handle all three cases:
|
|
1104 # 1) Is the top_dir_path the build directory,
|
|
1105 # 2) or is it inside of the given directory,
|
|
1106 # 3) or is it inside a subdirectory of the given directory.
|
|
1107 # The source_data downloads are built to a directory named _CTAT_Build_dirname,
|
|
1108 # and the plug-n-play downloads contain a directory with a single sub-directory named _CTAT_Build_dirname.
|
|
1109 # So the conventional structure has all the library files in .../GenomeName/_CTAT_Build_dirname
|
|
1110
|
|
1111 top_dir_full_path = os.path.realpath(top_dir_path)
|
|
1112 genome_build_directory = None
|
|
1113 genome_name_from_dirname = None
|
|
1114 print_warning = False
|
|
1115
|
|
1116 if not os.path.exists(top_dir_full_path):
|
|
1117 raise ValueError("Cannot find the CTAT Genome Resource Library. " + \
|
|
1118 "The given directory does not exist:\n\t{:s}".format(top_dir_full_path))
|
|
1119 elif not os.path.isdir(top_dir_full_path):
|
|
1120 raise ValueError("Cannot find the CTAT Genome Resource Library. " + \
|
|
1121 "The given directory is not a directory:\n\t{:s}".format(top_dir_full_path))
|
|
1122 if os.path.basename(top_dir_full_path) == _CTAT_Build_dirname:
|
|
1123 print "Build directory is: {:s}".format(top_dir_full_path)
|
|
1124 sys.stdout.flush()
|
|
1125 # The top_dir_path is the path to the genome_build_directory.
|
|
1126 genome_build_directory = top_dir_full_path
|
|
1127 else:
|
|
1128 # Look for it inside of the top_dir_path directory.
|
|
1129 print "Looking inside of: {:s}".format(top_dir_full_path)
|
|
1130 sys.stdout.flush()
|
|
1131 top_dir_contents = os.listdir(top_dir_full_path)
|
|
1132 if (_CTAT_Build_dirname in top_dir_contents):
|
|
1133 # The genome_build_directory is inside of the top_dir_path directory.
|
|
1134 print "1. Found it."
|
|
1135 sys.stdout.flush()
|
|
1136 genome_build_directory = "{:s}/{:s}".format(top_dir_full_path,_CTAT_Build_dirname)
|
|
1137 else:
|
|
1138 # Find all subdirectories containing the _CTAT_Build_dirname or the _CTAT_RefGenome_Filename.
|
|
1139 # Look down the directory tree two levels.
|
|
1140 build_dirs_in_subdirs = list()
|
|
1141 subdirs_with_genome_files = list()
|
|
1142 build_dirs_in_sub_subdirs = list()
|
|
1143 sub_subdirs_with_genome_files = list()
|
|
1144 subdirs = [entry for entry in top_dir_contents if (os.path.isdir("{:s}/{:s}".format(top_dir_full_path,entry)))]
|
|
1145 for subdir in subdirs:
|
|
1146 subdir_path = "{:s}/{:s}".format(top_dir_full_path, subdir)
|
|
1147 subdir_path_contents = os.listdir(subdir_path)
|
|
1148 # print "Is it one of:\n\t" + "\n\t".join(subdir_path_contents)
|
|
1149 if (_CTAT_Build_dirname in subdir_path_contents):
|
|
1150 # The genome_build_directory is inside of the subdir_path directory.
|
|
1151 print "2a, Found one."
|
|
1152 build_dirs_in_subdirs.append("{:s}/{:s}".format(subdir_path, _CTAT_Build_dirname))
|
|
1153 if (_CTAT_RefGenome_Filename in subdir_path_contents):
|
|
1154 subdirs_with_genome_files.append(subdir_path)
|
|
1155 # Since we are already looping, loop through all dirs one level deeper as well.
|
|
1156 sub_subdirs = [entry for entry in subdir_path_contents if (os.path.isdir("{:s}/{:s}".format(subdir_path,entry)))]
|
|
1157 for sub_subdir in sub_subdirs:
|
|
1158 sub_subdir_path = "{:s}/{:s}".format(subdir_path, sub_subdir)
|
|
1159 sub_subdir_path_contents = os.listdir(sub_subdir_path)
|
|
1160 # print "Is it one of:\n\t" + "\n\t".join(sub_subdir_path_contents)
|
|
1161 if (_CTAT_Build_dirname in sub_subdir_path_contents):
|
|
1162 # The genome_build_directory is inside of the sub_subdir_path directory.
|
|
1163 print "3a. Found one."
|
|
1164 build_dirs_in_sub_subdirs.append("{:s}/{:s}".format(sub_subdir_path, _CTAT_Build_dirname))
|
|
1165 if (_CTAT_RefGenome_Filename in sub_subdir_path_contents):
|
|
1166 sub_subdirs_with_genome_files.append(sub_subdir_path)
|
|
1167 # Hopefully there is one and only one found build directory.
|
|
1168 # If none are found we check for a directory containing the genome reference file,
|
|
1169 # but the build process sometimes causes more than one directory to have a copy,
|
|
1170 # so finding that file is not a sure thing.
|
|
1171 if (len(build_dirs_in_subdirs) + len(build_dirs_in_sub_subdirs)) > 1:
|
|
1172 print "\n***************************************"
|
|
1173 print "Found multiple CTAT Genome Resource Libraries " + \
|
|
1174 "in the given directory:\n\t{:s}".format(top_dir_full_path)
|
|
1175 sys.stdout.flush()
|
|
1176 print_directory_contents(top_dir_full_path, 2)
|
|
1177 print "***************************************\n"
|
|
1178 sys.stdout.flush()
|
|
1179 raise ValueError("Found multiple CTAT Genome Resource Libraries " + \
|
|
1180 "in the given directory:\n\t{:s}".format(top_dir_full_path))
|
|
1181 elif len(build_dirs_in_subdirs) == 1:
|
|
1182 # The genome_build_directory is inside of the subdir_path directory.
|
|
1183 print "2b, Found it."
|
|
1184 sys.stdout.flush()
|
|
1185 genome_build_directory = build_dirs_in_subdirs[0]
|
|
1186 elif len(build_dirs_in_sub_subdirs) == 1:
|
|
1187 # The genome_build_directory is inside of the subdir_path directory.
|
|
1188 print "3b, Found it."
|
|
1189 sys.stdout.flush()
|
|
1190 genome_build_directory = build_dirs_in_sub_subdirs[0]
|
|
1191 elif (len(sub_subdirs_with_genome_files) + len(subdirs_with_genome_files)) > 1:
|
|
1192 print "\n***************************************"
|
|
1193 print "Unable to find CTAT Genome Resource Library " + \
|
|
1194 "in the given directory:\n\t{:s}".format(top_dir_full_path)
|
|
1195 print "And multiple directories contain {:s}".format(_CTAT_RefGenome_Filename)
|
|
1196 sys.stdout.flush()
|
|
1197 print_directory_contents(top_dir_full_path, 2)
|
|
1198 print "***************************************\n"
|
|
1199 sys.stdout.flush()
|
|
1200 raise ValueError("Unable to find CTAT Genome Resource Library " + \
|
|
1201 "in the given directory:\n\t{:s}".format(top_dir_full_path))
|
|
1202 elif (len(sub_subdirs_with_genome_files) == 1):
|
|
1203 print "3c, Maybe found it."
|
|
1204 sys.stdout.flush()
|
|
1205 genome_build_directory = sub_subdirs_with_genome_files[0]
|
|
1206 print_warning = True
|
|
1207 elif (len(subdirs_with_genome_files) == 1):
|
|
1208 print "2c, Maybe found it."
|
|
1209 sys.stdout.flush()
|
|
1210 genome_build_directory = subdirs_with_genome_files[0]
|
|
1211 print_warning = True
|
|
1212 elif (_CTAT_RefGenome_Filename in top_dir_contents):
|
|
1213 print "1c. Maybe found it."
|
|
1214 sys.stdout.flush()
|
|
1215 genome_build_directory = top_dir_full_path
|
|
1216 print_warning = True
|
|
1217 else:
|
|
1218 print "\n***************************************"
|
|
1219 print "Unable to find CTAT Genome Resource Library " + \
|
|
1220 "in the given directory:\n\t{:s}".format(top_dir_full_path)
|
|
1221 sys.stdout.flush()
|
|
1222 print_directory_contents(top_dir_full_path, 2)
|
|
1223 print "***************************************\n"
|
|
1224 sys.stdout.flush()
|
|
1225 raise ValueError("Unable to find CTAT Genome Resource Library " + \
|
|
1226 "in the given directory:\n\t{:s}".format(top_dir_full_path))
|
|
1227 # end else
|
|
1228 # Check if the CTAT Genome Resource Lib has anything in it (and specifically ref_genome.fa).
|
|
1229 if (genome_build_directory is None):
|
|
1230 print "\n***************************************"
|
|
1231 print "Cannot find the CTAT Genome Resource Library " + \
|
|
1232 "in the given directory:\n\t{:s}".format(top_dir_full_path)
|
|
1233 sys.stdout.flush()
|
|
1234 print_directory_contents(top_dir_full_path, 2)
|
|
1235 print "***************************************\n"
|
|
1236 sys.stdout.flush()
|
|
1237 raise ValueError("Cannot find the CTAT Genome Resource Library " + \
|
|
1238 "in the given directory:\n\t{:s}".format(top_dir_full_path))
|
|
1239 else:
|
|
1240 if (_CTAT_RefGenome_Filename not in os.listdir(genome_build_directory)):
|
|
1241 print "\n***************************************"
|
|
1242 print "\nWARNING: Cannot find Genome Reference file {:s} ".format(_CTAT_RefGenome_Filename) + \
|
|
1243 "in the genome build directory:\n\t{:s}".format(genome_build_directory)
|
|
1244 sys.stdout.flush()
|
|
1245 print_directory_contents(genome_build_directory, 2)
|
|
1246 print "***************************************\n"
|
|
1247 sys.stdout.flush()
|
|
1248 if print_warning and genome_build_directory:
|
|
1249 print "\n***************************************"
|
|
1250 print "\nWARNING: Cannot find the CTAT Genome Resource Library, " + \
|
|
1251 "but found a {:s} file, so set its directory as the library.".format(_CTAT_RefGenome_Filename)
|
|
1252 print "This my not be the correct directory:\n\t{:s}".format(genome_build_directory)
|
|
1253 sys.stdout.flush()
|
|
1254 print_directory_contents(genome_build_directory, 2)
|
|
1255 print "***************************************\n"
|
|
1256 sys.stdout.flush()
|
|
1257 return genome_build_directory
|
|
1258
|
|
1259 def build_directory_from_build_location(src_filename, build_location):
|
|
1260 # This function is used to make sure our builds follow the covention of placing the build in a directory named
|
|
1261 # _CTAT_Build_dirname, which is normally inside of a directory named for the genome name.
|
|
1262 # However, if the user passes a build_location named _CTAT_Build_dirname that directory will be used,
|
|
1263 # regardless of the name of the enclosing directory.
|
|
1264 build_directory = None
|
|
1265 genome_dir_name = find_genome_name_in_path(src_filename)
|
|
1266 if (genome_dir_name is None) or (genome_dir_name == ""):
|
|
1267 # Maybe it is in the path of the build_location.
|
|
1268 genome_dir_name = find_genome_name_in_path(build_location)
|
|
1269 if os.path.basename(build_location) == genome_dir_name:
|
|
1270 build_directory = os.path.join(build_location, _CTAT_Build_dirname)
|
|
1271 elif os.path.basename(build_location) == _CTAT_Build_dirname:
|
|
1272 build_directory = build_location
|
|
1273 elif genome_dir_name is None:
|
|
1274 # This can be the case if the src_filename does not contain a directory named for the genome.
|
|
1275 build_directory = os.path.join(build_location, _CTAT_Build_dirname)
|
|
1276 else:
|
|
1277 build_directory = os.path.join(build_location, genome_dir_name, _CTAT_Build_dirname)
|
|
1278 return build_directory
|
|
1279
|
|
1280 def main():
|
|
1281 # Regarding the command line, there are three basic ways to use this tool:
|
|
1282 # 1) Download and Build the CTAT Genome Resource Library from an archive;
|
|
1283 # 2) Build the library from source data files that are already downloaded;
|
|
1284 # 3) Specify the location of an already built library.
|
|
1285 # Any of these methods can incorporate or be followed by a gmap build.
|
|
1286 # Any of these methods can be followed by a mutation resources download and/or integration.
|
|
1287 # Choose arguments for only one method.
|
|
1288 # Do not use arguments in a mixed manner. I am not writing code to handle that at this time.
|
|
1289 parser = argparse.ArgumentParser()
|
|
1290 # Arguments for all methods:
|
|
1291 parser.add_argument('-o', '--output_filename', \
|
|
1292 help='Name of the output file, where the json dictionary will be written.')
|
|
1293 parser.add_argument('-y', '--display_name',
|
|
1294 default='', \
|
|
1295 help='Is used as the display name for the entry of this Genome Resource Library in the data table.')
|
|
1296 parser.add_argument('-g', '--gmap_build', \
|
|
1297 help='Will do a gmap_build on the Genome Resource Library, if it has not previously been gmapped.',
|
|
1298 action='store_true')
|
|
1299 parser.add_argument('-f', '--force_gmap_build', \
|
|
1300 help='Will force gmap_build of the Genome Resource Library, even if previously gmapped.',
|
|
1301 action='store_true')
|
|
1302 parser.add_argument('-m', '--download_mutation_resources_url',
|
|
1303 default='', \
|
|
1304 help='Value should be the url of the zipped up mutation resources. ' + \
|
|
1305 'These are located at: https://data.broadinstitute.org/Trinity/CTAT/mutation/.' + \
|
|
1306 'Will download mutation resources and integrate them into the Genome Resource Library.' + \
|
|
1307 'Cosmic resources must previously have beeen downloaded (https://cancer.sanger.ac.uk/cosmic/download).' + \
|
|
1308 'Cosmic resources can be placed directly into the Genome Resource Library ' + \
|
|
1309 'or you can set the --cosmic_resources_location argument.' + \
|
|
1310 'See https://github.com/NCIP/ctat-mutations/tree/no_sciedpiper/mutation_lib_prep for more info. ' + \
|
|
1311 'If a previous download and integration was not completed, ' + \
|
|
1312 'calling with this option set will attempt to finish the integration.')
|
|
1313 parser.add_argument('-l', '--new_mutation_download', \
|
|
1314 help='Forces the mutation resources to be downloaded, ' + \
|
|
1315 'even if previously downloaded into this Genome Resource Library.',
|
|
1316 action='store_true')
|
|
1317 parser.add_argument('-i', '--new_mutation_integration', \
|
|
1318 help='Forces the mutation resources to be integrated, ' + \
|
|
1319 'even if previously integrated into this Genome Resource Library.',
|
|
1320 action='store_true')
|
|
1321 parser.add_argument('-c', '--cosmic_resources_location',
|
|
1322 default='', \
|
|
1323 help='Specify a non-default location where the Cosmic files reside. ' + \
|
|
1324 'Normally they are assumed to reside in the build directory, ' + \
|
|
1325 'but if that directory has not been created yet when this program ' + \
|
|
1326 'is called, you can specify the full path to the directory where they reside.')
|
|
1327 parser.add_argument('-t', '--cravat_tissues_filepath',
|
|
1328 default='', \
|
|
1329 help='Specify a non-default location where the Cosmic files reside. ' + \
|
|
1330 'Normally they are assumed to reside in the build directory, ' + \
|
|
1331 'but if that directory has not been created yet when this program ' + \
|
|
1332 'is called, you can specify the full path to the directory where they reside.')
|
|
1333 # Method 1) arguments - Download and Build.
|
|
1334 # - One can optionally utilize --build_location argument with this group of arguments.
|
|
1335 download_and_build_args = parser.add_argument_group('Download and Build arguments')
|
|
1336 download_and_build_args.add_argument('-u', '--download_url',
|
|
1337 default='', \
|
|
1338 help='This is the url of an archive file containing the library files. ' + \
|
|
1339 'These are located at https://data.broadinstitute.org/Trinity/CTAT_RESOURCE_LIB/. ' + \
|
|
1340 'Works with both source-data and plug-n-play archives.')
|
|
1341 download_and_build_args.add_argument('-d', '--download_location',
|
|
1342 default='', \
|
|
1343 help='Full path of the CTAT Resource Library download location, where the download will be placed. ' + \
|
|
1344 'If the archive file has already had been successfully downloaded, ' + \
|
|
1345 'it will only be downloaded again if --new_archive_download is selected. ' + \
|
|
1346 'If --build_location is not set, then the archive will be built in place at the download_location. ' + \
|
|
1347 'If a previous download and build was started but not completed at this or a specified build_location, ' + \
|
|
1348 'calling with this and the previous option set, but not --new_archive_download, ' + \
|
|
1349 'will attempt to finish the download and build.')
|
|
1350 download_and_build_args.add_argument('-a', '--new_archive_download', \
|
|
1351 help='Forces a new download (and build if needed) of the Genome Resource Library, ' + \
|
|
1352 'even if previously downloaded and built.',
|
|
1353 action='store_true')
|
|
1354 download_and_build_args.add_argument('-k', '--keep_archive', \
|
|
1355 help='The archive will not be deleted after it is extracted.',
|
|
1356 action='store_true')
|
|
1357 # Method 2) arguments - Specify source and build locations.
|
|
1358 # - One can optionally utilize --build_location argument with this group of arguments.
|
|
1359 specify_source_and_build_args = parser.add_argument_group('Specify Source and Build locations arguments')
|
|
1360 specify_source_and_build_args.add_argument('-s', '--source_location',
|
|
1361 default='', \
|
|
1362 help='Full path to the directory containing CTAT Resource Library source-data files ' + \
|
|
1363 'or the full path to a CTAT Resource Library archive file (.tar.gz). ' + \
|
|
1364 'If the --build_location option is not set, the reference library will be built in the source_location directory.' + \
|
|
1365 'If a previous download and build was started but not completed at this location, ' + \
|
|
1366 'calling with this option set, but not --new_library_build, ' + \
|
|
1367 'will attempt to finish the build.')
|
|
1368 specify_source_and_build_args.add_argument('-r', '--new_library_build', \
|
|
1369 help='Forces build of the CTAT Genome Resource Library, even if previously built. ' + \
|
|
1370 'The --source_location must be a source-data archive or directory, or this is a no-op.',
|
|
1371 action='store_true')
|
|
1372 # Method 3) arguments - Specify the location of a built library.
|
|
1373 built_lib_location_arg = parser.add_argument_group('Specify location of built library arguments')
|
|
1374 built_lib_location_arg.add_argument('-b', '--build_location',
|
|
1375 default='', \
|
|
1376 help='Full path to the location of a built CTAT Genome Resource Library, ' + \
|
|
1377 'either where it is, or where it will be placed.')
|
|
1378
|
|
1379 args = parser.parse_args()
|
|
1380
|
|
1381 # Apparently, Galaxy writes all of the input parameters to the output file prior to
|
|
1382 # this program being called.
|
|
1383 # But I do not get input values from the json file, but rather from command line.
|
|
1384 # Just leaving the following code as a comment, in case it might be useful to someone later.
|
|
1385 # params = from_json_string(open(filename).read())
|
|
1386 # target_directory = params['output_data'][0]['extra_files_path']
|
|
1387 # os.mkdir(target_directory)
|
|
1388
|
|
1389 lib_was_built = False
|
|
1390 extracted_directory = None
|
|
1391 source_data_directory = None
|
|
1392 genome_build_directory = None
|
|
1393 download_url_is_set = (args.download_url is not None) and (args.download_url != "")
|
|
1394 download_location_is_set = (args.download_location is not None) and (args.download_location != "")
|
|
1395 source_location_is_set = (args.source_location is not None) and (args.source_location != "")
|
|
1396 build_location_is_set = (args.build_location is not None) and (args.build_location != "")
|
|
1397 mutation_url_is_set = (args.download_mutation_resources_url is not None) \
|
|
1398 and (args.download_mutation_resources_url != "")
|
|
1399
|
|
1400 if download_url_is_set:
|
|
1401 print "The value of download_url argument is:\n\t{:s}".format(str(args.download_url))
|
|
1402 sys.stdout.flush()
|
|
1403 if source_location_is_set:
|
|
1404 raise ValueError("Argument --source_location cannot be used in combination with --download_url.")
|
|
1405 if not download_location_is_set:
|
|
1406 raise ValueError("Argument --download_url requires that --download_location be specified.")
|
|
1407 downloaded_filename_full_path = \
|
|
1408 download_genome_archive(source_url=args.download_url, \
|
|
1409 destination=args.download_location, \
|
|
1410 force_new_download=args.new_archive_download)
|
|
1411 print "\nThe downloaded file is:\n\t{:s}.\n".format(str(downloaded_filename_full_path))
|
|
1412 sys.stdout.flush()
|
|
1413
|
|
1414 if ctat_library_type(downloaded_filename_full_path) == _LIBTYPE_SOURCE_DATA:
|
|
1415 print "It is source data."
|
|
1416 sys.stdout.flush()
|
|
1417 # If it is source_data, extract to download_location (the directory where the download was placed).
|
|
1418 extracted_directory = extract_genome_file(archive_filepath=downloaded_filename_full_path, \
|
|
1419 destination=args.download_location, \
|
|
1420 force_new_extraction=args.new_archive_download, \
|
|
1421 keep_archive=args.keep_archive)
|
|
1422 source_data_directory = extracted_directory
|
|
1423 if build_location_is_set:
|
|
1424 genome_build_directory = build_directory_from_build_location(source_data_directory, args.build_location)
|
|
1425 else:
|
|
1426 # We will build within a subdirectory of the source_data_directory .
|
|
1427 # The name of the build directory will be the default _CTAT_Build_dirname.
|
|
1428 # This _CTAT_Build_dirname directory will not exist until the library is built.
|
|
1429 genome_build_directory = os.path.join(source_data_directory, _CTAT_Build_dirname)
|
|
1430
|
|
1431 elif ctat_library_type(downloaded_filename_full_path) == _LIBTYPE_PLUG_N_PLAY:
|
|
1432 print "It is plug-n-play data."
|
|
1433 sys.stdout.flush()
|
|
1434 if build_location_is_set:
|
|
1435 # Extract to the build location. The library is already built.
|
|
1436 extracted_directory = extract_genome_file(archive_filepath=downloaded_filename_full_path, \
|
|
1437 destination=args.build_location, \
|
|
1438 force_new_extraction=args.new_archive_download, \
|
|
1439 keep_archive=args.keep_archive)
|
|
1440 else:
|
|
1441 # Extract to the download location.
|
|
1442 extracted_directory = extract_genome_file(archive_filepath=downloaded_filename_full_path, \
|
|
1443 destination=args.download_location, \
|
|
1444 force_new_extraction=args.new_archive_download, \
|
|
1445 keep_archive=args.keep_archive)
|
|
1446 # There is no source_data_directory, so its value stays as None.
|
|
1447
|
|
1448 # Look for the build directory. It should be inside the extracted_directory
|
|
1449 if len(os.listdir(extracted_directory)) == 1:
|
|
1450 # Then that one file is a subdirectory that should be the build_directory.
|
|
1451 # That is how the plug-n-play directories are structured.
|
|
1452 subdir_filename = os.listdir(extracted_directory)[0]
|
|
1453 genome_build_directory = os.path.join(extracted_directory, subdir_filename)
|
|
1454 else:
|
|
1455 # We need to search for the build directory, since there is more than one file.
|
|
1456 genome_build_directory = search_for_genome_build_dir(extracted_directory)
|
|
1457 else:
|
|
1458 raise ValueError("Unexpected CTAT Library type. Neither plug-n-play nor source_data:\n\t" + \
|
|
1459 "{:s}".format(downloaded_filename_full_path))
|
|
1460 elif source_location_is_set:
|
|
1461 # Then the user wants to build the directory from the source data.
|
|
1462 source_data_directory = os.path.realpath(args.source_location)
|
|
1463 print "\nThe program is being told that the source data is in:\n\t{:s}.\n".format(str(source_data_directory))
|
|
1464 sys.stdout.flush()
|
|
1465 if build_location_is_set:
|
|
1466 genome_build_directory = build_directory_from_build_location(source_data_directory, args.build_location)
|
|
1467 else:
|
|
1468 # We will build within a subdirectory of the source_data_directory .
|
|
1469 # The name of the build directory will be the default _CTAT_Build_dirname.
|
|
1470 # This _CTAT_Build_dirname directory will not exist until the library is built.
|
|
1471 genome_build_directory = os.path.join(source_data_directory, _CTAT_Build_dirname)
|
|
1472 elif build_location_is_set:
|
|
1473 genome_build_directory = args.build_location
|
|
1474
|
|
1475 if (genome_build_directory is None) or (genome_build_directory == ""):
|
|
1476 raise ValueError("At least one of --download_url, --source_location, or --build_location must be specified.")
|
|
1477
|
|
1478 print "\nThe location where the CTAT Genome Resource Library exists " + \
|
|
1479 "or will be built is {:s}.\n".format(str(genome_build_directory))
|
|
1480 sys.stdout.flush()
|
|
1481
|
|
1482 # To take out builds for testing, comment out the lines that do the building.
|
|
1483 # The command that builds the ctat genome library also has an option for building the gmap indexes.
|
|
1484 # That is why the gmap_build values are sent to build_the_library(), but if we are not building the
|
|
1485 # library, the user might still be asking for a gmap_build. That is done after rechecking for the
|
|
1486 # genome_build_directory.
|
|
1487 if source_data_directory is not None:
|
|
1488 build_the_library(source_data_directory, \
|
|
1489 genome_build_directory, \
|
|
1490 args.new_library_build, \
|
|
1491 args.gmap_build, \
|
|
1492 args.force_gmap_build)
|
|
1493 lib_was_built = True
|
|
1494
|
|
1495 # The following looks to see if the library actually exists after the build,
|
|
1496 # and raises an error if it cannot find the library files.
|
|
1497 # The reassignment of genome_build_directory can be superfluous,
|
|
1498 # since many times the genome_build_directory will already point to the correct directory.
|
|
1499 # There are cases, however, where a user specifies a location that contains the
|
|
1500 # genome_build_directory rather than is the genome_build_directory.
|
|
1501 genome_build_directory = search_for_genome_build_dir(genome_build_directory)
|
|
1502
|
|
1503 if (args.gmap_build and not lib_was_built):
|
|
1504 # If we did not build the genome resource library
|
|
1505 # the user might still be asking for a gmap_build.
|
|
1506 gmap_the_library(genome_build_directory, args.force_gmap_build)
|
|
1507 sys.stdout.flush()
|
|
1508
|
|
1509 if mutation_url_is_set:
|
|
1510 download_and_integrate_mutation_resources(source_url=args.download_mutation_resources_url, \
|
|
1511 genome_build_directory=genome_build_directory, \
|
|
1512 cosmic_resources_location=args.cosmic_resources_location, \
|
|
1513 force_new_download=args.new_mutation_download, \
|
|
1514 force_new_integration=args.new_mutation_integration)
|
|
1515
|
|
1516 # Need to get the genome name.
|
|
1517 genome_name = find_genome_name_in_path(args.download_url)
|
|
1518 if genome_name is None:
|
|
1519 genome_name = find_genome_name_in_path(genome_build_directory)
|
|
1520 if genome_name is None:
|
|
1521 genome_name = find_genome_name_in_path(extracted_directory)
|
|
1522 if genome_name is None:
|
|
1523 genome_name = find_genome_name_in_path(args.source_location)
|
|
1524 if genome_name is None:
|
|
1525 genome_name = find_genome_name_in_path(args.download_location)
|
|
1526 if genome_name is None:
|
|
1527 genome_name = find_genome_name_in_path(args.display_name)
|
|
1528 if genome_name is None:
|
|
1529 genome_name = _CTAT_ResourceLib_DefaultGenome
|
|
1530 print "WARNING: We could not find a genome name in any of the directory paths."
|
|
1531 sys.stdout.flush()
|
|
1532
|
|
1533 # Determine the display_name for the library.
|
|
1534 if (args.display_name is None) or (args.display_name == ""):
|
|
1535 # Create the display_name from the genome_name.
|
|
1536 display_name = _CTAT_ResourceLib_DisplayNamePrefix + genome_name
|
|
1537 else:
|
|
1538 display_name = _CTAT_ResourceLib_DisplayNamePrefix + args.display_name
|
|
1539 display_name = display_name.replace(" ","_")
|
|
1540
|
|
1541 # Create a unique_id for the library.
|
|
1542 datetime_stamp = datetime.now().strftime("_%Y_%m_%d_%H_%M_%S_%f")
|
|
1543 unique_id = genome_name + "." + datetime_stamp
|
|
1544
|
|
1545 print "The Genome Resource Library's display_name will be set to: {:s}\n".format(display_name)
|
|
1546 print "Its unique_id will be set to: {:s}\n".format(unique_id)
|
|
1547 print "Its dir_path will be set to: {:s}\n".format(genome_build_directory)
|
|
1548 sys.stdout.flush()
|
|
1549
|
|
1550 data_manager_dict = {}
|
|
1551 data_manager_dict['data_tables'] = {}
|
|
1552 data_manager_dict['data_tables']['ctat_genome_resource_libs'] = []
|
|
1553 data_table_entry = dict(value=unique_id, name=display_name, path=genome_build_directory)
|
|
1554 data_manager_dict['data_tables']['ctat_genome_resource_libs'].append(data_table_entry)
|
|
1555
|
|
1556 # Create the data table for the cravat_tissues, if the file is given:
|
|
1557 print "The cravat tissues file is: {:s}".format(str(args.cravat_tissues_filepath))
|
|
1558 if (args.cravat_tissues_filepath is not None) and (args.cravat_tissues_filepath != ""):
|
|
1559 data_manager_dict['data_tables']['ctat_cravat_tissues'] = []
|
|
1560 cravat_file = open(args.cravat_tissues_filepath, 'r')
|
|
1561 for line in cravat_file:
|
|
1562 # print line
|
|
1563 if line[0] != '#':
|
|
1564 # The line is not a comment, so parse it.
|
|
1565 items = [item.strip() for item in line.split("\t")]
|
|
1566 print items
|
|
1567 data_table_entry = dict(value=items[0], name=items[1], code=items[2], date=items[3])
|
|
1568 data_manager_dict['data_tables']['ctat_cravat_tissues'].append(data_table_entry)
|
|
1569
|
|
1570 # Temporarily the output file's dictionary is written for debugging:
|
|
1571 print "The dictionary for the output file is:\n\t{:s}".format(str(data_manager_dict))
|
|
1572 sys.stdout.flush()
|
|
1573 # Save info to json file. This is used to transfer data from the DataManager tool, to the data manager,
|
|
1574 # which then puts it into the correct .loc file (I think).
|
|
1575 # One can comment out the following line when testing without galaxy package.
|
|
1576 open(args.output_filename, 'wb').write(to_json_string(data_manager_dict))
|
|
1577
|
|
1578 if __name__ == "__main__":
|
|
1579 main()
|