# HG changeset patch # User trinity_ctat # Date 1526069207 14400 # Node ID f22a133787507ad5443aef5bb3c71f84c4588bbf # Parent be27617454003a825eb320334cb817c69eb13ed8 Uploaded diff -r be2761745400 -r f22a13378750 data_manager/add_ctat_resource_lib.py --- a/data_manager/add_ctat_resource_lib.py Fri May 04 13:19:47 2018 -0400 +++ b/data_manager/add_ctat_resource_lib.py Fri May 11 16:06:47 2018 -0400 @@ -40,6 +40,8 @@ _CTAT_ResourceLib_DefaultGenome = 'Unspecified_Genome' _CTAT_HumanFusionLib_FilenamePrefix = 'CTAT_HumanFusionLib' _CTAT_RefGenome_Filename = 'ref_genome.fa' +_CTAT_MouseGenome_Prefix = 'Mouse' +_CTAT_HumanGenome_Prefix = 'GRCh' _NumBytesNeededForBuild = 64424509440 # 60 Gigabytes. FIX - This might not be correct. _Download_TestFile = "write_testfile.txt" _DownloadSuccessFile = 'download_succeeded.txt' @@ -90,7 +92,7 @@ full_url_path = "{:s}/{:s}".format(_CTAT_ResourceLib_URL, url) filename = url.split("/")[-1] - if filename.split("_")[0] != "Mouse": + if filename.split("_")[0] != _CTAT_MouseGenome_Prefix: # Take out the mouse genome options for now. # The mouse genome option is not handled correctly yet options.append((filename, full_url_path, i == 0)) @@ -151,6 +153,10 @@ # lib_was_downloaded # Since it doesn't always do the download, the function returns whether download occurred. lib_was_downloaded = False + if len(source.split(":")) == 1: + # Might want to check that it is one of "http", "ftp", "file" or other accepted url starts. + # Assume we only were given the filename and that it exists at _CTAT_ResourceLib_URL. + source = "{:s}/{:s}".format(_CTAT_ResourceLib_URL, source) print "In download_from_BroadInst(). The source_url is:\n\t{:s}".format(str(source)) @@ -160,6 +166,8 @@ # If the src_filename indicates it is a source file, as opposed to plug-n-play, # then we may need to do some post processing on it. type_of_download = src_filename.split(".")[1] + print "The file to be extracted is {:s}".format(src_filename) + print "The type of download is {:s}".format(type_of_download) download_has_source_data = (type_of_download == "source_data") # We want to make sure that destination is absolute fully specified path. @@ -184,7 +192,7 @@ # in the code, something is wrong. Raise an error. raise OSError("The destination directory could not be created: " + \ "{:s}".format(cannonical_destination)) - test_writing_file = "{:s}/{:s}".format(cannonical_destination, _Download_TestFile) + test_writing_file = "{:s}/{:s}.{:s}".format(cannonical_destination, root_genome_dirname, _Download_TestFile) try: filehandle = open(test_writing_file, "w") filehandle.write("Testing writing to this file.") @@ -199,8 +207,9 @@ # We use it to check for a previous download or extraction among other things. orig_files_in_destdir = set(os.listdir(cannonical_destination)) # See whether the file has been downloaded already. - download_success_file_path = "{:s}/{:s}".format(cannonical_destination, _DownloadSuccessFile) - if ((_DownloadSuccessFile not in orig_files_in_destdir) \ + download_success_file = "{:s}.{:s}".format(root_genome_dirname, _DownloadSuccessFile) + download_success_file_path = "{:s}/{:s}".format(cannonical_destination, download_success_file) + if ((download_success_file not in orig_files_in_destdir) \ or (root_genome_dirname not in orig_files_in_destdir) \ or force_download): # Check whether there is enough space on the device for the library. @@ -231,7 +240,7 @@ #try: # tarfile.open(full_filepath, mode='r:*').extractall() - if (_DownloadSuccessFile in orig_files_in_destdir): + if (download_success_file in orig_files_in_destdir): # Since we are redoing the download, # the success file needs to be removed # until the download has succeeded. @@ -340,6 +349,8 @@ --genome_lib_dir ctat_genome_lib_build_dir gmap_build -D ctat_genome_lib_build_dir -d ref_genome.fa.gmap -k 13 ctat_genome_lib_build_dir/ref_genome.fa" """ + + print "Building the CTAT Genome Resource Library from source data at:\n\t{:s}".format(genome_source_directory) if (genome_source_directory != "" ) and build: if os.path.exists(genome_source_directory): os.chdir(genome_source_directory) @@ -398,27 +409,31 @@ # 3) or is it inside a subdirectory of the given directory. # The source_data downloads are built to a directory named _CTAT_Build_dirname, # and the plug-n-play downloads contain a sub-directory named _CTAT_Build_dirname. + # We also look for the genome name and return that, if we find it in the + # directory name of the directory holding the build directory. + top_dir_full_path = os.path.realpath(top_dir_path) genome_build_directory = None + genome_name_from_dirname = None print_warning = False - if not os.path.exists(top_dir_path): + if not os.path.exists(top_dir_full_path): raise ValueError("Cannot find the CTAT Genome Resource Library. " + \ - "The given directory does not exist:\n\t{:s}".format(top_dir_path)) - elif not os.path.isdir(top_dir_path): + "The given directory does not exist:\n\t{:s}".format(top_dir_full_path)) + elif not os.path.isdir(top_dir_full_path): raise ValueError("Cannot find the CTAT Genome Resource Library. " + \ - "The given directory is not a directory:\n\t{:s}".format(top_dir_path)) - if top_dir_path.split("/")[-1] == _CTAT_Build_dirname: - print "Build directory is: {:s}".format(top_dir_path) + "The given directory is not a directory:\n\t{:s}".format(top_dir_full_path)) + if top_dir_full_path.split("/")[-1] == _CTAT_Build_dirname: + print "Build directory is: {:s}".format(top_dir_full_path) # The top_dir_path is the path to the genome_build_directory. - genome_build_directory = top_dir_path + genome_build_directory = top_dir_full_path else: # Look for it inside of the top_dir_path directory. - print "Looking inside of: {:s}".format(top_dir_path) - top_dir_contents = os.listdir(top_dir_path) + print "Looking inside of: {:s}".format(top_dir_full_path) + top_dir_contents = os.listdir(top_dir_full_path) if (_CTAT_Build_dirname in top_dir_contents): # The genome_build_directory is inside of the top_dir_path directory. print "1. Found it." - genome_build_directory = "{:s}/{:s}".format(top_dir_path,_CTAT_Build_dirname) + genome_build_directory = "{:s}/{:s}".format(top_dir_full_path,_CTAT_Build_dirname) else: # Find all subdirectories containing the _CTAT_Build_dirname or the _CTAT_RefGenome_Filename. # Look down the directory tree two levels. @@ -426,9 +441,9 @@ subdirs_with_genome_files = list() build_dirs_in_sub_subdirs = list() sub_subdirs_with_genome_files = list() - subdirs = [entry for entry in top_dir_contents if (os.path.isdir("{:s}/{:s}".format(top_dir_path,entry)))] + subdirs = [entry for entry in top_dir_contents if (os.path.isdir("{:s}/{:s}".format(top_dir_full_path,entry)))] for subdir in subdirs: - subdir_path = "{:s}/{:s}".format(top_dir_path, subdir) + subdir_path = "{:s}/{:s}".format(top_dir_full_path, subdir) subdir_path_contents = os.listdir(subdir_path) # print "Is it one of:\n\t" + "\n\t".join(subdir_path_contents) if (_CTAT_Build_dirname in subdir_path_contents): @@ -456,11 +471,11 @@ if (len(build_dirs_in_subdirs) + len(build_dirs_in_sub_subdirs)) > 1: print "\n***************************************" print "Found multiple CTAT Genome Resource Libraries " + \ - "in the given directory:\n\t{:s}".format(top_dir_path) - print_directory_contents(top_dir_path, 2) + "in the given directory:\n\t{:s}".format(top_dir_full_path) + print_directory_contents(top_dir_full_path, 2) print "***************************************\n" raise ValueError("Found multiple CTAT Genome Resource Libraries " + \ - "in the given directory:\n\t{:s}".format(top_dir_path)) + "in the given directory:\n\t{:s}".format(top_dir_full_path)) elif len(build_dirs_in_subdirs) == 1: # The genome_build_directory is inside of the subdir_path directory. print "2b, Found it." @@ -472,12 +487,12 @@ elif (len(sub_subdirs_with_genome_files) + len(subdirs_with_genome_files)) > 1: print "\n***************************************" print "Unable to find CTAT Genome Resource Library " + \ - "in the given directory:\n\t{:s}".format(top_dir_path) + "in the given directory:\n\t{:s}".format(top_dir_full_path) print "And multiple directories contain {:s}".format(_CTAT_RefGenome_Filename) - print_directory_contents(top_dir_path, 2) + print_directory_contents(top_dir_full_path, 2) print "***************************************\n" raise ValueError("Unable to find CTAT Genome Resource Library " + \ - "in the given directory:\n\t{:s}".format(top_dir_path)) + "in the given directory:\n\t{:s}".format(top_dir_full_path)) elif (len(sub_subdirs_with_genome_files) == 1): print "3c, Maybe found it." genome_build_directory = sub_subdirs_with_genome_files[0] @@ -488,41 +503,56 @@ print_warning = True elif (_CTAT_RefGenome_Filename in top_dir_contents): print "1c. Maybe found it." - genome_build_directory = top_dir_path + genome_build_directory = top_dir_full_path print_warning = True else: print "\n***************************************" print "Unable to find CTAT Genome Resource Library " + \ - "in the given directory:\n\t{:s}".format(top_dir_path) - print_directory_contents(top_dir_path, 2) + "in the given directory:\n\t{:s}".format(top_dir_full_path) + print_directory_contents(top_dir_full_path, 2) print "***************************************\n" raise ValueError("Unable to find CTAT Genome Resource Library " + \ - "in the given directory:\n\t{:s}".format(top_dir_path)) + "in the given directory:\n\t{:s}".format(top_dir_full_path)) # end else # Check if the CTAT Genome Resource Lib has anything in it (and specifically ref_genome.fa). if (genome_build_directory is None): print "\n***************************************" print "Cannot find the CTAT Genome Resource Library " + \ - "in the given directory:\n\t{:s}".format(top_dir_path) - print_directory_contents(top_dir_path, 2) + "in the given directory:\n\t{:s}".format(top_dir_full_path) + print_directory_contents(top_dir_full_path, 2) print "***************************************\n" raise ValueError("Cannot find the CTAT Genome Resource Library " + \ - "in the given directory:\n\t{:s}".format(top_dir_path)) - elif (_CTAT_RefGenome_Filename not in os.listdir(genome_build_directory)): - print "\n***************************************" - print "\nWARNING: Cannot find Genome Reference file {:s}".format(_CTAT_RefGenome_Filename) + \ - "in the genome build directory:\n\t{:s}".format(genome_build_directory) - print_directory_contents(genome_build_directory, 2) - print "***************************************\n" - if print_warning and genome_build_directory: - print "\n***************************************" - print "\nWARNING: Cannot find the CTAT Genome Resource Library," + \ - "but found a {:s} file, so set its directory as the library.".format(_CTAT_RefGenome_Filename) - print "This my not be the correct directory:\n\t{:s}".format(genome_build_directory) - print_directory_contents(genome_build_directory, 2) - print "***************************************\n" + "in the given directory:\n\t{:s}".format(top_dir_full_path)) + else: + if (_CTAT_RefGenome_Filename not in os.listdir(genome_build_directory)): + print "\n***************************************" + print "\nWARNING: Cannot find Genome Reference file {:s}".format(_CTAT_RefGenome_Filename) + \ + "in the genome build directory:\n\t{:s}".format(genome_build_directory) + print_directory_contents(genome_build_directory, 2) + print "***************************************\n" + if print_warning and genome_build_directory: + print "\n***************************************" + print "\nWARNING: Cannot find the CTAT Genome Resource Library," + \ + "but found a {:s} file, so set its directory as the library.".format(_CTAT_RefGenome_Filename) + print "This my not be the correct directory:\n\t{:s}".format(genome_build_directory) + print_directory_contents(genome_build_directory, 2) + print "***************************************\n" return genome_build_directory +def find_genome_name_in_path(path): + # The form of the genome name in directory names (if present in the path) looks like: + # GRCh37_v19_CTAT_lib_Feb092018 + # Mouse_M16_CTAT_lib_Feb202018 + genome_name = None + if (path is not None) and (path != ""): + for element in path.split("/"): + # print "Looking for genome name in {:s}.".format(element) + if (element[0:len(_CTAT_MouseGenome_Prefix)] == _CTAT_MouseGenome_Prefix) \ + or (element[0:len(_CTAT_HumanGenome_Prefix)] == _CTAT_HumanGenome_Prefix): + # Remove any extension that might be in the filename. + genome_name = element.split(".")[0] + return genome_name + def main(): #Parse Command Line parser = argparse.ArgumentParser() @@ -576,8 +606,8 @@ # FIX - We should leave a file indicating build success the same way we do for download success. # To take out builds for testing, coment out the next four lines. - if (download_has_source_data or args.build or args.gmap_build) : - build_the_library(downloaded_directory, genome_build_directory, args.build, args.gmap_build) + if (download_has_source_data or args.build or args.gmap_build): + build_the_library(downloaded_directory, genome_build_directory, True, args.gmap_build) elif (args.gmap_build): gmap_the_library(genome_build_directory) @@ -585,35 +615,33 @@ # and raises an error if it cannot find the library files. # The reassignment of genome_build_directory should be superfluous, # unless I made a mistake in the build code. - # FIX - need to get the genome name from the directory name, if there was no download. - #genome_build_directory, genome_name_from_dirname = search_for_genome_build_dir(genome_build_directory) genome_build_directory = search_for_genome_build_dir(genome_build_directory) - source_filename_root = None - if (args.source_url != None) and (args.source_url != ""): - # Get the name out of the source's filename. - source_filename_root = args.source_url.split("/")[-1].split(".")[0] + # Need to get the genome name. + genome_name = find_genome_name_in_path(args.source_url) + if genome_name is None: + genome_name = find_genome_name_in_path(genome_build_directory) + if genome_name is None: + genome_name = find_genome_name_in_path(downloaded_directory) + if genome_name is None: + genome_name = find_genome_name_in_path(args.destination_path) + if genome_name is None: + genome_name = find_genome_name_in_path(args.display_name) + if genome_name is None: + genome_name = _CTAT_ResourceLib_DefaultGenome + print "WARNING: We could not find a genome name in any of the directory paths." # Determine the display_name for the library. if (args.display_name is None) or (args.display_name == ""): - if (source_filename_root != None) and (source_filename_root != ""): - # Create the display_name from the source_filename_root. - display_name = _CTAT_ResourceLib_DisplayNamePrefix + source_filename_root - else: - display_name = _CTAT_ResourceLib_DisplayNamePrefix + _CTAT_ResourceLib_DefaultGenome - print "WARNING: We do not have a genome name." + # Create the display_name from the genome_name. + display_name = _CTAT_ResourceLib_DisplayNamePrefix + genome_name else: display_name = _CTAT_ResourceLib_DisplayNamePrefix + args.display_name display_name = display_name.replace(" ","_") # Create a unique_id for the library. datetime_stamp = datetime.now().strftime("_%Y_%m_%d_%H_%M_%S_%f") - if (source_filename_root != None) and (source_filename_root != ""): - unique_id = source_filename_root + datetime_stamp - elif (downloaded_directory != None) and (downloaded_directory != ""): - unique_id = os.path.basename(downloaded_directory).split(".")[0] - else: - unique_id = _CTAT_ResourceLib_DefaultGenome + datetime_stamp + unique_id = genome_name + datetime_stamp print "The Genome Resource Library's display_name will be set to: {:s}\n".format(display_name) print "Its unique_id will be set to: {:s}\n".format(unique_id) diff -r be2761745400 -r f22a13378750 data_manager/ctat_genome_resource_libs_data_manager.tar.gz Binary file data_manager/ctat_genome_resource_libs_data_manager.tar.gz has changed