Mercurial > repos > trinity_ctat > ctat_genome_resource_libs_data_manager_3
changeset 6:be2761745400 draft
Uploaded
author | trinity_ctat |
---|---|
date | Fri, 04 May 2018 13:19:47 -0400 |
parents | 7f1257532b6f |
children | f22a13378750 |
files | data_manager/add_ctat_resource_lib.py data_manager/ctat_genome_resource_libs_data_manager.tar.gz |
diffstat | 2 files changed, 218 insertions(+), 80 deletions(-) [+] |
line wrap: on
line diff
--- a/data_manager/add_ctat_resource_lib.py Tue May 01 15:40:08 2018 -0400 +++ b/data_manager/add_ctat_resource_lib.py Fri May 04 13:19:47 2018 -0400 @@ -35,9 +35,11 @@ from HTMLParser import HTMLParser _CTAT_ResourceLib_URL = 'https://data.broadinstitute.org/Trinity/CTAT_RESOURCE_LIB/' -_CTAT_BuildDir_Name = 'ctat_genome_lib_build_dir' +_CTAT_Build_dirname = 'ctat_genome_lib_build_dir' _CTAT_ResourceLib_DisplayNamePrefix = 'CTAT_GenomeResourceLib_' _CTAT_ResourceLib_DefaultGenome = 'Unspecified_Genome' +_CTAT_HumanFusionLib_FilenamePrefix = 'CTAT_HumanFusionLib' +_CTAT_RefGenome_Filename = 'ref_genome.fa' _NumBytesNeededForBuild = 64424509440 # 60 Gigabytes. FIX - This might not be correct. _Download_TestFile = "write_testfile.txt" _DownloadSuccessFile = 'download_succeeded.txt' @@ -110,6 +112,19 @@ # trained_url = params['param_dict']['trained_url'] # return trained_url +def print_directory_contents(dir_path, num_levels): + if num_levels > 0: + if os.path.exists(dir_path) and os.path.isdir(dir_path): + print "\nDirectory {:s}:".format(dir_path) + subprocess.call("ls -la {:s} 2>&1".format(dir_path), shell=True) + else: + print "Path either does not exist, or is not a directory:\n\t{:s}.".format(dir_path) + if num_levels > 1: + for filename in os.listdir(dir_path): + filename_path = "{:s}/{:s}".format(dir_path, filename) + if os.path.exists(filename_path) and os.path.isdir(filename_path): + print_directory_contents(filename_path, num_levels-1) + def download_from_BroadInst(source, destination, force_download): # Input Parameters # source is the full URL of the file we want to download. @@ -224,7 +239,7 @@ # We want to transfer and untar the file without storing the tar file, because that # adds all that much more space to the needed amount of free space on the disk. # Use subprocess to pipe the output of curl into tar. - command = "curl {:s} | tar -xzvf - -C {:s}".format(source, cannonical_destination) + command = "curl --silent {:s} | tar -xzf - -C {:s}".format(source, cannonical_destination) try: # to send the command that downloads and extracts the file. command_output = subprocess.check_output(command, shell=True) # FIX - not sure check_output is what we want to use. If we want to have an error raised on @@ -237,8 +252,8 @@ # Some code to help us if errors occur. print "\n*******************************\nFinished download and extraction." - subprocess.check_call("ls -lad {:s}/*".format(cannonical_destination), shell=True) - subprocess.check_call("ls -lad {:s}/*/*".format(cannonical_destination), shell=True) + print_directory_contents(cannonical_destination, 2) + print "*******************************\n" newfiles_in_destdir = set(os.listdir(cannonical_destination)) - orig_files_in_destdir if (root_genome_dirname not in newfiles_in_destdir): @@ -272,10 +287,14 @@ # Look for the build directory, or specify the path where it should be placed. if len(os.listdir(downloaded_directory)) == 1: # Then that one file is a subdirectory that should be the downloaded_directory. + # That is how the plug-n-play directories are structured. subdir_filename = os.listdir(downloaded_directory)[0] genome_build_directory = "{:s}/{:s}".format(downloaded_directory, subdir_filename) else: - genome_build_directory = "{:s}/{:s}".format(downloaded_directory, _CTAT_BuildDir_Name) + # In this case, we have source_data in the directory. The default will be to create + # the build directory in the downloaded_directory with the default _CTAT_Build_dirname. + # In this case, this directory will not exist yet until the library is built. + genome_build_directory = "{:s}/{:s}".format(downloaded_directory, _CTAT_Build_dirname) else: raise ValueError("ERROR: Could not find the extracted file in the destination directory:" + \ "\n\t{:s}".format(cannonical_destination)) @@ -285,8 +304,10 @@ def gmap_the_library(genome_build_directory): # This is the processing that needs to happen for gmap-fusion to work. # genome_build_directory should normally be a fully specified path, - # though it should work if it is relative. - command = "gmap_build -D {:s}/ -d ref_genome.fa.gmap -k 13 {:s}/ref_genome.fa".format( \ + # though this function should work even if it is relative. + # The command prints messages out to stderr, even when there is not an error, + # so route stderr to stdout. Otherwise, galaxy thinks an error occurred. + command = "gmap_build -D {:s}/ -d ref_genome.fa.gmap -k 13 {:s}/ref_genome.fa 2>&1".format( \ genome_build_directory, genome_build_directory) try: # to send the gmap_build command. command_output = subprocess.check_output(command, shell=True) @@ -296,23 +317,8 @@ finally: # Some code to help us if errors occur. print "\n*******************************\nAfter running gmap_build." - if os.path.exists(genome_build_directory): - print "\nBuild Directory {:s}:".format(genome_build_directory) - subprocess.check_call("ls -la {:s}".format(genome_build_directory), shell=True) - dir_entries = os.listdir(genome_build_directory) - for entry in dir_entries: - entry_path = "{:s}/{:s}".format(genome_build_directory, entry) - print "\nDirectory {:s}:".format(entry_path) - subprocess.check_call("ls -la {:s}".format(entry_path), shell=True) - if os.path.isdir(entry_path): - subdir_entries = os.listdir(entry_path) - for subdir_entry in subdir_entries: - subdir_entry_path = "{:s}/{:s}".format(entry_path, subdir_entry) - print "\nDirectory {:s}:".format(subdir_entry_path) - subprocess.check_call("ls -la {:s}".format(subdir_entry_path), shell=True) - else: - print "Genome Build Directory does not exist:\n\t{:s}".format(genome_build_directory) - print "*******************************" + print_directory_contents(genome_build_directory, 2) + print "*******************************\n" def build_the_library(genome_source_directory, genome_build_directory, build, gmap_build): """ genome_source_directory is the location of the source_data needed to build the library. @@ -337,14 +343,31 @@ if (genome_source_directory != "" ) and build: if os.path.exists(genome_source_directory): os.chdir(genome_source_directory) - # FIX - look for a fusion_annot_lib and include it, else omit it. + # Create the command that builds the Genome Resource Library form the source data. command = "prep_genome_lib.pl --genome_fa ref_genome.fa --gtf ref_annot.gtf " + \ - "--fusion_annot_lib CTAT_HumanFusionLib.v0.1.0.dat.gz " + \ - "--annot_filter_rule AnnotFilterRule.pm " + \ "--pfam_db PFAM.domtblout.dat.gz " + \ - "--output_dir {:s} ".format(genome_build_directory) + "--output_dir {:s}".format(genome_build_directory) + found_HumanFusionLib = False + HumanFusionLib_filename = "NoFileFound" + for filename in os.listdir(genome_source_directory): + # At the time this was written, the filename was CTAT_HumanFusionLib.v0.1.0.dat.gz + # We only check the prefix, in case other versions are used later. + # I assume there is only one in the directory, but if there are more than one, + # the later one, alphabetically, will be used. + if filename.split(".")[0] == _CTAT_HumanFusionLib_FilenamePrefix: + found_HumanFusionLib = True + filename_of_HumanFusionLib = filename + if found_HumanFusionLib: + # The mouse genomes do not have a fusion_annot_lib + # so only add the following for Human genomes. + command += "--fusion_annot_lib {:s} ".format(filename_of_HumanFusionLib) + \ + "--annot_filter_rule AnnotFilterRule.pm " if gmap_build: command += "--gmap_build " + # Send stderr of the command to stdout, because some functions may write to stderr, + # even though no error has occurred. We will depend on error code return in order + # to know if an error occurred. + command += " 2>&1" try: # to send the prep_genome_lib command. command_output = subprocess.check_call(command, shell=True) except subprocess.CalledProcessError: @@ -353,59 +376,174 @@ raise finally: # Some code to help us if errors occur. - print "*******************************" - if os.path.exists(genome_build_directory): - print "\nSource Directory {:s}:".format(genome_source_directory) - subprocess.check_call("ls -la {:s}".format(genome_source_directory), shell=True) - dir_entries = os.listdir(genome_source_directory) - for entry in dir_entries: - entry_path = "{:s}/{:s}".format(genome_source_directory, entry) - print "\nDirectory {:s}:".format(entry_path) - subprocess.check_call("ls -la {:s}".format(entry_path), shell=True) - else: - print "Genome Source Directory does not exist:\n\t{:s}".format(genome_source_directory) - if os.path.exists(genome_build_directory): - print "\nBuild Directory {:s}:".format(genome_build_directory) - subprocess.check_call("ls -la {:s}".format(genome_build_directory), shell=True) - dir_entries = os.listdir(genome_build_directory) - for entry in dir_entries: - entry_path = "{:s}/{:s}".format(genome_build_directory, entry) - print "\nDirectory {:s}:".format(entry_path) - subprocess.check_call("ls -la {:s}".format(entry_path), shell=True) - if os.path.isdir(entry_path): - subdir_entries = os.listdir(entry_path) - for subdir_entry in subdir_entries: - subdir_entry_path = "{:s}/{:s}".format(entry_path, subdir_entry) - print "\nDirectory {:s}:".format(subdir_entry_path) - subprocess.check_call("ls -la {:s}".format(subdir_entry_path), shell=True) - else: - print "Genome Build Directory does not exist:\n\t{:s}".format(genome_build_directory) - print "*******************************" + print "\n*******************************" + print "Contents of Genome Source Directory {:s}:".format(genome_source_directory) + print_directory_contents(genome_source_directory, 2) + print "\nContents of Genome Build Directory {:s}:".format(genome_build_directory) + print_directory_contents(genome_build_directory, 2) + print "*******************************\n" else: raise ValueError("Cannot build the CTAT Genome Resource Library. " + \ "The source directory does not exist:\n\t{:s}".format(genome_source_directory)) elif gmap_build: gmap_the_library(genome_build_directory) +def search_for_genome_build_dir(top_dir_path): + # If we do not download the directory, the topdir_path could be the + # location of the genome resource library, but we also want to allow the + # user to give the same value for top_dir_path that they do when a + # build happens, so we need to handle all three cases: + # 1) Is the top_dir_path the build directory, + # 2) or is it inside of the given directory, + # 3) or is it inside a subdirectory of the given directory. + # The source_data downloads are built to a directory named _CTAT_Build_dirname, + # and the plug-n-play downloads contain a sub-directory named _CTAT_Build_dirname. + genome_build_directory = None + print_warning = False + + if not os.path.exists(top_dir_path): + raise ValueError("Cannot find the CTAT Genome Resource Library. " + \ + "The given directory does not exist:\n\t{:s}".format(top_dir_path)) + elif not os.path.isdir(top_dir_path): + raise ValueError("Cannot find the CTAT Genome Resource Library. " + \ + "The given directory is not a directory:\n\t{:s}".format(top_dir_path)) + if top_dir_path.split("/")[-1] == _CTAT_Build_dirname: + print "Build directory is: {:s}".format(top_dir_path) + # The top_dir_path is the path to the genome_build_directory. + genome_build_directory = top_dir_path + else: + # Look for it inside of the top_dir_path directory. + print "Looking inside of: {:s}".format(top_dir_path) + top_dir_contents = os.listdir(top_dir_path) + if (_CTAT_Build_dirname in top_dir_contents): + # The genome_build_directory is inside of the top_dir_path directory. + print "1. Found it." + genome_build_directory = "{:s}/{:s}".format(top_dir_path,_CTAT_Build_dirname) + else: + # Find all subdirectories containing the _CTAT_Build_dirname or the _CTAT_RefGenome_Filename. + # Look down the directory tree two levels. + build_dirs_in_subdirs = list() + subdirs_with_genome_files = list() + build_dirs_in_sub_subdirs = list() + sub_subdirs_with_genome_files = list() + subdirs = [entry for entry in top_dir_contents if (os.path.isdir("{:s}/{:s}".format(top_dir_path,entry)))] + for subdir in subdirs: + subdir_path = "{:s}/{:s}".format(top_dir_path, subdir) + subdir_path_contents = os.listdir(subdir_path) + # print "Is it one of:\n\t" + "\n\t".join(subdir_path_contents) + if (_CTAT_Build_dirname in subdir_path_contents): + # The genome_build_directory is inside of the subdir_path directory. + print "2a, Found one." + build_dirs_in_subdirs.append("{:s}/{:s}".format(subdir_path, _CTAT_Build_dirname)) + if (_CTAT_RefGenome_Filename in subdir_path_contents): + subdirs_with_genome_files.append(subdir_path) + # Since we are already looping, loop through all dirs one level deeper as well. + sub_subdirs = [entry for entry in subdir_path_contents if (os.path.isdir("{:s}/{:s}".format(subdir_path,entry)))] + for sub_subdir in sub_subdirs: + sub_subdir_path = "{:s}/{:s}".format(subdir_path, sub_subdir) + sub_subdir_path_contents = os.listdir(sub_subdir_path) + # print "Is it one of:\n\t" + "\n\t".join(sub_subdir_path_contents) + if (_CTAT_Build_dirname in sub_subdir_path_contents): + # The genome_build_directory is inside of the sub_subdir_path directory. + print "3a. Found one." + build_dirs_in_sub_subdirs.append("{:s}/{:s}".format(sub_subdir_path, _CTAT_Build_dirname)) + if (_CTAT_RefGenome_Filename in sub_subdir_path_contents): + sub_subdirs_with_genome_files.append(sub_subdir_path) + # Hopefully there is one and only one found build directory. + # If none are found we check for a directory containing the genome reference file, + # but the build process sometimes causes more than one directory to have a copy, + # so finding that file is not a sure thing. + if (len(build_dirs_in_subdirs) + len(build_dirs_in_sub_subdirs)) > 1: + print "\n***************************************" + print "Found multiple CTAT Genome Resource Libraries " + \ + "in the given directory:\n\t{:s}".format(top_dir_path) + print_directory_contents(top_dir_path, 2) + print "***************************************\n" + raise ValueError("Found multiple CTAT Genome Resource Libraries " + \ + "in the given directory:\n\t{:s}".format(top_dir_path)) + elif len(build_dirs_in_subdirs) == 1: + # The genome_build_directory is inside of the subdir_path directory. + print "2b, Found it." + genome_build_directory = build_dirs_in_subdirs[0] + elif len(build_dirs_in_sub_subdirs) == 1: + # The genome_build_directory is inside of the subdir_path directory. + print "3b, Found it." + genome_build_directory = build_dirs_in_sub_subdirs[0] + elif (len(sub_subdirs_with_genome_files) + len(subdirs_with_genome_files)) > 1: + print "\n***************************************" + print "Unable to find CTAT Genome Resource Library " + \ + "in the given directory:\n\t{:s}".format(top_dir_path) + print "And multiple directories contain {:s}".format(_CTAT_RefGenome_Filename) + print_directory_contents(top_dir_path, 2) + print "***************************************\n" + raise ValueError("Unable to find CTAT Genome Resource Library " + \ + "in the given directory:\n\t{:s}".format(top_dir_path)) + elif (len(sub_subdirs_with_genome_files) == 1): + print "3c, Maybe found it." + genome_build_directory = sub_subdirs_with_genome_files[0] + print_warning = True + elif (len(subdirs_with_genome_files) == 1): + print "2c, Maybe found it." + genome_build_directory = subdirs_with_genome_files[0] + print_warning = True + elif (_CTAT_RefGenome_Filename in top_dir_contents): + print "1c. Maybe found it." + genome_build_directory = top_dir_path + print_warning = True + else: + print "\n***************************************" + print "Unable to find CTAT Genome Resource Library " + \ + "in the given directory:\n\t{:s}".format(top_dir_path) + print_directory_contents(top_dir_path, 2) + print "***************************************\n" + raise ValueError("Unable to find CTAT Genome Resource Library " + \ + "in the given directory:\n\t{:s}".format(top_dir_path)) + # end else + # Check if the CTAT Genome Resource Lib has anything in it (and specifically ref_genome.fa). + if (genome_build_directory is None): + print "\n***************************************" + print "Cannot find the CTAT Genome Resource Library " + \ + "in the given directory:\n\t{:s}".format(top_dir_path) + print_directory_contents(top_dir_path, 2) + print "***************************************\n" + raise ValueError("Cannot find the CTAT Genome Resource Library " + \ + "in the given directory:\n\t{:s}".format(top_dir_path)) + elif (_CTAT_RefGenome_Filename not in os.listdir(genome_build_directory)): + print "\n***************************************" + print "\nWARNING: Cannot find Genome Reference file {:s}".format(_CTAT_RefGenome_Filename) + \ + "in the genome build directory:\n\t{:s}".format(genome_build_directory) + print_directory_contents(genome_build_directory, 2) + print "***************************************\n" + if print_warning and genome_build_directory: + print "\n***************************************" + print "\nWARNING: Cannot find the CTAT Genome Resource Library," + \ + "but found a {:s} file, so set its directory as the library.".format(_CTAT_RefGenome_Filename) + print "This my not be the correct directory:\n\t{:s}".format(genome_build_directory) + print_directory_contents(genome_build_directory, 2) + print "***************************************\n" + return genome_build_directory + def main(): #Parse Command Line parser = argparse.ArgumentParser() - parser.add_argument('-s', '--source_url', default="", \ - help='This is the url of a file with the data. They come from https://data.broadinstitute.org/Trinity/CTAT_RESOURCE_LIB/.') - parser.add_argument('-n', '--display_name', default="", \ + parser.add_argument('-s', '--source_url', default='', \ + help='This is the url of a file with the data. ' + \ + 'They come from https://data.broadinstitute.org/Trinity/CTAT_RESOURCE_LIB/.') + parser.add_argument('-n', '--display_name', default='', \ help='Is used as the display name for the entry of this Genome Resource Library in the data table.') - parser.add_argument('-p', '--destination_path', \ - help='Full path of the CTAT Resource Library location or destination, either where it is, or where it will be placed.') parser.add_argument('-o', '--output_filename', \ help='Name of the output file, where the json dictionary will be written.') parser.add_argument('-f', '--force_download', - help='Forces download of the Genome Resource Library, even if previously downloaded.', action="store_true") + help='Forces download of the Genome Resource Library, even if previously downloaded.', action='store_true') parser.add_argument('-b', '--build', help='Forces build/rebuild the Genome Resource Library, even if previously built. ' + \ - 'Must have downloaded source_data for this to work.', action="store_true") + 'Must have downloaded source_data for this to work.', action='store_true') parser.add_argument('-m', '--gmap_build', help='Must be selected if you want the library to be gmapped. ' + \ - 'Will force gmap_build of the Genome Resource Library, even if previously gmapped.', action="store_true") + 'Will force gmap_build of the Genome Resource Library, even if previously gmapped.', action='store_true') + requiredNamed = parser.add_argument_group('required named arguments') + requiredNamed.add_argument('-p', '--destination_path', required=True, \ + help='Full path of the CTAT Resource Library location or destination, either where it is, or where it will be placed.') args = parser.parse_args() # All of the input parameters are written by default to the output file prior to @@ -418,39 +556,40 @@ print "The value of source_url argument is:\n\t{:s}".format(str(args.source_url)) - # FIX - not sure the lib_was_downloaded actually serves a purpose... + # FIX - not sure lib_was_downloaded actually serves a purpose... lib_was_downloaded = False download_has_source_data = False - # If we do not download the directory, the destination_path should be the - # location of the genome resource library. downloaded_directory = None - # FIX - look inside of the args.destination_path to see if the build directory is inside it or is it. genome_build_directory = None # FIX - need to make sure we are handling all "possible" combinations of arguments. # Probably would be good if we could simplify/remove some of them. + # But I think the current interface is using them all. if (args.source_url != ""): downloaded_directory, download_has_source_data, genome_build_directory, lib_was_downloaded = \ download_from_BroadInst(source=args.source_url, \ destination=args.destination_path, \ force_download=args.force_download) else: - genome_build_directory = args.destination_path - if not os.path.exists(genome_build_directory): - raise ValueError("Cannot find the CTAT Genome Resource Library. " + \ - "The directory does not exist:\n\t{:s}".format(genome_build_directory)) - # else: - # FIX - Check if there is an actual CTAT Genome Resource Lib there. - # _CTAT_BuildDir_Name + genome_build_directory = search_for_genome_build_dir(args.destination_path) print "\nThe location of the CTAT Genome Resource Library is {:s}.\n".format(genome_build_directory) - # Take out builds for testing. # FIX - We should leave a file indicating build success the same way we do for download success. + # To take out builds for testing, coment out the next four lines. if (download_has_source_data or args.build or args.gmap_build) : build_the_library(downloaded_directory, genome_build_directory, args.build, args.gmap_build) elif (args.gmap_build): gmap_the_library(genome_build_directory) + # The following looks to see if the library actually exists after the build, + # and raises an error if it cannot find the library files. + # The reassignment of genome_build_directory should be superfluous, + # unless I made a mistake in the build code. + # FIX - need to get the genome name from the directory name, if there was no download. + #genome_build_directory, genome_name_from_dirname = search_for_genome_build_dir(genome_build_directory) + genome_build_directory = search_for_genome_build_dir(genome_build_directory) + + source_filename_root = None if (args.source_url != None) and (args.source_url != ""): # Get the name out of the source's filename. source_filename_root = args.source_url.split("/")[-1].split(".")[0] @@ -458,15 +597,14 @@ # Determine the display_name for the library. if (args.display_name is None) or (args.display_name == ""): if (source_filename_root != None) and (source_filename_root != ""): - # Get the name out of the source filename. + # Create the display_name from the source_filename_root. display_name = _CTAT_ResourceLib_DisplayNamePrefix + source_filename_root else: display_name = _CTAT_ResourceLib_DisplayNamePrefix + _CTAT_ResourceLib_DefaultGenome - print "WARNING: We do not have a genome name. Using a default name, that might not be correct." + print "WARNING: We do not have a genome name." else: display_name = _CTAT_ResourceLib_DisplayNamePrefix + args.display_name display_name = display_name.replace(" ","_") - print "The Genome Name will be set to: {:s}\n".format(display_name) # Create a unique_id for the library. datetime_stamp = datetime.now().strftime("_%Y_%m_%d_%H_%M_%S_%f") @@ -477,7 +615,7 @@ else: unique_id = _CTAT_ResourceLib_DefaultGenome + datetime_stamp - print "The Resource Lib's display_name will be set to: {:s}\n".format(display_name) + print "The Genome Resource Library's display_name will be set to: {:s}\n".format(display_name) print "Its unique_id will be set to: {:s}\n".format(unique_id) print "Its dir_path will be set to: {:s}\n".format(genome_build_directory)