Mercurial > repos > trinity_ctat > ctat_genome_resource_libs_data_manager_3
diff data_manager/add_ctat_resource_lib.py @ 8:b2e6ed40840a draft
Uploaded
author | trinity_ctat |
---|---|
date | Sat, 23 Jun 2018 15:40:54 -0400 |
parents | f22a13378750 |
children | 1717c42112ed |
line wrap: on
line diff
--- a/data_manager/add_ctat_resource_lib.py Fri May 11 16:06:47 2018 -0400 +++ b/data_manager/add_ctat_resource_lib.py Sat Jun 23 15:40:54 2018 -0400 @@ -35,6 +35,7 @@ from HTMLParser import HTMLParser _CTAT_ResourceLib_URL = 'https://data.broadinstitute.org/Trinity/CTAT_RESOURCE_LIB/' +_CTAT_MutationIndex_URL = 'https://data.broadinstitute.org/Trinity/CTAT/mutation/' _CTAT_Build_dirname = 'ctat_genome_lib_build_dir' _CTAT_ResourceLib_DisplayNamePrefix = 'CTAT_GenomeResourceLib_' _CTAT_ResourceLib_DefaultGenome = 'Unspecified_Genome' @@ -42,9 +43,12 @@ _CTAT_RefGenome_Filename = 'ref_genome.fa' _CTAT_MouseGenome_Prefix = 'Mouse' _CTAT_HumanGenome_Prefix = 'GRCh' -_NumBytesNeededForBuild = 64424509440 # 60 Gigabytes. FIX - This might not be correct. +_NumBytesNeededForBuild = 66571993088 # 62 Gigabytes. FIX - This might not be correct. +_NumBytesNeededForIndexes = 21474836480 # 20 Gigabytes. FIX - This might not be correct. _Download_TestFile = "write_testfile.txt" _DownloadSuccessFile = 'download_succeeded.txt' +_LibBuiltSuccessFile = 'build_succeeded.txt' +_MutationDownloadSuccessFile = 'mutation_index_download_succeeded.txt' class FileListParser(HTMLParser): def __init__(self): @@ -81,7 +85,7 @@ # The urls should look like: # https://data.broadinstitute.org/Trinity/CTAT_RESOURCE_LIB/GRCh37_v19_CTAT_lib_Feb092018.plug-n-play.tar.gz # https://data.broadinstitute.org/Trinity/CTAT_RESOURCE_LIB/Mouse_M16_CTAT_lib_Feb202018.source_data.tar.gz - # But is actuality, they are coming in looking like: + # But in actuality, they are coming in looking like: # GRCh37_v19_CTAT_lib_Feb092018.plug-n-play.tar.gz # Mouse_M16_CTAT_lib_Feb202018.source_data.tar.gz # Write code to handle both situations, or an ftp: url. @@ -91,11 +95,44 @@ # Assume the path is relative to the page location. full_url_path = "{:s}/{:s}".format(_CTAT_ResourceLib_URL, url) filename = url.split("/")[-1] + # if filename.split("_")[0] != _CTAT_MouseGenome_Prefix: + # # Don't put in the mouse genome options for now. + # # The mouse genome option is not handled correctly yet + # options.append((filename, full_url_path, i == 0)) + # Mouse genomes should work now (we hope) - FIX - still not tested. + options.append((filename, full_url_path, i == 0)) + options.sort() # So the list will be in alphabetical order. + # return a tuple of the urls + print "The list being returned as options is:" + print "{:s}\n".format(str(options)) + return options - if filename.split("_")[0] != _CTAT_MouseGenome_Prefix: - # Take out the mouse genome options for now. - # The mouse genome option is not handled correctly yet - options.append((filename, full_url_path, i == 0)) +def get_mutation_index_urls(): + # open the url and retrieve the urls of the files in the directory. + resource = urllib2.urlopen(_CTAT_MutationIndex_URL) + theHTML = resource.read() + filelist_parser = FileListParser() + filelist_parser.feed(theHTML) + # For dynamic options need to return an interable with contents that are tuples with 3 items. + # Item one is a string that is the display name put into the option list. + # Item two is the value that is put into the parameter associated with the option list. + # Item three is a True or False value, indicating whether the item is selected. + options = [] + for i, url in enumerate(filelist_parser.urls): + # The urls should look like: + # https://data.broadinstitute.org/Trinity/CTAT/mutation/mc7.tar.gz + # https://data.broadinstitute.org/Trinity/CTAT/mutation/hg19.tar.gz + # But in actuality, they are coming in looking like: + # hg19.tar.gz + # mc7.tar.gz + # Write code to handle both situations, or an ftp: url. + if (url.split(":")[0] == "http") or (url.split(":")[0] == "https") or (url.split(":")[0] == "ftp"): + full_url_path = url + else: + # Assume the path is relative to the page location. + full_url_path = "{:s}/{:s}".format(_CTAT_MutationIndex_URL, url) + filename = url.split("/")[-1] + options.append((filename, full_url_path, i == 0)) options.sort() # So the list will be in alphabetical order. # return a tuple of the urls print "The list being returned as options is:" @@ -114,6 +151,7 @@ # trained_url = params['param_dict']['trained_url'] # return trained_url +# The following procedure is used to help with debugging and for user information. def print_directory_contents(dir_path, num_levels): if num_levels > 0: if os.path.exists(dir_path) and os.path.isdir(dir_path): @@ -122,10 +160,13 @@ else: print "Path either does not exist, or is not a directory:\n\t{:s}.".format(dir_path) if num_levels > 1: - for filename in os.listdir(dir_path): - filename_path = "{:s}/{:s}".format(dir_path, filename) - if os.path.exists(filename_path) and os.path.isdir(filename_path): - print_directory_contents(filename_path, num_levels-1) + if os.path.exists(dir_path) and os.path.isdir(dir_path): + for filename in os.listdir(dir_path): + filename_path = "{:s}/{:s}".format(dir_path, filename) + if os.path.exists(filename_path) and os.path.isdir(filename_path): + print_directory_contents(filename_path, num_levels-1) + else: + print "Path either does not exist, or is not a directory:\n\t{:s}.".format(dir_path) def download_from_BroadInst(source, destination, force_download): # Input Parameters @@ -154,9 +195,10 @@ # Since it doesn't always do the download, the function returns whether download occurred. lib_was_downloaded = False if len(source.split(":")) == 1: - # Might want to check that it is one of "http", "ftp", "file" or other accepted url starts. + # Then we were given a source_url without a leading https: or similar. # Assume we only were given the filename and that it exists at _CTAT_ResourceLib_URL. source = "{:s}/{:s}".format(_CTAT_ResourceLib_URL, source) + # else we might want to check that it is one of "http", "ftp", "file" or other accepted url starts. print "In download_from_BroadInst(). The source_url is:\n\t{:s}".format(str(source)) @@ -207,6 +249,10 @@ # We use it to check for a previous download or extraction among other things. orig_files_in_destdir = set(os.listdir(cannonical_destination)) # See whether the file has been downloaded already. + # FIX - Try looking one or two directories above, as well as current directory, + # and maybe one directory below, + # for the download success file? + # Not sure about this though... download_success_file = "{:s}.{:s}".format(root_genome_dirname, _DownloadSuccessFile) download_success_file_path = "{:s}/{:s}".format(cannonical_destination, download_success_file) if ((download_success_file not in orig_files_in_destdir) \ @@ -329,6 +375,74 @@ print_directory_contents(genome_build_directory, 2) print "*******************************\n" +def download_mutation_indexes(source_url, genome_build_directory, force_download): + print "\n*****************************************************************" + print "* The real mutation indexes have not yet been created. Just testing. *" + print "*****************************************************************\n" + # It is assumed that this procedure is only called with a valid genome_build_directory. + # No checks are made to see whether it exists, whether we can write to it, etc. + index_was_downloaded = False + if len(source_url.split(":")) == 1: + # Then we were given a source_url without a leading https: or similar. + # Assume we only were given the filename and that it exists at _CTAT_MutationIndex_URL. + source_url = "{:s}/{:s}".format(_CTAT_MutationIndex_URL, source_url) + + print "In download_mutation_indexes(). The source_url is:\n\t{:s}".format(str(source_url)) + + # Get the root filename of the Genome Directory. + src_filename = source.split("/")[-1] + root_genome_dirname = src_filename.split(".")[0] + print "The mutation index file to be downloaded and extracted is {:s}".format(src_filename) + + # Get the list of files in the directory, + # We use it to check for a previous download or extraction among other things. + orig_files_in_destdir = set(os.listdir(genome_build_directory)) + # See whether the index file has been downloaded already. + download_success_file = "{:s}.{:s}".format(root_genome_dirname, _MutationDownloadSuccessFile) + download_success_file_path = "{:s}/{:s}".format(genome_build_directory, download_success_file) + if ((download_success_file not in orig_files_in_destdir) or force_download): + # Check whether there is enough space on the device for the library. + statvfs = os.statvfs(genome_build_directory) + # fs_size = statvfs.f_frsize * statvfs.f_blocks # Size of filesystem in bytes + # num_free_bytes = statvfs.f_frsize * statvfs.f_bfree # Actual number of free bytes + num_avail_bytes = statvfs.f_frsize * statvfs.f_bavail # Number of free bytes that ordinary users + # are allowed to use (excl. reserved space) + if (num_avail_bytes < _NumBytesNeededForIndexes): + raise OSError("There is insufficient space ({:s} bytes)".format(str(num_avail_bytes)) + \ + " for the indexes on the device of the destination directory: " + \ + "{:s}".format(cannonical_destination)) + if (download_success_file in orig_files_in_destdir): + # Since we are redoing the download, + # the success file needs to be removed + # until the download has succeeded. + os.remove(download_success_file_path) + # We want to transfer and untar the file without storing the tar file, because that + # adds all that much more space to the needed amount of free space on the disk. + # Use subprocess to pipe the output of curl into tar. + command = "curl --silent {:s} | tar -xzf - -C {:s}".format(source_url, genome_build_directory) + try: # to send the command that downloads and extracts the file. + command_output = subprocess.check_output(command, shell=True) + # FIX - not sure check_output is what we want to use. If we want to have an error raised on + # any problem, maybe we should not be checking output. + except subprocess.CalledProcessError: + print "ERROR: Trying to run the following command:\n\t{:s}".format(command) + raise + else: + index_was_downloaded = True + # Some code to help us if errors occur. + print "/n*********************************************************" + print "* Finished download and extraction of Mutation Indexes. *" + print_directory_contents(genome_build_directory, 2) + print "*********************************************************\n" + try: + # Create a file to indicate that the download succeeded. + subprocess.check_call("touch {:s}".format(download_success_file_path), shell=True) + except IOError: + print "The download_success file could not be created: " + \ + "{:s}".format(download_success_file_path) + raise + return index_was_downloaded + def build_the_library(genome_source_directory, genome_build_directory, build, gmap_build): """ genome_source_directory is the location of the source_data needed to build the library. Normally it is fully specified, but could be relative. @@ -350,14 +464,27 @@ gmap_build -D ctat_genome_lib_build_dir -d ref_genome.fa.gmap -k 13 ctat_genome_lib_build_dir/ref_genome.fa" """ + # Get the root filename of the Genome Directory. + src_filename = genome_source_directory.split("/")[-1] + root_genome_dirname = src_filename.split(".")[0] print "Building the CTAT Genome Resource Library from source data at:\n\t{:s}".format(genome_source_directory) - if (genome_source_directory != "" ) and build: + # See whether the library has been built already. The success file is written into the source directory. + files_in_sourcedir = set(os.listdir(genome_source_directory)) + build_success_file = "{:s}.{:s}".format(root_genome_dirname, _LibBuiltSuccessFile) + build_success_file_path = "{:s}/{:s}".format(genome_source_directory, build_success_file) + if (genome_source_directory != "" ) and \ + ((build_success_file not in files_in_sourcedir) or build): if os.path.exists(genome_source_directory): os.chdir(genome_source_directory) + if (build_success_file in files_in_sourcedir): + # Since we are redoing the build, + # the success file needs to be removed + # until the build has succeeded. + os.remove(build_success_file_path) # Create the command that builds the Genome Resource Library form the source data. command = "prep_genome_lib.pl --genome_fa ref_genome.fa --gtf ref_annot.gtf " + \ "--pfam_db PFAM.domtblout.dat.gz " + \ - "--output_dir {:s}".format(genome_build_directory) + "--output_dir {:s} ".format(genome_build_directory) found_HumanFusionLib = False HumanFusionLib_filename = "NoFileFound" for filename in os.listdir(genome_source_directory): @@ -398,6 +525,13 @@ "The source directory does not exist:\n\t{:s}".format(genome_source_directory)) elif gmap_build: gmap_the_library(genome_build_directory) + try: + # Create a file to indicate that the build succeeded. + subprocess.check_call("touch {:s}".format(build_success_file_path), shell=True) + except IOError: + print "The download_success file could not be created: " + \ + "{:s}".format(build_success_file_path) + raise def search_for_genome_build_dir(top_dir_path): # If we do not download the directory, the topdir_path could be the @@ -563,14 +697,20 @@ help='Is used as the display name for the entry of this Genome Resource Library in the data table.') parser.add_argument('-o', '--output_filename', \ help='Name of the output file, where the json dictionary will be written.') - parser.add_argument('-f', '--force_download', + parser.add_argument('-d', '--force_download', \ help='Forces download of the Genome Resource Library, even if previously downloaded.', action='store_true') - parser.add_argument('-b', '--build', + parser.add_argument('-b', '--build', \ help='Forces build/rebuild the Genome Resource Library, even if previously built. ' + \ 'Must have downloaded source_data for this to work.', action='store_true') - parser.add_argument('-m', '--gmap_build', + parser.add_argument('-g', '--gmap_build', \ help='Must be selected if you want the library to be gmapped. ' + \ 'Will force gmap_build of the Genome Resource Library, even if previously gmapped.', action='store_true') + parser.add_argument('-m', '--download_mutation_indexes', default='', \ + help='Set to the url of the mutation indexes for the Library. ' + \ + 'Will download mutation indexes into the Genome Resource Library.', action='store_true') + parser.add_argument('-f', '--force_mutation_indexes_download', \ + help='Forces the mutation indexes to download, ' + \ + 'even if previously downloaded to this Library.', action='store_true') requiredNamed = parser.add_argument_group('required named arguments') requiredNamed.add_argument('-p', '--destination_path', required=True, \ help='Full path of the CTAT Resource Library location or destination, either where it is, or where it will be placed.') @@ -587,7 +727,13 @@ print "The value of source_url argument is:\n\t{:s}".format(str(args.source_url)) # FIX - not sure lib_was_downloaded actually serves a purpose... + # The original intent was to check whether an attempted download actually succeeded before proceeding, + # but I believe that in those situations, currently, exceptions are raised. + # FIX - Need to double check that. Sometimes, although we are told to download, the function + # could find that the files are already there, successfully downloaded from a prior attempt, + # and does not re-download them. lib_was_downloaded = False + lib_was_built = False download_has_source_data = False downloaded_directory = None genome_build_directory = None @@ -605,18 +751,31 @@ print "\nThe location of the CTAT Genome Resource Library is {:s}.\n".format(genome_build_directory) # FIX - We should leave a file indicating build success the same way we do for download success. - # To take out builds for testing, coment out the next four lines. - if (download_has_source_data or args.build or args.gmap_build): + # To take out builds for testing, comment out the lines that do the building. + # The command that builds the ctat genome library also has an option for building the gmap indexes. + # That is why the gmap_build value is sent to build_the_library(), but if we are not building the + # library, the user might still be asking for a gmap_build. That is done after rechecking for the + # genome_build_directory. + if (download_has_source_data or args.build): build_the_library(downloaded_directory, genome_build_directory, True, args.gmap_build) - elif (args.gmap_build): - gmap_the_library(genome_build_directory) - + lib_was_built = True # The following looks to see if the library actually exists after the build, # and raises an error if it cannot find the library files. # The reassignment of genome_build_directory should be superfluous, + # since genome_build_directory should already point to the correct directory, # unless I made a mistake in the build code. genome_build_directory = search_for_genome_build_dir(genome_build_directory) + if (args.gmap_build and not lib_was_built): + # If we did not build the genome resource library + # the user might still be asking for a gmap_build. + gmap_the_library(genome_build_directory) + + if (args.download_mutation_indexes != ""): + download_mutation_indexes(source_url=args.download_mutation_indexes, \ + genome_build_directory=genome_build_directory, \ + force_download=args.force_mutation_indexes_download) + # Need to get the genome name. genome_name = find_genome_name_in_path(args.source_url) if genome_name is None: @@ -641,7 +800,7 @@ # Create a unique_id for the library. datetime_stamp = datetime.now().strftime("_%Y_%m_%d_%H_%M_%S_%f") - unique_id = genome_name + datetime_stamp + unique_id = genome_name + "." + datetime_stamp print "The Genome Resource Library's display_name will be set to: {:s}\n".format(display_name) print "Its unique_id will be set to: {:s}\n".format(unique_id)