Mercurial > repos > trinity_ctat > ctat_genome_resource_libs_data_manager_3

--- a/data_manager/add_ctat_resource_lib.py	Fri May 04 13:19:47 2018 -0400
+++ b/data_manager/add_ctat_resource_lib.py	Fri May 11 16:06:47 2018 -0400
@@ -40,6 +40,8 @@
 _CTAT_ResourceLib_DefaultGenome = 'Unspecified_Genome'
 _CTAT_HumanFusionLib_FilenamePrefix = 'CTAT_HumanFusionLib'
 _CTAT_RefGenome_Filename = 'ref_genome.fa'
+_CTAT_MouseGenome_Prefix = 'Mouse'
+_CTAT_HumanGenome_Prefix = 'GRCh'
 _NumBytesNeededForBuild = 64424509440 # 60 Gigabytes. FIX - This might not be correct.
 _Download_TestFile = "write_testfile.txt"
 _DownloadSuccessFile = 'download_succeeded.txt'
@@ -90,7 +92,7 @@
             full_url_path = "{:s}/{:s}".format(_CTAT_ResourceLib_URL, url)
         filename = url.split("/")[-1]

-        if filename.split("_")[0] != "Mouse":
+        if filename.split("_")[0] != _CTAT_MouseGenome_Prefix:
             # Take out the mouse genome options for now.
             # The mouse genome option is not handled correctly yet
             options.append((filename, full_url_path, i == 0))
@@ -151,6 +153,10 @@
     # lib_was_downloaded
     #     Since it doesn't always do the download, the function returns whether download occurred.
     lib_was_downloaded = False
+    if len(source.split(":")) == 1:
+        # Might want to check that it is one of "http", "ftp", "file" or other accepted url starts.
+        # Assume we only were given the filename and that it exists at _CTAT_ResourceLib_URL.
+        source = "{:s}/{:s}".format(_CTAT_ResourceLib_URL, source)

     print "In download_from_BroadInst(). The source_url is:\n\t{:s}".format(str(source))

@@ -160,6 +166,8 @@
     # If the src_filename indicates it is a source file, as opposed to plug-n-play,
     # then we may need to do some post processing on it.
     type_of_download = src_filename.split(".")[1]
+    print "The file to be extracted is {:s}".format(src_filename)
+    print "The type of download is {:s}".format(type_of_download)
     download_has_source_data = (type_of_download == "source_data")

     # We want to make sure that destination is absolute fully specified path.
@@ -184,7 +192,7 @@
         # in the code, something is wrong. Raise an error.
         raise OSError("The destination directory could not be created: " + \
                       "{:s}".format(cannonical_destination))
-    test_writing_file = "{:s}/{:s}".format(cannonical_destination, _Download_TestFile)
+    test_writing_file = "{:s}/{:s}.{:s}".format(cannonical_destination, root_genome_dirname, _Download_TestFile)
     try:
         filehandle = open(test_writing_file, "w")
         filehandle.write("Testing writing to this file.")
@@ -199,8 +207,9 @@
     # We use it to check for a previous download or extraction among other things.
     orig_files_in_destdir = set(os.listdir(cannonical_destination))
     # See whether the file has been downloaded already.
-    download_success_file_path = "{:s}/{:s}".format(cannonical_destination, _DownloadSuccessFile)
-    if ((_DownloadSuccessFile not in orig_files_in_destdir) \
+    download_success_file = "{:s}.{:s}".format(root_genome_dirname, _DownloadSuccessFile)
+    download_success_file_path = "{:s}/{:s}".format(cannonical_destination, download_success_file)
+    if ((download_success_file not in orig_files_in_destdir) \
         or (root_genome_dirname not in orig_files_in_destdir) \
         or force_download):
         # Check whether there is enough space on the device for the library.
@@ -231,7 +240,7 @@
         #try:
         #    tarfile.open(full_filepath, mode='r:*').extractall()

-        if (_DownloadSuccessFile in orig_files_in_destdir):
+        if (download_success_file in orig_files_in_destdir):
             # Since we are redoing the download,
             # the success file needs to be removed
             # until the download has succeeded.
@@ -340,6 +349,8 @@
             --genome_lib_dir ctat_genome_lib_build_dir
         gmap_build -D ctat_genome_lib_build_dir -d ref_genome.fa.gmap -k 13 ctat_genome_lib_build_dir/ref_genome.fa"
     """
+
+    print "Building the CTAT Genome Resource Library from source data at:\n\t{:s}".format(genome_source_directory)
     if (genome_source_directory != "" ) and build:
         if os.path.exists(genome_source_directory):
             os.chdir(genome_source_directory)
@@ -398,27 +409,31 @@
     # 3) or is it inside a subdirectory of the given directory.
     # The source_data downloads are built to a directory named _CTAT_Build_dirname,
     # and the plug-n-play downloads contain a sub-directory named _CTAT_Build_dirname.
+    # We also look for the genome name and return that, if we find it in the
+    # directory name of the directory holding the build directory.
+    top_dir_full_path = os.path.realpath(top_dir_path)
     genome_build_directory = None
+    genome_name_from_dirname = None
     print_warning = False

-    if not os.path.exists(top_dir_path):
+    if not os.path.exists(top_dir_full_path):
         raise ValueError("Cannot find the CTAT Genome Resource Library. " + \
-            "The given directory does not exist:\n\t{:s}".format(top_dir_path))
-    elif not os.path.isdir(top_dir_path):
+            "The given directory does not exist:\n\t{:s}".format(top_dir_full_path))
+    elif not os.path.isdir(top_dir_full_path):
         raise ValueError("Cannot find the CTAT Genome Resource Library. " + \
-            "The given directory is not a directory:\n\t{:s}".format(top_dir_path))
-    if top_dir_path.split("/")[-1] == _CTAT_Build_dirname:
-        print "Build directory is: {:s}".format(top_dir_path)
+            "The given directory is not a directory:\n\t{:s}".format(top_dir_full_path))
+    if top_dir_full_path.split("/")[-1] == _CTAT_Build_dirname:
+        print "Build directory is: {:s}".format(top_dir_full_path)
         # The top_dir_path is the path to the genome_build_directory.
-        genome_build_directory = top_dir_path
+        genome_build_directory = top_dir_full_path
     else:
         # Look for it inside of the top_dir_path directory.
-        print "Looking inside of: {:s}".format(top_dir_path)
-        top_dir_contents = os.listdir(top_dir_path)
+        print "Looking inside of: {:s}".format(top_dir_full_path)
+        top_dir_contents = os.listdir(top_dir_full_path)
         if (_CTAT_Build_dirname in top_dir_contents):
             # The genome_build_directory is inside of the top_dir_path directory.
             print "1. Found it."
-            genome_build_directory = "{:s}/{:s}".format(top_dir_path,_CTAT_Build_dirname)
+            genome_build_directory = "{:s}/{:s}".format(top_dir_full_path,_CTAT_Build_dirname)
         else:
             # Find all subdirectories containing the _CTAT_Build_dirname or the _CTAT_RefGenome_Filename.
             # Look down the directory tree two levels.
@@ -426,9 +441,9 @@
             subdirs_with_genome_files = list()
             build_dirs_in_sub_subdirs = list()
             sub_subdirs_with_genome_files = list()
-            subdirs = [entry for entry in top_dir_contents if (os.path.isdir("{:s}/{:s}".format(top_dir_path,entry)))]
+            subdirs = [entry for entry in top_dir_contents if (os.path.isdir("{:s}/{:s}".format(top_dir_full_path,entry)))]
             for subdir in subdirs:
-                subdir_path = "{:s}/{:s}".format(top_dir_path, subdir)
+                subdir_path = "{:s}/{:s}".format(top_dir_full_path, subdir)
                 subdir_path_contents = os.listdir(subdir_path)
                 # print "Is it one of:\n\t" + "\n\t".join(subdir_path_contents)
                 if (_CTAT_Build_dirname in subdir_path_contents):
@@ -456,11 +471,11 @@
             if (len(build_dirs_in_subdirs) + len(build_dirs_in_sub_subdirs)) > 1:
                 print "\n***************************************"
                 print "Found multiple CTAT Genome Resource Libraries " + \
-                    "in the given directory:\n\t{:s}".format(top_dir_path)
-                print_directory_contents(top_dir_path, 2)
+                    "in the given directory:\n\t{:s}".format(top_dir_full_path)
+                print_directory_contents(top_dir_full_path, 2)
                 print "***************************************\n"
                 raise ValueError("Found multiple CTAT Genome Resource Libraries " + \
-                    "in the given directory:\n\t{:s}".format(top_dir_path))
+                    "in the given directory:\n\t{:s}".format(top_dir_full_path))
             elif len(build_dirs_in_subdirs) == 1:
                 # The genome_build_directory is inside of the subdir_path directory.
                 print "2b, Found it."
@@ -472,12 +487,12 @@
             elif (len(sub_subdirs_with_genome_files) + len(subdirs_with_genome_files)) > 1:
                 print "\n***************************************"
                 print "Unable to find CTAT Genome Resource Library " + \
-                      "in the given directory:\n\t{:s}".format(top_dir_path)
+                      "in the given directory:\n\t{:s}".format(top_dir_full_path)
                 print "And multiple directories contain {:s}".format(_CTAT_RefGenome_Filename)
-                print_directory_contents(top_dir_path, 2)
+                print_directory_contents(top_dir_full_path, 2)
                 print "***************************************\n"
                 raise ValueError("Unable to find CTAT Genome Resource Library " + \
-                    "in the given directory:\n\t{:s}".format(top_dir_path))
+                    "in the given directory:\n\t{:s}".format(top_dir_full_path))
             elif (len(sub_subdirs_with_genome_files) == 1):
                 print "3c, Maybe found it."
                 genome_build_directory = sub_subdirs_with_genome_files[0]
@@ -488,41 +503,56 @@
                 print_warning = True
             elif (_CTAT_RefGenome_Filename in top_dir_contents):
                 print "1c. Maybe found it."
-                genome_build_directory = top_dir_path
+                genome_build_directory = top_dir_full_path
                 print_warning = True
             else:
                 print "\n***************************************"
                 print "Unable to find CTAT Genome Resource Library " + \
-                      "in the given directory:\n\t{:s}".format(top_dir_path)
-                print_directory_contents(top_dir_path, 2)
+                      "in the given directory:\n\t{:s}".format(top_dir_full_path)
+                print_directory_contents(top_dir_full_path, 2)
                 print "***************************************\n"
                 raise ValueError("Unable to find CTAT Genome Resource Library " + \
-                    "in the given directory:\n\t{:s}".format(top_dir_path))
+                    "in the given directory:\n\t{:s}".format(top_dir_full_path))
         # end else
     # Check if the CTAT Genome Resource Lib has anything in it (and specifically ref_genome.fa).
     if (genome_build_directory is None):
         print "\n***************************************"
         print "Cannot find the CTAT Genome Resource Library " + \
-            "in the given directory:\n\t{:s}".format(top_dir_path)
-        print_directory_contents(top_dir_path, 2)
+            "in the given directory:\n\t{:s}".format(top_dir_full_path)
+        print_directory_contents(top_dir_full_path, 2)
         print "***************************************\n"
         raise ValueError("Cannot find the CTAT Genome Resource Library " + \
-            "in the given directory:\n\t{:s}".format(top_dir_path))
-    elif (_CTAT_RefGenome_Filename not in os.listdir(genome_build_directory)):
-        print "\n***************************************"
-        print "\nWARNING: Cannot find Genome Reference file {:s}".format(_CTAT_RefGenome_Filename) + \
-            "in the genome build directory:\n\t{:s}".format(genome_build_directory)
-        print_directory_contents(genome_build_directory, 2)
-        print "***************************************\n"
-    if print_warning and genome_build_directory:
-        print "\n***************************************"
-        print "\nWARNING: Cannot find the CTAT Genome Resource Library," + \
-            "but found a {:s} file, so set its directory as the library.".format(_CTAT_RefGenome_Filename)
-        print "This my not be the correct directory:\n\t{:s}".format(genome_build_directory)
-        print_directory_contents(genome_build_directory, 2)
-        print "***************************************\n"
+            "in the given directory:\n\t{:s}".format(top_dir_full_path))
+    else:
+        if (_CTAT_RefGenome_Filename not in os.listdir(genome_build_directory)):
+            print "\n***************************************"
+            print "\nWARNING: Cannot find Genome Reference file {:s}".format(_CTAT_RefGenome_Filename) + \
+                "in the genome build directory:\n\t{:s}".format(genome_build_directory)
+            print_directory_contents(genome_build_directory, 2)
+            print "***************************************\n"
+        if print_warning and genome_build_directory:
+            print "\n***************************************"
+            print "\nWARNING: Cannot find the CTAT Genome Resource Library," + \
+                "but found a {:s} file, so set its directory as the library.".format(_CTAT_RefGenome_Filename)
+            print "This my not be the correct directory:\n\t{:s}".format(genome_build_directory)
+            print_directory_contents(genome_build_directory, 2)
+            print "***************************************\n"
     return genome_build_directory

+def find_genome_name_in_path(path):
+    # The form of the genome name in directory names (if present in the path) looks like:
+    # GRCh37_v19_CTAT_lib_Feb092018
+    # Mouse_M16_CTAT_lib_Feb202018
+    genome_name = None
+    if (path is not None) and (path != ""):
+        for element in path.split("/"):
+            # print "Looking for genome name in {:s}.".format(element)
+            if (element[0:len(_CTAT_MouseGenome_Prefix)] == _CTAT_MouseGenome_Prefix) \
+                or (element[0:len(_CTAT_HumanGenome_Prefix)] == _CTAT_HumanGenome_Prefix):
+                # Remove any extension that might be in the filename.
+                genome_name = element.split(".")[0]
+    return genome_name
+
 def main():
     #Parse Command Line
     parser = argparse.ArgumentParser()
@@ -576,8 +606,8 @@

     # FIX - We should leave a file indicating build success the same way we do for download success.
     # To take out builds for testing, coment out the next four lines.
-    if (download_has_source_data or args.build or args.gmap_build) :
-        build_the_library(downloaded_directory, genome_build_directory, args.build, args.gmap_build)
+    if (download_has_source_data or args.build or args.gmap_build):
+        build_the_library(downloaded_directory, genome_build_directory, True, args.gmap_build)
     elif (args.gmap_build):
         gmap_the_library(genome_build_directory)

@@ -585,35 +615,33 @@
     # and raises an error if it cannot find the library files.
     # The reassignment of genome_build_directory should be superfluous,
     # unless I made a mistake in the build code.
-    # FIX - need to get the genome name from the directory name, if there was no download.
-    #genome_build_directory, genome_name_from_dirname = search_for_genome_build_dir(genome_build_directory)
     genome_build_directory = search_for_genome_build_dir(genome_build_directory)

-    source_filename_root = None
-    if (args.source_url != None) and (args.source_url != ""):
-        # Get the name out of the source's filename.
-        source_filename_root = args.source_url.split("/")[-1].split(".")[0]
+    # Need to get the genome name.
+    genome_name = find_genome_name_in_path(args.source_url)
+    if genome_name is None:
+        genome_name = find_genome_name_in_path(genome_build_directory)
+    if genome_name is None:
+        genome_name = find_genome_name_in_path(downloaded_directory)
+    if genome_name is None:
+        genome_name = find_genome_name_in_path(args.destination_path)
+    if genome_name is None:
+        genome_name = find_genome_name_in_path(args.display_name)
+    if genome_name is None:
+        genome_name = _CTAT_ResourceLib_DefaultGenome
+        print "WARNING: We could not find a genome name in any of the directory paths."

     # Determine the display_name for the library.
     if (args.display_name is None) or (args.display_name == ""):
-        if (source_filename_root != None) and (source_filename_root != ""):
-            # Create the display_name from the source_filename_root.
-            display_name = _CTAT_ResourceLib_DisplayNamePrefix + source_filename_root
-        else:
-            display_name = _CTAT_ResourceLib_DisplayNamePrefix + _CTAT_ResourceLib_DefaultGenome
-            print "WARNING: We do not have a genome name."
+        # Create the display_name from the genome_name.
+        display_name = _CTAT_ResourceLib_DisplayNamePrefix + genome_name
     else:
         display_name = _CTAT_ResourceLib_DisplayNamePrefix + args.display_name
     display_name = display_name.replace(" ","_")

     # Create a unique_id for the library.
     datetime_stamp = datetime.now().strftime("_%Y_%m_%d_%H_%M_%S_%f")
-    if (source_filename_root != None) and (source_filename_root != ""):
-        unique_id = source_filename_root + datetime_stamp
-    elif (downloaded_directory != None) and (downloaded_directory != ""):
-        unique_id = os.path.basename(downloaded_directory).split(".")[0]
-    else:
-        unique_id = _CTAT_ResourceLib_DefaultGenome + datetime_stamp
+    unique_id = genome_name + datetime_stamp

     print "The Genome Resource Library's display_name will be set to: {:s}\n".format(display_name)
     print "Its unique_id will be set to: {:s}\n".format(unique_id)
Binary file data_manager/ctat_genome_resource_libs_data_manager.tar.gz has changed