Mercurial > repos > trinity_ctat > ctat_genome_resource_libs_data_manager_3

--- a/data_manager/add_ctat_resource_lib.py	Tue May 01 15:40:08 2018 -0400
+++ b/data_manager/add_ctat_resource_lib.py	Fri May 04 13:19:47 2018 -0400
@@ -35,9 +35,11 @@
 from HTMLParser import HTMLParser

 _CTAT_ResourceLib_URL = 'https://data.broadinstitute.org/Trinity/CTAT_RESOURCE_LIB/'
-_CTAT_BuildDir_Name = 'ctat_genome_lib_build_dir'
+_CTAT_Build_dirname = 'ctat_genome_lib_build_dir'
 _CTAT_ResourceLib_DisplayNamePrefix = 'CTAT_GenomeResourceLib_'
 _CTAT_ResourceLib_DefaultGenome = 'Unspecified_Genome'
+_CTAT_HumanFusionLib_FilenamePrefix = 'CTAT_HumanFusionLib'
+_CTAT_RefGenome_Filename = 'ref_genome.fa'
 _NumBytesNeededForBuild = 64424509440 # 60 Gigabytes. FIX - This might not be correct.
 _Download_TestFile = "write_testfile.txt"
 _DownloadSuccessFile = 'download_succeeded.txt'
@@ -110,6 +112,19 @@
 #    trained_url = params['param_dict']['trained_url']
 #    return trained_url

+def print_directory_contents(dir_path, num_levels):
+    if num_levels > 0:
+        if os.path.exists(dir_path) and os.path.isdir(dir_path):
+            print "\nDirectory {:s}:".format(dir_path)
+            subprocess.call("ls -la {:s} 2>&1".format(dir_path), shell=True)
+        else:
+            print "Path either does not exist, or is not a directory:\n\t{:s}.".format(dir_path)
+    if num_levels > 1:
+        for filename in os.listdir(dir_path):
+            filename_path = "{:s}/{:s}".format(dir_path, filename)
+            if os.path.exists(filename_path) and os.path.isdir(filename_path):
+                print_directory_contents(filename_path, num_levels-1)
+
 def download_from_BroadInst(source, destination, force_download):
     # Input Parameters
     # source is the full URL of the file we want to download.
@@ -224,7 +239,7 @@
         # We want to transfer and untar the file without storing the tar file, because that
         # adds all that much more space to the needed amount of free space on the disk.
         # Use subprocess to pipe the output of curl into tar.
-        command = "curl {:s} | tar -xzvf - -C {:s}".format(source, cannonical_destination)
+        command = "curl --silent {:s} | tar -xzf - -C {:s}".format(source, cannonical_destination)
         try: # to send the command that downloads and extracts the file.
             command_output = subprocess.check_output(command, shell=True)
             # FIX - not sure check_output is what we want to use. If we want to have an error raised on
@@ -237,8 +252,8 @@

     # Some code to help us if errors occur.
     print "\n*******************************\nFinished download and extraction."
-    subprocess.check_call("ls -lad {:s}/*".format(cannonical_destination), shell=True)
-    subprocess.check_call("ls -lad {:s}/*/*".format(cannonical_destination), shell=True)
+    print_directory_contents(cannonical_destination, 2)
+    print "*******************************\n"

     newfiles_in_destdir = set(os.listdir(cannonical_destination)) - orig_files_in_destdir
     if (root_genome_dirname not in newfiles_in_destdir):
@@ -272,10 +287,14 @@
         # Look for the build directory, or specify the path where it should be placed.
         if len(os.listdir(downloaded_directory)) == 1:
             # Then that one file is a subdirectory that should be the downloaded_directory.
+            # That is how the plug-n-play directories are structured.
             subdir_filename = os.listdir(downloaded_directory)[0]
             genome_build_directory = "{:s}/{:s}".format(downloaded_directory, subdir_filename)
         else:
-            genome_build_directory = "{:s}/{:s}".format(downloaded_directory, _CTAT_BuildDir_Name)
+            # In this case, we have source_data in the directory. The default will be to create
+            # the build directory in the downloaded_directory with the default _CTAT_Build_dirname.
+            # In this case, this directory will not exist yet until the library is built.
+            genome_build_directory = "{:s}/{:s}".format(downloaded_directory, _CTAT_Build_dirname)
     else:
         raise ValueError("ERROR: Could not find the extracted file in the destination directory:" + \
                              "\n\t{:s}".format(cannonical_destination))
@@ -285,8 +304,10 @@
 def gmap_the_library(genome_build_directory):
         # This is the processing that needs to happen for gmap-fusion to work.
         # genome_build_directory should normally be a fully specified path,
-        # though it should work if it is relative.
-        command = "gmap_build -D {:s}/ -d ref_genome.fa.gmap -k 13 {:s}/ref_genome.fa".format( \
+        # though this function should work even if it is relative.
+        # The command prints messages out to stderr, even when there is not an error,
+        # so route stderr to stdout. Otherwise, galaxy thinks an error occurred.
+        command = "gmap_build -D {:s}/ -d ref_genome.fa.gmap -k 13 {:s}/ref_genome.fa 2>&1".format( \
                   genome_build_directory, genome_build_directory)
         try: # to send the gmap_build command.
             command_output = subprocess.check_output(command, shell=True)
@@ -296,23 +317,8 @@
         finally:
             # Some code to help us if errors occur.
             print "\n*******************************\nAfter running gmap_build."
-            if os.path.exists(genome_build_directory):
-                print "\nBuild Directory {:s}:".format(genome_build_directory)
-                subprocess.check_call("ls -la {:s}".format(genome_build_directory), shell=True)
-                dir_entries = os.listdir(genome_build_directory)
-                for entry in dir_entries:
-                    entry_path = "{:s}/{:s}".format(genome_build_directory, entry)
-                    print "\nDirectory {:s}:".format(entry_path)
-                    subprocess.check_call("ls -la {:s}".format(entry_path), shell=True)
-                    if os.path.isdir(entry_path):
-                        subdir_entries = os.listdir(entry_path)
-                        for subdir_entry in subdir_entries:
-                            subdir_entry_path = "{:s}/{:s}".format(entry_path, subdir_entry)
-                            print "\nDirectory {:s}:".format(subdir_entry_path)
-                            subprocess.check_call("ls -la {:s}".format(subdir_entry_path), shell=True)
-            else:
-                print "Genome Build Directory does not exist:\n\t{:s}".format(genome_build_directory)
-            print "*******************************"
+            print_directory_contents(genome_build_directory, 2)
+            print "*******************************\n"

 def build_the_library(genome_source_directory, genome_build_directory, build, gmap_build):
     """ genome_source_directory is the location of the source_data needed to build the library.
@@ -337,14 +343,31 @@
     if (genome_source_directory != "" ) and build:
         if os.path.exists(genome_source_directory):
             os.chdir(genome_source_directory)
-            # FIX - look for a fusion_annot_lib and include it, else omit it.
+            # Create the command that builds the Genome Resource Library form the source data.
             command = "prep_genome_lib.pl --genome_fa ref_genome.fa --gtf ref_annot.gtf " + \
-                      "--fusion_annot_lib CTAT_HumanFusionLib.v0.1.0.dat.gz " + \
-                      "--annot_filter_rule AnnotFilterRule.pm " + \
                       "--pfam_db PFAM.domtblout.dat.gz " + \
-                      "--output_dir {:s} ".format(genome_build_directory)
+                      "--output_dir {:s}".format(genome_build_directory)
+            found_HumanFusionLib = False
+            HumanFusionLib_filename = "NoFileFound"
+            for filename in os.listdir(genome_source_directory):
+                # At the time this was written, the filename was CTAT_HumanFusionLib.v0.1.0.dat.gz
+                # We only check the prefix, in case other versions are used later.
+                # I assume there is only one in the directory, but if there are more than one,
+                # the later one, alphabetically, will be used.
+                if filename.split(".")[0] == _CTAT_HumanFusionLib_FilenamePrefix:
+                    found_HumanFusionLib = True
+                    filename_of_HumanFusionLib = filename
+            if found_HumanFusionLib:
+                # The mouse genomes do not have a fusion_annot_lib
+                # so only add the following for Human genomes.
+                command += "--fusion_annot_lib {:s} ".format(filename_of_HumanFusionLib) + \
+                           "--annot_filter_rule AnnotFilterRule.pm "
             if gmap_build:
                 command += "--gmap_build "
+            # Send stderr of the command to stdout, because some functions may write to stderr,
+            # even though no error has occurred. We will depend on error code return in order
+            # to know if an error occurred.
+            command += " 2>&1"
             try: # to send the prep_genome_lib command.
                 command_output = subprocess.check_call(command, shell=True)
             except subprocess.CalledProcessError:
@@ -353,59 +376,174 @@
                 raise
             finally:
                 # Some code to help us if errors occur.
-                print "*******************************"
-                if os.path.exists(genome_build_directory):
-                    print "\nSource Directory {:s}:".format(genome_source_directory)
-                    subprocess.check_call("ls -la {:s}".format(genome_source_directory), shell=True)
-                    dir_entries = os.listdir(genome_source_directory)
-                    for entry in dir_entries:
-                        entry_path = "{:s}/{:s}".format(genome_source_directory, entry)
-                        print "\nDirectory {:s}:".format(entry_path)
-                        subprocess.check_call("ls -la {:s}".format(entry_path), shell=True)
-                else:
-                    print "Genome Source Directory does not exist:\n\t{:s}".format(genome_source_directory)
-                if os.path.exists(genome_build_directory):
-                    print "\nBuild Directory {:s}:".format(genome_build_directory)
-                    subprocess.check_call("ls -la {:s}".format(genome_build_directory), shell=True)
-                    dir_entries = os.listdir(genome_build_directory)
-                    for entry in dir_entries:
-                        entry_path = "{:s}/{:s}".format(genome_build_directory, entry)
-                        print "\nDirectory {:s}:".format(entry_path)
-                        subprocess.check_call("ls -la {:s}".format(entry_path), shell=True)
-                        if os.path.isdir(entry_path):
-                            subdir_entries = os.listdir(entry_path)
-                            for subdir_entry in subdir_entries:
-                                subdir_entry_path = "{:s}/{:s}".format(entry_path, subdir_entry)
-                                print "\nDirectory {:s}:".format(subdir_entry_path)
-                                subprocess.check_call("ls -la {:s}".format(subdir_entry_path), shell=True)
-                else:
-                    print "Genome Build Directory does not exist:\n\t{:s}".format(genome_build_directory)
-                print "*******************************"
+                print "\n*******************************"
+                print "Contents of Genome Source Directory {:s}:".format(genome_source_directory)
+                print_directory_contents(genome_source_directory, 2)
+                print "\nContents of Genome Build Directory {:s}:".format(genome_build_directory)
+                print_directory_contents(genome_build_directory, 2)
+                print "*******************************\n"
         else:
             raise ValueError("Cannot build the CTAT Genome Resource Library. " + \
                 "The source directory does not exist:\n\t{:s}".format(genome_source_directory))
     elif gmap_build:
         gmap_the_library(genome_build_directory)

+def search_for_genome_build_dir(top_dir_path):
+    # If we do not download the directory, the topdir_path could be the
+    # location of the genome resource library, but we also want to allow the
+    # user to give the same value for top_dir_path that they do when a
+    # build happens, so we need to handle all three cases:
+    # 1) Is the top_dir_path the build directory,
+    # 2) or is it inside of the given directory,
+    # 3) or is it inside a subdirectory of the given directory.
+    # The source_data downloads are built to a directory named _CTAT_Build_dirname,
+    # and the plug-n-play downloads contain a sub-directory named _CTAT_Build_dirname.
+    genome_build_directory = None
+    print_warning = False
+
+    if not os.path.exists(top_dir_path):
+        raise ValueError("Cannot find the CTAT Genome Resource Library. " + \
+            "The given directory does not exist:\n\t{:s}".format(top_dir_path))
+    elif not os.path.isdir(top_dir_path):
+        raise ValueError("Cannot find the CTAT Genome Resource Library. " + \
+            "The given directory is not a directory:\n\t{:s}".format(top_dir_path))
+    if top_dir_path.split("/")[-1] == _CTAT_Build_dirname:
+        print "Build directory is: {:s}".format(top_dir_path)
+        # The top_dir_path is the path to the genome_build_directory.
+        genome_build_directory = top_dir_path
+    else:
+        # Look for it inside of the top_dir_path directory.
+        print "Looking inside of: {:s}".format(top_dir_path)
+        top_dir_contents = os.listdir(top_dir_path)
+        if (_CTAT_Build_dirname in top_dir_contents):
+            # The genome_build_directory is inside of the top_dir_path directory.
+            print "1. Found it."
+            genome_build_directory = "{:s}/{:s}".format(top_dir_path,_CTAT_Build_dirname)
+        else:
+            # Find all subdirectories containing the _CTAT_Build_dirname or the _CTAT_RefGenome_Filename.
+            # Look down the directory tree two levels.
+            build_dirs_in_subdirs = list()
+            subdirs_with_genome_files = list()
+            build_dirs_in_sub_subdirs = list()
+            sub_subdirs_with_genome_files = list()
+            subdirs = [entry for entry in top_dir_contents if (os.path.isdir("{:s}/{:s}".format(top_dir_path,entry)))]
+            for subdir in subdirs:
+                subdir_path = "{:s}/{:s}".format(top_dir_path, subdir)
+                subdir_path_contents = os.listdir(subdir_path)
+                # print "Is it one of:\n\t" + "\n\t".join(subdir_path_contents)
+                if (_CTAT_Build_dirname in subdir_path_contents):
+                    # The genome_build_directory is inside of the subdir_path directory.
+                    print "2a, Found one."
+                    build_dirs_in_subdirs.append("{:s}/{:s}".format(subdir_path, _CTAT_Build_dirname))
+                if (_CTAT_RefGenome_Filename in subdir_path_contents):
+                    subdirs_with_genome_files.append(subdir_path)
+                # Since we are already looping, loop through all dirs one level deeper as well.
+                sub_subdirs = [entry for entry in subdir_path_contents if (os.path.isdir("{:s}/{:s}".format(subdir_path,entry)))]
+                for sub_subdir in sub_subdirs:
+                    sub_subdir_path = "{:s}/{:s}".format(subdir_path, sub_subdir)
+                    sub_subdir_path_contents = os.listdir(sub_subdir_path)
+                    # print "Is it one of:\n\t" + "\n\t".join(sub_subdir_path_contents)
+                    if (_CTAT_Build_dirname in sub_subdir_path_contents):
+                        # The genome_build_directory is inside of the sub_subdir_path directory.
+                        print "3a. Found one."
+                        build_dirs_in_sub_subdirs.append("{:s}/{:s}".format(sub_subdir_path, _CTAT_Build_dirname))
+                    if (_CTAT_RefGenome_Filename in sub_subdir_path_contents):
+                        sub_subdirs_with_genome_files.append(sub_subdir_path)
+            # Hopefully there is one and only one found build directory.
+            # If none are found we check for a directory containing the genome reference file,
+            # but the build process sometimes causes more than one directory to have a copy,
+            # so finding that file is not a sure thing.
+            if (len(build_dirs_in_subdirs) + len(build_dirs_in_sub_subdirs)) > 1:
+                print "\n***************************************"
+                print "Found multiple CTAT Genome Resource Libraries " + \
+                    "in the given directory:\n\t{:s}".format(top_dir_path)
+                print_directory_contents(top_dir_path, 2)
+                print "***************************************\n"
+                raise ValueError("Found multiple CTAT Genome Resource Libraries " + \
+                    "in the given directory:\n\t{:s}".format(top_dir_path))
+            elif len(build_dirs_in_subdirs) == 1:
+                # The genome_build_directory is inside of the subdir_path directory.
+                print "2b, Found it."
+                genome_build_directory = build_dirs_in_subdirs[0]
+            elif len(build_dirs_in_sub_subdirs) == 1:
+                # The genome_build_directory is inside of the subdir_path directory.
+                print "3b, Found it."
+                genome_build_directory = build_dirs_in_sub_subdirs[0]
+            elif (len(sub_subdirs_with_genome_files) + len(subdirs_with_genome_files)) > 1:
+                print "\n***************************************"
+                print "Unable to find CTAT Genome Resource Library " + \
+                      "in the given directory:\n\t{:s}".format(top_dir_path)
+                print "And multiple directories contain {:s}".format(_CTAT_RefGenome_Filename)
+                print_directory_contents(top_dir_path, 2)
+                print "***************************************\n"
+                raise ValueError("Unable to find CTAT Genome Resource Library " + \
+                    "in the given directory:\n\t{:s}".format(top_dir_path))
+            elif (len(sub_subdirs_with_genome_files) == 1):
+                print "3c, Maybe found it."
+                genome_build_directory = sub_subdirs_with_genome_files[0]
+                print_warning = True
+            elif (len(subdirs_with_genome_files) == 1):
+                print "2c, Maybe found it."
+                genome_build_directory = subdirs_with_genome_files[0]
+                print_warning = True
+            elif (_CTAT_RefGenome_Filename in top_dir_contents):
+                print "1c. Maybe found it."
+                genome_build_directory = top_dir_path
+                print_warning = True
+            else:
+                print "\n***************************************"
+                print "Unable to find CTAT Genome Resource Library " + \
+                      "in the given directory:\n\t{:s}".format(top_dir_path)
+                print_directory_contents(top_dir_path, 2)
+                print "***************************************\n"
+                raise ValueError("Unable to find CTAT Genome Resource Library " + \
+                    "in the given directory:\n\t{:s}".format(top_dir_path))
+        # end else
+    # Check if the CTAT Genome Resource Lib has anything in it (and specifically ref_genome.fa).
+    if (genome_build_directory is None):
+        print "\n***************************************"
+        print "Cannot find the CTAT Genome Resource Library " + \
+            "in the given directory:\n\t{:s}".format(top_dir_path)
+        print_directory_contents(top_dir_path, 2)
+        print "***************************************\n"
+        raise ValueError("Cannot find the CTAT Genome Resource Library " + \
+            "in the given directory:\n\t{:s}".format(top_dir_path))
+    elif (_CTAT_RefGenome_Filename not in os.listdir(genome_build_directory)):
+        print "\n***************************************"
+        print "\nWARNING: Cannot find Genome Reference file {:s}".format(_CTAT_RefGenome_Filename) + \
+            "in the genome build directory:\n\t{:s}".format(genome_build_directory)
+        print_directory_contents(genome_build_directory, 2)
+        print "***************************************\n"
+    if print_warning and genome_build_directory:
+        print "\n***************************************"
+        print "\nWARNING: Cannot find the CTAT Genome Resource Library," + \
+            "but found a {:s} file, so set its directory as the library.".format(_CTAT_RefGenome_Filename)
+        print "This my not be the correct directory:\n\t{:s}".format(genome_build_directory)
+        print_directory_contents(genome_build_directory, 2)
+        print "***************************************\n"
+    return genome_build_directory
+
 def main():
     #Parse Command Line
     parser = argparse.ArgumentParser()
-    parser.add_argument('-s', '--source_url', default="", \
-        help='This is the url of a file with the data. They come from https://data.broadinstitute.org/Trinity/CTAT_RESOURCE_LIB/.')
-    parser.add_argument('-n', '--display_name', default="", \
+    parser.add_argument('-s', '--source_url', default='', \
+        help='This is the url of a file with the data. ' + \
+            'They come from https://data.broadinstitute.org/Trinity/CTAT_RESOURCE_LIB/.')
+    parser.add_argument('-n', '--display_name', default='', \
         help='Is used as the display name for the entry of this Genome Resource Library in the data table.')
-    parser.add_argument('-p', '--destination_path', \
-        help='Full path of the CTAT Resource Library location or destination, either where it is, or where it will be placed.')
     parser.add_argument('-o', '--output_filename', \
         help='Name of the output file, where the json dictionary will be written.')
     parser.add_argument('-f', '--force_download',
-        help='Forces download of the Genome Resource Library, even if previously downloaded.', action="store_true")
+        help='Forces download of the Genome Resource Library, even if previously downloaded.', action='store_true')
     parser.add_argument('-b', '--build',
         help='Forces build/rebuild the Genome Resource Library, even if previously built. ' + \
-             'Must have downloaded source_data for this to work.', action="store_true")
+             'Must have downloaded source_data for this to work.', action='store_true')
     parser.add_argument('-m', '--gmap_build',
         help='Must be selected if you want the library to be gmapped. ' + \
-             'Will force gmap_build of the Genome Resource Library, even if previously gmapped.', action="store_true")
+             'Will force gmap_build of the Genome Resource Library, even if previously gmapped.', action='store_true')
+    requiredNamed = parser.add_argument_group('required named arguments')
+    requiredNamed.add_argument('-p', '--destination_path', required=True, \
+        help='Full path of the CTAT Resource Library location or destination, either where it is, or where it will be placed.')
     args = parser.parse_args()

     # All of the input parameters are written by default to the output file prior to
@@ -418,39 +556,40 @@

     print "The value of source_url argument is:\n\t{:s}".format(str(args.source_url))

-    # FIX - not sure the lib_was_downloaded actually serves a purpose...
+    # FIX - not sure lib_was_downloaded actually serves a purpose...
     lib_was_downloaded = False
     download_has_source_data = False
-    # If we do not download the directory, the destination_path should be the
-    # location of the genome resource library.
     downloaded_directory = None
-    # FIX - look inside of the args.destination_path to see if the build directory is inside it or is it.
     genome_build_directory = None
     # FIX - need to make sure we are handling all "possible" combinations of arguments.
     # Probably would be good if we could simplify/remove some of them.
+    # But I think the current interface is using them all.
     if (args.source_url != ""):
         downloaded_directory, download_has_source_data, genome_build_directory, lib_was_downloaded = \
             download_from_BroadInst(source=args.source_url, \
                                     destination=args.destination_path, \
                                     force_download=args.force_download)
     else:
-        genome_build_directory = args.destination_path
-        if not os.path.exists(genome_build_directory):
-            raise ValueError("Cannot find the CTAT Genome Resource Library. " + \
-                "The directory does not exist:\n\t{:s}".format(genome_build_directory))
-        # else:
-        # FIX - Check if there is an actual CTAT Genome Resource Lib there.
-        #    _CTAT_BuildDir_Name
+        genome_build_directory = search_for_genome_build_dir(args.destination_path)

     print "\nThe location of the CTAT Genome Resource Library is {:s}.\n".format(genome_build_directory)

-    # Take out builds for testing.
     # FIX - We should leave a file indicating build success the same way we do for download success.
+    # To take out builds for testing, coment out the next four lines.
     if (download_has_source_data or args.build or args.gmap_build) :
         build_the_library(downloaded_directory, genome_build_directory, args.build, args.gmap_build)
     elif (args.gmap_build):
         gmap_the_library(genome_build_directory)

+    # The following looks to see if the library actually exists after the build,
+    # and raises an error if it cannot find the library files.
+    # The reassignment of genome_build_directory should be superfluous,
+    # unless I made a mistake in the build code.
+    # FIX - need to get the genome name from the directory name, if there was no download.
+    #genome_build_directory, genome_name_from_dirname = search_for_genome_build_dir(genome_build_directory)
+    genome_build_directory = search_for_genome_build_dir(genome_build_directory)
+
+    source_filename_root = None
     if (args.source_url != None) and (args.source_url != ""):
         # Get the name out of the source's filename.
         source_filename_root = args.source_url.split("/")[-1].split(".")[0]
@@ -458,15 +597,14 @@
     # Determine the display_name for the library.
     if (args.display_name is None) or (args.display_name == ""):
         if (source_filename_root != None) and (source_filename_root != ""):
-            # Get the name out of the source filename.
+            # Create the display_name from the source_filename_root.
             display_name = _CTAT_ResourceLib_DisplayNamePrefix + source_filename_root
         else:
             display_name = _CTAT_ResourceLib_DisplayNamePrefix + _CTAT_ResourceLib_DefaultGenome
-            print "WARNING: We do not have a genome name. Using a default name, that might not be correct."
+            print "WARNING: We do not have a genome name."
     else:
         display_name = _CTAT_ResourceLib_DisplayNamePrefix + args.display_name
     display_name = display_name.replace(" ","_")
-    print "The Genome Name will be set to: {:s}\n".format(display_name)

     # Create a unique_id for the library.
     datetime_stamp = datetime.now().strftime("_%Y_%m_%d_%H_%M_%S_%f")
@@ -477,7 +615,7 @@
     else:
         unique_id = _CTAT_ResourceLib_DefaultGenome + datetime_stamp

-    print "The Resource Lib's display_name will be set to: {:s}\n".format(display_name)
+    print "The Genome Resource Library's display_name will be set to: {:s}\n".format(display_name)
     print "Its unique_id will be set to: {:s}\n".format(unique_id)
     print "Its dir_path will be set to: {:s}\n".format(genome_build_directory)
Binary file data_manager/ctat_genome_resource_libs_data_manager.tar.gz has changed