Mercurial > repos > trinity_ctat > ctat_genome_resource_libs_data_manager_3

--- a/data_manager/add_ctat_resource_lib.py	Thu Oct 25 11:03:53 2018 -0400
+++ b/data_manager/add_ctat_resource_lib.py	Thu Oct 25 15:35:48 2018 -0400
@@ -10,6 +10,7 @@
 # Users can create or download other libraries and use this Data Manger to add them
 # if they don't want to add them by hand.

+import sys
 import argparse
 import os
 import shutil
@@ -166,6 +167,7 @@
     # return a tuple of the urls
     print "The list being returned as options is:"
     print "{:s}\n".format(str(options))
+    sys.stdout.flush()
     return options

 def get_mutation_resource_urls():
@@ -226,6 +228,7 @@
     # return a tuple of the urls
     print "The list being returned as options is:"
     print "{:s}\n".format(str(options))
+    sys.stdout.flush()
     return options

 # The following was used by the example program to get input parameters through the json.
@@ -245,9 +248,11 @@
     if num_levels > 0:
         if os.path.exists(dir_path) and os.path.isdir(dir_path):
             print "\nDirectory {:s}:".format(dir_path)
+            sys.stdout.flush()
             subprocess.call("ls -la {:s} 2>&1".format(dir_path), shell=True)
         else:
             print "Path either does not exist, or is not a directory:\n\t{:s}.".format(dir_path)
+            sys.stdout.flush()
     if num_levels > 1:
         if os.path.exists(dir_path) and os.path.isdir(dir_path):
             for filename in os.listdir(dir_path):
@@ -256,6 +261,7 @@
                     print_directory_contents(filename_path, num_levels-1)
         else:
             print "Path either does not exist, or is not a directory:\n\t{:s}.".format(dir_path)
+            sys.stdout.flush()

 def which(file):
     # This procedure is similar to the linux "which" command.
@@ -346,6 +352,7 @@
     except IOError:
         print "The success indication file could not be created: " + \
                     "{:s}".format(full_file_path)
+        sys.stdout.flush()
         raise

 def download_file_from_url(file_url, dest_dir, resume_download=True):
@@ -369,7 +376,8 @@
     source_filesize = size_of_file_at(file_url)
     print "Downloading {:s}\nSize of the file is {:d}".format(file_url, source_filesize)
     print "Destination file for the download is {:s}".format(dest_fullpath)
-
+    sys.stdout.flush()
+
     # If the file exists and resume_download is requested, then only download the remainder
     if resume_download and os.path.exists(dest_fullpath):
         existing_size = os.path.getsize(dest_fullpath)
@@ -386,11 +394,15 @@
         output_file = open(dest_fullpath,"ab")
     else:
         if os.path.exists(dest_fullpath):
-            print "resume_download is set to False. Download will overwrite an existing file."
+            print "The destination file exists:\n\t{:s}".format(dest_fullpath)
+            print "However a new download has been requested."
+            print "The download will overwrite the existing file."
         else:
             print "The destination file does not exist yet."
         existing_size = 0
         output_file = open(dest_fullpath,"wb")
+    sys.stdout.flush()
+
     try:
         # Check whether there is enough space on the device for the rest of the file to download.
         statvfs = os.statvfs(dest_dir)
@@ -415,6 +427,7 @@
         source_file.close()
     except IOError:
         print "Error while attempting to download {:s}".format(file_url)
+        sys.stdout.flush()
         raise
     finally:
         output_file.close()
@@ -424,6 +437,7 @@
     print "Downloaded {:s} bytes from {:s}".format(str(bytes_read), str(file_url))
     dest_filesize = os.path.getsize(dest_fullpath)
     print "{:s} {:s}".format(str(dest_filesize), str(dest_fullpath))
+    sys.stdout.flush()
     if source_filesize != dest_filesize:
         raise IOError("Download error:\n\t" + \
             "The source file\n\t\t{:d}\t{:s}\n\t".format(source_filesize, file_url) + \
@@ -452,6 +466,7 @@
         except os.error:
             print "ERROR: Trying to create the following directory path:"
             print "\t{:s}".format(cannonical_destination)
+            sys.stdout.flush()
             raise
     # Make sure the directory now exists and we can write to it.
     if not os.path.exists(cannonical_destination):
@@ -469,6 +484,7 @@
     except IOError:
         print "The destination directory could not be written into:\n\t" + \
               "{:s}".format(cannonical_destination)
+        sys.stdout.flush()
         raise
     # Check whether there are numbytes available on cannonical_destination's device.
     statvfs = os.statvfs(cannonical_destination)
@@ -516,6 +532,7 @@

     print "Downloading:\n\t{:s}".format(str(source_url))
     print "to:\n\t{:s}".format(destination)
+    sys.stdout.flush()
     # The next is done so that if the source_url does not have a genome name in it, an error will be raised.
     find_genome_name_in_path(source_url, raise_error=True)
     cannonical_destination = ensure_we_can_write_numbytes_to(destination, size_of_file_at(source_url))
@@ -547,6 +564,7 @@
             md5sum_from_file = md5sum_for(dest_fullpath)
         except IOError:
             print "Error while attempting to check the md5sum for {:s}".format(dest_fullpath)
+            sys.stdout.flush()
             raise
         if md5sum_from_web != md5sum_from_file:
             raise IOError("Download error:\n\t" + \
@@ -565,12 +583,15 @@
         dest_fullpath = os.path.join(cannonical_destination, dest_filename)
     else:
         print "download_genome_archive(): This code should never be printed. Something is wrong."
-
+    sys.stdout.flush()
+
     # Some code to help us if errors occur.
     print "\n*******************************"
     print "*      Finished download.     *"
+    sys.stdout.flush()
     print_directory_contents(cannonical_destination, 1)
     print "*******************************\n"
+    sys.stdout.flush()

     return dest_fullpath

@@ -606,12 +627,15 @@
         print "Remove the success file or set <force new extraction> if you want a new extraction to occur."
     else:
         print "extract_archive(): This code should never be printed. Something is wrong."
+    sys.stdout.flush()

     # Some code to help us if errors occur.
     print "\n*******************************************************"
     print "* Finished extraction. Destination directory listing. *"
+    sys.stdout.flush()
     print_directory_contents(cannonical_destination, 1)
     print "*******************************************************\n"
+    sys.stdout.flush()
     return

 def extract_genome_file(archive_filepath, destination, force_new_extraction=False, keep_archive=False):
@@ -626,6 +650,7 @@

     print "Extracting:\n\t {:s}".format(str(archive_filepath))
     print "to:\n\t{:s}".format(destination)
+    sys.stdout.flush()
     cannonical_destination = ensure_we_can_write_numbytes_to(destination, bytes_needed_to_extract(archive_filepath))
     # Get the root filename of the Genome Directory from the source file's name.
     # That should also be the name of the extracted directory.
@@ -670,6 +695,7 @@
         # We are done extracting, so remove the archive file.
         if os.path.exists(archive_filepath):
             print "Removing the archive file:\n\t{:s}".format(archive_filepath)
+            sys.stdout.flush()
             os.remove(archive_filepath)
         # else: # It was removed previously, so we don't need to remove it again.
     return extracted_directory
@@ -700,12 +726,15 @@
             subprocess.check_call(command, shell=True)
         except subprocess.CalledProcessError:
             print "ERROR: While trying to run the gmap_build command on the library:\n\t{:s}".format(command)
+            sys.stdout.flush()
             raise
         finally:
             # Some code to help us if errors occur.
             print "\n*******************************\nAfter running gmap_build."
+            sys.stdout.flush()
             print_directory_contents(genome_build_directory, 2)
             print "*******************************\n"
+            sys.stdout.flush()
         create_success_file(gmap_success_full_file_path, \
                     "gmap of:\n\t{:s}\nsucceeded.".format(genome_build_directory))
     elif gmap_success_filename in orig_files_in_build_dir:
@@ -714,6 +743,7 @@
         print "Remove the file or set <force new gmap> if you want a new gmap to occur."
     else:
         print "gmap_the_library(): This code should never be printed. Something is wrong."
+    sys.stdout.flush()
     return


@@ -745,7 +775,8 @@
                                                              bytes_needed_to_build(genome_source_directory))
     print "Building the CTAT Genome Resource Library from source data at:\n\t{:s}".format(str(genome_source_directory))
     print "The Destination directory is at:\n\t{:s}".format(str(cannonical_destination))
-
+    sys.stdout.flush()
+
     # Get the root filename of the Genome Directory.
     src_filename = os.path.basename(genome_source_directory)
     # See whether the library has been built already. The success file is written into the source directory.
@@ -785,6 +816,7 @@
         # to know if an error occurred.
         command += " 2>&1"
         print "About to run the following command:\n\t{:s}".format(command)
+        sys.stdout.flush()
         try: # to send the prep_genome_lib command.
             subprocess.check_call(command, shell=True)
         except subprocess.CalledProcessError:
@@ -795,10 +827,13 @@
             # Some code to help us if errors occur.
             print "\n*******************************"
             print "Contents of Genome Source Directory {:s}:".format(genome_source_directory)
+            sys.stdout.flush()
             print_directory_contents(genome_source_directory, 2)
             print "\nContents of Genome Build Directory {:s}:".format(cannonical_destination)
+            sys.stdout.flush()
             print_directory_contents(cannonical_destination, 2)
             print "*******************************\n"
+            sys.stdout.flush()
         create_success_file(build_success_file_path, \
                             "Build of:\n\t{:s}\n".format(genome_source_directory) + \
                             "to:\n\t{:s}\nsucceeded.".format(cannonical_destination))
@@ -819,7 +854,7 @@
             # gmap_the_library creates a gmap success file if it succeeds.
     else:
         print "build_the_library(): This code should never be printed. Something is wrong."
-
+    sys.stdout.flush()
     return
 	# End of build_the_library()

@@ -914,6 +949,7 @@
     print "\n***********************************"
     print "* Integrating Mutation Resources. *"
     print "***********************************\n"
+    sys.stdout.flush()
     # It is assumed that this procedure is only called with a valid genome_build_directory.
     url_parts = urlparse.urlparse(source_url)
     source_filename = os.path.basename(url_parts.path)
@@ -926,6 +962,7 @@
     print "Download and Integrate a Mutation Resource Archive."
     print "The source URL is:\n\t{:s}".format(str(source_url))
     print "The destination is:\n\t{:s}".format(str(cannonical_destination))
+    sys.stdout.flush()
     # Get the list of files in the directory,
     # We use it to check for a previous download or extraction among other things.
     orig_files_in_destdir = set(os.listdir(cannonical_destination))
@@ -952,6 +989,7 @@
         print "Remove the file or set <new_mutation_download> if you want a new download to occur."
     else:
         print "download_and_integrate_mutation_resources() - Download: This code should never be printed. Something is wrong."
+    sys.stdout.flush()

     # INTEGRATION SECTION
     integration_success_file = "{:s}.{:s}".format(source_filename, _MutationIntegrationSuccessFile)
@@ -994,13 +1032,16 @@
             subprocess.check_call(command, shell=True)
         except subprocess.CalledProcessError:
             print "ERROR: While trying to integrate the mutation resources:\n\t{:s}".format(command)
+            sys.stdout.flush()
             raise
         finally:
             # Some code to help us if errors occur.
             print "/n*********************************************************"
             print "* After download and integration of Mutation Resources. *"
+            sys.stdout.flush()
             print_directory_contents(cannonical_destination, 2)
             print "*********************************************************\n"
+            sys.stdout.flush()
         create_success_file(integration_success_file_path, \
                         "Download and integration of mutation resources:\n\t{:s}\n".format(source_url) + \
                         "to:\n\t{:s}\nsucceeded.".format(genome_build_directory))
@@ -1010,6 +1051,7 @@
         print "Remove the file or set <new_mutation_integration> if you want a new integration to occur."
     else:
         print "download_and_integrate_mutation_resources() - Integration: This code should never be printed. Something is wrong."
+    sys.stdout.flush()
     return

 def search_for_genome_build_dir(top_dir_path):
@@ -1037,15 +1079,18 @@
             "The given directory is not a directory:\n\t{:s}".format(top_dir_full_path))
     if os.path.basename(top_dir_full_path) == _CTAT_Build_dirname:
         print "Build directory is: {:s}".format(top_dir_full_path)
+        sys.stdout.flush()
         # The top_dir_path is the path to the genome_build_directory.
         genome_build_directory = top_dir_full_path
     else:
         # Look for it inside of the top_dir_path directory.
         print "Looking inside of: {:s}".format(top_dir_full_path)
+        sys.stdout.flush()
         top_dir_contents = os.listdir(top_dir_full_path)
         if (_CTAT_Build_dirname in top_dir_contents):
             # The genome_build_directory is inside of the top_dir_path directory.
             print "1. Found it."
+            sys.stdout.flush()
             genome_build_directory = "{:s}/{:s}".format(top_dir_full_path,_CTAT_Build_dirname)
         else:
             # Find all subdirectories containing the _CTAT_Build_dirname or the _CTAT_RefGenome_Filename.
@@ -1085,45 +1130,56 @@
                 print "\n***************************************"
                 print "Found multiple CTAT Genome Resource Libraries " + \
                     "in the given directory:\n\t{:s}".format(top_dir_full_path)
+                sys.stdout.flush()
                 print_directory_contents(top_dir_full_path, 2)
                 print "***************************************\n"
+                sys.stdout.flush()
                 raise ValueError("Found multiple CTAT Genome Resource Libraries " + \
                     "in the given directory:\n\t{:s}".format(top_dir_full_path))
             elif len(build_dirs_in_subdirs) == 1:
                 # The genome_build_directory is inside of the subdir_path directory.
                 print "2b, Found it."
+                sys.stdout.flush()
                 genome_build_directory = build_dirs_in_subdirs[0]
             elif len(build_dirs_in_sub_subdirs) == 1:
                 # The genome_build_directory is inside of the subdir_path directory.
                 print "3b, Found it."
+                sys.stdout.flush()
                 genome_build_directory = build_dirs_in_sub_subdirs[0]
             elif (len(sub_subdirs_with_genome_files) + len(subdirs_with_genome_files)) > 1:
                 print "\n***************************************"
                 print "Unable to find CTAT Genome Resource Library " + \
                       "in the given directory:\n\t{:s}".format(top_dir_full_path)
                 print "And multiple directories contain {:s}".format(_CTAT_RefGenome_Filename)
+                sys.stdout.flush()
                 print_directory_contents(top_dir_full_path, 2)
                 print "***************************************\n"
+                sys.stdout.flush()
                 raise ValueError("Unable to find CTAT Genome Resource Library " + \
                     "in the given directory:\n\t{:s}".format(top_dir_full_path))
             elif (len(sub_subdirs_with_genome_files) == 1):
                 print "3c, Maybe found it."
+                sys.stdout.flush()
                 genome_build_directory = sub_subdirs_with_genome_files[0]
                 print_warning = True
             elif (len(subdirs_with_genome_files) == 1):
                 print "2c, Maybe found it."
+                sys.stdout.flush()
                 genome_build_directory = subdirs_with_genome_files[0]
                 print_warning = True
             elif (_CTAT_RefGenome_Filename in top_dir_contents):
                 print "1c. Maybe found it."
+                sys.stdout.flush()
                 genome_build_directory = top_dir_full_path
                 print_warning = True
             else:
                 print "\n***************************************"
                 print "Unable to find CTAT Genome Resource Library " + \
                       "in the given directory:\n\t{:s}".format(top_dir_full_path)
+                sys.stdout.flush()
                 print_directory_contents(top_dir_full_path, 2)
                 print "***************************************\n"
+                sys.stdout.flush()
                 raise ValueError("Unable to find CTAT Genome Resource Library " + \
                     "in the given directory:\n\t{:s}".format(top_dir_full_path))
         # end else
@@ -1132,8 +1188,10 @@
         print "\n***************************************"
         print "Cannot find the CTAT Genome Resource Library " + \
             "in the given directory:\n\t{:s}".format(top_dir_full_path)
+        sys.stdout.flush()
         print_directory_contents(top_dir_full_path, 2)
         print "***************************************\n"
+        sys.stdout.flush()
         raise ValueError("Cannot find the CTAT Genome Resource Library " + \
             "in the given directory:\n\t{:s}".format(top_dir_full_path))
     else:
@@ -1141,15 +1199,19 @@
             print "\n***************************************"
             print "\nWARNING: Cannot find Genome Reference file {:s} ".format(_CTAT_RefGenome_Filename) + \
                 "in the genome build directory:\n\t{:s}".format(genome_build_directory)
+            sys.stdout.flush()
             print_directory_contents(genome_build_directory, 2)
             print "***************************************\n"
+            sys.stdout.flush()
         if print_warning and genome_build_directory:
             print "\n***************************************"
             print "\nWARNING: Cannot find the CTAT Genome Resource Library, " + \
                 "but found a {:s} file, so set its directory as the library.".format(_CTAT_RefGenome_Filename)
             print "This my not be the correct directory:\n\t{:s}".format(genome_build_directory)
+            sys.stdout.flush()
             print_directory_contents(genome_build_directory, 2)
             print "***************************************\n"
+            sys.stdout.flush()
     return genome_build_directory

 def build_directory_from_build_location(src_filename, build_location):
@@ -1266,7 +1328,8 @@
     # os.mkdir(target_directory)

     print "The value of download_url argument is:\n\t{:s}".format(str(args.download_url))
-
+    sys.stdout.flush()
+
     lib_was_built = False
     extracted_directory = None
     source_data_directory = None
@@ -1289,10 +1352,11 @@
                              destination=args.download_location, \
                              force_new_download=args.new_archive_download)
         print "\nThe downloaded file is:\n\t{:s}.\n".format(str(downloaded_filename_full_path))
-
+        sys.stdout.flush()

         if ctat_library_type(downloaded_filename_full_path) == _LIBTYPE_SOURCE_DATA:
             print "It is source data."
+            sys.stdout.flush()
             # If it is source_data, extract to download_location (the directory where the download was placed).
             extracted_directory = extract_genome_file(archive_filepath=downloaded_filename_full_path, \
                                                       destination=args.download_location, \
@@ -1309,6 +1373,7 @@

         elif ctat_library_type(downloaded_filename_full_path) == _LIBTYPE_PLUG_N_PLAY:
             print "It is plug-n-play data."
+            sys.stdout.flush()
             if build_location_is_set:
                 # Extract to the build location. The library is already built.
                 extracted_directory = extract_genome_file(archive_filepath=downloaded_filename_full_path, \
@@ -1338,7 +1403,8 @@
     elif source_location_is_set:
             # Then the user wants to build the directory from the source data.
             source_data_directory = os.path.realpath(args.source_location)
-            print "\nThe user is saying the source data is in:\n\t{:s}.\n".format(str(source_data_directory))
+            print "\nThe program is being told that the source data is in:\n\t{:s}.\n".format(str(source_data_directory))
+            sys.stdout.flush()
             if build_location_is_set:
                 genome_build_directory = build_directory_from_build_location(source_data_directory, args.build_location)
             else:
@@ -1354,6 +1420,7 @@

     print "\nThe location where the CTAT Genome Resource Library exists " + \
         "or will be built is {:s}.\n".format(str(genome_build_directory))
+    sys.stdout.flush()

     # To take out builds for testing, comment out the lines that do the building.
     # The command that builds the ctat genome library also has an option for building the gmap indexes.
@@ -1402,6 +1469,7 @@
     if genome_name is None:
         genome_name = _CTAT_ResourceLib_DefaultGenome
         print "WARNING: We could not find a genome name in any of the directory paths."
+        sys.stdout.flush()

     # Determine the display_name for the library.
     if (args.display_name is None) or (args.display_name == ""):
@@ -1418,6 +1486,7 @@
     print "The Genome Resource Library's display_name will be set to: {:s}\n".format(display_name)
     print "Its unique_id will be set to: {:s}\n".format(unique_id)
     print "Its dir_path will be set to: {:s}\n".format(genome_build_directory)
+    sys.stdout.flush()

     data_manager_dict = {}
     data_manager_dict['data_tables'] = {}
@@ -1427,6 +1496,7 @@

     # Temporarily the output file's dictionary is written for debugging:
     print "The dictionary for the output file is:\n\t{:s}".format(str(data_manager_dict))
+    sys.stdout.flush()
     # Save info to json file. This is used to transfer data from the DataManager tool, to the data manager,
     # which then puts it into the correct .loc file (I think).
     # Comment out the following line when testing without galaxy package.