Mercurial > repos > trinity_ctat > ctat_genome_resource_libs_data_manager_3

--- a/data_manager/add_ctat_resource_lib.py	Fri May 11 16:06:47 2018 -0400
+++ b/data_manager/add_ctat_resource_lib.py	Sat Jun 23 15:40:54 2018 -0400
@@ -35,6 +35,7 @@
 from HTMLParser import HTMLParser

 _CTAT_ResourceLib_URL = 'https://data.broadinstitute.org/Trinity/CTAT_RESOURCE_LIB/'
+_CTAT_MutationIndex_URL = 'https://data.broadinstitute.org/Trinity/CTAT/mutation/'
 _CTAT_Build_dirname = 'ctat_genome_lib_build_dir'
 _CTAT_ResourceLib_DisplayNamePrefix = 'CTAT_GenomeResourceLib_'
 _CTAT_ResourceLib_DefaultGenome = 'Unspecified_Genome'
@@ -42,9 +43,12 @@
 _CTAT_RefGenome_Filename = 'ref_genome.fa'
 _CTAT_MouseGenome_Prefix = 'Mouse'
 _CTAT_HumanGenome_Prefix = 'GRCh'
-_NumBytesNeededForBuild = 64424509440 # 60 Gigabytes. FIX - This might not be correct.
+_NumBytesNeededForBuild = 66571993088 # 62 Gigabytes. FIX - This might not be correct.
+_NumBytesNeededForIndexes = 21474836480 # 20 Gigabytes. FIX - This might not be correct.
 _Download_TestFile = "write_testfile.txt"
 _DownloadSuccessFile = 'download_succeeded.txt'
+_LibBuiltSuccessFile = 'build_succeeded.txt'
+_MutationDownloadSuccessFile = 'mutation_index_download_succeeded.txt'

 class FileListParser(HTMLParser):
     def __init__(self):
@@ -81,7 +85,7 @@
         # The urls should look like:
         # https://data.broadinstitute.org/Trinity/CTAT_RESOURCE_LIB/GRCh37_v19_CTAT_lib_Feb092018.plug-n-play.tar.gz
         # https://data.broadinstitute.org/Trinity/CTAT_RESOURCE_LIB/Mouse_M16_CTAT_lib_Feb202018.source_data.tar.gz
-        # But is actuality, they are coming in looking like:
+        # But in actuality, they are coming in looking like:
         # GRCh37_v19_CTAT_lib_Feb092018.plug-n-play.tar.gz
         # Mouse_M16_CTAT_lib_Feb202018.source_data.tar.gz
         # Write code to handle both situations, or an ftp: url.
@@ -91,11 +95,44 @@
             # Assume the path is relative to the page location.
             full_url_path = "{:s}/{:s}".format(_CTAT_ResourceLib_URL, url)
         filename = url.split("/")[-1]
+        # if filename.split("_")[0] != _CTAT_MouseGenome_Prefix:
+        #     # Don't put in the mouse genome options for now.
+        #     # The mouse genome option is not handled correctly yet
+        #     options.append((filename, full_url_path, i == 0))
+        # Mouse genomes should work now (we hope) - FIX - still not tested.
+        options.append((filename, full_url_path, i == 0))
+    options.sort() # So the list will be in alphabetical order.
+    # return a tuple of the urls
+    print "The list being returned as options is:"
+    print "{:s}\n".format(str(options))
+    return options

-        if filename.split("_")[0] != _CTAT_MouseGenome_Prefix:
-            # Take out the mouse genome options for now.
-            # The mouse genome option is not handled correctly yet
-            options.append((filename, full_url_path, i == 0))
+def get_mutation_index_urls():
+    # open the url and retrieve the urls of the files in the directory.
+    resource = urllib2.urlopen(_CTAT_MutationIndex_URL)
+    theHTML = resource.read()
+    filelist_parser = FileListParser()
+    filelist_parser.feed(theHTML)
+    # For dynamic options need to return an interable with contents that are tuples with 3 items.
+    # Item one is a string that is the display name put into the option list.
+    # Item two is the value that is put into the parameter associated with the option list.
+    # Item three is a True or False value, indicating whether the item is selected.
+    options = []
+    for i, url in enumerate(filelist_parser.urls):
+        # The urls should look like:
+        # https://data.broadinstitute.org/Trinity/CTAT/mutation/mc7.tar.gz
+        # https://data.broadinstitute.org/Trinity/CTAT/mutation/hg19.tar.gz
+        # But in actuality, they are coming in looking like:
+        # hg19.tar.gz
+        # mc7.tar.gz
+        # Write code to handle both situations, or an ftp: url.
+        if (url.split(":")[0] == "http") or (url.split(":")[0] == "https") or (url.split(":")[0] == "ftp"):
+            full_url_path = url
+        else:
+            # Assume the path is relative to the page location.
+            full_url_path = "{:s}/{:s}".format(_CTAT_MutationIndex_URL, url)
+        filename = url.split("/")[-1]
+        options.append((filename, full_url_path, i == 0))
     options.sort() # So the list will be in alphabetical order.
     # return a tuple of the urls
     print "The list being returned as options is:"
@@ -114,6 +151,7 @@
 #    trained_url = params['param_dict']['trained_url']
 #    return trained_url

+# The following procedure is used to help with debugging and for user information.
 def print_directory_contents(dir_path, num_levels):
     if num_levels > 0:
         if os.path.exists(dir_path) and os.path.isdir(dir_path):
@@ -122,10 +160,13 @@
         else:
             print "Path either does not exist, or is not a directory:\n\t{:s}.".format(dir_path)
     if num_levels > 1:
-        for filename in os.listdir(dir_path):
-            filename_path = "{:s}/{:s}".format(dir_path, filename)
-            if os.path.exists(filename_path) and os.path.isdir(filename_path):
-                print_directory_contents(filename_path, num_levels-1)
+        if os.path.exists(dir_path) and os.path.isdir(dir_path):
+            for filename in os.listdir(dir_path):
+                filename_path = "{:s}/{:s}".format(dir_path, filename)
+                if os.path.exists(filename_path) and os.path.isdir(filename_path):
+                    print_directory_contents(filename_path, num_levels-1)
+        else:
+            print "Path either does not exist, or is not a directory:\n\t{:s}.".format(dir_path)

 def download_from_BroadInst(source, destination, force_download):
     # Input Parameters
@@ -154,9 +195,10 @@
     #     Since it doesn't always do the download, the function returns whether download occurred.
     lib_was_downloaded = False
     if len(source.split(":")) == 1:
-        # Might want to check that it is one of "http", "ftp", "file" or other accepted url starts.
+        # Then we were given a source_url without a leading https: or similar.
         # Assume we only were given the filename and that it exists at _CTAT_ResourceLib_URL.
         source = "{:s}/{:s}".format(_CTAT_ResourceLib_URL, source)
+    # else we might want to check that it is one of "http", "ftp", "file" or other accepted url starts.

     print "In download_from_BroadInst(). The source_url is:\n\t{:s}".format(str(source))

@@ -207,6 +249,10 @@
     # We use it to check for a previous download or extraction among other things.
     orig_files_in_destdir = set(os.listdir(cannonical_destination))
     # See whether the file has been downloaded already.
+    # FIX - Try looking one or two directories above, as well as current directory,
+    #     and maybe one directory below,
+    #     for the download success file?
+    #     Not sure about this though...
     download_success_file = "{:s}.{:s}".format(root_genome_dirname, _DownloadSuccessFile)
     download_success_file_path = "{:s}/{:s}".format(cannonical_destination, download_success_file)
     if ((download_success_file not in orig_files_in_destdir) \
@@ -329,6 +375,74 @@
             print_directory_contents(genome_build_directory, 2)
             print "*******************************\n"

+def download_mutation_indexes(source_url, genome_build_directory, force_download):
+    print "\n*****************************************************************"
+    print "* The real mutation indexes have not yet been created. Just testing. *"
+    print "*****************************************************************\n"
+    # It is assumed that this procedure is only called with a valid genome_build_directory.
+    # No checks are made to see whether it exists, whether we can write to it, etc.
+    index_was_downloaded = False
+    if len(source_url.split(":")) == 1:
+        # Then we were given a source_url without a leading https: or similar.
+        # Assume we only were given the filename and that it exists at _CTAT_MutationIndex_URL.
+        source_url = "{:s}/{:s}".format(_CTAT_MutationIndex_URL, source_url)
+
+    print "In download_mutation_indexes(). The source_url is:\n\t{:s}".format(str(source_url))
+
+    # Get the root filename of the Genome Directory.
+    src_filename = source.split("/")[-1]
+    root_genome_dirname = src_filename.split(".")[0]
+    print "The mutation index file to be downloaded and extracted is {:s}".format(src_filename)
+
+    # Get the list of files in the directory,
+    # We use it to check for a previous download or extraction among other things.
+    orig_files_in_destdir = set(os.listdir(genome_build_directory))
+    # See whether the index file has been downloaded already.
+    download_success_file = "{:s}.{:s}".format(root_genome_dirname, _MutationDownloadSuccessFile)
+    download_success_file_path = "{:s}/{:s}".format(genome_build_directory, download_success_file)
+    if ((download_success_file not in orig_files_in_destdir) or force_download):
+        # Check whether there is enough space on the device for the library.
+        statvfs = os.statvfs(genome_build_directory)
+        # fs_size = statvfs.f_frsize * statvfs.f_blocks          # Size of filesystem in bytes
+        # num_free_bytes = statvfs.f_frsize * statvfs.f_bfree    # Actual number of free bytes
+        num_avail_bytes = statvfs.f_frsize * statvfs.f_bavail    # Number of free bytes that ordinary users
+                                                                 # are allowed to use (excl. reserved space)
+        if (num_avail_bytes < _NumBytesNeededForIndexes):
+            raise OSError("There is insufficient space ({:s} bytes)".format(str(num_avail_bytes)) + \
+                          " for the indexes on the device of the destination directory: " + \
+                          "{:s}".format(cannonical_destination))
+        if (download_success_file in orig_files_in_destdir):
+            # Since we are redoing the download,
+            # the success file needs to be removed
+            # until the download has succeeded.
+            os.remove(download_success_file_path)
+        # We want to transfer and untar the file without storing the tar file, because that
+        # adds all that much more space to the needed amount of free space on the disk.
+        # Use subprocess to pipe the output of curl into tar.
+        command = "curl --silent {:s} | tar -xzf - -C {:s}".format(source_url, genome_build_directory)
+        try: # to send the command that downloads and extracts the file.
+            command_output = subprocess.check_output(command, shell=True)
+            # FIX - not sure check_output is what we want to use. If we want to have an error raised on
+            # any problem, maybe we should not be checking output.
+        except subprocess.CalledProcessError:
+            print "ERROR: Trying to run the following command:\n\t{:s}".format(command)
+            raise
+        else:
+            index_was_downloaded = True
+    # Some code to help us if errors occur.
+    print "/n*********************************************************"
+    print "* Finished download and extraction of Mutation Indexes. *"
+    print_directory_contents(genome_build_directory, 2)
+    print "*********************************************************\n"
+    try:
+        # Create a file to indicate that the download succeeded.
+        subprocess.check_call("touch {:s}".format(download_success_file_path), shell=True)
+    except IOError:
+        print "The download_success file could not be created: " + \
+                    "{:s}".format(download_success_file_path)
+        raise
+    return index_was_downloaded
+
 def build_the_library(genome_source_directory, genome_build_directory, build, gmap_build):
     """ genome_source_directory is the location of the source_data needed to build the library.
             Normally it is fully specified, but could be relative.
@@ -350,14 +464,27 @@
         gmap_build -D ctat_genome_lib_build_dir -d ref_genome.fa.gmap -k 13 ctat_genome_lib_build_dir/ref_genome.fa"
     """

+    # Get the root filename of the Genome Directory.
+    src_filename = genome_source_directory.split("/")[-1]
+    root_genome_dirname = src_filename.split(".")[0]
     print "Building the CTAT Genome Resource Library from source data at:\n\t{:s}".format(genome_source_directory)
-    if (genome_source_directory != "" ) and build:
+    # See whether the library has been built already. The success file is written into the source directory.
+    files_in_sourcedir = set(os.listdir(genome_source_directory))
+    build_success_file = "{:s}.{:s}".format(root_genome_dirname, _LibBuiltSuccessFile)
+    build_success_file_path = "{:s}/{:s}".format(genome_source_directory, build_success_file)
+    if (genome_source_directory != "" ) and \
+        ((build_success_file not in files_in_sourcedir) or build):
         if os.path.exists(genome_source_directory):
             os.chdir(genome_source_directory)
+            if (build_success_file in files_in_sourcedir):
+                # Since we are redoing the build,
+                # the success file needs to be removed
+                # until the build has succeeded.
+                os.remove(build_success_file_path)
             # Create the command that builds the Genome Resource Library form the source data.
             command = "prep_genome_lib.pl --genome_fa ref_genome.fa --gtf ref_annot.gtf " + \
                       "--pfam_db PFAM.domtblout.dat.gz " + \
-                      "--output_dir {:s}".format(genome_build_directory)
+                      "--output_dir {:s} ".format(genome_build_directory)
             found_HumanFusionLib = False
             HumanFusionLib_filename = "NoFileFound"
             for filename in os.listdir(genome_source_directory):
@@ -398,6 +525,13 @@
                 "The source directory does not exist:\n\t{:s}".format(genome_source_directory))
     elif gmap_build:
         gmap_the_library(genome_build_directory)
+    try:
+        # Create a file to indicate that the build succeeded.
+        subprocess.check_call("touch {:s}".format(build_success_file_path), shell=True)
+    except IOError:
+        print "The download_success file could not be created: " + \
+                    "{:s}".format(build_success_file_path)
+        raise

 def search_for_genome_build_dir(top_dir_path):
     # If we do not download the directory, the topdir_path could be the
@@ -563,14 +697,20 @@
         help='Is used as the display name for the entry of this Genome Resource Library in the data table.')
     parser.add_argument('-o', '--output_filename', \
         help='Name of the output file, where the json dictionary will be written.')
-    parser.add_argument('-f', '--force_download',
+    parser.add_argument('-d', '--force_download', \
         help='Forces download of the Genome Resource Library, even if previously downloaded.', action='store_true')
-    parser.add_argument('-b', '--build',
+    parser.add_argument('-b', '--build', \
         help='Forces build/rebuild the Genome Resource Library, even if previously built. ' + \
              'Must have downloaded source_data for this to work.', action='store_true')
-    parser.add_argument('-m', '--gmap_build',
+    parser.add_argument('-g', '--gmap_build', \
         help='Must be selected if you want the library to be gmapped. ' + \
              'Will force gmap_build of the Genome Resource Library, even if previously gmapped.', action='store_true')
+    parser.add_argument('-m', '--download_mutation_indexes', default='', \
+        help='Set to the url of the mutation indexes for the Library. ' + \
+             'Will download mutation indexes into the Genome Resource Library.', action='store_true')
+    parser.add_argument('-f', '--force_mutation_indexes_download', \
+        help='Forces the mutation indexes to download, ' + \
+             'even if previously downloaded to this Library.', action='store_true')
     requiredNamed = parser.add_argument_group('required named arguments')
     requiredNamed.add_argument('-p', '--destination_path', required=True, \
         help='Full path of the CTAT Resource Library location or destination, either where it is, or where it will be placed.')
@@ -587,7 +727,13 @@
     print "The value of source_url argument is:\n\t{:s}".format(str(args.source_url))

     # FIX - not sure lib_was_downloaded actually serves a purpose...
+    # The original intent was to check whether an attempted download actually succeeded before proceeding,
+    # but I believe that in those situations, currently, exceptions are raised.
+    # FIX - Need to double check that. Sometimes, although we are told to download, the function
+    # could find that the files are already there, successfully downloaded from a prior attempt,
+    # and does not re-download them.
     lib_was_downloaded = False
+    lib_was_built = False
     download_has_source_data = False
     downloaded_directory = None
     genome_build_directory = None
@@ -605,18 +751,31 @@
     print "\nThe location of the CTAT Genome Resource Library is {:s}.\n".format(genome_build_directory)

     # FIX - We should leave a file indicating build success the same way we do for download success.
-    # To take out builds for testing, coment out the next four lines.
-    if (download_has_source_data or args.build or args.gmap_build):
+    # To take out builds for testing, comment out the lines that do the building.
+    # The command that builds the ctat genome library also has an option for building the gmap indexes.
+    # That is why the gmap_build value is sent to build_the_library(), but if we are not building the
+    # library, the user might still be asking for a gmap_build. That is done after rechecking for the
+    # genome_build_directory.
+    if (download_has_source_data or args.build):
         build_the_library(downloaded_directory, genome_build_directory, True, args.gmap_build)
-    elif (args.gmap_build):
-        gmap_the_library(genome_build_directory)
-
+        lib_was_built = True
     # The following looks to see if the library actually exists after the build,
     # and raises an error if it cannot find the library files.
     # The reassignment of genome_build_directory should be superfluous,
+    # since genome_build_directory should already point to the correct directory,
     # unless I made a mistake in the build code.
     genome_build_directory = search_for_genome_build_dir(genome_build_directory)

+    if (args.gmap_build and not lib_was_built):
+        # If we did not build the genome resource library
+        # the user might still be asking for a gmap_build.
+        gmap_the_library(genome_build_directory)
+
+    if (args.download_mutation_indexes != ""):
+        download_mutation_indexes(source_url=args.download_mutation_indexes, \
+                                  genome_build_directory=genome_build_directory, \
+                                  force_download=args.force_mutation_indexes_download)
+
     # Need to get the genome name.
     genome_name = find_genome_name_in_path(args.source_url)
     if genome_name is None:
@@ -641,7 +800,7 @@

     # Create a unique_id for the library.
     datetime_stamp = datetime.now().strftime("_%Y_%m_%d_%H_%M_%S_%f")
-    unique_id = genome_name + datetime_stamp
+    unique_id = genome_name + "." + datetime_stamp

     print "The Genome Resource Library's display_name will be set to: {:s}\n".format(display_name)
     print "Its unique_id will be set to: {:s}\n".format(unique_id)
--- a/data_manager/add_ctat_resource_lib.xml	Fri May 11 16:06:47 2018 -0400
+++ b/data_manager/add_ctat_resource_lib.xml	Sat Jun 23 15:40:54 2018 -0400
@@ -6,12 +6,6 @@
     <requirements>
         <requirement type="package" version="2.7">python</requirement>
         <requirement type="package" version="0.5.0">fusion-filter</requirement>
-        <!-- gmap-fusion used to be required in order to process downloaded libraries
-             to create all of the required files and indexes. It includes gmap
-             and FusionFilter, programs from both of which are needed.
-             Now there is a bioconda FusionFilter recipe. Lets try using that instead.
-        <requirement type="package" version="0.3.0">gmap-fusion</requirement>
-        -->
     </requirements>
     <command detect_errors="default">
         <![CDATA[
@@ -19,9 +13,9 @@
             --display_name "${display_name}"
             --destination_path "${destination}"
             --output_filename "${out_file}"
-            #if str( $download_question.download ) == "true":
-                --source_url "\"${download_question.source_url}\""
-                #if str( $download_question.force_download ) == "true":
+            #if str( $genome_resource_library.download ) == "true":
+                --source_url "${genome_resource_library.source_url}"
+                #if str( $genome_resource_library.force_download ) == "true":
                     --force_download
                 #end if
             #end if
@@ -31,6 +25,12 @@
             #if str( $gmap_build ) == "true":
                 --gmap_build
             #end if
+            #if str( $mutation_indexes.download ) == "true":
+                --download_mutation_indexes "${mutation_indexes.source_url}"
+                #if str( $mutation_indexes.force_download ) == "true":
+                    --force_mutation_indexes_download
+                #end if
+            #end if
         ]]>
     </command>
     <inputs>
@@ -44,8 +44,8 @@
                  <param name="fastq_input" format="fastqsanger" type="data_collection" collection_type="paired" label="Select dataset pair" help="Specify paired dataset collection containing paired reads"/>
             </when>
         -->
-        <conditional name="download_question">
-            <param name="download" type="boolean" checked="false" label="Need to Download?">
+        <conditional name="genome_resource_library">
+            <param name="download" type="boolean" checked="false" label="Download CTAT Genome Resource Library?">
             </param>
             <when value="true">
                 <!-- The use of a code block to get dynamic options is now deprecated and discouraged.
@@ -91,6 +91,19 @@
         <param name="destination" type="text" label="Local Destination (full path)" />
         <param name="rebuild" type="boolean" checked="false" label="Force rebuild of Library?" />
         <param name="gmap_build" type="boolean" checked="true" label="Do a gmap_build on the Library?" />
+        <!-- <param name="mutation_indexes" type="boolean" checked="true" label="Download mutation indexes into the Library?" />
+        -->
+        <conditional name="mutation_indexes">
+            <param name="download" type="boolean" checked="true" label="Download mutation indexes into the Library?">
+            </param>
+            <when value="true">
+                <param name="source_url" type="select" label="Select a File"
+                    dynamic_options="get_mutation_index_urls()"
+                    help="Select CTAT Mutation Indexes File to Download.\nMake sure it is the right one for your CTAT Genome Resource Library!">
+                </param>
+                <param name="force_download" type="boolean" checked="false" label="Force New Download?" />
+            </when>
+        </conditional>
     </inputs>
     <outputs>
         <data name="out_file" format="data_manager_json" />
@@ -108,9 +121,17 @@
         it is about 25GB that is transfered, so plan accordingly.
         Neither the "source_data" nor the "plug-n-play" versions have had their gmap index built. If you are not going to be
         using gmap_fusion, then you can uncheck the gmap-build check box and save the space and time building the index consumes.
-        If you already have the library, specify the full path of the location where it exists and leave the download box unchecked.
+        Neither the "source_data" nor the "plug-n-play" versions have mutation indexes included. Those must be downloaded
+        separately. If you are not going to be using the mutation tool, uncheck the Download mutation indexes check box and
+        save the space and time it takes to include the mutation index files.
+        - FIX -
+        This version of the tool does not yet implement the download of mutation indexes.
+        - FIX -
+        If you already have a CTAT Genome Resource library installed on your system,
+        specify the full path of the location where it exists and leave the download box unchecked.
         The Reference Genome name may be left empty if downloading. The filename will then be used as the selector text of the entry in the data table.
-        For more information on CTAT Genome Resource Libraries, see <a http="https://github.com/FusionFilter/FusionFilter/wiki">FusionFilter</a>
+        For more information on CTAT Genome Resource Libraries,
+        see <a http="https://github.com/FusionFilter/FusionFilter/wiki">FusionFilter</a>
     </help>
     <code file="add_ctat_resource_lib.py" />
 </tool>