Mercurial > repos > trinity_ctat > ctat_genome_resource_libs_data_manager_3

--- a/data_manager/add_ctat_resource_lib.py	Thu Oct 25 21:44:51 2018 -0400
+++ b/data_manager/add_ctat_resource_lib.py	Fri Oct 26 11:17:43 2018 -0400
@@ -1,17 +1,28 @@
 #!/usr/bin/env python
 # ref: https://galaxyproject.org/admin/tools/data-managers/how-to/define/

-# Rewritten by H.E. Cicada Brokaw Dennis from code downloaded from the toolshed and
-# other example code on the web. It has however been extensively modified and augmented.
-# This now allows downloading of a user selected library
-# but only from the CTAT Genome Resource Library website.
-# Ultimately we might want to allow the user to specify any location
-# from which to download.
-# Users can create or download other libraries and use this Data Manger to add them
+# Written by H.E. Cicada Brokaw Dennis of Indiana University for the Broad Institute.
+# Initial starting point was some code downloaded from the toolshed and
+# other example code on the web.
+# That code has however been extensively modified and augmented.
+
+# This is part of Data Manager code to be used within a Galaxy.
+# This Data Manager allows users to add entries to the ctat_genome_resource_libs table.
+
+# This code allows downloading of a user selected Genome Reference Library
+# from the CTAT Genome Resource Library website.
+# It also provides for building libraries from source, doing a gmap_build over,
+# and/or integrating mutation resources with, a Genome Reference Library.
+# For more information on CTAT Genome Resource Libraries,
+# see https://github.com/FusionFilter/FusionFilter/wiki
+# Users can create or download their own libraries and use this Data Manger to add them
 # if they don't want to add them by hand.

 import sys
 # The many calls to sys.stdout.flush() are done in order to get the output to be synchronized.
+# Otherwise output from subprocesses can get streamed to stdout in a disjunct manner from
+# the output of the process running this code.
+# This is particularly evident in the stdout stream when running within a Galaxy instance.
 import argparse
 import os
 import shutil
@@ -22,11 +33,14 @@
 import contextlib
 import subprocess

-# Comment out the following line when testing without galaxy package.
+# One can comment out the following line when testing without galaxy package.
+# In that case, also comment out the last line in main(). That is, the line that uses to_json_string.
 from galaxy.util.json import to_json_string
-# The following is not being used, but leaving as info
-# in case we ever want to get input values using json.
+
+# The following is not being used, but leaving here as info
+# in case one ever wants to get input values using json.
 # from galaxy.util.json import from_json_string
+# However in this datamanager, the command line arguments are used instead.

 # datetime.now() is used to create the unique_id
 from datetime import datetime
@@ -54,12 +68,13 @@
 _NumBytesNeededForSourceDataExtraction = 10737418240 # 10 Gigabytes. FIX - Not checked - Largest archive is currently 2.5GB.
 _NumBytesNeededForPlugNPlayExtraction = 48318382080 # 45 Gigabytes. Largest archive is currently 28GB and extracts to 43GB.
 # Built Human Genome archive (GRCh38_v27_CTAT_lib_Feb092018) with mutation lib is 46GB.
-# Fix - check amount with gmap.
+# Fix - double check what amount needed when the library is gmap'ed.
 _NumBytesNeededForBuild = 66571993088 # 62 Gigabytes. FIX - This might not be correct.
 _NumBytesNeededForMutationResources = 4294967296 # 4 Gigabytes. Actually need about 3.8GB.
 # Once built the downloaded archive could be deleted to reduce the amount used, but with the archive
 # there and the Cosmic files and the built ctat_mutation_library, 3.8GB is needed.
 # If the archive files are deleted after the integration of the library, only 1.8GB would be used at that point.
+# This program does not currently provide a method for deleting the mutation resource archive files.
 _Write_TestFile = 'write_testfile.txt'
 _DownloadSuccessFile = 'download_succeeded.txt'
 _ExtractionSuccessFile = 'extraction_succeeded.txt'
@@ -172,10 +187,10 @@
     return options

 def get_mutation_resource_urls():
-    # FIX - Rather than letting user choose mutation resource url,
-    # download the correct one for the chosen library?
+    # FIX - Perhaps rather than letting the user choose a mutation resource url,
+    # should we download the correct one for the chosen library?
     # Not sure about this.
-    # In that case don't provide a pull down interface for this.
+    # In that case we wouldn't provide a pull down interface that would call this.
     # FIX -
     build_default_list = False
     resource = urllib2.urlopen(_CTAT_Mutation_URL)
@@ -309,7 +324,7 @@
     # GRCh37_v19_CTAT_lib_Feb092018
     # GRCh38_v27_CTAT_lib_Feb092018
     # Mouse_M16_CTAT_lib_Feb202018
-    # Raises a ValueError if there is no genome name in the given path.
+    # When raise_error is True, a ValueError will be raised if there is no genome name in the given path.
     genome_name = None
     if (path is not None) and (path != ""):
         for element in path.split(os.sep):
@@ -318,14 +333,13 @@
                 or (element[0:len(_CTAT_HumanGenome_Prefix)] == _CTAT_HumanGenome_Prefix):
                 # Remove any extension that might be in the filename.
                 genome_name = element.split(".")[0]
-    if (genome_name is None or (genome_name == "")) and raise_error:
+    if ((genome_name is None) or (genome_name == "")) and raise_error:
         raise ValueError("Cannnot find genome name in the given filename path:\n\t".format(path))
     return genome_name

 def bytes_needed_to_extract(archive_filepath):
     # FIX -- The following should be replaced by a series of statements that return the right value for each archive.
     # The numbers used now estimates for the human genome, and so are big enough for the mouse genome, so ok for now.
-    # But now we are also using this for the mutation resource files, so really need to FIX this.
     # FIX --
     bytes_needed = _NumBytesNeededForPlugNPlayExtraction
     if (ctat_library_type(archive_filepath) == _LIBTYPE_SOURCE_DATA):
@@ -702,17 +716,23 @@
     return extracted_directory

 def get_gmap_success_filename(genome_build_directory):
+    # This function was created because there are two places where the success_filename was being created.
+    # Using this function makes sure that the names being used are the same.
+    # FIX - We could use a static string like "gmap_build" as the first part of the name,
+    #     rather than the genome name, and maybe that would be more logical.
+    #     The name in that case would not be different in different libraries.
+    #     Leaving for now because I don't want to do another round of testing.
     genome_name = find_genome_name_in_path(genome_build_directory)
     if genome_name is None:
         genome_name = os.path.basename(genome_build_directory)
     return "{:s}.{:s}".format(genome_name, _GmapSuccessFile)

 def gmap_the_library(genome_build_directory, force_new_gmap=False):
-    # This is the processing that needs to happen for gmap-fusion to work.
+    # This is the processing that needs to happen for the ctat_gmap_fusion tool to work.
     # genome_build_directory should normally be a fully specified path,
     # though this function should work even if it is relative.
-    # The command prints messages out to stderr, even when there is not an error,
-    # so route stderr to stdout. Otherwise, galaxy thinks an error occurred.
+    # The gmap_build command prints messages out to stderr, even when there is not an error,
+    # so I route stderr to stdout.

     # Create the name of the file used to indicate prior success of gmap.
     gmap_success_filename = get_gmap_success_filename(genome_build_directory)
@@ -766,6 +786,8 @@
         It can be relative to the current working directory or an absolute path.
         build specifies whether to run prep_genome_lib.pl even if it was run before.
 		gmap_build specifies whether to run gmap_build or not.
+        The prep_genome_lib.pl command can send messages out to stderr, even when there is not an error,
+        so I route stderr to stdout.

 		Following was the old way to do it. Before FusionFilter 0.5.0.
 		prep_genome_lib.pl \
@@ -925,9 +947,14 @@
     # source_url is the url of the mutation resources archive to download.
     # genome_build_dir is the location where the archive will be placed.
     # If cosmic_files_location is set, that is the location where the files are presumed to exist.
-    # If cosmic_files_location is not set, the files will assumed to exist in genome_build_directory.
+    # If cosmic_files_location is not set, the files will be assumed to exist in genome_build_directory.
     # If force_new_download is True, then even if the archive has previously been downloaded,
     # it will be downloaded again.
+    # If force_new_integration is True, the resources will be integrated again, even if there has been a
+    # a previous successful integration.
+    # The ctat-mutation-lib-integration command may print messages out to stderr, even when there is not an error.
+    # FIX - However, I forgot to route stderr to stdout as I did with other commands.
+    #     I have left it this way for now because I do not want to do another round of testing.
     """
     From https://github.com/NCIP/ctat-mutations/tree/master/mutation_lib_prep

@@ -935,9 +962,7 @@
     download mutation_lib.hg38.tar.gz into GRCh38_v27_CTAT_lib_Feb092018
     or
     download mutation_lib.hg19.tar.gz into GRCh37_v19_CTAT_lib_Feb092018
-    or
-    download mc-7.tar.gz into Mouse_M16_CTAT_lib_Feb202018
-    (Need to ask about support for mouse, since there is not info about Cosmic mouse genome files in instracutions.)
+    (mouse genome is not yet supported)

     Step 2: Cosmic files download - User must perform this step prior to running this code. We check if files are present.

@@ -1077,9 +1102,9 @@
     # 2) or is it inside of the given directory,
     # 3) or is it inside a subdirectory of the given directory.
     # The source_data downloads are built to a directory named _CTAT_Build_dirname,
-    # and the plug-n-play downloads contain a sub-directory named _CTAT_Build_dirname.
-    # We also look for the genome name and return that, if we find it in the
-    # directory name of the directory holding the build directory.
+    # and the plug-n-play downloads contain a directory with a single sub-directory named _CTAT_Build_dirname.
+    # So the conventional structure has all the library files in .../GenomeName/_CTAT_Build_dirname
+
     top_dir_full_path = os.path.realpath(top_dir_path)
     genome_build_directory = None
     genome_name_from_dirname = None
@@ -1229,20 +1254,30 @@
     return genome_build_directory

 def build_directory_from_build_location(src_filename, build_location):
+    # This function is used to make sure our builds follow the covention of placing the build in a directory named
+    # _CTAT_Build_dirname, which is normally inside of a directory named for the genome name.
+    # However, if the user passes a build_location named _CTAT_Build_dirname that directory will be used,
+    # regardless of the name of the enclosing directory.
     build_directory = None
     genome_dir_name = find_genome_name_in_path(src_filename)
+    if (genome_dir_name is None) or (genome_dir_name == ""):
+        # Maybe it is in the path of the build_location.
+        genome_dir_name = find_genome_name_in_path(build_location)
     if os.path.basename(build_location) == genome_dir_name:
         build_directory = os.path.join(build_location, _CTAT_Build_dirname)
     elif os.path.basename(build_location) == _CTAT_Build_dirname:
         build_directory = build_location
+    elif genome_dir_name is None:
+        # This can be the case if the src_filename does not contain a directory named for the genome.
+        build_directory = os.path.join(build_location, _CTAT_Build_dirname)
     else:
         build_directory = os.path.join(build_location, genome_dir_name, _CTAT_Build_dirname)
     return build_directory

 def main():
-    #Parse Command Line. There are three basic ways to use this tool.
-    # 1) Download and Build the CTAT Genome Resource Library from an archive.
-    # 2) Build the library from source data files that are already downloaded.
+    # Regarding the command line, there are three basic ways to use this tool:
+    # 1) Download and Build the CTAT Genome Resource Library from an archive;
+    # 2) Build the library from source data files that are already downloaded;
     # 3) Specify the location of an already built library.
     # Any of these methods can incorporate or be followed by a gmap build.
     # Any of these methods can be followed by a mutation resources download and/or integration.
@@ -1311,6 +1346,7 @@
         help='The archive will not be deleted after it is extracted.',
         action='store_true')
     # Method 2) arguments - Specify source and build locations.
+    # - One can optionally utilize --build_location argument with this group of arguments.
     specify_source_and_build_args = parser.add_argument_group('Specify Source and Build locations arguments')
     specify_source_and_build_args.add_argument('-s', '--source_location',
         default='', \
@@ -1333,7 +1369,7 @@

     args = parser.parse_args()

-    # All of the input parameters are written by default to the output file prior to
+    # Apparently, Galaxy writes all of the input parameters to the output file prior to
     # this program being called.
     # But I do not get input values from the json file, but rather from command line.
     # Just leaving the following code as a comment, in case it might be useful to someone later.
@@ -1349,7 +1385,9 @@
     download_location_is_set = (args.download_location is not None) and (args.download_location != "")
     source_location_is_set = (args.source_location is not None) and (args.source_location != "")
     build_location_is_set = (args.build_location is not None) and (args.build_location != "")
-
+    mutation_url_is_set = (args.download_mutation_resources_url is not None) \
+                               and (args.download_mutation_resources_url != "")
+
     if download_url_is_set:
         print "The value of download_url argument is:\n\t{:s}".format(str(args.download_url))
         sys.stdout.flush()
@@ -1437,7 +1475,7 @@
     # That is why the gmap_build values are sent to build_the_library(), but if we are not building the
     # library, the user might still be asking for a gmap_build. That is done after rechecking for the
     # genome_build_directory.
-    if (source_data_directory is not None):
+    if source_data_directory is not None:
         build_the_library(source_data_directory, \
                           genome_build_directory, \
                           args.new_library_build, \
@@ -1459,7 +1497,7 @@
         gmap_the_library(genome_build_directory, args.force_gmap_build)
         sys.stdout.flush()

-    if (args.download_mutation_resources_url != ""):
+    if mutation_url_is_set:
         download_and_integrate_mutation_resources(source_url=args.download_mutation_resources_url, \
                                   genome_build_directory=genome_build_directory, \
                                   cosmic_resources_location=args.cosmic_resources_location, \
@@ -1511,7 +1549,7 @@
     sys.stdout.flush()
     # Save info to json file. This is used to transfer data from the DataManager tool, to the data manager,
     # which then puts it into the correct .loc file (I think).
-    # Comment out the following line when testing without galaxy package.
+    # One can comment out the following line when testing without galaxy package.
     open(args.output_filename, 'wb').write(to_json_string(data_manager_dict))

 if __name__ == "__main__":
--- a/data_manager/add_ctat_resource_lib.xml	Thu Oct 25 21:44:51 2018 -0400
+++ b/data_manager/add_ctat_resource_lib.xml	Fri Oct 26 11:17:43 2018 -0400
@@ -1,6 +1,8 @@
 <tool id="ctat_genome_resource_libs_data_manager"
     name="CTAT Genome Resource Libraries Data Manager"
     version="2.0.0" tool_type="manage_data">
+    <!-- This Data Manager tool was written by Cicada Dennis of Indiana University for the Broad Institute.
+    -->
     <description>Retrieve, and/or specify the location of, a CTAT Genome Resource Library.
     </description>
     <requirements>
@@ -60,16 +62,6 @@
         ]]>
     </command>
     <inputs>
-        <!-- The following are left in here, just as examples of various ways of doing options.
-            <param name="force_download" type="boolean" checked="false"
-                truevalue="- -force_download" falsevalue="" label="Force New Download? (yes/no)" />
-            <param name="download" type="select" label="Need to Download?">
-                <option value="single" selected="true">Single Dataset</option>
-                <option value="paired_collection">Paired Collection</option>
-            <when value="paired_collection">
-                 <param name="fastq_input" format="fastqsanger" type="data_collection" collection_type="paired" label="Select dataset pair" help="Specify paired dataset collection containing paired reads"/>
-            </when>
-        -->
         <conditional name="genome_resource_library">
             <param name="build_type" type="select" label="Download CTAT Genome Resource Library?">
                 <option value="download_and_build" selected="true">Download from CTAT and build if needed</option>
@@ -85,7 +77,7 @@
                      this data_manager.
                 This is the dynamic way to get the options filled.
                 <param name="filename" type="select" label="Select File" display="radio"
-                    dynamic_options="get_ctat_genome_filenames()"
+                    dynamic_options="get_ctat_genome_urls()"
                     help="Select a CTAT Genome Resource Library to Download." />
                 Here is the static method for what is online in April 2017:
                 <param name="filename" type="select" label="Choose which library to download.">
@@ -157,7 +149,6 @@
                 <param name="cosmic_files_location" type="text" label="Location of the COSMIC files (See Tool Notes)." />
             </when>
         </conditional>
-        -->
     </inputs>
     <outputs>
         <data name="out_file" format="data_manager_json" />
@@ -180,7 +171,7 @@
         All options allow the user to do a gmap_build on the library
         and also to integrate ctat-mutation resources into the library.

-        You will need approximately 62GB of space for this library, once it is built,
+        You will need approximately 62GB of space for a human genome resource library, once it is built,
         but if downloading and building, to be safe provide at least 75GB.

         The installation of this tool takes some time, due to building a conda environment for the dependencies.
@@ -195,8 +186,8 @@
         Neither the "source_data" nor the "plug-n-play" versions have had their gmap index built. If you are not going to be
         using gmap_fusion, then you can uncheck the gmap_build check box and save the space and time building the index consumes.

-        Neither the "source_data" nor the "plug-n-play" versions have mutation resources included. Those must be downloaded
-        separately. By default the Mutation Resources are not integrated into the Library. If you are going to be using the
+        Neither the "source_data" nor the "plug-n-play" versions have Mutation Resources included.
+        Those must be downloaded separately and integrated into the Library. If you are going to be using the
         ctat_mutations tool, check the Download Mutation Library check box.

         In order to integrate the Mutation Resources into a CTAT Genome Resource Library, you must have previously downloaded
@@ -210,7 +201,7 @@
         If the Reference Genome Display Name is left empty a name will be created,
         but any text that will best guide the user can be entered here.
         It will be the text that is used for selecting the library in pull down lists
-        requiring a Genome Reference Library resource (ctat_genome_resource_libs).
+        requiring a Genome Reference Library resource (These are stored in the ctat_genome_resource_libs table).

         For more information on CTAT Genome Resource Libraries,
         see https://github.com/FusionFilter/FusionFilter/wiki