annotate data_manager/add_ctat_resource_lib.py @ 66:792cfdb8e2f1 draft default tip

Uploaded
author trinity_ctat
date Wed, 28 Nov 2018 23:16:11 -0500
parents 5e233acec659
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
62
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1 #!/usr/bin/env python
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
2 # ref: https://galaxyproject.org/admin/tools/data-managers/how-to/define/
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
3
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
4 # Written by H.E. Cicada Brokaw Dennis of Indiana University for the Broad Institute.
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
5 # Initial starting point was some code downloaded from the toolshed and
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
6 # other example code on the web.
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
7 # That code has however been extensively modified and augmented.
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
8
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
9 # This is part of Data Manager code to be used within a Galaxy.
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
10 # This Data Manager allows users to add entries to the ctat_genome_resource_libs table.
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
11
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
12 # This code allows downloading of a user selected Genome Reference Library
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
13 # from the CTAT Genome Resource Library website.
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
14 # It also provides for building libraries from source, doing a gmap_build over,
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
15 # and/or integrating mutation resources with, a Genome Reference Library.
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
16 # For more information on CTAT Genome Resource Libraries,
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
17 # see https://github.com/FusionFilter/FusionFilter/wiki
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
18 # Users can create or download their own libraries and use this Data Manger to add them
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
19 # if they don't want to add them by hand.
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
20
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
21 import sys
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
22 # The many calls to sys.stdout.flush() are done in order to get the output to be synchronized.
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
23 # Otherwise output from subprocesses can get streamed to stdout in a disjunct manner from
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
24 # the output of the process running this code.
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
25 # This is particularly evident in the stdout stream when running within a Galaxy instance.
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
26 import argparse
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
27 import os
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
28 import shutil
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
29 import tarfile
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
30 import hashlib
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
31 import urllib
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
32 import urlparse
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
33 import contextlib
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
34 import subprocess
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
35
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
36 # One can comment out the following line when testing without galaxy package.
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
37 # In that case, also comment out the last line in main(). That is, the line that uses to_json_string.
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
38 from galaxy.util.json import to_json_string
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
39
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
40 # The following is not being used, but leaving here as info
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
41 # in case one ever wants to get input values using json.
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
42 # from galaxy.util.json import from_json_string
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
43 # However in this datamanager, the command line arguments are used instead.
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
44
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
45 # datetime.now() is used to create the unique_id
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
46 from datetime import datetime
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
47
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
48 # The Data Manager uses a subclass of HTMLParser to look through a web page's html
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
49 # searching for the filenames within anchor tags.
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
50 import urllib2
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
51 from HTMLParser import HTMLParser
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
52
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
53 _CTAT_ResourceLib_URL = 'https://data.broadinstitute.org/Trinity/CTAT_RESOURCE_LIB/'
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
54 _CTAT_Mutation_URL = 'https://data.broadinstitute.org/Trinity/CTAT/mutation/'
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
55 _CTAT_Build_dirname = 'ctat_genome_lib_build_dir'
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
56 _CTAT_MutationLibDirname = 'ctat_mutation_lib'
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
57 _CTAT_ResourceLib_DisplayNamePrefix = 'CTAT_GenomeResourceLib_'
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
58 _CTAT_ResourceLib_DefaultGenome = 'Unspecified_Genome'
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
59 _CTAT_HumanFusionLib_FilenamePrefix = 'CTAT_HumanFusionLib'
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
60 _CTAT_RefGenome_Filename = 'ref_genome.fa'
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
61 _CTAT_MouseGenome_Prefix = 'Mouse'
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
62 _CTAT_HumanGenome_Prefix = 'GRCh'
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
63 _COSMIC_Mutant_Filename = 'CosmicMutantExport.tsv.gz'
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
64 _COSMIC_Coding_Filename = 'CosmicCodingMuts.vcf.gz'
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
65
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
66 # FIX - The following numbers need to be checked and other numbers for gmap, etc. need to be determined.
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
67 # Values for each genome should be determined, so we can get more precise values for each genome.
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
68 _NumBytesNeededForSourceDataExtraction = 10737418240 # 10 Gigabytes. FIX - Not checked - Largest archive is currently 2.5GB.
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
69 _NumBytesNeededForPlugNPlayExtraction = 48318382080 # 45 Gigabytes. Largest archive is currently 28GB and extracts to 43GB.
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
70 # Built Human Genome archive (GRCh38_v27_CTAT_lib_Feb092018) with mutation lib is 46GB.
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
71 # Fix - double check what amount needed when the library is gmap'ed.
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
72 _NumBytesNeededForBuild = 66571993088 # 62 Gigabytes. FIX - This might not be correct.
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
73 _NumBytesNeededForMutationResources = 4294967296 # 4 Gigabytes. Actually need about 3.8GB.
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
74 # Once built the downloaded archive could be deleted to reduce the amount used, but with the archive
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
75 # there and the Cosmic files and the built ctat_mutation_library, 3.8GB is needed.
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
76 # If the archive files are deleted after the integration of the library, only 1.8GB would be used at that point.
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
77 # This program does not currently provide a method for deleting the mutation resource archive files.
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
78 _Write_TestFile = 'write_testfile.txt'
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
79 _DownloadSuccessFile = 'download_succeeded.txt'
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
80 _ExtractionSuccessFile = 'extraction_succeeded.txt'
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
81 _LibBuiltSuccessFile = 'build_succeeded.txt'
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
82 _GmapSuccessFile = 'gmap_succeeded.txt'
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
83 _MutationDownloadSuccessFile = 'mutation_download_succeeded.txt'
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
84 _MutationIntegrationSuccessFile = 'mutation_integration_succeeded.txt'
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
85 _LIBTYPE_SOURCE_DATA = 'source_data'
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
86 _LIBTYPE_PLUG_N_PLAY = 'plug-n-play'
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
87
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
88 class resumable_URL_opener(urllib.FancyURLopener):
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
89 # This class is used to do downloads that can restart a download from
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
90 # the point where it left off after a partial download was interupted.
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
91 # This class and code using it was found online:
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
92 # http://code.activestate.com/recipes/83208-resuming-download-of-a-file/
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
93 # A sub-class is created in order to overide error 206.
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
94 # This error means a partial file is being sent,
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
95 # which is ok in this case. Do nothing with this error.
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
96 def http_error_206(self, url, fp, errcode, errmsg, headers, data=None):
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
97 pass
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
98 # End of class resumable_URL_opener
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
99
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
100 class FileListParser(HTMLParser):
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
101 # The FileListParser object is used by get_ctat_genome_urls() and get_mutation_resource_urls(),
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
102 # which can be called by the Data Manager interface (.xml file) to get
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
103 # the filenames that are available online at broadinstitute.org
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
104 # Apparently creating dynamic option lists this way is deprecated, but no
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
105 # other method exists by which I can get the options dynamically from the web.
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
106 # I believe that it is considered a security risk.
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
107
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
108 # This HTMLParser facilitates getting url's of tar.gz links in an HTML page.
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
109 # These are assumed to be files that can be downloaded and are the files we
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
110 # are particularly interested in this Data Manager.
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
111 def __init__(self):
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
112 # Have to use direct call to super class rather than using super():
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
113 # super(FileListParser, self).__init__()
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
114 # because HTMLParser is an "old style" class and its inheritance chain does not include object.
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
115 HTMLParser.__init__(self)
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
116 self.urls = set()
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
117 def handle_starttag(self, tag, attrs):
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
118 # Look for filename references in anchor tags and add them to urls.
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
119 if tag == "a":
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
120 # The tag is an anchor tag.
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
121 for attribute in attrs:
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
122 # print "Checking: {:s}".format(str(attribute))
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
123 if attribute[0] == "href":
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
124 # Does the href have a tar.gz in it?
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
125 if ("tar.gz" in attribute[1]) and ("md5" not in attribute[1]):
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
126 # Add the value to urls.
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
127 self.urls.add(attribute[1])
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
128 # End of class FileListParser
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
129
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
130 def get_ctat_genome_urls():
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
131 # open the url and retrieve the urls of the files in the directory.
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
132 # If we can't get the list, send a default list.
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
133
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
134 build_default_list = False
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
135 default_url_filename = "GRCh38_v27_CTAT_lib_Feb092018.plug-n-play.tar.gz"
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
136 resource = urllib2.urlopen(_CTAT_ResourceLib_URL)
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
137 if resource is None:
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
138 build_default_list = True
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
139 else:
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
140 theHTML = resource.read()
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
141 if (theHTML is None) or (theHTML == ""):
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
142 build_default_list = True
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
143 if build_default_list:
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
144 # These are the filenames for what was there at least until 2018/10/09.
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
145 urls_to_return = set()
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
146 urls_to_return.add("GRCh37_v19_CTAT_lib_Feb092018.plug-n-play.tar.gz")
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
147 urls_to_return.add("GRCh37_v19_CTAT_lib_Feb092018.source_data.tar.gz")
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
148 urls_to_return.add("GRCh38_v27_CTAT_lib_Feb092018.plug-n-play.tar.gz")
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
149 urls_to_return.add("GRCh38_v27_CTAT_lib_Feb092018.source_data.tar.gz")
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
150 urls_to_return.add("Mouse_M16_CTAT_lib_Feb202018.plug-n-play.tar.gz")
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
151 urls_to_return.add("Mouse_M16_CTAT_lib_Feb202018.source_data.tar.gz")
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
152 else:
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
153 filelist_parser = FileListParser()
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
154 filelist_parser.feed(theHTML)
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
155 urls_to_return = filelist_parser.urls
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
156
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
157 # For dynamic options need to return an itterable with contents that are tuples with 3 items.
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
158 # Item one is a string that is the display name put into the option list.
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
159 # Item two is the value that is put into the parameter associated with the option list.
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
160 # Item three is a True or False value, indicating whether the item is selected.
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
161 options = []
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
162 found_default_url = False
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
163 if len([item for item in urls_to_return if default_url_filename in item]) > 0:
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
164 found_default_url = True
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
165 for i, url in enumerate(filelist_parser.urls):
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
166 # The urls should look like:
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
167 # https://data.broadinstitute.org/Trinity/CTAT_RESOURCE_LIB/GRCh37_v19_CTAT_lib_Feb092018.plug-n-play.tar.gz
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
168 # https://data.broadinstitute.org/Trinity/CTAT_RESOURCE_LIB/Mouse_M16_CTAT_lib_Feb202018.source_data.tar.gz
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
169 # But in actuality, they are coming in looking like:
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
170 # GRCh37_v19_CTAT_lib_Feb092018.plug-n-play.tar.gz
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
171 # Mouse_M16_CTAT_lib_Feb202018.source_data.tar.gz
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
172 # Write code to handle both situations, or an ftp: url.
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
173 url_parts = urlparse.urlparse(url)
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
174 if (url_parts.scheme != ""):
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
175 full_url_path = url
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
176 else:
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
177 # Assume the path is relative to the page location.
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
178 full_url_path = os.path.join(_CTAT_ResourceLib_URL, url)
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
179 filename = os.path.basename(url)
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
180 if (found_default_url and (filename == default_url_filename)) or ((not found_default_url) and (i == 0)):
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
181 # This should be the default option chosen.
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
182 options.append((filename, full_url_path, True))
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
183 else:
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
184 options.append((filename, full_url_path, False))
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
185 options.sort() # So the list will be in alphabetical order.
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
186 # return a tuple of the urls
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
187 print "The list being returned as options is:"
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
188 print "{:s}\n".format(str(options))
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
189 sys.stdout.flush()
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
190 return options
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
191
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
192 def get_mutation_resource_urls():
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
193 # FIX - Perhaps rather than letting the user choose a mutation resource url,
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
194 # should we download the correct one for the chosen library?
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
195 # Not sure about this.
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
196 # In that case we wouldn't provide a pull down interface that would call this.
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
197 # FIX -
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
198 build_default_list = False
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
199 resource = urllib2.urlopen(_CTAT_Mutation_URL)
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
200 if resource is None:
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
201 build_default_list = True
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
202 else:
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
203 theHTML = resource.read()
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
204 if (theHTML is None) or (theHTML == ""):
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
205 build_default_list = True
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
206 if build_default_list:
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
207 # These are the filenames for what was there at least until 2018/10/09.
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
208 urls_to_return = set()
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
209 urls_to_return.add("mutation_lib.hg19.tar.gz")
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
210 urls_to_return.add("mutation_lib.hg38.tar.gz")
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
211 else:
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
212 filelist_parser = FileListParser()
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
213 filelist_parser.feed(theHTML)
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
214 urls_to_return = filelist_parser.urls
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
215
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
216 # For dynamic options need to return an itterable with contents that are tuples with 3 items.
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
217 # Item one is a string that is the display name put into the option list.
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
218 # Item two is the value that is put into the parameter associated with the option list.
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
219 # Item three is a True or False value, indicating whether the item is selected.
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
220 options = []
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
221 for i, url in enumerate(filelist_parser.urls):
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
222 # The urls should look like:
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
223 # https://data.broadinstitute.org/Trinity/CTAT/mutation/mc7.tar.gz
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
224 # https://data.broadinstitute.org/Trinity/CTAT/mutation/hg19.tar.gz
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
225 # But in actuality, they are coming in looking like:
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
226 # hg19.tar.gz
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
227 # mc7.tar.gz
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
228 #
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
229 # On 2018/10/06, the following tar.gz files were present:
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
230 # mutation_lib.hg19.tar.gz
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
231 # mutation_lib.hg38.tar.gz
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
232 # mc-7.tar.gz
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
233 # ctat_mutation_demo.tar.gz
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
234 #
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
235 # Write code to handle both situations, or an ftp: url.
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
236 url_parts = urlparse.urlparse(url)
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
237 if (url_parts.scheme != ""):
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
238 full_url_path = url
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
239 else:
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
240 # Assume the path is relative to the page location.
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
241 full_url_path = os.path.join(_CTAT_Mutation_URL, url)
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
242 filename = os.path.basename(url)
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
243 if (filename.split(".")[0] == "mutation_lib"):
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
244 # As of 2018_10_09, the only ones supported have mutation_lib as the first part of the name.
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
245 options.append((filename, full_url_path, i == 0))
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
246 options.sort() # So the list will be in alphabetical order.
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
247 # return a tuple of the urls
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
248 print "The list being returned as options is:"
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
249 print "{:s}\n".format(str(options))
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
250 sys.stdout.flush()
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
251 return options
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
252
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
253 # The following was used by the example program to get input parameters through the json.
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
254 # Just leaving here for reference.
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
255 # We are getting all of our parameter values through command line arguments.
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
256 #def get_reference_id_name(params):
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
257 # genome_id = params['param_dict']['genome_id']
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
258 # genome_name = params['param_dict']['genome_name']
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
259 # return genome_id, genome_name
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
260 #
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
261 #def get_url(params):
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
262 # trained_url = params['param_dict']['trained_url']
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
263 # return trained_url
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
264
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
265 def print_directory_contents(dir_path, num_levels):
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
266 # This procedure is used to help with debugging and for user information.
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
267 if num_levels > 0:
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
268 if os.path.exists(dir_path) and os.path.isdir(dir_path):
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
269 print "\nDirectory {:s}:".format(dir_path)
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
270 sys.stdout.flush()
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
271 subprocess.call("ls -la {:s} 2>&1".format(dir_path), shell=True)
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
272 else:
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
273 print "Path either does not exist, or is not a directory:\n\t{:s}.".format(dir_path)
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
274 sys.stdout.flush()
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
275 if num_levels > 1:
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
276 if os.path.exists(dir_path) and os.path.isdir(dir_path):
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
277 for filename in os.listdir(dir_path):
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
278 filename_path = os.path.join(dir_path, filename)
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
279 if os.path.exists(filename_path) and os.path.isdir(filename_path):
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
280 print_directory_contents(filename_path, num_levels-1)
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
281 else:
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
282 print "Path either does not exist, or is not a directory:\n\t{:s}.".format(dir_path)
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
283 sys.stdout.flush()
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
284
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
285 def which(file):
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
286 # This procedure is similar to the linux "which" command.
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
287 # It is used to find the location of an executable program that is in the PATH.
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
288 # However this implementation does not check whether the program's file is executable.
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
289 for path in os.environ["PATH"].split(os.pathsep):
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
290 if os.path.exists(os.path.join(path, file)):
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
291 return os.path.join(path, file)
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
292 return None
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
293
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
294 def size_of_file_at(file_url):
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
295 # Returns the size of the file at file_url.
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
296 # We have to open the file, in order to find out how big it is.
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
297 file_retriever = resumable_URL_opener()
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
298 with contextlib.closing(file_retriever.open(file_url)) as filelike_object:
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
299 filesize = int(filelike_object.headers['Content-Length'])
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
300 return filesize
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
301
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
302 def md5sum_for(filename, blocksize=2**20):
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
303 # I got this code for this function off the web, but don't remember where.
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
304 m = hashlib.md5()
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
305 finished = False
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
306 with open(filename, "rb" ) as f:
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
307 while not finished:
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
308 buf = f.read(blocksize)
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
309 if buf:
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
310 m.update( buf )
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
311 else:
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
312 finished = True
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
313 return m.hexdigest()
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
314
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
315 def ctat_library_type(filepath):
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
316 # This function pulls out the string indicating the library type of the file.
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
317 # If the filename indicates source_data, as opposed to plug-n-play,
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
318 # then the library will have to be built after it is downloaded.
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
319 base_filename = os.path.basename(filepath)
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
320 library_type = base_filename.split(".")[1]
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
321 #print "The file {:s}".format(base_filename)
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
322 #print "is of type {:s}".format(library_type)
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
323 return library_type
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
324
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
325 def find_genome_name_in_path(path, raise_error=False):
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
326 # The form of the genome name in directory names (if present in the path) looks like:
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
327 # GRCh37_v19_CTAT_lib_Feb092018
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
328 # GRCh38_v27_CTAT_lib_Feb092018
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
329 # Mouse_M16_CTAT_lib_Feb202018
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
330 # When raise_error is True, a ValueError will be raised if there is no genome name in the given path.
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
331 genome_name = None
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
332 if (path is not None) and (path != ""):
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
333 for element in path.split(os.sep):
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
334 # print "Looking for genome name in {:s}.".format(element)
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
335 if (element[0:len(_CTAT_MouseGenome_Prefix)] == _CTAT_MouseGenome_Prefix) \
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
336 or (element[0:len(_CTAT_HumanGenome_Prefix)] == _CTAT_HumanGenome_Prefix):
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
337 # Remove any extension that might be in the filename.
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
338 genome_name = element.split(".")[0]
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
339 if ((genome_name is None) or (genome_name == "")) and raise_error:
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
340 raise ValueError("Cannnot find genome name in the given filename path:\n\t".format(path))
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
341 return genome_name
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
342
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
343 def bytes_needed_to_extract(archive_filepath):
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
344 # FIX -- The following should be replaced by a series of statements that return the right value for each archive.
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
345 # The numbers used now estimates for the human genome, and so are big enough for the mouse genome, so ok for now.
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
346 # FIX --
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
347 bytes_needed = _NumBytesNeededForPlugNPlayExtraction
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
348 if (ctat_library_type(archive_filepath) == _LIBTYPE_SOURCE_DATA):
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
349 bytes_needed = _NumBytesNeededForSourceDataExtraction
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
350 else: # assume otherwise that it is a plug-n-play archive.
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
351 bytes_needed = _NumBytesNeededForPlugNPlayExtraction
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
352 return bytes_needed
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
353
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
354 def bytes_needed_to_build(source_data_filepath):
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
355 # FIX - The following should be replaced by a series of statements that return the right value for each archive.
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
356 # The numbers used now estimates that largest size needed. Also, it is probably not correct.
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
357 return _NumBytesNeededForBuild
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
358
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
359 def create_success_file(full_file_path, contents=None):
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
360 # full_file_path is the path to the file to write.
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
361 # It should not exist before calling this function,
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
362 # but if it does, it will be overwritten.
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
363 # contents is some text that will be written into the file.
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
364 # It can be empty and nothing will be written.
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
365 try:
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
366 with open(full_file_path,"w") as success_file:
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
367 if contents is not None:
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
368 success_file.write(contents)
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
369 # else nothing is written into it, but we still will have created the file.
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
370 except IOError:
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
371 print "The success indication file could not be created: " + \
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
372 "{:s}".format(full_file_path)
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
373 sys.stdout.flush()
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
374 raise
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
375
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
376 def download_file_from_url(file_url, dest_dir, resume_download=True):
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
377 # Some of the code used in this procedure was downloaded and modified for our needs.
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
378 # That code was at: http://code.activestate.com/recipes/83208-resuming-download-of-a-file/
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
379 # Given a file_url, downloads that file to dest_dir.
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
380 # The url must specify a file to download, so I can grab the filename from the end of the url's path.
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
381 # It is best to fully specify dest_dir. Otherwise the dest_dir will be opened relative to whatever cwd is.
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
382 # If resume_download is True (the default), the function will attempt to resume the download where it left off,
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
383 # if, for example, a previous download was interupted.
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
384 # If resume_download is False, any existing download of the file is deleted and a new download is started.
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
385
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
386 # DOWNLOAD_BLOCK_SIZE = 65536 # 64KB. Old number was 8192 or 8KB.
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
387 DOWNLOAD_BLOCK_SIZE = 1048576 # 1 MB
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
388 download_complete = False
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
389 existing_size = 0
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
390 bytes_read = 0
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
391 file_retriever = resumable_URL_opener()
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
392 dest_filename = os.path.basename(file_url)
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
393 dest_fullpath = os.path.join(dest_dir, dest_filename)
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
394 source_filesize = size_of_file_at(file_url)
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
395 print "Downloading {:s}\nSize of the file is {:d}".format(file_url, source_filesize)
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
396 print "Destination file for the download is {:s}".format(dest_fullpath)
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
397 sys.stdout.flush()
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
398
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
399 # If the file exists and resume_download is requested, then only download the remainder
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
400 if resume_download and os.path.exists(dest_fullpath):
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
401 existing_size = os.path.getsize(dest_fullpath)
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
402 #If the file exists, but we already have the whole thing, don't download again
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
403 print "The destination file exists and is {:d} bytes in size.".format(existing_size)
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
404 if (source_filesize == existing_size):
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
405 print "The file has already been completely downloaded:\n\t{:s}".format(dest_fullpath)
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
406 download_complete = True
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
407 else:
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
408 header = "Range","bytes={:s}-".format(str(existing_size))
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
409 print "Adding header to resume download:\n\t{:s}".format(header)
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
410 file_retriever.addheader("Range","bytes={:s}-".format(str(existing_size)))
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
411 # We open even if download is complete, to avoid adding code to determine whether to close.
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
412 output_file = open(dest_fullpath,"ab")
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
413 else:
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
414 if os.path.exists(dest_fullpath):
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
415 print "The destination file exists:\n\t{:s}".format(dest_fullpath)
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
416 print "However a new download has been requested."
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
417 print "The download will overwrite the existing file."
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
418 else:
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
419 print "The destination file does not exist yet."
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
420 existing_size = 0
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
421 output_file = open(dest_fullpath,"wb")
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
422 sys.stdout.flush()
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
423
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
424 try:
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
425 # Check whether there is enough space on the device for the rest of the file to download.
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
426 statvfs = os.statvfs(dest_dir)
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
427 num_avail_bytes = statvfs.f_frsize * statvfs.f_bavail
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
428 # num_avail_bytes is the number of free bytes that ordinary users
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
429 # are allowed to use (excl. reserved space)
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
430 # Perhaps should subtract some padding amount from num_avail_bytes
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
431 # rather than raising only if there is less than exactly what is needed.
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
432 if (num_avail_bytes < (source_filesize-existing_size)):
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
433 raise OSError("There is insufficient space ({:s} bytes)".format(str(num_avail_bytes)) + \
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
434 " on the device of the destination directory for the download: " + \
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
435 "{:s}".format(cannonical_destination))
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
436
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
437 source_file = file_retriever.open(file_url)
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
438 while not download_complete:
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
439 data = source_file.read(DOWNLOAD_BLOCK_SIZE)
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
440 if data:
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
441 output_file.write(data)
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
442 bytes_read = bytes_read + len(data)
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
443 else:
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
444 download_complete = True
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
445 source_file.close()
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
446 except IOError:
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
447 print "Error while attempting to download {:s}".format(file_url)
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
448 sys.stdout.flush()
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
449 raise
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
450 finally:
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
451 output_file.close()
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
452
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
453 for k,v in source_file.headers.items():
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
454 print k, "=",v
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
455 print "Downloaded {:s} bytes from {:s}".format(str(bytes_read), str(file_url))
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
456 dest_filesize = os.path.getsize(dest_fullpath)
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
457 print "{:s} {:s}".format(str(dest_filesize), str(dest_fullpath))
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
458 sys.stdout.flush()
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
459 if source_filesize != dest_filesize:
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
460 raise IOError("Download error:\n\t" + \
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
461 "The source file\n\t\t{:d}\t{:s}\n\t".format(source_filesize, file_url) + \
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
462 "and the destination file\n\t\t{:d}\t{:s}\n\t".format(dest_filesize, dest_fullpath) + \
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
463 "are different sizes.")
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
464 return dest_fullpath
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
465
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
466 def ensure_we_can_write_numbytes_to(destination, numbytes):
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
467 # Attempts to create the destination directory if it does not exist.
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
468 # Tests whether a file can be written to that directory.
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
469 # Tests whether there is numbytes space on the device of the destination.
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
470 # Raises errors if it cannot do any of the above.
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
471 #
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
472 # Returns the full specification of the destination path.
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
473 # We want to make sure that destination is an absolute fully specified path.
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
474 cannonical_destination = os.path.realpath(destination)
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
475 if os.path.exists(cannonical_destination):
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
476 if not os.path.isdir(cannonical_destination):
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
477 raise ValueError("The destination is not a directory: " + \
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
478 "{:s}".format(cannonical_destination))
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
479 # else all is good. It is a directory.
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
480 else:
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
481 # We need to create it since it does not exist.
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
482 try:
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
483 os.makedirs(cannonical_destination)
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
484 except os.error:
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
485 print "ERROR: Trying to create the following directory path:"
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
486 print "\t{:s}".format(cannonical_destination)
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
487 sys.stdout.flush()
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
488 raise
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
489 # Make sure the directory now exists and we can write to it.
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
490 if not os.path.exists(cannonical_destination):
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
491 # It should have been created, but if it doesn't exist at this point
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
492 # in the code, something is wrong. Raise an error.
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
493 raise OSError("The destination directory could not be created: " + \
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
494 "{:s}".format(cannonical_destination))
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
495 test_writing_filename = "{:s}.{:s}".format(os.path.basename(cannonical_destination), _Write_TestFile)
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
496 test_writing_filepath = os.path.join(cannonical_destination, test_writing_filename)
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
497 try:
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
498 with open(test_writing_filepath, "w") as test_writing_file:
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
499 test_writing_file.write("Testing writing to this file.")
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
500 if os.path.exists(test_writing_filepath):
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
501 os.remove(test_writing_filepath)
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
502 except IOError:
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
503 print "The destination directory could not be written into:\n\t" + \
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
504 "{:s}".format(cannonical_destination)
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
505 sys.stdout.flush()
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
506 raise
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
507 # Check whether there are numbytes available on cannonical_destination's device.
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
508 statvfs = os.statvfs(cannonical_destination)
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
509 # fs_size = statvfs.f_frsize * statvfs.f_blocks # Size of filesystem in bytes
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
510 # num_free_bytes = statvfs.f_frsize * statvfs.f_bfree # Actual number of free bytes
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
511 num_avail_bytes = statvfs.f_frsize * statvfs.f_bavail # Number of free bytes that ordinary users
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
512 # are allowed to use (excl. reserved space)
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
513 if (num_avail_bytes < numbytes):
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
514 raise OSError("There is insufficient space ({:s} bytes)".format(str(num_avail_bytes)) + \
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
515 " on the device of the destination directory:\n\t" + \
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
516 "{:s}\n\t{:d} bytes are needed.".format(cannonical_destination, numbytes))
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
517
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
518 return cannonical_destination
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
519
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
520 def download_genome_archive(source_url, destination, force_new_download=False):
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
521 # This function downloads but does not extract the archive at source_url.
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
522 # This function can be called on a file whose download was interrupted, and if force_new_download
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
523 # is False, the download will proceed where it left off.
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
524 # If download does not succeed, an IOError is raised.
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
525 # The function checks whether there is enough space at the destination for the expanded library.
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
526 # It raises an OSError if not.
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
527 # ValueError can also be raised by this function.
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
528
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
529 # Input Parameters
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
530 # source_url is the full URL of the file we want to download.
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
531 # It should look something like:
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
532 # https://data.broadinstitute.org/Trinity/CTAT_RESOURCE_LIB/GRCh37_v19_CTAT_lib_Feb092018.plug-n-play.tar.gz
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
533 # If only the filename is given, it is assumed to reside at _CTAT_ResourceLib_URL.
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
534 # destination is the location (directory) where a copy of the source file will be placed.
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
535 # Relative paths are expanded using the current working directory, so within Galaxy,
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
536 # it is best to send in absolute fully specified path names so you know to where
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
537 # the source file is going to be copied.
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
538 # force_new_download if True, will cause a new download to occur, even if the file has been downloaded previously.
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
539 #
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
540 # Returns the canonical path to the file that was downloaded.
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
541
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
542 dest_fullpath = None
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
543 url_parts = urlparse.urlparse(source_url)
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
544 source_filename = os.path.basename(url_parts.path)
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
545 if url_parts.scheme == "":
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
546 # Then we were given a source_url without a leading https: or similar.
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
547 # Assume we only were given the filename and that it exists at _CTAT_ResourceLib_URL.
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
548 source_url = urlparse.urljoin(_CTAT_ResourceLib_URL, source_url)
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
549 # FIX - We might want to otherwise check if we have a valid url and/or if we can reach it.
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
550
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
551 print "Downloading:\n\t{:s}".format(str(source_url))
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
552 print "to:\n\t{:s}".format(destination)
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
553 sys.stdout.flush()
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
554 # The next is done so that if the source_url does not have a genome name in it, an error will be raised.
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
555 find_genome_name_in_path(source_url, raise_error=True)
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
556 cannonical_destination = ensure_we_can_write_numbytes_to(destination, size_of_file_at(source_url))
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
557
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
558 # Get the list of files in the directory,
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
559 # We use it to check for a previous download.
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
560 orig_files_in_destdir = set(os.listdir(cannonical_destination))
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
561 # See whether the file has been downloaded already.
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
562 download_success_filename = "{:s}.{:s}".format(source_filename, _DownloadSuccessFile)
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
563 download_success_full_file_path = os.path.join(cannonical_destination, download_success_filename)
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
564 if ((download_success_filename not in orig_files_in_destdir) \
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
565 or force_new_download):
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
566 if (download_success_filename in orig_files_in_destdir):
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
567 # Since we are redoing the download,
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
568 # the success file needs to be removed
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
569 # until the download has succeeded.
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
570 os.remove(download_success_full_file_path)
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
571 # The following raises an error if the download fails for some reason.
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
572 dest_fullpath = download_file_from_url(source_url, cannonical_destination, \
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
573 resume_download=(not force_new_download))
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
574 # Check the md5sum of the cannonical_destination file to ensure the data in the file is correct.
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
575 file_retriever = resumable_URL_opener()
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
576 md5_url = "{:s}.md5".format(source_url)
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
577 print "Checking the md5sum of the downloaded file."
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
578 try:
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
579 md5_file = file_retriever.open(md5_url, "r")
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
580 md5sum_from_web = md5_file.readlines()[0].strip().split()[0]
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
581 md5_file.close()
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
582 md5sum_from_file = md5sum_for(dest_fullpath)
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
583 except IOError:
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
584 print "Error while attempting to check the md5sum for {:s}".format(dest_fullpath)
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
585 sys.stdout.flush()
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
586 raise
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
587 if md5sum_from_web != md5sum_from_file:
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
588 raise IOError("Download error:\n\t" + \
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
589 "The md5 sum for\n\t\t({:s})\n\t".format(dest_fullpath) + \
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
590 "does not match the value read from the web:\n\t\t" + \
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
591 "({:s} != {:s})".format(md5sum_from_file, md5sum_from_web))
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
592 print "Check of md5sum succeeded."
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
593 create_success_file(download_success_full_file_path, \
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
594 "Download of:\n\t{:s}\n".format(source_url) + \
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
595 "to:\n\t{:s}\nsucceeded.".format(dest_fullpath))
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
596 elif download_success_filename in orig_files_in_destdir:
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
597 print "The download success file exists, so no download is being attempted:"
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
598 print "\t{:s}".format(download_success_full_file_path)
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
599 print "Remove the file or set <Force New Download> if you want a new download to occur."
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
600 dest_filename = os.path.basename(source_url)
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
601 dest_fullpath = os.path.join(cannonical_destination, dest_filename)
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
602 else:
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
603 print "download_genome_archive(): This code should never be printed. Something is wrong."
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
604 sys.stdout.flush()
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
605
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
606 # Some code to help us if errors occur.
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
607 print "\n*******************************"
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
608 print "* Finished download. *"
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
609 sys.stdout.flush()
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
610 print_directory_contents(cannonical_destination, 1)
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
611 print "*******************************\n"
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
612 sys.stdout.flush()
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
613
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
614 return dest_fullpath
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
615
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
616 def extract_archive(archive_filepath, destination, force_new_extraction=False):
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
617 # Generic function will use tarfile object to extract the given archive_filepath
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
618 # to the destination. If a file indicating a previous successful extraction exists
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
619 # the file is not extracted again unless force_new_extraction is True.
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
620 # This procedure does not write the extraction success file, because some error checking
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
621 # is dependant on the file being extracted. The calling procedure can/should write the
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
622 # success file after doing error checking.
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
623 cannonical_destination = ensure_we_can_write_numbytes_to(destination, bytes_needed_to_extract(archive_filepath))
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
624
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
625 # Create the name of the file used to indicate prior success of the file's extraction.
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
626 extraction_success_filename = "{:s}.{:s}".format(os.path.basename(archive_filepath), _ExtractionSuccessFile)
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
627 extraction_success_full_file_path = os.path.join(cannonical_destination, extraction_success_filename)
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
628 #print "extraction_success_filename is {:s}".format(extraction_success_filename)
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
629
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
630 orig_files_in_destination = set(os.listdir(cannonical_destination))
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
631 if ((extraction_success_filename not in orig_files_in_destination) \
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
632 or force_new_extraction):
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
633 # Do the extraction.
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
634 if (extraction_success_filename in orig_files_in_destination):
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
635 # Since we are redoing the extraction,
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
636 # the success file needs to be removed
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
637 # until the extraction has succeeded.
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
638 os.remove(extraction_success_full_file_path)
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
639 with tarfile.open(archive_filepath, mode="r:*") as archive_file:
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
640 archive_file.extractall(path=cannonical_destination)
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
641 elif (extraction_success_filename in orig_files_in_destination):
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
642 # The archive was successfully extracted before so we do not do it again.
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
643 print "The extraction success file exists, so no new extraction was attempted:"
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
644 print "\t{:s}".format(extraction_success_full_file_path)
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
645 print "Remove the success file or set <force new extraction> if you want a new extraction to occur."
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
646 else:
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
647 print "extract_archive(): This code should never be printed. Something is wrong."
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
648 sys.stdout.flush()
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
649
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
650 # Some code to help us if errors occur.
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
651 print "\n*******************************************************"
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
652 print "* Finished extraction. Destination directory listing. *"
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
653 sys.stdout.flush()
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
654 print_directory_contents(cannonical_destination, 1)
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
655 print "*******************************************************\n"
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
656 sys.stdout.flush()
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
657 return
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
658
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
659 def extract_genome_file(archive_filepath, destination, force_new_extraction=False, keep_archive=False):
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
660 # Extract a CTAT Genome Reference Library archive file.
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
661 # It is best if archive_filepath is an absolute, fully specified filepath, not a relative one.
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
662 # destination is the directory to which the archive will be extracted.
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
663 # force_new_extraction can be used to cause extraction to occur, even if the file was extracted before.
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
664 #
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
665 # Returns extracted_directory
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
666 # The full path of the top level directory that is
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
667 # created by the extraction of the files from the archive.
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
668
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
669 print "Extracting:\n\t {:s}".format(str(archive_filepath))
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
670 print "to:\n\t{:s}".format(destination)
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
671 sys.stdout.flush()
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
672 cannonical_destination = ensure_we_can_write_numbytes_to(destination, bytes_needed_to_extract(archive_filepath))
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
673 # Get the root filename of the Genome Directory from the source file's name.
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
674 # That should also be the name of the extracted directory.
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
675 genome_dirname = find_genome_name_in_path(archive_filepath, raise_error=True)
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
676
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
677 orig_files_in_destination = set(os.listdir(cannonical_destination))
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
678 extract_archive(archive_filepath, destination, force_new_extraction)
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
679 newfiles_in_destdir = set(os.listdir(cannonical_destination)) - orig_files_in_destination
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
680
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
681 if (genome_dirname not in newfiles_in_destdir):
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
682 # Perhaps it has a different name than what we expect it to be.
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
683 # It will be a sub-directory that was not in the directory
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
684 # before we did the download and extraction.
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
685 found_filename = None
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
686 if len(newfiles_in_destdir) == 1:
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
687 found_filename = newfiles_in_destdir[0]
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
688 else:
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
689 for filename in newfiles_in_destdir:
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
690 # In most cases, there will only be one new file, but some OS's might have created
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
691 # other files in the directory.
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
692 # Look for the directory that was downloaded and extracted.
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
693 # The correct file's name should be a substring of the tar file that was downloaded.
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
694 if filename in src_filename:
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
695 # make sure it is a directory
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
696 if os.path.isdir(os.path.join(cannonical_destination,filename)):
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
697 found_filename = filename
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
698 if found_filename is not None:
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
699 genome_dirname = found_filename
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
700
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
701 extracted_directory = os.path.join(cannonical_destination, genome_dirname)
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
702 if (os.path.exists(extracted_directory)):
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
703 # Create the name of the file used to indicate prior success of the file's extraction.
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
704 extraction_success_filename = "{:s}.{:s}".format(os.path.basename(archive_filepath), _ExtractionSuccessFile)
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
705 extraction_success_full_file_path = os.path.join(cannonical_destination, extraction_success_filename)
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
706 create_success_file(extraction_success_full_file_path, \
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
707 "Extraction of:\n\t{:s}\n".format(archive_filepath) + \
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
708 "to:\n\t{:s}\nsucceeded.".format(extracted_directory))
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
709 else:
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
710 raise ValueError("ERROR: Could not find the extracted directory in the destination directory:" + \
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
711 "\n\t{:s}".format(cannonical_destination))
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
712 if not keep_archive:
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
713 # We are done extracting, so remove the archive file.
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
714 if os.path.exists(archive_filepath):
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
715 print "Removing the archive file:\n\t{:s}".format(archive_filepath)
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
716 sys.stdout.flush()
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
717 os.remove(archive_filepath)
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
718 # else: # It was removed previously, so we don't need to remove it again.
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
719 return extracted_directory
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
720
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
721 def get_gmap_success_filename(genome_build_directory):
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
722 # This function was created because there are two places where the success_filename was being created.
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
723 # Using this function makes sure that the names being used are the same.
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
724 # FIX - We could use a static string like "gmap_build" as the first part of the name,
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
725 # rather than the genome name, and maybe that would be more logical.
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
726 # The name in that case would not be different in different libraries.
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
727 # Leaving for now because I don't want to do another round of testing.
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
728 genome_name = find_genome_name_in_path(genome_build_directory)
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
729 if genome_name is None:
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
730 genome_name = os.path.basename(genome_build_directory)
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
731 return "{:s}.{:s}".format(genome_name, _GmapSuccessFile)
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
732
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
733 def gmap_the_library(genome_build_directory, force_new_gmap=False):
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
734 # This is the processing that needs to happen for the ctat_gmap_fusion tool to work.
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
735 # genome_build_directory should normally be a fully specified path,
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
736 # though this function should work even if it is relative.
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
737 # The gmap_build command prints messages out to stderr, even when there is not an error,
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
738 # so I route stderr to stdout.
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
739
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
740 # Create the name of the file used to indicate prior success of gmap.
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
741 gmap_success_filename = get_gmap_success_filename(genome_build_directory)
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
742 gmap_success_full_file_path = os.path.join(genome_build_directory, gmap_success_filename)
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
743
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
744 orig_files_in_build_dir = set(os.listdir(genome_build_directory))
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
745 if ((gmap_success_filename not in orig_files_in_build_dir) \
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
746 or force_new_gmap):
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
747 # Do the gmap.
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
748 if (gmap_success_filename in orig_files_in_build_dir):
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
749 # Since we are redoing the gmap,
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
750 # the success file needs to be removed
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
751 # until the gmap has succeeded.
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
752 os.remove(gmap_success_full_file_path)
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
753 command = "gmap_build -D {:s}/ -d ref_genome.fa.gmap -k 13 {:s}/ref_genome.fa 2>&1".format( \
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
754 genome_build_directory, genome_build_directory)
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
755 print "Doing a gmap_build with the following command:\n\t{:s}\n".format(command)
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
756 sys.stdout.flush()
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
757 try: # to send the gmap_build command.
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
758 subprocess.check_call(command, shell=True)
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
759 except subprocess.CalledProcessError:
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
760 print "ERROR: While trying to run the gmap_build command on the library:\n\t{:s}".format(command)
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
761 sys.stdout.flush()
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
762 raise
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
763 finally:
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
764 sys.stdout.flush()
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
765 # Some code to help us if errors occur.
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
766 print "\n*******************************\nAfter running gmap_build."
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
767 sys.stdout.flush()
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
768 print_directory_contents(genome_build_directory, 2)
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
769 print "*******************************\n"
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
770 sys.stdout.flush()
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
771 create_success_file(gmap_success_full_file_path, \
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
772 "gmap of:\n\t{:s}\nsucceeded.".format(genome_build_directory))
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
773 elif gmap_success_filename in orig_files_in_build_dir:
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
774 print "The gmap success file exists, so no gmap is being attempted:"
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
775 print "\t{:s}".format(gmap_success_full_file_path)
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
776 print "Remove the file or set <force new gmap> if you want a new gmap to occur."
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
777 else:
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
778 print "gmap_the_library(): This code should never be printed. Something is wrong."
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
779 sys.stdout.flush()
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
780 return
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
781
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
782
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
783 def build_the_library(genome_source_directory, \
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
784 genome_build_directory, force_new_build=False, \
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
785 gmap_build=False, force_gmap_build=False):
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
786 """ genome_source_directory is the location of the source_data needed to build the library.
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
787 Normally it is fully specified, but could be relative.
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
788 genome_build_directory is the location where the library will be built.
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
789 It can be relative to the current working directory or an absolute path.
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
790 build specifies whether to run prep_genome_lib.pl even if it was run before.
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
791 gmap_build specifies whether to run gmap_build or not.
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
792 The prep_genome_lib.pl command can send messages out to stderr, even when there is not an error,
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
793 so I route stderr to stdout.
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
794
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
795 Following was the old way to do it. Before FusionFilter 0.5.0.
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
796 prep_genome_lib.pl \
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
797 --genome_fa ref_genome.fa \
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
798 --gtf ref_annot.gtf \
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
799 --blast_pairs blast_pairs.gene_syms.outfmt6.gz \
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
800 --fusion_annot_lib fusion_lib.dat.gz
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
801 --output_dir ctat_genome_lib_build_dir
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
802 index_pfam_domain_info.pl \
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
803 --pfam_domains PFAM.domtblout.dat.gz \
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
804 --genome_lib_dir ctat_genome_lib_build_dir
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
805 gmap_build -D ctat_genome_lib_build_dir -d ref_genome.fa.gmap -k 13 ctat_genome_lib_build_dir/ref_genome.fa"
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
806 """
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
807
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
808 if (genome_source_directory is None) or (genome_source_directory == "" ) or not os.path.exists(genome_source_directory):
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
809 raise ValueError("Cannot build the CTAT Genome Resource Library. " + \
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
810 "The source directory does not exist:\n\t{:s}".format(str(genome_source_directory)))
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
811 cannonical_destination = ensure_we_can_write_numbytes_to(genome_build_directory, \
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
812 bytes_needed_to_build(genome_source_directory))
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
813 print "Building the CTAT Genome Resource Library from source data at:\n\t{:s}".format(str(genome_source_directory))
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
814 print "The Destination directory is at:\n\t{:s}".format(str(cannonical_destination))
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
815 sys.stdout.flush()
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
816
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
817 # Get the root filename of the Genome Directory.
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
818 src_filename = os.path.basename(genome_source_directory)
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
819 # See whether the library has been built already. The success file is written into the source directory.
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
820 files_in_sourcedir = set(os.listdir(genome_source_directory))
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
821 build_success_filename = "{:s}.{:s}".format(src_filename, _LibBuiltSuccessFile)
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
822 build_success_file_path = os.path.join(genome_source_directory, build_success_filename)
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
823 if (build_success_filename not in files_in_sourcedir) or force_new_build:
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
824 os.chdir(genome_source_directory)
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
825 if (build_success_filename in files_in_sourcedir):
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
826 # Since we are redoing the build,
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
827 # the success file needs to be removed
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
828 # until the build has succeeded.
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
829 os.remove(build_success_file_path)
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
830 # Create the command that builds the Genome Resource Library form the source data.
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
831 command = "prep_genome_lib.pl --genome_fa ref_genome.fa --gtf ref_annot.gtf " + \
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
832 "--pfam_db PFAM.domtblout.dat.gz " + \
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
833 "--output_dir {:s} ".format(cannonical_destination)
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
834 found_HumanFusionLib = False
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
835 HumanFusionLib_filename = "NoFileFound"
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
836 for filename in os.listdir(genome_source_directory):
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
837 # At the time this was written, the filename was CTAT_HumanFusionLib.v0.1.0.dat.gz
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
838 # We only check the prefix, in case other versions are used later.
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
839 # I assume there is only one in the directory, but if there are more than one,
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
840 # the later one, alphabetically, will be used.
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
841 if filename.split(".")[0] == _CTAT_HumanFusionLib_FilenamePrefix:
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
842 found_HumanFusionLib = True
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
843 filename_of_HumanFusionLib = filename
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
844 if found_HumanFusionLib:
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
845 # The mouse genomes do not have a fusion_annot_lib
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
846 # so only add the following for Human genomes.
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
847 command += "--fusion_annot_lib {:s} ".format(filename_of_HumanFusionLib) + \
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
848 "--annot_filter_rule AnnotFilterRule.pm "
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
849 if gmap_build:
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
850 command += "--gmap_build "
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
851 # Send stderr of the command to stdout, because some functions may write to stderr,
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
852 # even though no error has occurred. We will depend on error code return in order
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
853 # to know if an error occurred.
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
854 command += " 2>&1"
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
855 print "About to run the following command:\n\t{:s}".format(command)
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
856 sys.stdout.flush()
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
857 try: # to send the prep_genome_lib command.
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
858 subprocess.check_call(command, shell=True)
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
859 except subprocess.CalledProcessError:
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
860 print "ERROR: While trying to run the prep_genome_lib.pl command " + \
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
861 "on the CTAT Genome Resource Library:\n\t{:s}".format(command)
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
862 raise
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
863 finally:
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
864 # Some code to help us if errors occur.
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
865 print "\n*******************************"
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
866 print "Contents of Genome Source Directory {:s}:".format(genome_source_directory)
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
867 sys.stdout.flush()
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
868 print_directory_contents(genome_source_directory, 2)
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
869 print "\nContents of Genome Build Directory {:s}:".format(cannonical_destination)
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
870 sys.stdout.flush()
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
871 print_directory_contents(cannonical_destination, 2)
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
872 print "*******************************\n"
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
873 sys.stdout.flush()
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
874 create_success_file(build_success_file_path, \
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
875 "Build of:\n\t{:s}\n".format(genome_source_directory) + \
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
876 "to:\n\t{:s}\nsucceeded.".format(cannonical_destination))
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
877 if gmap_build:
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
878 # Create the gmap success file.
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
879 gmap_success_filename = get_gmap_success_filename(cannonical_destination)
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
880 gmap_success_full_file_path = os.path.join(cannonical_destination, gmap_success_filename)
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
881 create_success_file(gmap_success_full_file_path, \
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
882 "gmap of:\n\t{:s}\nsucceeded.".format(cannonical_destination))
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
883 elif (build_success_filename in files_in_sourcedir):
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
884 print "The build success file exists, so no build is being attempted:"
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
885 print "\t{:s}".format(build_success_file_path)
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
886 print "Remove the file or set <force new build> if you want a new build to occur."
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
887 # We might still need to do a gmap_build.
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
888 if gmap_build:
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
889 print "Checking if we need to gmap the library."
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
890 sys.stdout.flush()
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
891 gmap_the_library(cannonical_destination, force_gmap_build)
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
892 sys.stdout.flush()
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
893 # gmap_the_library creates a gmap success file if it succeeds.
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
894 else:
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
895 print "build_the_library(): This code should never be printed. Something is wrong."
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
896 sys.stdout.flush()
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
897 return
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
898 # End of build_the_library()
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
899
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
900 def find_path_to_mutation_lib_integration():
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
901 # We are assuming that we exist inside of a conda environment and that the directory that we want
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
902 # is in the share directory, one level up from the bin directory that contains the ctat_mutations
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
903 # command.
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
904 path_to_mutation_lib_integration = None
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
905 path_to_ctat_mutations = which("ctat_mutations")
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
906 if (path_to_ctat_mutations is None) or (path_to_ctat_mutations == ""):
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
907 raise ValueError("Unable to find ctat_mutations, which is required to do mutation resource processing.")
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
908 conda_root_dir = os.path.dirname(os.path.dirname(path_to_ctat_mutations))
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
909 share_dir = os.path.join(conda_root_dir, "share")
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
910 ctat_mutations_dir = None
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
911 for filename in os.listdir(share_dir):
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
912 if "ctat-mutations" in filename:
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
913 ctat_mutations_dir = filename
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
914 if (ctat_mutations_dir is None) or (ctat_mutations_dir == ""):
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
915 raise ValueError("Unable to find the home of ctat_mutations.\n" + \
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
916 "It should be in the share directory:\n\t{:s}.".format(share_dir))
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
917 path_to_mutation_lib_integration = os.path.join(share_dir, \
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
918 ctat_mutations_dir, \
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
919 "mutation_lib_prep", \
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
920 "ctat-mutation-lib-integration.py")
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
921 return path_to_mutation_lib_integration
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
922
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
923 def find_path_to_picard_home():
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
924 picard_home = None
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
925 path_to_ctat_mutations = which("ctat_mutations")
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
926 if (path_to_ctat_mutations is None) or (path_to_ctat_mutations == ""):
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
927 raise ValueError("Unable to find ctat_mutations, which is required to do mutation resources processing.")
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
928 # The ctat_mutations shell script defines PICARD_HOME. We just need to get it out of that file.
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
929 ctat_mutations_file = open(path_to_ctat_mutations, "r")
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
930 for line in ctat_mutations_file:
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
931 if ("export" in line) and ("PICARD_HOME=" in line):
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
932 # Get the value after the equal sign and strip off the newline at the end of string.
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
933 # Then strip off quotes at begin and end if they are there.
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
934 # And then strip off any other whitespace that might have been inside of stripped off quotes.
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
935 picard_home = line.split("=")[1].strip().strip('\"').strip()
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
936 if (picard_home is None) or (picard_home == ""):
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
937 # We didn't find it in the ctat_mutations file. Search for it.
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
938 conda_root_dir = os.path.dirname(os.path.dirname(path_to_ctat_mutations))
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
939 share_dir = os.path.join(conda_root_dir, "share")
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
940 for filename in os.listdir(share_dir):
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
941 if "picard" in filename:
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
942 picard_home = os.path.join(share_dir,filename)
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
943 if (picard_home is None) or (picard_home == ""):
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
944 raise ValueError("Unable to find PICARD_HOME.\n" +
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
945 "It should be in the share directory:\n\t{:s}.".format(share_dir))
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
946 return picard_home
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
947
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
948 def download_and_integrate_mutation_resources(source_url, genome_build_directory, cosmic_resources_location=None, \
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
949 force_new_download=False, force_new_integration=False):
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
950 # source_url is the url of the mutation resources archive to download.
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
951 # genome_build_dir is the location where the archive will be placed.
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
952 # If cosmic_files_location is set, that is the location where the files are presumed to exist.
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
953 # If cosmic_files_location is not set, the files will be assumed to exist in genome_build_directory.
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
954 # If force_new_download is True, then even if the archive has previously been downloaded,
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
955 # it will be downloaded again.
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
956 # If force_new_integration is True, the resources will be integrated again, even if there has been a
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
957 # a previous successful integration.
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
958 # The ctat-mutation-lib-integration command may print messages out to stderr, even when there is not an error.
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
959 # FIX - However, I forgot to route stderr to stdout as I did with other commands.
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
960 # I have left it this way for now because I do not want to do another round of testing.
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
961 """
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
962 From https://github.com/NCIP/ctat-mutations/tree/master/mutation_lib_prep
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
963
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
964 Step 1 (after CTAT Genome Resource Library is built)
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
965 download mutation_lib.hg38.tar.gz into GRCh38_v27_CTAT_lib_Feb092018
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
966 or
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
967 download mutation_lib.hg19.tar.gz into GRCh37_v19_CTAT_lib_Feb092018
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
968 (mouse genome is not yet supported)
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
969
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
970 Step 2: Cosmic files download - User must perform this step prior to running this code. We check if files are present.
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
971
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
972 Next download COSMIC resources required in this directory. Depending on the version of genome you need you can install either COSMIC's hg38 or COSMIC's hg19. You will need to download 2 sets of files: COSMIC Mutation Data (CosmicMutantExport.tsv.gz) and COSMIC Coding Mutation VCF File (CosmicCodingMuts.vcf.gz). Please note, for download to succeed you will need to register and login to their service.
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
973
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
974 So is there a way the user can give their credentials through the Data Manager interface as a part of specifying Mutation parameters and then I can programatically use those credentials to download the file, or maybe instead, the interface needs to have the intructions for the user to download the files, then the use needs to specify the absolute path to where those files are.
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
975
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
976 Step 3: Mutation lib integration
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
977
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
978 Once you have downloaded CosmicMutantExport.tsv.gz AND CosmicCodingMuts.vcf.gz (hg38 or hg19), proceed with mutation lib integration step which will integrate the mutation resource with CTAT_GENOME_LIB (This corresponds to "GRCh37_v19_CTAT_lib_Feb092018" or "GRCh38_v27_CTAT_lib_Feb092018" downloaded in Step 1). You will find this script in ctat-mutations repo in 'src' directory.
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
979
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
980 #Keep Picard in PICARD_HOME environmental variable like so
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
981 export PICARD_HOME=/path/to/picard
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
982
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
983 #Integrate CTAT mutations lib with CTAT genome library
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
984 python ctat-mutations/mutation_lib_prep/ctat-mutation-lib-integration.py \
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
985 --CosmicMutantExport CosmicMutantExport.tsv.gz \
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
986 --CosmicCodingMuts CosmicCodingMuts.vcf.gz \
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
987 --genome_lib_dir GRCh37_v19_CTAT_lib_Feb092018/ # OR GRCh38_v27_CTAT_lib_Feb092018/
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
988
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
989 Now you are all set to run the ctat-mutations pipeline
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
990 """
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
991 print "\n***********************************"
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
992 print "* Integrating Mutation Resources. *"
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
993 print "***********************************\n"
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
994 sys.stdout.flush()
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
995 # It is assumed that this procedure is only called with a valid genome_build_directory.
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
996 url_parts = urlparse.urlparse(source_url)
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
997 source_filename = os.path.basename(url_parts.path)
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
998 if url_parts.scheme == "":
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
999 # Then we were given a source_url without a leading https: or similar.
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1000 # Assume we only were given the filename and that it exists at _CTAT_Mutation_URL.
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1001 source_url = urlparse.urljoin(_CTAT_Mutation_URL, source_url)
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1002 # FIX - We might want to otherwise check if we have a valid url and/or if we can reach it.
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1003 cannonical_destination = ensure_we_can_write_numbytes_to(genome_build_directory, _NumBytesNeededForMutationResources)
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1004 print "Download and Integrate a Mutation Resource Archive."
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1005 print "The source URL is:\n\t{:s}".format(str(source_url))
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1006 print "The destination is:\n\t{:s}".format(str(cannonical_destination))
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1007 sys.stdout.flush()
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1008 # Get the list of files in the directory,
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1009 # We use it to check for a previous download or extraction among other things.
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1010 orig_files_in_destdir = set(os.listdir(cannonical_destination))
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1011
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1012 # DOWNLOAD SECTION
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1013 # See whether the index file has been downloaded already.
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1014 download_success_file = "{:s}.{:s}".format(source_filename, _MutationDownloadSuccessFile)
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1015 download_success_file_path = os.path.join(cannonical_destination, download_success_file)
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1016 if ((download_success_file not in orig_files_in_destdir) or force_new_download):
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1017 # DO THE DOWNLOAD
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1018 if (download_success_file in orig_files_in_destdir):
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1019 # Since we are redoing the download,
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1020 # the success file needs to be removed
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1021 # until the download has succeeded.
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1022 os.remove(download_success_file_path)
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1023 # The following raises an IOError if the download fails for some reason.
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1024 archive_fullpath = download_file_from_url(source_url, cannonical_destination, resume_download=(not force_new_download))
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1025 create_success_file(download_success_file_path, \
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1026 "Download of the mutation resource archive:\n\t{:s}\n".format(source_url) + \
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1027 "to:\n\t{:s}\nsucceeded.".format(cannonical_destination))
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1028 elif (download_success_file in orig_files_in_destdir):
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1029 print "The download success file exists, so no download is being attempted:"
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1030 print "\t{:s}".format(download_success_file_path)
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1031 print "Remove the file or set <new_mutation_download> if you want a new download to occur."
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1032 else:
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1033 print "download_and_integrate_mutation_resources() - Download: This code should never be printed. Something is wrong."
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1034 sys.stdout.flush()
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1035
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1036 # INTEGRATION SECTION
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1037 integration_success_file = "{:s}.{:s}".format(source_filename, _MutationIntegrationSuccessFile)
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1038 integration_success_file_path = os.path.join(cannonical_destination, integration_success_file)
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1039 if ((integration_success_file not in orig_files_in_destdir) or force_new_integration):
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1040 # INTEGRATE THE LIBRARY
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1041 if (integration_success_file in orig_files_in_destdir):
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1042 # Since we are redoing the integration,
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1043 # the success file needs to be removed
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1044 # until the download has succeeded.
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1045 os.remove(integration_success_file_path)
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1046 mutation_lib_dirpath = os.path.join(cannonical_destination, _CTAT_MutationLibDirname)
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1047 # If we do not remove the directory, then the old files will exist and a new integration does not occur.
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1048 # Also, with the Cosmic files, when the integrated file is created, if there is a previous one, gzip
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1049 # asks a question of the user, and this program is not prepared to respond to a question from a subprocess:
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1050 # [bgzip] /path/to/ctat_mutation_lib/cosmic.vcf.gz already exists; do you wish to overwrite (y or n)?
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1051 if os.path.exists(mutation_lib_dirpath):
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1052 shutil.rmtree(mutation_lib_dirpath)
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1053 # Check for Cosmic resources. User has to place these files into the correct location.
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1054 if (cosmic_resources_location is None) or (cosmic_resources_location == ""):
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1055 cosmic_resources_loc_full_path = cannonical_destination
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1056 end_err_msg = "These files must be placed into:\n\t{:s}".format(cosmic_resources_loc_full_path)
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1057 else:
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1058 cosmic_resources_loc_full_path = os.path.realpath(cosmic_resources_location)
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1059 end_err_msg = "This function was told they would be placed into:\n\t{:s}".format(cosmic_resources_loc_full_path)
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1060 cosmic_mutant_full_path = os.path.join(cosmic_resources_loc_full_path, _COSMIC_Mutant_Filename)
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1061 cosmic_coding_full_path = os.path.join(cosmic_resources_loc_full_path, _COSMIC_Coding_Filename)
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1062 if not (os.path.exists(cosmic_mutant_full_path) and os.path.exists(cosmic_coding_full_path)):
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1063 raise IOError("Either one or both of Cosmic Resources are missing:\n\t" + \
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1064 "{:s}\nand/or\n\t{:s}\n".format(cosmic_mutant_full_path, cosmic_coding_full_path) + \
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1065 "Unable to integrate mutation resources.\n{:s}".format(end_err_msg))
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1066 # Create the integration command. We also must define PICARD_HOME for the command to work.
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1067 picard_home = find_path_to_picard_home()
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1068 integration_command = find_path_to_mutation_lib_integration()
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1069 command = "export PICARD_HOME={:s} && python {:s} ".format(picard_home, integration_command) + \
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1070 "--CosmicMutantExport {:s} ".format(cosmic_mutant_full_path) + \
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1071 "--CosmicCodingMuts {:s} ".format(cosmic_coding_full_path) + \
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1072 "--genome_lib_dir {:s}".format(cannonical_destination)
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1073 try: # to send the ctat-mutation-lib-integration command.
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1074 subprocess.check_call(command, shell=True)
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1075 except subprocess.CalledProcessError:
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1076 print "ERROR: While trying to integrate the mutation resources:\n\t{:s}".format(command)
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1077 sys.stdout.flush()
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1078 raise
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1079 finally:
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1080 # Some code to help us if errors occur.
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1081 print "/n*********************************************************"
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1082 print "* After download and integration of Mutation Resources. *"
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1083 sys.stdout.flush()
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1084 print_directory_contents(cannonical_destination, 2)
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1085 print "*********************************************************\n"
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1086 sys.stdout.flush()
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1087 create_success_file(integration_success_file_path, \
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1088 "Download and integration of mutation resources:\n\t{:s}\n".format(source_url) + \
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1089 "to:\n\t{:s}\nsucceeded.".format(genome_build_directory))
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1090 elif (integration_success_file in orig_files_in_destdir):
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1091 print "The mutation resources integration success file exists, so no integration is being attempted:"
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1092 print "\t{:s}".format(integration_success_file_path)
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1093 print "Remove the file or set <new_mutation_integration> if you want a new integration to occur."
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1094 else:
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1095 print "download_and_integrate_mutation_resources() - Integration: This code should never be printed. Something is wrong."
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1096 sys.stdout.flush()
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1097 return
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1098
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1099 def search_for_genome_build_dir(top_dir_path):
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1100 # If we do not download the directory, the topdir_path could be the
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1101 # location of the genome resource library, but we also want to allow the
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1102 # user to give the same value for top_dir_path that they do when a
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1103 # build happens, so we need to handle all three cases:
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1104 # 1) Is the top_dir_path the build directory,
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1105 # 2) or is it inside of the given directory,
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1106 # 3) or is it inside a subdirectory of the given directory.
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1107 # The source_data downloads are built to a directory named _CTAT_Build_dirname,
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1108 # and the plug-n-play downloads contain a directory with a single sub-directory named _CTAT_Build_dirname.
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1109 # So the conventional structure has all the library files in .../GenomeName/_CTAT_Build_dirname
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1110
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1111 top_dir_full_path = os.path.realpath(top_dir_path)
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1112 genome_build_directory = None
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1113 genome_name_from_dirname = None
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1114 print_warning = False
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1115
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1116 if not os.path.exists(top_dir_full_path):
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1117 raise ValueError("Cannot find the CTAT Genome Resource Library. " + \
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1118 "The given directory does not exist:\n\t{:s}".format(top_dir_full_path))
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1119 elif not os.path.isdir(top_dir_full_path):
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1120 raise ValueError("Cannot find the CTAT Genome Resource Library. " + \
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1121 "The given directory is not a directory:\n\t{:s}".format(top_dir_full_path))
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1122 if os.path.basename(top_dir_full_path) == _CTAT_Build_dirname:
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1123 print "Build directory is: {:s}".format(top_dir_full_path)
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1124 sys.stdout.flush()
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1125 # The top_dir_path is the path to the genome_build_directory.
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1126 genome_build_directory = top_dir_full_path
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1127 else:
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1128 # Look for it inside of the top_dir_path directory.
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1129 print "Looking inside of: {:s}".format(top_dir_full_path)
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1130 sys.stdout.flush()
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1131 top_dir_contents = os.listdir(top_dir_full_path)
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1132 if (_CTAT_Build_dirname in top_dir_contents):
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1133 # The genome_build_directory is inside of the top_dir_path directory.
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1134 print "1. Found it."
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1135 sys.stdout.flush()
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1136 genome_build_directory = "{:s}/{:s}".format(top_dir_full_path,_CTAT_Build_dirname)
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1137 else:
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1138 # Find all subdirectories containing the _CTAT_Build_dirname or the _CTAT_RefGenome_Filename.
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1139 # Look down the directory tree two levels.
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1140 build_dirs_in_subdirs = list()
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1141 subdirs_with_genome_files = list()
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1142 build_dirs_in_sub_subdirs = list()
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1143 sub_subdirs_with_genome_files = list()
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1144 subdirs = [entry for entry in top_dir_contents if (os.path.isdir("{:s}/{:s}".format(top_dir_full_path,entry)))]
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1145 for subdir in subdirs:
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1146 subdir_path = "{:s}/{:s}".format(top_dir_full_path, subdir)
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1147 subdir_path_contents = os.listdir(subdir_path)
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1148 # print "Is it one of:\n\t" + "\n\t".join(subdir_path_contents)
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1149 if (_CTAT_Build_dirname in subdir_path_contents):
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1150 # The genome_build_directory is inside of the subdir_path directory.
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1151 print "2a, Found one."
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1152 build_dirs_in_subdirs.append("{:s}/{:s}".format(subdir_path, _CTAT_Build_dirname))
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1153 if (_CTAT_RefGenome_Filename in subdir_path_contents):
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1154 subdirs_with_genome_files.append(subdir_path)
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1155 # Since we are already looping, loop through all dirs one level deeper as well.
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1156 sub_subdirs = [entry for entry in subdir_path_contents if (os.path.isdir("{:s}/{:s}".format(subdir_path,entry)))]
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1157 for sub_subdir in sub_subdirs:
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1158 sub_subdir_path = "{:s}/{:s}".format(subdir_path, sub_subdir)
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1159 sub_subdir_path_contents = os.listdir(sub_subdir_path)
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1160 # print "Is it one of:\n\t" + "\n\t".join(sub_subdir_path_contents)
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1161 if (_CTAT_Build_dirname in sub_subdir_path_contents):
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1162 # The genome_build_directory is inside of the sub_subdir_path directory.
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1163 print "3a. Found one."
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1164 build_dirs_in_sub_subdirs.append("{:s}/{:s}".format(sub_subdir_path, _CTAT_Build_dirname))
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1165 if (_CTAT_RefGenome_Filename in sub_subdir_path_contents):
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1166 sub_subdirs_with_genome_files.append(sub_subdir_path)
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1167 # Hopefully there is one and only one found build directory.
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1168 # If none are found we check for a directory containing the genome reference file,
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1169 # but the build process sometimes causes more than one directory to have a copy,
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1170 # so finding that file is not a sure thing.
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1171 if (len(build_dirs_in_subdirs) + len(build_dirs_in_sub_subdirs)) > 1:
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1172 print "\n***************************************"
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1173 print "Found multiple CTAT Genome Resource Libraries " + \
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1174 "in the given directory:\n\t{:s}".format(top_dir_full_path)
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1175 sys.stdout.flush()
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1176 print_directory_contents(top_dir_full_path, 2)
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1177 print "***************************************\n"
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1178 sys.stdout.flush()
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1179 raise ValueError("Found multiple CTAT Genome Resource Libraries " + \
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1180 "in the given directory:\n\t{:s}".format(top_dir_full_path))
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1181 elif len(build_dirs_in_subdirs) == 1:
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1182 # The genome_build_directory is inside of the subdir_path directory.
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1183 print "2b, Found it."
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1184 sys.stdout.flush()
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1185 genome_build_directory = build_dirs_in_subdirs[0]
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1186 elif len(build_dirs_in_sub_subdirs) == 1:
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1187 # The genome_build_directory is inside of the subdir_path directory.
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1188 print "3b, Found it."
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1189 sys.stdout.flush()
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1190 genome_build_directory = build_dirs_in_sub_subdirs[0]
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1191 elif (len(sub_subdirs_with_genome_files) + len(subdirs_with_genome_files)) > 1:
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1192 print "\n***************************************"
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1193 print "Unable to find CTAT Genome Resource Library " + \
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1194 "in the given directory:\n\t{:s}".format(top_dir_full_path)
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1195 print "And multiple directories contain {:s}".format(_CTAT_RefGenome_Filename)
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1196 sys.stdout.flush()
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1197 print_directory_contents(top_dir_full_path, 2)
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1198 print "***************************************\n"
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1199 sys.stdout.flush()
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1200 raise ValueError("Unable to find CTAT Genome Resource Library " + \
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1201 "in the given directory:\n\t{:s}".format(top_dir_full_path))
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1202 elif (len(sub_subdirs_with_genome_files) == 1):
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1203 print "3c, Maybe found it."
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1204 sys.stdout.flush()
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1205 genome_build_directory = sub_subdirs_with_genome_files[0]
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1206 print_warning = True
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1207 elif (len(subdirs_with_genome_files) == 1):
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1208 print "2c, Maybe found it."
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1209 sys.stdout.flush()
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1210 genome_build_directory = subdirs_with_genome_files[0]
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1211 print_warning = True
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1212 elif (_CTAT_RefGenome_Filename in top_dir_contents):
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1213 print "1c. Maybe found it."
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1214 sys.stdout.flush()
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1215 genome_build_directory = top_dir_full_path
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1216 print_warning = True
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1217 else:
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1218 print "\n***************************************"
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1219 print "Unable to find CTAT Genome Resource Library " + \
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1220 "in the given directory:\n\t{:s}".format(top_dir_full_path)
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1221 sys.stdout.flush()
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1222 print_directory_contents(top_dir_full_path, 2)
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1223 print "***************************************\n"
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1224 sys.stdout.flush()
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1225 raise ValueError("Unable to find CTAT Genome Resource Library " + \
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1226 "in the given directory:\n\t{:s}".format(top_dir_full_path))
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1227 # end else
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1228 # Check if the CTAT Genome Resource Lib has anything in it (and specifically ref_genome.fa).
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1229 if (genome_build_directory is None):
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1230 print "\n***************************************"
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1231 print "Cannot find the CTAT Genome Resource Library " + \
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1232 "in the given directory:\n\t{:s}".format(top_dir_full_path)
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1233 sys.stdout.flush()
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1234 print_directory_contents(top_dir_full_path, 2)
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1235 print "***************************************\n"
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1236 sys.stdout.flush()
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1237 raise ValueError("Cannot find the CTAT Genome Resource Library " + \
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1238 "in the given directory:\n\t{:s}".format(top_dir_full_path))
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1239 else:
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1240 if (_CTAT_RefGenome_Filename not in os.listdir(genome_build_directory)):
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1241 print "\n***************************************"
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1242 print "\nWARNING: Cannot find Genome Reference file {:s} ".format(_CTAT_RefGenome_Filename) + \
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1243 "in the genome build directory:\n\t{:s}".format(genome_build_directory)
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1244 sys.stdout.flush()
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1245 print_directory_contents(genome_build_directory, 2)
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1246 print "***************************************\n"
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1247 sys.stdout.flush()
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1248 if print_warning and genome_build_directory:
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1249 print "\n***************************************"
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1250 print "\nWARNING: Cannot find the CTAT Genome Resource Library, " + \
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1251 "but found a {:s} file, so set its directory as the library.".format(_CTAT_RefGenome_Filename)
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1252 print "This my not be the correct directory:\n\t{:s}".format(genome_build_directory)
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1253 sys.stdout.flush()
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1254 print_directory_contents(genome_build_directory, 2)
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1255 print "***************************************\n"
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1256 sys.stdout.flush()
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1257 return genome_build_directory
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1258
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1259 def build_directory_from_build_location(src_filename, build_location):
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1260 # This function is used to make sure our builds follow the covention of placing the build in a directory named
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1261 # _CTAT_Build_dirname, which is normally inside of a directory named for the genome name.
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1262 # However, if the user passes a build_location named _CTAT_Build_dirname that directory will be used,
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1263 # regardless of the name of the enclosing directory.
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1264 build_directory = None
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1265 genome_dir_name = find_genome_name_in_path(src_filename)
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1266 if (genome_dir_name is None) or (genome_dir_name == ""):
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1267 # Maybe it is in the path of the build_location.
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1268 genome_dir_name = find_genome_name_in_path(build_location)
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1269 if os.path.basename(build_location) == genome_dir_name:
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1270 build_directory = os.path.join(build_location, _CTAT_Build_dirname)
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1271 elif os.path.basename(build_location) == _CTAT_Build_dirname:
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1272 build_directory = build_location
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1273 elif genome_dir_name is None:
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1274 # This can be the case if the src_filename does not contain a directory named for the genome.
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1275 build_directory = os.path.join(build_location, _CTAT_Build_dirname)
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1276 else:
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1277 build_directory = os.path.join(build_location, genome_dir_name, _CTAT_Build_dirname)
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1278 return build_directory
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1279
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1280 def main():
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1281 # Regarding the command line, there are three basic ways to use this tool:
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1282 # 1) Download and Build the CTAT Genome Resource Library from an archive;
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1283 # 2) Build the library from source data files that are already downloaded;
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1284 # 3) Specify the location of an already built library.
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1285 # Any of these methods can incorporate or be followed by a gmap build.
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1286 # Any of these methods can be followed by a mutation resources download and/or integration.
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1287 # Choose arguments for only one method.
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1288 # Do not use arguments in a mixed manner. I am not writing code to handle that at this time.
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1289 parser = argparse.ArgumentParser()
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1290 # Arguments for all methods:
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1291 parser.add_argument('-o', '--output_filename', \
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1292 help='Name of the output file, where the json dictionary will be written.')
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1293 parser.add_argument('-y', '--display_name',
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1294 default='', \
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1295 help='Is used as the display name for the entry of this Genome Resource Library in the data table.')
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1296 parser.add_argument('-g', '--gmap_build', \
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1297 help='Will do a gmap_build on the Genome Resource Library, if it has not previously been gmapped.',
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1298 action='store_true')
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1299 parser.add_argument('-f', '--force_gmap_build', \
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1300 help='Will force gmap_build of the Genome Resource Library, even if previously gmapped.',
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1301 action='store_true')
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1302 parser.add_argument('-m', '--download_mutation_resources_url',
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1303 default='', \
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1304 help='Value should be the url of the zipped up mutation resources. ' + \
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1305 'These are located at: https://data.broadinstitute.org/Trinity/CTAT/mutation/.' + \
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1306 'Will download mutation resources and integrate them into the Genome Resource Library.' + \
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1307 'Cosmic resources must previously have beeen downloaded (https://cancer.sanger.ac.uk/cosmic/download).' + \
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1308 'Cosmic resources can be placed directly into the Genome Resource Library ' + \
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1309 'or you can set the --cosmic_resources_location argument.' + \
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1310 'See https://github.com/NCIP/ctat-mutations/tree/no_sciedpiper/mutation_lib_prep for more info. ' + \
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1311 'If a previous download and integration was not completed, ' + \
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1312 'calling with this option set will attempt to finish the integration.')
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1313 parser.add_argument('-l', '--new_mutation_download', \
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1314 help='Forces the mutation resources to be downloaded, ' + \
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1315 'even if previously downloaded into this Genome Resource Library.',
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1316 action='store_true')
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1317 parser.add_argument('-i', '--new_mutation_integration', \
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1318 help='Forces the mutation resources to be integrated, ' + \
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1319 'even if previously integrated into this Genome Resource Library.',
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1320 action='store_true')
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1321 parser.add_argument('-c', '--cosmic_resources_location',
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1322 default='', \
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1323 help='Specify a non-default location where the Cosmic files reside. ' + \
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1324 'Normally they are assumed to reside in the build directory, ' + \
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1325 'but if that directory has not been created yet when this program ' + \
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1326 'is called, you can specify the full path to the directory where they reside.')
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1327 parser.add_argument('-t', '--cravat_tissues_filepath',
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1328 default='', \
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1329 help='Specify a non-default location where the Cosmic files reside. ' + \
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1330 'Normally they are assumed to reside in the build directory, ' + \
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1331 'but if that directory has not been created yet when this program ' + \
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1332 'is called, you can specify the full path to the directory where they reside.')
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1333 # Method 1) arguments - Download and Build.
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1334 # - One can optionally utilize --build_location argument with this group of arguments.
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1335 download_and_build_args = parser.add_argument_group('Download and Build arguments')
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1336 download_and_build_args.add_argument('-u', '--download_url',
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1337 default='', \
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1338 help='This is the url of an archive file containing the library files. ' + \
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1339 'These are located at https://data.broadinstitute.org/Trinity/CTAT_RESOURCE_LIB/. ' + \
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1340 'Works with both source-data and plug-n-play archives.')
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1341 download_and_build_args.add_argument('-d', '--download_location',
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1342 default='', \
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1343 help='Full path of the CTAT Resource Library download location, where the download will be placed. ' + \
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1344 'If the archive file has already had been successfully downloaded, ' + \
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1345 'it will only be downloaded again if --new_archive_download is selected. ' + \
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1346 'If --build_location is not set, then the archive will be built in place at the download_location. ' + \
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1347 'If a previous download and build was started but not completed at this or a specified build_location, ' + \
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1348 'calling with this and the previous option set, but not --new_archive_download, ' + \
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1349 'will attempt to finish the download and build.')
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1350 download_and_build_args.add_argument('-a', '--new_archive_download', \
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1351 help='Forces a new download (and build if needed) of the Genome Resource Library, ' + \
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1352 'even if previously downloaded and built.',
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1353 action='store_true')
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1354 download_and_build_args.add_argument('-k', '--keep_archive', \
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1355 help='The archive will not be deleted after it is extracted.',
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1356 action='store_true')
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1357 # Method 2) arguments - Specify source and build locations.
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1358 # - One can optionally utilize --build_location argument with this group of arguments.
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1359 specify_source_and_build_args = parser.add_argument_group('Specify Source and Build locations arguments')
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1360 specify_source_and_build_args.add_argument('-s', '--source_location',
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1361 default='', \
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1362 help='Full path to the directory containing CTAT Resource Library source-data files ' + \
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1363 'or the full path to a CTAT Resource Library archive file (.tar.gz). ' + \
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1364 'If the --build_location option is not set, the reference library will be built in the source_location directory.' + \
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1365 'If a previous download and build was started but not completed at this location, ' + \
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1366 'calling with this option set, but not --new_library_build, ' + \
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1367 'will attempt to finish the build.')
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1368 specify_source_and_build_args.add_argument('-r', '--new_library_build', \
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1369 help='Forces build of the CTAT Genome Resource Library, even if previously built. ' + \
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1370 'The --source_location must be a source-data archive or directory, or this is a no-op.',
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1371 action='store_true')
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1372 # Method 3) arguments - Specify the location of a built library.
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1373 built_lib_location_arg = parser.add_argument_group('Specify location of built library arguments')
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1374 built_lib_location_arg.add_argument('-b', '--build_location',
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1375 default='', \
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1376 help='Full path to the location of a built CTAT Genome Resource Library, ' + \
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1377 'either where it is, or where it will be placed.')
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1378
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1379 args = parser.parse_args()
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1380
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1381 # Apparently, Galaxy writes all of the input parameters to the output file prior to
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1382 # this program being called.
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1383 # But I do not get input values from the json file, but rather from command line.
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1384 # Just leaving the following code as a comment, in case it might be useful to someone later.
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1385 # params = from_json_string(open(filename).read())
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1386 # target_directory = params['output_data'][0]['extra_files_path']
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1387 # os.mkdir(target_directory)
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1388
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1389 lib_was_built = False
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1390 extracted_directory = None
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1391 source_data_directory = None
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1392 genome_build_directory = None
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1393 download_url_is_set = (args.download_url is not None) and (args.download_url != "")
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1394 download_location_is_set = (args.download_location is not None) and (args.download_location != "")
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1395 source_location_is_set = (args.source_location is not None) and (args.source_location != "")
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1396 build_location_is_set = (args.build_location is not None) and (args.build_location != "")
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1397 mutation_url_is_set = (args.download_mutation_resources_url is not None) \
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1398 and (args.download_mutation_resources_url != "")
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1399
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1400 if download_url_is_set:
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1401 print "The value of download_url argument is:\n\t{:s}".format(str(args.download_url))
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1402 sys.stdout.flush()
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1403 if source_location_is_set:
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1404 raise ValueError("Argument --source_location cannot be used in combination with --download_url.")
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1405 if not download_location_is_set:
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1406 raise ValueError("Argument --download_url requires that --download_location be specified.")
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1407 downloaded_filename_full_path = \
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1408 download_genome_archive(source_url=args.download_url, \
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1409 destination=args.download_location, \
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1410 force_new_download=args.new_archive_download)
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1411 print "\nThe downloaded file is:\n\t{:s}.\n".format(str(downloaded_filename_full_path))
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1412 sys.stdout.flush()
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1413
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1414 if ctat_library_type(downloaded_filename_full_path) == _LIBTYPE_SOURCE_DATA:
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1415 print "It is source data."
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1416 sys.stdout.flush()
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1417 # If it is source_data, extract to download_location (the directory where the download was placed).
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1418 extracted_directory = extract_genome_file(archive_filepath=downloaded_filename_full_path, \
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1419 destination=args.download_location, \
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1420 force_new_extraction=args.new_archive_download, \
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1421 keep_archive=args.keep_archive)
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1422 source_data_directory = extracted_directory
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1423 if build_location_is_set:
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1424 genome_build_directory = build_directory_from_build_location(source_data_directory, args.build_location)
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1425 else:
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1426 # We will build within a subdirectory of the source_data_directory .
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1427 # The name of the build directory will be the default _CTAT_Build_dirname.
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1428 # This _CTAT_Build_dirname directory will not exist until the library is built.
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1429 genome_build_directory = os.path.join(source_data_directory, _CTAT_Build_dirname)
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1430
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1431 elif ctat_library_type(downloaded_filename_full_path) == _LIBTYPE_PLUG_N_PLAY:
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1432 print "It is plug-n-play data."
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1433 sys.stdout.flush()
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1434 if build_location_is_set:
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1435 # Extract to the build location. The library is already built.
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1436 extracted_directory = extract_genome_file(archive_filepath=downloaded_filename_full_path, \
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1437 destination=args.build_location, \
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1438 force_new_extraction=args.new_archive_download, \
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1439 keep_archive=args.keep_archive)
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1440 else:
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1441 # Extract to the download location.
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1442 extracted_directory = extract_genome_file(archive_filepath=downloaded_filename_full_path, \
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1443 destination=args.download_location, \
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1444 force_new_extraction=args.new_archive_download, \
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1445 keep_archive=args.keep_archive)
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1446 # There is no source_data_directory, so its value stays as None.
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1447
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1448 # Look for the build directory. It should be inside the extracted_directory
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1449 if len(os.listdir(extracted_directory)) == 1:
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1450 # Then that one file is a subdirectory that should be the build_directory.
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1451 # That is how the plug-n-play directories are structured.
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1452 subdir_filename = os.listdir(extracted_directory)[0]
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1453 genome_build_directory = os.path.join(extracted_directory, subdir_filename)
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1454 else:
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1455 # We need to search for the build directory, since there is more than one file.
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1456 genome_build_directory = search_for_genome_build_dir(extracted_directory)
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1457 else:
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1458 raise ValueError("Unexpected CTAT Library type. Neither plug-n-play nor source_data:\n\t" + \
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1459 "{:s}".format(downloaded_filename_full_path))
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1460 elif source_location_is_set:
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1461 # Then the user wants to build the directory from the source data.
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1462 source_data_directory = os.path.realpath(args.source_location)
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1463 print "\nThe program is being told that the source data is in:\n\t{:s}.\n".format(str(source_data_directory))
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1464 sys.stdout.flush()
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1465 if build_location_is_set:
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1466 genome_build_directory = build_directory_from_build_location(source_data_directory, args.build_location)
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1467 else:
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1468 # We will build within a subdirectory of the source_data_directory .
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1469 # The name of the build directory will be the default _CTAT_Build_dirname.
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1470 # This _CTAT_Build_dirname directory will not exist until the library is built.
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1471 genome_build_directory = os.path.join(source_data_directory, _CTAT_Build_dirname)
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1472 elif build_location_is_set:
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1473 genome_build_directory = args.build_location
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1474
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1475 if (genome_build_directory is None) or (genome_build_directory == ""):
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1476 raise ValueError("At least one of --download_url, --source_location, or --build_location must be specified.")
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1477
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1478 print "\nThe location where the CTAT Genome Resource Library exists " + \
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1479 "or will be built is {:s}.\n".format(str(genome_build_directory))
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1480 sys.stdout.flush()
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1481
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1482 # To take out builds for testing, comment out the lines that do the building.
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1483 # The command that builds the ctat genome library also has an option for building the gmap indexes.
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1484 # That is why the gmap_build values are sent to build_the_library(), but if we are not building the
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1485 # library, the user might still be asking for a gmap_build. That is done after rechecking for the
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1486 # genome_build_directory.
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1487 if source_data_directory is not None:
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1488 build_the_library(source_data_directory, \
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1489 genome_build_directory, \
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1490 args.new_library_build, \
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1491 args.gmap_build, \
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1492 args.force_gmap_build)
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1493 lib_was_built = True
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1494
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1495 # The following looks to see if the library actually exists after the build,
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1496 # and raises an error if it cannot find the library files.
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1497 # The reassignment of genome_build_directory can be superfluous,
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1498 # since many times the genome_build_directory will already point to the correct directory.
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1499 # There are cases, however, where a user specifies a location that contains the
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1500 # genome_build_directory rather than is the genome_build_directory.
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1501 genome_build_directory = search_for_genome_build_dir(genome_build_directory)
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1502
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1503 if (args.gmap_build and not lib_was_built):
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1504 # If we did not build the genome resource library
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1505 # the user might still be asking for a gmap_build.
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1506 gmap_the_library(genome_build_directory, args.force_gmap_build)
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1507 sys.stdout.flush()
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1508
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1509 if mutation_url_is_set:
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1510 download_and_integrate_mutation_resources(source_url=args.download_mutation_resources_url, \
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1511 genome_build_directory=genome_build_directory, \
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1512 cosmic_resources_location=args.cosmic_resources_location, \
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1513 force_new_download=args.new_mutation_download, \
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1514 force_new_integration=args.new_mutation_integration)
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1515
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1516 # Need to get the genome name.
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1517 genome_name = find_genome_name_in_path(args.download_url)
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1518 if genome_name is None:
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1519 genome_name = find_genome_name_in_path(genome_build_directory)
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1520 if genome_name is None:
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1521 genome_name = find_genome_name_in_path(extracted_directory)
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1522 if genome_name is None:
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1523 genome_name = find_genome_name_in_path(args.source_location)
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1524 if genome_name is None:
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1525 genome_name = find_genome_name_in_path(args.download_location)
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1526 if genome_name is None:
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1527 genome_name = find_genome_name_in_path(args.display_name)
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1528 if genome_name is None:
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1529 genome_name = _CTAT_ResourceLib_DefaultGenome
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1530 print "WARNING: We could not find a genome name in any of the directory paths."
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1531 sys.stdout.flush()
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1532
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1533 # Determine the display_name for the library.
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1534 if (args.display_name is None) or (args.display_name == ""):
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1535 # Create the display_name from the genome_name.
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1536 display_name = _CTAT_ResourceLib_DisplayNamePrefix + genome_name
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1537 else:
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1538 display_name = _CTAT_ResourceLib_DisplayNamePrefix + args.display_name
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1539 display_name = display_name.replace(" ","_")
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1540
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1541 # Create a unique_id for the library.
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1542 datetime_stamp = datetime.now().strftime("_%Y_%m_%d_%H_%M_%S_%f")
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1543 unique_id = genome_name + "." + datetime_stamp
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1544
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1545 print "The Genome Resource Library's display_name will be set to: {:s}\n".format(display_name)
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1546 print "Its unique_id will be set to: {:s}\n".format(unique_id)
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1547 print "Its dir_path will be set to: {:s}\n".format(genome_build_directory)
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1548 sys.stdout.flush()
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1549
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1550 data_manager_dict = {}
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1551 data_manager_dict['data_tables'] = {}
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1552 data_manager_dict['data_tables']['ctat_genome_resource_libs'] = []
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1553 data_table_entry = dict(value=unique_id, name=display_name, path=genome_build_directory)
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1554 data_manager_dict['data_tables']['ctat_genome_resource_libs'].append(data_table_entry)
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1555
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1556 # Create the data table for the cravat_tissues, if the file is given:
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1557 print "The cravat tissues file is: {:s}".format(str(args.cravat_tissues_filepath))
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1558 if (args.cravat_tissues_filepath is not None) and (args.cravat_tissues_filepath != ""):
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1559 data_manager_dict['data_tables']['ctat_cravat_tissues'] = []
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1560 cravat_file = open(args.cravat_tissues_filepath, 'r')
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1561 for line in cravat_file:
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1562 # print line
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1563 if line[0] != '#':
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1564 # The line is not a comment, so parse it.
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1565 items = [item.strip() for item in line.split("\t")]
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1566 print items
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1567 data_table_entry = dict(value=items[0], name=items[1], code=items[2], date=items[3])
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1568 data_manager_dict['data_tables']['ctat_cravat_tissues'].append(data_table_entry)
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1569
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1570 # Temporarily the output file's dictionary is written for debugging:
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1571 print "The dictionary for the output file is:\n\t{:s}".format(str(data_manager_dict))
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1572 sys.stdout.flush()
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1573 # Save info to json file. This is used to transfer data from the DataManager tool, to the data manager,
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1574 # which then puts it into the correct .loc file (I think).
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1575 # One can comment out the following line when testing without galaxy package.
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1576 open(args.output_filename, 'wb').write(to_json_string(data_manager_dict))
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1577
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1578 if __name__ == "__main__":
63f45d5fdda9 Uploaded
trinity_ctat
parents:
diff changeset
1579 main()