Mercurial > repos > trinity_ctat > ctat_genome_resource_libs_data_manager_3
annotate data_manager/add_ctat_resource_lib.py @ 32:9b7dc7d09fda draft
Fixing some indentation errors in build_the_library.
author | trinity_ctat |
---|---|
date | Thu, 25 Oct 2018 10:31:19 -0400 |
parents | 0df7a729910d |
children | 91319ae21a16 |
rev | line source |
---|---|
0 | 1 #!/usr/bin/env python |
2 # ref: https://galaxyproject.org/admin/tools/data-managers/how-to/define/ | |
3 | |
11
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
4 # Rewritten by H.E. Cicada Brokaw Dennis from code downloaded from the toolshed and |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
5 # other example code on the web. It has however been extensively modified and augmented. |
0 | 6 # This now allows downloading of a user selected library |
7 # but only from the CTAT Genome Resource Library website. | |
8 # Ultimately we might want to allow the user to specify any location | |
9 # from which to download. | |
11
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
10 # Users can create or download other libraries and use this Data Manger to add them |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
11 # if they don't want to add them by hand. |
0 | 12 |
13 import argparse | |
14 import os | |
11
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
15 import shutil |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
16 import tarfile |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
17 import hashlib |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
18 import urllib |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
19 import urlparse |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
20 import contextlib |
0 | 21 import subprocess |
22 | |
23 # Comment out the following line when testing without galaxy package. | |
24 from galaxy.util.json import to_json_string | |
25 # The following is not being used, but leaving as info | |
26 # in case we ever want to get input values using json. | |
27 # from galaxy.util.json import from_json_string | |
28 | |
29 # datetime.now() is used to create the unique_id | |
30 from datetime import datetime | |
31 | |
11
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
32 # The Data Manager uses a subclass of HTMLParser to look through a web page's html |
0 | 33 # searching for the filenames within anchor tags. |
34 import urllib2 | |
35 from HTMLParser import HTMLParser | |
36 | |
37 _CTAT_ResourceLib_URL = 'https://data.broadinstitute.org/Trinity/CTAT_RESOURCE_LIB/' | |
11
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
38 _CTAT_Mutation_URL = 'https://data.broadinstitute.org/Trinity/CTAT/mutation/' |
6 | 39 _CTAT_Build_dirname = 'ctat_genome_lib_build_dir' |
11
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
40 _CTAT_MutationLibDirname = 'ctat_mutation_lib' |
0 | 41 _CTAT_ResourceLib_DisplayNamePrefix = 'CTAT_GenomeResourceLib_' |
42 _CTAT_ResourceLib_DefaultGenome = 'Unspecified_Genome' | |
6 | 43 _CTAT_HumanFusionLib_FilenamePrefix = 'CTAT_HumanFusionLib' |
44 _CTAT_RefGenome_Filename = 'ref_genome.fa' | |
7 | 45 _CTAT_MouseGenome_Prefix = 'Mouse' |
46 _CTAT_HumanGenome_Prefix = 'GRCh' | |
11
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
47 _COSMIC_Mutant_Filename = 'CosmicMutantExport.tsv.gz' |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
48 _COSMIC_Coding_Filename = 'CosmicCodingMuts.vcf.gz' |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
49 |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
50 # FIX - The following numbers need to be checked and other numbers for gmap, etc. need to be determined. |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
51 # Values for each genome should be determined, so we can get more precise values for each genome. |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
52 _NumBytesNeededForSourceDataExtraction = 10737418240 # 10 Gigabytes. FIX - Not checked - Largest archive is currently 2.5GB. |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
53 _NumBytesNeededForPlugNPlayExtraction = 48318382080 # 45 Gigabytes. Largest archive is currently 28GB and extracts to 43GB. |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
54 # Built Human Genome archive (GRCh38_v27_CTAT_lib_Feb092018) with mutation lib is 46GB. |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
55 # Fix - check amount with gmap. |
8 | 56 _NumBytesNeededForBuild = 66571993088 # 62 Gigabytes. FIX - This might not be correct. |
11
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
57 _NumBytesNeededForMutationResources = 4294967296 # 4 Gigabytes. Actually need about 3.8GB. |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
58 # Once built the downloaded archive could be deleted to reduce the amount used, but with the archive |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
59 # there and the Cosmic files and the built ctat_mutation_library, 3.8GB is needed. |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
60 # If the archive files are deleted after the integration of the library, only 1.8GB would be used at that point. |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
61 _Write_TestFile = 'write_testfile.txt' |
0 | 62 _DownloadSuccessFile = 'download_succeeded.txt' |
11
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
63 _ExtractionSuccessFile = 'extraction_succeeded.txt' |
8 | 64 _LibBuiltSuccessFile = 'build_succeeded.txt' |
11
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
65 _GmapSuccessFile = 'gmap_succeeded.txt' |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
66 _MutationDownloadSuccessFile = 'mutation_download_succeeded.txt' |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
67 _MutationIntegrationSuccessFile = 'mutation_integration_succeeded.txt' |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
68 _LIBTYPE_SOURCE_DATA = 'source_data' |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
69 _LIBTYPE_PLUG_N_PLAY = 'plug-n-play' |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
70 |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
71 class resumable_URL_opener(urllib.FancyURLopener): |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
72 # This class is used to do downloads that can restart a download from |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
73 # the point where it left off after a partial download was interupted. |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
74 # This class and code using it was found online: |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
75 # http://code.activestate.com/recipes/83208-resuming-download-of-a-file/ |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
76 # A sub-class is created in order to overide error 206. |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
77 # This error means a partial file is being sent, |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
78 # which is ok in this case. Do nothing with this error. |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
79 def http_error_206(self, url, fp, errcode, errmsg, headers, data=None): |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
80 pass |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
81 # End of class resumable_URL_opener |
0 | 82 |
83 class FileListParser(HTMLParser): | |
11
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
84 # The FileListParser object is used by get_ctat_genome_urls() and get_mutation_resource_urls(), |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
85 # which can be called by the Data Manager interface (.xml file) to get |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
86 # the filenames that are available online at broadinstitute.org |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
87 # Apparently creating dynamic option lists this way is deprecated, but no |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
88 # other method exists by which I can get the options dynamically from the web. |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
89 # I believe that it is considered a security risk. |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
90 |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
91 # This HTMLParser facilitates getting url's of tar.gz links in an HTML page. |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
92 # These are assumed to be files that can be downloaded and are the files we |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
93 # are particularly interested in this Data Manager. |
0 | 94 def __init__(self): |
95 # Have to use direct call to super class rather than using super(): | |
96 # super(FileListParser, self).__init__() | |
97 # because HTMLParser is an "old style" class and its inheritance chain does not include object. | |
98 HTMLParser.__init__(self) | |
99 self.urls = set() | |
100 def handle_starttag(self, tag, attrs): | |
101 # Look for filename references in anchor tags and add them to urls. | |
102 if tag == "a": | |
103 # The tag is an anchor tag. | |
104 for attribute in attrs: | |
105 # print "Checking: {:s}".format(str(attribute)) | |
106 if attribute[0] == "href": | |
107 # Does the href have a tar.gz in it? | |
108 if ("tar.gz" in attribute[1]) and ("md5" not in attribute[1]): | |
109 # Add the value to urls. | |
110 self.urls.add(attribute[1]) | |
111 # End of class FileListParser | |
112 | |
113 def get_ctat_genome_urls(): | |
114 # open the url and retrieve the urls of the files in the directory. | |
11
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
115 # If we can't get the list, send a default list. |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
116 |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
117 build_default_list = False |
0 | 118 resource = urllib2.urlopen(_CTAT_ResourceLib_URL) |
11
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
119 if resource is None: |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
120 build_default_list = True |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
121 else: |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
122 theHTML = resource.read() |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
123 if (theHTML is None) or (theHTML == ""): |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
124 build_default_list = True |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
125 if build_default_list: |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
126 # These are the filenames for what was there at least until 2018/10/09. |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
127 urls_to_return = set() |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
128 urls_to_return.add("GRCh37_v19_CTAT_lib_Feb092018.plug-n-play.tar.gz") |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
129 urls_to_return.add("GRCh37_v19_CTAT_lib_Feb092018.source_data.tar.gz") |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
130 urls_to_return.add("GRCh38_v27_CTAT_lib_Feb092018.plug-n-play.tar.gz") |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
131 urls_to_return.add("GRCh38_v27_CTAT_lib_Feb092018.source_data.tar.gz") |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
132 urls_to_return.add("Mouse_M16_CTAT_lib_Feb202018.plug-n-play.tar.gz") |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
133 urls_to_return.add("Mouse_M16_CTAT_lib_Feb202018.source_data.tar.gz") |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
134 else: |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
135 filelist_parser = FileListParser() |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
136 filelist_parser.feed(theHTML) |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
137 urls_to_return = filelist_parser.urls |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
138 |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
139 # For dynamic options need to return an itterable with contents that are tuples with 3 items. |
0 | 140 # Item one is a string that is the display name put into the option list. |
141 # Item two is the value that is put into the parameter associated with the option list. | |
142 # Item three is a True or False value, indicating whether the item is selected. | |
143 options = [] | |
144 for i, url in enumerate(filelist_parser.urls): | |
5 | 145 # The urls should look like: |
0 | 146 # https://data.broadinstitute.org/Trinity/CTAT_RESOURCE_LIB/GRCh37_v19_CTAT_lib_Feb092018.plug-n-play.tar.gz |
147 # https://data.broadinstitute.org/Trinity/CTAT_RESOURCE_LIB/Mouse_M16_CTAT_lib_Feb202018.source_data.tar.gz | |
8 | 148 # But in actuality, they are coming in looking like: |
5 | 149 # GRCh37_v19_CTAT_lib_Feb092018.plug-n-play.tar.gz |
150 # Mouse_M16_CTAT_lib_Feb202018.source_data.tar.gz | |
151 # Write code to handle both situations, or an ftp: url. | |
11
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
152 url_parts = urlparse.urlparse(url) |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
153 if (url_parts.scheme != ""): |
5 | 154 full_url_path = url |
155 else: | |
156 # Assume the path is relative to the page location. | |
11
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
157 full_url_path = os.path.join(_CTAT_ResourceLib_URL, url) |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
158 filename = os.path.basename(url) |
8 | 159 # if filename.split("_")[0] != _CTAT_MouseGenome_Prefix: |
160 # # Don't put in the mouse genome options for now. | |
161 # # The mouse genome option is not handled correctly yet | |
162 # options.append((filename, full_url_path, i == 0)) | |
163 # Mouse genomes should work now (we hope) - FIX - still not tested. | |
164 options.append((filename, full_url_path, i == 0)) | |
165 options.sort() # So the list will be in alphabetical order. | |
166 # return a tuple of the urls | |
167 print "The list being returned as options is:" | |
168 print "{:s}\n".format(str(options)) | |
169 return options | |
5 | 170 |
11
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
171 def get_mutation_resource_urls(): |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
172 # FIX - Rather than letting user choose mutation resource url, |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
173 # download the correct one for the chosen library? |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
174 # Not sure about this. |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
175 # In that case don't provide a pull down interface for this. |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
176 # FIX - |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
177 build_default_list = False |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
178 resource = urllib2.urlopen(_CTAT_Mutation_URL) |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
179 if resource is None: |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
180 build_default_list = True |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
181 else: |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
182 theHTML = resource.read() |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
183 if (theHTML is None) or (theHTML == ""): |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
184 build_default_list = True |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
185 if build_default_list: |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
186 # These are the filenames for what was there at least until 2018/10/09. |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
187 urls_to_return = set() |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
188 urls_to_return.add("mutation_lib.hg19.tar.gz") |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
189 urls_to_return.add("mutation_lib.hg38.tar.gz") |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
190 else: |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
191 filelist_parser = FileListParser() |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
192 filelist_parser.feed(theHTML) |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
193 urls_to_return = filelist_parser.urls |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
194 |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
195 # For dynamic options need to return an itterable with contents that are tuples with 3 items. |
8 | 196 # Item one is a string that is the display name put into the option list. |
197 # Item two is the value that is put into the parameter associated with the option list. | |
198 # Item three is a True or False value, indicating whether the item is selected. | |
199 options = [] | |
200 for i, url in enumerate(filelist_parser.urls): | |
201 # The urls should look like: | |
202 # https://data.broadinstitute.org/Trinity/CTAT/mutation/mc7.tar.gz | |
203 # https://data.broadinstitute.org/Trinity/CTAT/mutation/hg19.tar.gz | |
204 # But in actuality, they are coming in looking like: | |
205 # hg19.tar.gz | |
206 # mc7.tar.gz | |
11
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
207 # |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
208 # On 2018/10/06, the following tar.gz files were present: |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
209 # mutation_lib.hg19.tar.gz |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
210 # mutation_lib.hg38.tar.gz |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
211 # mc-7.tar.gz |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
212 # ctat_mutation_demo.tar.gz |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
213 # |
8 | 214 # Write code to handle both situations, or an ftp: url. |
11
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
215 url_parts = urlparse.urlparse(url) |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
216 if (url_parts.scheme != ""): |
8 | 217 full_url_path = url |
218 else: | |
219 # Assume the path is relative to the page location. | |
11
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
220 full_url_path = os.path.join(_CTAT_Mutation_URL, url) |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
221 filename = os.path.basename(url) |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
222 if (filename.split(".")[0] == "mutation_lib"): |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
223 # As of 2018_10_09, the only ones supported have mutation_lib as the first part of the name. |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
224 options.append((filename, full_url_path, i == 0)) |
5 | 225 options.sort() # So the list will be in alphabetical order. |
0 | 226 # return a tuple of the urls |
4 | 227 print "The list being returned as options is:" |
228 print "{:s}\n".format(str(options)) | |
0 | 229 return options |
230 | |
231 # The following was used by the example program to get input parameters through the json. | |
232 # Just leaving here for reference. | |
233 # We are getting all of our parameter values through command line arguments. | |
234 #def get_reference_id_name(params): | |
235 # genome_id = params['param_dict']['genome_id'] | |
236 # genome_name = params['param_dict']['genome_name'] | |
237 # return genome_id, genome_name | |
238 # | |
239 #def get_url(params): | |
240 # trained_url = params['param_dict']['trained_url'] | |
241 # return trained_url | |
242 | |
6 | 243 def print_directory_contents(dir_path, num_levels): |
11
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
244 # This procedure is used to help with debugging and for user information. |
6 | 245 if num_levels > 0: |
246 if os.path.exists(dir_path) and os.path.isdir(dir_path): | |
247 print "\nDirectory {:s}:".format(dir_path) | |
248 subprocess.call("ls -la {:s} 2>&1".format(dir_path), shell=True) | |
249 else: | |
250 print "Path either does not exist, or is not a directory:\n\t{:s}.".format(dir_path) | |
251 if num_levels > 1: | |
8 | 252 if os.path.exists(dir_path) and os.path.isdir(dir_path): |
253 for filename in os.listdir(dir_path): | |
11
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
254 filename_path = os.path.join(dir_path, filename) |
8 | 255 if os.path.exists(filename_path) and os.path.isdir(filename_path): |
256 print_directory_contents(filename_path, num_levels-1) | |
257 else: | |
258 print "Path either does not exist, or is not a directory:\n\t{:s}.".format(dir_path) | |
6 | 259 |
11
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
260 def which(file): |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
261 # This procedure is similar to the linux "which" command. |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
262 # It is used to find the location of an executable program that is in the PATH. |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
263 # However this implementation does not check whether the program's file is executable. |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
264 for path in os.environ["PATH"].split(os.pathsep): |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
265 if os.path.exists(os.path.join(path, file)): |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
266 return os.path.join(path, file) |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
267 return None |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
268 |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
269 def size_of_file_at(file_url): |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
270 # Returns the size of the file at file_url. |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
271 # We have to open the file, in order to find out how big it is. |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
272 file_retriever = resumable_URL_opener() |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
273 with contextlib.closing(file_retriever.open(file_url)) as filelike_object: |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
274 filesize = int(filelike_object.headers['Content-Length']) |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
275 return filesize |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
276 |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
277 def md5sum_for(filename, blocksize=2**20): |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
278 # I got this code for this function off the web, but don't remember where. |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
279 m = hashlib.md5() |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
280 finished = False |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
281 with open(filename, "rb" ) as f: |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
282 while not finished: |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
283 buf = f.read(blocksize) |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
284 if buf: |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
285 m.update( buf ) |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
286 else: |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
287 finished = True |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
288 return m.hexdigest() |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
289 |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
290 def ctat_library_type(filepath): |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
291 # This function pulls out the string indicating the library type of the file. |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
292 # If the filename indicates source_data, as opposed to plug-n-play, |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
293 # then the library will have to be built after it is downloaded. |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
294 base_filename = os.path.basename(filepath) |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
295 library_type = base_filename.split(".")[1] |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
296 #print "The file {:s}".format(base_filename) |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
297 #print "is of type {:s}".format(library_type) |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
298 return library_type |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
299 |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
300 def find_genome_name_in_path(path, raise_error=False): |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
301 # The form of the genome name in directory names (if present in the path) looks like: |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
302 # GRCh37_v19_CTAT_lib_Feb092018 |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
303 # GRCh38_v27_CTAT_lib_Feb092018 |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
304 # Mouse_M16_CTAT_lib_Feb202018 |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
305 # Raises a ValueError if there is no genome name in the given path. |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
306 genome_name = None |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
307 if (path is not None) and (path != ""): |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
308 for element in path.split(os.sep): |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
309 # print "Looking for genome name in {:s}.".format(element) |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
310 if (element[0:len(_CTAT_MouseGenome_Prefix)] == _CTAT_MouseGenome_Prefix) \ |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
311 or (element[0:len(_CTAT_HumanGenome_Prefix)] == _CTAT_HumanGenome_Prefix): |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
312 # Remove any extension that might be in the filename. |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
313 genome_name = element.split(".")[0] |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
314 if (genome_name is None or (genome_name == "")) and raise_error: |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
315 raise ValueError("Cannnot find genome name in the given filename path:\n\t".format(path)) |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
316 return genome_name |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
317 |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
318 def bytes_needed_to_extract(archive_filepath): |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
319 # FIX -- The following should be replaced by a series of statements that return the right value for each archive. |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
320 # The numbers used now estimates for the human genome, and so are big enough for the mouse genome, so ok for now. |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
321 # But now we are also using this for the mutation resource files, so really need to FIX this. |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
322 # FIX -- |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
323 bytes_needed = _NumBytesNeededForPlugNPlayExtraction |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
324 if (ctat_library_type(archive_filepath) == _LIBTYPE_SOURCE_DATA): |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
325 bytes_needed = _NumBytesNeededForSourceDataExtraction |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
326 else: # assume otherwise that it is a plug-n-play archive. |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
327 bytes_needed = _NumBytesNeededForPlugNPlayExtraction |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
328 return bytes_needed |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
329 |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
330 def bytes_needed_to_build(source_data_filepath): |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
331 # FIX - The following should be replaced by a series of statements that return the right value for each archive. |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
332 # The numbers used now estimates that largest size needed. Also, it is probably not correct. |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
333 return _NumBytesNeededForBuild |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
334 |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
335 def create_success_file(full_file_path, contents=None): |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
336 # full_file_path is the path to the file to write. |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
337 # It should not exist before calling this function, |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
338 # but if it does, it will be overwritten. |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
339 # contents is some text that will be written into the file. |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
340 # It can be empty and nothing will be written. |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
341 try: |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
342 with open(full_file_path,"w") as success_file: |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
343 if contents is not None: |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
344 success_file.write(contents) |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
345 # else nothing is written into it, but we still will have created the file. |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
346 except IOError: |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
347 print "The success indication file could not be created: " + \ |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
348 "{:s}".format(full_file_path) |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
349 raise |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
350 |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
351 def download_file_from_url(file_url, dest_dir, resume_download=True): |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
352 # Some of the code used in this procedure was downloaded and modified for our needs. |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
353 # That code was at: http://code.activestate.com/recipes/83208-resuming-download-of-a-file/ |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
354 # Given a file_url, downloads that file to dest_dir. |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
355 # The url must specify a file to download, so I can grab the filename from the end of the url's path. |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
356 # It is best to fully specify dest_dir. Otherwise the dest_dir will be opened relative to whatever cwd is. |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
357 # If resume_download is True (the default), the function will attempt to resume the download where it left off, |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
358 # if, for example, a previous download was interupted. |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
359 # If resume_download is False, any existing download of the file is deleted and a new download is started. |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
360 |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
361 # DOWNLOAD_BLOCK_SIZE = 65536 # 64KB. Old number was 8192 or 8KB. |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
362 DOWNLOAD_BLOCK_SIZE = 1048576 # 1 MB |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
363 download_complete = False |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
364 existing_size = 0 |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
365 bytes_read = 0 |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
366 file_retriever = resumable_URL_opener() |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
367 dest_filename = os.path.basename(file_url) |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
368 dest_fullpath = os.path.join(dest_dir, dest_filename) |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
369 source_filesize = size_of_file_at(file_url) |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
370 print "Downloading {:s}\nSize of the file is {:d}".format(file_url, source_filesize) |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
371 print "Destination file for the download is {:s}".format(dest_fullpath) |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
372 |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
373 # If the file exists and resume_download is requested, then only download the remainder |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
374 if resume_download and os.path.exists(dest_fullpath): |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
375 existing_size = os.path.getsize(dest_fullpath) |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
376 #If the file exists, but we already have the whole thing, don't download again |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
377 print "The destination file exists and is {:d} bytes in size.".format(existing_size) |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
378 if (source_filesize == existing_size): |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
379 print "The file has already been completely downloaded:\n\t{:s}".format(dest_fullpath) |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
380 download_complete = True |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
381 else: |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
382 header = "Range","bytes={:s}-".format(str(existing_size)) |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
383 print "Adding header to resume download:\n\t{:s}".format(header) |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
384 file_retriever.addheader("Range","bytes={:s}-".format(str(existing_size))) |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
385 # We open even if download is complete, to avoid adding code to determine whether to close. |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
386 output_file = open(dest_fullpath,"ab") |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
387 else: |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
388 if os.path.exists(dest_fullpath): |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
389 print "resume_download is set to False. Download will overwrite an existing file." |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
390 else: |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
391 print "The destination file does not exist yet." |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
392 existing_size = 0 |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
393 output_file = open(dest_fullpath,"wb") |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
394 try: |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
395 # Check whether there is enough space on the device for the rest of the file to download. |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
396 statvfs = os.statvfs(dest_dir) |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
397 num_avail_bytes = statvfs.f_frsize * statvfs.f_bavail |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
398 # num_avail_bytes is the number of free bytes that ordinary users |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
399 # are allowed to use (excl. reserved space) |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
400 # Perhaps should subtract some padding amount from num_avail_bytes |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
401 # rather than raising only if there is less than exactly what is needed. |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
402 if (num_avail_bytes < (source_filesize-existing_size)): |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
403 raise OSError("There is insufficient space ({:s} bytes)".format(str(num_avail_bytes)) + \ |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
404 " on the device of the destination directory for the download: " + \ |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
405 "{:s}".format(cannonical_destination)) |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
406 |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
407 source_file = file_retriever.open(file_url) |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
408 while not download_complete: |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
409 data = source_file.read(DOWNLOAD_BLOCK_SIZE) |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
410 if data: |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
411 output_file.write(data) |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
412 bytes_read = bytes_read + len(data) |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
413 else: |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
414 download_complete = True |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
415 source_file.close() |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
416 except IOError: |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
417 print "Error while attempting to download {:s}".format(file_url) |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
418 raise |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
419 finally: |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
420 output_file.close() |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
421 |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
422 for k,v in source_file.headers.items(): |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
423 print k, "=",v |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
424 print "Downloaded {:s} bytes from {:s}".format(str(bytes_read), str(file_url)) |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
425 dest_filesize = os.path.getsize(dest_fullpath) |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
426 print "{:s} {:s}".format(str(dest_filesize), str(dest_fullpath)) |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
427 if source_filesize != dest_filesize: |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
428 raise IOError("Download error:\n\t" + \ |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
429 "The source file\n\t\t{:d}\t{:s}\n\t".format(source_filesize, file_url) + \ |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
430 "and the destination file\n\t\t{:d}\t{:s}\n\t".format(dest_filesize, dest_fullpath) + \ |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
431 "are different sizes.") |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
432 return dest_fullpath |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
433 |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
434 def ensure_we_can_write_numbytes_to(destination, numbytes): |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
435 # Attempts to create the destination directory if it does not exist. |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
436 # Tests whether a file can be written to that directory. |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
437 # Tests whether there is numbytes space on the device of the destination. |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
438 # Raises errors if it cannot do any of the above. |
0 | 439 # |
11
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
440 # Returns the full specification of the destination path. |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
441 # We want to make sure that destination is an absolute fully specified path. |
0 | 442 cannonical_destination = os.path.realpath(destination) |
443 if os.path.exists(cannonical_destination): | |
444 if not os.path.isdir(cannonical_destination): | |
445 raise ValueError("The destination is not a directory: " + \ | |
446 "{:s}".format(cannonical_destination)) | |
447 # else all is good. It is a directory. | |
448 else: | |
11
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
449 # We need to create it since it does not exist. |
0 | 450 try: |
451 os.makedirs(cannonical_destination) | |
452 except os.error: | |
453 print "ERROR: Trying to create the following directory path:" | |
454 print "\t{:s}".format(cannonical_destination) | |
455 raise | |
456 # Make sure the directory now exists and we can write to it. | |
457 if not os.path.exists(cannonical_destination): | |
458 # It should have been created, but if it doesn't exist at this point | |
459 # in the code, something is wrong. Raise an error. | |
460 raise OSError("The destination directory could not be created: " + \ | |
461 "{:s}".format(cannonical_destination)) | |
11
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
462 test_writing_filename = "{:s}.{:s}".format(os.path.basename(cannonical_destination), _Write_TestFile) |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
463 test_writing_filepath = os.path.join(cannonical_destination, test_writing_filename) |
0 | 464 try: |
11
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
465 with open(test_writing_filepath, "w") as test_writing_file: |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
466 test_writing_file.write("Testing writing to this file.") |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
467 if os.path.exists(test_writing_filepath): |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
468 os.remove(test_writing_filepath) |
0 | 469 except IOError: |
11
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
470 print "The destination directory could not be written into:\n\t" + \ |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
471 "{:s}".format(cannonical_destination) |
0 | 472 raise |
11
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
473 # Check whether there are numbytes available on cannonical_destination's device. |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
474 statvfs = os.statvfs(cannonical_destination) |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
475 # fs_size = statvfs.f_frsize * statvfs.f_blocks # Size of filesystem in bytes |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
476 # num_free_bytes = statvfs.f_frsize * statvfs.f_bfree # Actual number of free bytes |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
477 num_avail_bytes = statvfs.f_frsize * statvfs.f_bavail # Number of free bytes that ordinary users |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
478 # are allowed to use (excl. reserved space) |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
479 if (num_avail_bytes < numbytes): |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
480 raise OSError("There is insufficient space ({:s} bytes)".format(str(num_avail_bytes)) + \ |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
481 " on the device of the destination directory:\n\t" + \ |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
482 "{:s}\n\t{:d} bytes are needed.".format(cannonical_destination, numbytes)) |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
483 |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
484 return cannonical_destination |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
485 |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
486 def download_genome_archive(source_url, destination, force_new_download=False): |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
487 # This function downloads but does not extract the archive at source_url. |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
488 # This function can be called on a file whose download was interrupted, and if force_new_download |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
489 # is False, the download will proceed where it left off. |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
490 # If download does not succeed, an IOError is raised. |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
491 # The function checks whether there is enough space at the destination for the expanded library. |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
492 # It raises an OSError if not. |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
493 # ValueError can also be raised by this function. |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
494 |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
495 # Input Parameters |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
496 # source_url is the full URL of the file we want to download. |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
497 # It should look something like: |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
498 # https://data.broadinstitute.org/Trinity/CTAT_RESOURCE_LIB/GRCh37_v19_CTAT_lib_Feb092018.plug-n-play.tar.gz |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
499 # If only the filename is given, it is assumed to reside at _CTAT_ResourceLib_URL. |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
500 # destination is the location (directory) where a copy of the source file will be placed. |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
501 # Relative paths are expanded using the current working directory, so within Galaxy, |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
502 # it is best to send in absolute fully specified path names so you know to where |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
503 # the source file is going to be copied. |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
504 # force_new_download if True, will cause a new download to occur, even if the file has been downloaded previously. |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
505 # |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
506 # Returns the canonical path to the file that was downloaded. |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
507 |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
508 dest_fullpath = None |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
509 url_parts = urlparse.urlparse(source_url) |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
510 source_filename = os.path.basename(url_parts.path) |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
511 if url_parts.scheme == "": |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
512 # Then we were given a source_url without a leading https: or similar. |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
513 # Assume we only were given the filename and that it exists at _CTAT_ResourceLib_URL. |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
514 source_url = urlparse.urljoin(_CTAT_ResourceLib_URL, source_url) |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
515 # FIX - We might want to otherwise check if we have a valid url and/or if we can reach it. |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
516 |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
517 print "Downloading:\n\t{:s}".format(str(source_url)) |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
518 print "to:\n\t{:s}".format(destination) |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
519 # The next is done so that if the source_url does not have a genome name in it, an error will be raised. |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
520 find_genome_name_in_path(source_url, raise_error=True) |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
521 cannonical_destination = ensure_we_can_write_numbytes_to(destination, size_of_file_at(source_url)) |
0 | 522 |
523 # Get the list of files in the directory, | |
11
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
524 # We use it to check for a previous download. |
0 | 525 orig_files_in_destdir = set(os.listdir(cannonical_destination)) |
526 # See whether the file has been downloaded already. | |
11
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
527 download_success_filename = "{:s}.{:s}".format(source_filename, _DownloadSuccessFile) |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
528 download_success_full_file_path = os.path.join(cannonical_destination, download_success_filename) |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
529 if ((download_success_filename not in orig_files_in_destdir) \ |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
530 or force_new_download): |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
531 if (download_success_filename in orig_files_in_destdir): |
0 | 532 # Since we are redoing the download, |
533 # the success file needs to be removed | |
534 # until the download has succeeded. | |
11
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
535 os.remove(download_success_full_file_path) |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
536 # The following raises an error if the download fails for some reason. |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
537 dest_fullpath = download_file_from_url(source_url, cannonical_destination, \ |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
538 resume_download=(not force_new_download)) |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
539 # Check the md5sum of the cannonical_destination file to ensure the data in the file is correct. |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
540 file_retriever = resumable_URL_opener() |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
541 md5_url = "{:s}.md5".format(source_url) |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
542 print "Checking the md5sum of the downloaded file." |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
543 try: |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
544 md5_file = file_retriever.open(md5_url, "r") |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
545 md5sum_from_web = md5_file.readlines()[0].strip().split()[0] |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
546 md5_file.close() |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
547 md5sum_from_file = md5sum_for(dest_fullpath) |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
548 except IOError: |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
549 print "Error while attempting to check the md5sum for {:s}".format(dest_fullpath) |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
550 raise |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
551 if md5sum_from_web != md5sum_from_file: |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
552 raise IOError("Download error:\n\t" + \ |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
553 "The md5 sum for\n\t\t({:s})\n\t".format(dest_fullpath) + \ |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
554 "does not match the value read from the web:\n\t\t" + \ |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
555 "({:s} != {:s})".format(md5sum_from_file, md5sum_from_web)) |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
556 print "Check of md5sum succeeded." |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
557 create_success_file(download_success_full_file_path, \ |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
558 "Download of:\n\t{:s}\n".format(source_url) + \ |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
559 "to:\n\t{:s}\nsucceeded.".format(dest_fullpath)) |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
560 elif download_success_filename in orig_files_in_destdir: |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
561 print "The download success file exists, so no download is being attempted:" |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
562 print "\t{:s}".format(download_success_full_file_path) |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
563 print "Remove the file or set <Force New Download> if you want a new download to occur." |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
564 dest_filename = os.path.basename(source_url) |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
565 dest_fullpath = os.path.join(cannonical_destination, dest_filename) |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
566 else: |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
567 print "download_genome_archive(): This code should never be printed. Something is wrong." |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
568 |
0 | 569 # Some code to help us if errors occur. |
11
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
570 print "\n*******************************" |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
571 print "* Finished download. *" |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
572 print_directory_contents(cannonical_destination, 1) |
6 | 573 print "*******************************\n" |
0 | 574 |
11
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
575 return dest_fullpath |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
576 |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
577 def extract_archive(archive_filepath, destination, force_new_extraction=False): |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
578 # Generic function will use tarfile object to extract the given archive_filepath |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
579 # to the destination. If a file indicating a previous successful extraction exists |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
580 # the file is not extracted again unless force_new_extraction is True. |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
581 # This procedure does not write the extraction success file, because some error checking |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
582 # is dependant on the file being extracted. The calling procedure can/should write the |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
583 # success file after doing error checking. |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
584 cannonical_destination = ensure_we_can_write_numbytes_to(destination, bytes_needed_to_extract(archive_filepath)) |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
585 |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
586 # Create the name of the file used to indicate prior success of the file's extraction. |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
587 extraction_success_filename = "{:s}.{:s}".format(os.path.basename(archive_filepath), _ExtractionSuccessFile) |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
588 extraction_success_full_file_path = os.path.join(cannonical_destination, extraction_success_filename) |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
589 #print "extraction_success_filename is {:s}".format(extraction_success_filename) |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
590 |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
591 orig_files_in_destination = set(os.listdir(cannonical_destination)) |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
592 if ((extraction_success_filename not in orig_files_in_destination) \ |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
593 or force_new_extraction): |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
594 # Do the extraction. |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
595 if (extraction_success_filename in orig_files_in_destination): |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
596 # Since we are redoing the extraction, |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
597 # the success file needs to be removed |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
598 # until the extraction has succeeded. |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
599 os.remove(extraction_success_full_file_path) |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
600 with tarfile.open(archive_filepath, mode="r:*") as archive_file: |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
601 archive_file.extractall(path=cannonical_destination) |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
602 elif (extraction_success_filename in orig_files_in_destination): |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
603 # The archive was successfully extracted before so we do not do it again. |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
604 print "The extraction success file exists, so no new extraction was attempted:" |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
605 print "\t{:s}".format(extraction_success_filename) |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
606 print "Remove the success file or set <force new extraction> if you want a new extraction to occur." |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
607 else: |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
608 print "extract_archive(): This code should never be printed. Something is wrong." |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
609 |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
610 # Some code to help us if errors occur. |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
611 print "\n*******************************************************" |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
612 print "* Finished extraction. Destination directory listing. *" |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
613 print_directory_contents(cannonical_destination, 1) |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
614 print "*******************************************************\n" |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
615 return |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
616 |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
617 def extract_genome_file(archive_filepath, destination, force_new_extraction=False, keep_archive=False): |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
618 # Extract a CTAT Genome Reference Library archive file. |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
619 # It is best if archive_filepath is an absolute, fully specified filepath, not a relative one. |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
620 # destination is the directory to which the archive will be extracted. |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
621 # force_new_extraction can be used to cause extraction to occur, even if the file was extracted before. |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
622 # |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
623 # Returns extracted_directory |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
624 # The full path of the top level directory that is |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
625 # created by the extraction of the files from the archive. |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
626 |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
627 print "Extracting:\n\t {:s}".format(str(archive_filepath)) |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
628 print "to:\n\t{:s}".format(destination) |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
629 cannonical_destination = ensure_we_can_write_numbytes_to(destination, bytes_needed_to_extract(archive_filepath)) |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
630 # Get the root filename of the Genome Directory from the source file's name. |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
631 # That should also be the name of the extracted directory. |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
632 genome_dirname = find_genome_name_in_path(archive_filepath, raise_error=True) |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
633 |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
634 orig_files_in_destination = set(os.listdir(cannonical_destination)) |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
635 extract_archive(archive_filepath, destination, force_new_extraction) |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
636 newfiles_in_destdir = set(os.listdir(cannonical_destination)) - orig_files_in_destination |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
637 |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
638 if (genome_dirname not in newfiles_in_destdir): |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
639 # Perhaps it has a different name than what we expect it to be. |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
640 # It will be a sub-directory that was not in the directory |
0 | 641 # before we did the download and extraction. |
642 found_filename = None | |
643 if len(newfiles_in_destdir) == 1: | |
644 found_filename = newfiles_in_destdir[0] | |
645 else: | |
646 for filename in newfiles_in_destdir: | |
647 # In most cases, there will only be one new file, but some OS's might have created | |
648 # other files in the directory. | |
649 # Look for the directory that was downloaded and extracted. | |
650 # The correct file's name should be a substring of the tar file that was downloaded. | |
651 if filename in src_filename: | |
11
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
652 # make sure it is a directory |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
653 if os.path.isdir(os.path.join(cannonical_destination,filename)): |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
654 found_filename = filename |
0 | 655 if found_filename is not None: |
11
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
656 genome_dirname = found_filename |
0 | 657 |
11
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
658 extracted_directory = os.path.join(cannonical_destination, genome_dirname) |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
659 if (os.path.exists(extracted_directory)): |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
660 # Create the name of the file used to indicate prior success of the file's extraction. |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
661 extraction_success_filename = "{:s}.{:s}".format(os.path.basename(archive_filepath), _ExtractionSuccessFile) |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
662 extraction_success_full_file_path = os.path.join(cannonical_destination, extraction_success_filename) |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
663 create_success_file(extraction_success_full_file_path, \ |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
664 "Extraction of:\n\t{:s}\n".format(archive_filepath) + \ |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
665 "to:\n\t{:s}\nsucceeded.".format(extracted_directory)) |
0 | 666 else: |
11
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
667 raise ValueError("ERROR: Could not find the extracted directory in the destination directory:" + \ |
0 | 668 "\n\t{:s}".format(cannonical_destination)) |
11
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
669 if not keep_archive: |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
670 # We are done extracting, so remove the archive file. |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
671 if os.path.exists(archive_filepath): |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
672 print "Removing the archive file:\n\t{:s}".format(archive_filepath) |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
673 os.remove(archive_filepath) |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
674 # else: # It was removed previously, so we don't need to remove it again. |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
675 return extracted_directory |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
676 |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
677 def gmap_the_library(genome_build_directory, force_new_gmap=False): |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
678 # This is the processing that needs to happen for gmap-fusion to work. |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
679 # genome_build_directory should normally be a fully specified path, |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
680 # though this function should work even if it is relative. |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
681 # The command prints messages out to stderr, even when there is not an error, |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
682 # so route stderr to stdout. Otherwise, galaxy thinks an error occurred. |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
683 |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
684 # Create the name of the file used to indicate prior success of gmap. |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
685 gmap_success_filename = "{:s}.{:s}".format(os.path.basename(genome_build_directory), _GmapSuccessFile) |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
686 gmap_success_full_file_path = os.path.join(genome_build_directory, gmap_success_filename) |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
687 |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
688 orig_files_in_build_dir = set(os.listdir(genome_build_directory)) |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
689 if ((gmap_success_filename not in orig_files_in_build_dir) \ |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
690 or force_new_gmap): |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
691 # Do the gmap. |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
692 if (gmap_success_filename in orig_files_in_build_dir): |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
693 # Since we are redoing the gmap, |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
694 # the success file needs to be removed |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
695 # until the gmap has succeeded. |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
696 os.remove(gmap_success_full_file_path) |
6 | 697 command = "gmap_build -D {:s}/ -d ref_genome.fa.gmap -k 13 {:s}/ref_genome.fa 2>&1".format( \ |
11
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
698 genome_build_directory, genome_build_directory) |
0 | 699 try: # to send the gmap_build command. |
11
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
700 subprocess.check_call(command, shell=True) |
0 | 701 except subprocess.CalledProcessError: |
702 print "ERROR: While trying to run the gmap_build command on the library:\n\t{:s}".format(command) | |
703 raise | |
704 finally: | |
705 # Some code to help us if errors occur. | |
706 print "\n*******************************\nAfter running gmap_build." | |
6 | 707 print_directory_contents(genome_build_directory, 2) |
708 print "*******************************\n" | |
11
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
709 create_success_file(gmap_success_full_file_path, \ |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
710 "gmap of:\n\t{:s}\nsucceeded.".format(genome_build_directory)) |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
711 elif gmap_success_filename in orig_files_in_build_dir: |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
712 print "The gmap success file exists, so no gmap is being attempted:" |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
713 print "\t{:s}".format(gmap_success_full_file_path) |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
714 print "Remove the file or set <force new gmap> if you want a new gmap to occur." |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
715 else: |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
716 print "gmap_the_library(): This code should never be printed. Something is wrong." |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
717 return |
0 | 718 |
11
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
719 |
32
9b7dc7d09fda
Fixing some indentation errors in build_the_library.
trinity_ctat
parents:
31
diff
changeset
|
720 def build_the_library(genome_source_directory, genome_build_directory, force_new_build=False, gmap_build=False): |
11
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
721 """ genome_source_directory is the location of the source_data needed to build the library. |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
722 Normally it is fully specified, but could be relative. |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
723 genome_build_directory is the location where the library will be built. |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
724 It can be relative to the current working directory or an absolute path. |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
725 build specifies whether to run prep_genome_lib.pl even if it was run before. |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
726 gmap_build specifies whether to run gmap_build or not. |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
727 |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
728 Following was the old way to do it. Before FusionFilter 0.5.0. |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
729 prep_genome_lib.pl \ |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
730 --genome_fa ref_genome.fa \ |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
731 --gtf ref_annot.gtf \ |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
732 --blast_pairs blast_pairs.gene_syms.outfmt6.gz \ |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
733 --fusion_annot_lib fusion_lib.dat.gz |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
734 --output_dir ctat_genome_lib_build_dir |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
735 index_pfam_domain_info.pl \ |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
736 --pfam_domains PFAM.domtblout.dat.gz \ |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
737 --genome_lib_dir ctat_genome_lib_build_dir |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
738 gmap_build -D ctat_genome_lib_build_dir -d ref_genome.fa.gmap -k 13 ctat_genome_lib_build_dir/ref_genome.fa" |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
739 """ |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
740 |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
741 if (genome_source_directory is None) or (genome_source_directory == "" ) or not os.path.exists(genome_source_directory): |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
742 raise ValueError("Cannot build the CTAT Genome Resource Library. " + \ |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
743 "The source directory does not exist:\n\t{:s}".format(str(genome_source_directory))) |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
744 cannonical_destination = ensure_we_can_write_numbytes_to(genome_build_directory, \ |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
745 bytes_needed_to_build(genome_source_directory)) |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
746 print "Building the CTAT Genome Resource Library from source data at:\n\t{:s}".format(str(genome_source_directory)) |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
747 print "The Destination directory is at:\n\t{:s}".format(str(cannonical_destination)) |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
748 |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
749 # Get the root filename of the Genome Directory. |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
750 src_filename = os.path.basename(genome_source_directory) |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
751 # See whether the library has been built already. The success file is written into the source directory. |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
752 files_in_sourcedir = set(os.listdir(genome_source_directory)) |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
753 build_success_filename = "{:s}.{:s}".format(src_filename, _LibBuiltSuccessFile) |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
754 build_success_file_path = os.path.join(genome_source_directory, build_success_filename) |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
755 if (build_success_filename not in files_in_sourcedir) or force_new_build: |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
756 os.chdir(genome_source_directory) |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
757 if (build_success_filename in files_in_sourcedir): |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
758 # Since we are redoing the build, |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
759 # the success file needs to be removed |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
760 # until the build has succeeded. |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
761 os.remove(build_success_file_path) |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
762 # Create the command that builds the Genome Resource Library form the source data. |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
763 command = "prep_genome_lib.pl --genome_fa ref_genome.fa --gtf ref_annot.gtf " + \ |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
764 "--pfam_db PFAM.domtblout.dat.gz " + \ |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
765 "--output_dir {:s} ".format(cannonical_destination) |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
766 found_HumanFusionLib = False |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
767 HumanFusionLib_filename = "NoFileFound" |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
768 for filename in os.listdir(genome_source_directory): |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
769 # At the time this was written, the filename was CTAT_HumanFusionLib.v0.1.0.dat.gz |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
770 # We only check the prefix, in case other versions are used later. |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
771 # I assume there is only one in the directory, but if there are more than one, |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
772 # the later one, alphabetically, will be used. |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
773 if filename.split(".")[0] == _CTAT_HumanFusionLib_FilenamePrefix: |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
774 found_HumanFusionLib = True |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
775 filename_of_HumanFusionLib = filename |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
776 if found_HumanFusionLib: |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
777 # The mouse genomes do not have a fusion_annot_lib |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
778 # so only add the following for Human genomes. |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
779 command += "--fusion_annot_lib {:s} ".format(filename_of_HumanFusionLib) + \ |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
780 "--annot_filter_rule AnnotFilterRule.pm " |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
781 if gmap_build: |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
782 command += "--gmap_build " |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
783 # Send stderr of the command to stdout, because some functions may write to stderr, |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
784 # even though no error has occurred. We will depend on error code return in order |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
785 # to know if an error occurred. |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
786 command += " 2>&1" |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
787 print "About to run the following command:\n\t{:s}".format(command) |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
788 try: # to send the prep_genome_lib command. |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
789 subprocess.check_call(command, shell=True) |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
790 except subprocess.CalledProcessError: |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
791 print "ERROR: While trying to run the prep_genome_lib.pl command " + \ |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
792 "on the CTAT Genome Resource Library:\n\t{:s}".format(command) |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
793 raise |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
794 finally: |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
795 # Some code to help us if errors occur. |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
796 print "\n*******************************" |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
797 print "Contents of Genome Source Directory {:s}:".format(genome_source_directory) |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
798 print_directory_contents(genome_source_directory, 2) |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
799 print "\nContents of Genome Build Directory {:s}:".format(cannonical_destination) |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
800 print_directory_contents(cannonical_destination, 2) |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
801 print "*******************************\n" |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
802 create_success_file(build_success_file_path, \ |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
803 "Build of:\n\t{:s}\n".format(genome_source_directory) + \ |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
804 "to:\n\t{:s}\nsucceeded.".format(cannonical_destination)) |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
805 if gmap_build: |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
806 # Create the gmap success file. |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
807 gmap_success_filename = "{:s}.{:s}".format(src_filename, _GmapSuccessFile) |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
808 gmap_success_full_file_path = os.path.join(cannonical_destination, gmap_success_filename) |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
809 create_success_file(gmap_success_full_file_path, \ |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
810 "gmap of:\n\t{:s}\nsucceeded.".format(cannonical_destination)) |
32
9b7dc7d09fda
Fixing some indentation errors in build_the_library.
trinity_ctat
parents:
31
diff
changeset
|
811 elif (build_success_filename in files_in_sourcedir): |
9b7dc7d09fda
Fixing some indentation errors in build_the_library.
trinity_ctat
parents:
31
diff
changeset
|
812 print "The build success file exists, so no build is being attempted:" |
9b7dc7d09fda
Fixing some indentation errors in build_the_library.
trinity_ctat
parents:
31
diff
changeset
|
813 print "\t{:s}".format(build_success_file_path) |
9b7dc7d09fda
Fixing some indentation errors in build_the_library.
trinity_ctat
parents:
31
diff
changeset
|
814 print "Remove the file or set <force new build> if you want a new build to occur." |
9b7dc7d09fda
Fixing some indentation errors in build_the_library.
trinity_ctat
parents:
31
diff
changeset
|
815 # We might still need to do a gmap_build. |
11
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
816 if gmap_build: |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
817 print "Checking if we need to gmap the library." |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
818 gmap_the_library(cannonical_destination, force_new_build) |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
819 # gmap_the_library creates a gmap success file if it succeeds. |
32
9b7dc7d09fda
Fixing some indentation errors in build_the_library.
trinity_ctat
parents:
31
diff
changeset
|
820 else: |
9b7dc7d09fda
Fixing some indentation errors in build_the_library.
trinity_ctat
parents:
31
diff
changeset
|
821 print "build_the_library(): This code should never be printed. Something is wrong." |
9b7dc7d09fda
Fixing some indentation errors in build_the_library.
trinity_ctat
parents:
31
diff
changeset
|
822 |
11
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
823 return |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
824 # End of build_the_library() |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
825 |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
826 def find_path_to_mutation_lib_integration(): |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
827 # We are assuming that we exist inside of a conda environment and that the directory that we want |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
828 # is in the share directory, one level up from the bin directory that contains the ctat_mutations |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
829 # command. |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
830 path_to_mutation_lib_integration = None |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
831 path_to_ctat_mutations = which("ctat_mutations") |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
832 if (path_to_ctat_mutations is None) or (path_to_ctat_mutations == ""): |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
833 raise ValueError("Unable to find ctat_mutations, which is required to do mutation resource processing.") |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
834 conda_root_dir = os.path.dirname(os.path.dirname(path_to_ctat_mutations)) |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
835 share_dir = os.path.join(conda_root_dir, "share") |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
836 ctat_mutations_dir = None |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
837 for filename in os.listdir(share_dir): |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
838 if "ctat-mutations" in filename: |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
839 ctat_mutations_dir = filename |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
840 if (ctat_mutations_dir is None) or (ctat_mutations_dir == ""): |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
841 raise ValueError("Unable to find the home of ctat_mutations.\n" + \ |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
842 "It should be in the share directory:\n\t{:s}.".format(share_dir)) |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
843 path_to_mutation_lib_integration = os.path.join(share_dir, \ |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
844 ctat_mutations_dir, \ |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
845 "mutation_lib_prep", \ |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
846 "ctat-mutation-lib-integration.py") |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
847 return path_to_mutation_lib_integration |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
848 |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
849 def find_path_to_picard_home(): |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
850 picard_home = None |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
851 path_to_ctat_mutations = which("ctat_mutations") |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
852 if (path_to_ctat_mutations is None) or (path_to_ctat_mutations == ""): |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
853 raise ValueError("Unable to find ctat_mutations, which is required to do mutation resources processing.") |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
854 # The ctat_mutations shell script defines PICARD_HOME. We just need to get it out of that file. |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
855 ctat_mutations_file = open(path_to_ctat_mutations, "r") |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
856 for line in ctat_mutations_file: |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
857 if ("export" in line) and ("PICARD_HOME=" in line): |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
858 # Get the value after the equal sign and strip off the newline at the end of string. |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
859 # Then strip off quotes at begin and end if they are there. |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
860 # And then strip off any other whitespace that might have been inside of stripped off quotes. |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
861 picard_home = line.split("=")[1].strip().strip('\"').strip() |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
862 if (picard_home is None) or (picard_home == ""): |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
863 # We didn't find it in the ctat_mutations file. Search for it. |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
864 conda_root_dir = os.path.dirname(os.path.dirname(path_to_ctat_mutations)) |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
865 share_dir = os.path.join(conda_root_dir, "share") |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
866 for filename in os.listdir(share_dir): |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
867 if "picard" in filename: |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
868 picard_home = os.path.join(share_dir,filename) |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
869 if (picard_home is None) or (picard_home == ""): |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
870 raise ValueError("Unable to find PICARD_HOME.\n" + |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
871 "It should be in the share directory:\n\t{:s}.".format(share_dir)) |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
872 return picard_home |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
873 |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
874 def download_and_integrate_mutation_resources(source_url, genome_build_directory, cosmic_resources_location=None, \ |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
875 force_new_download=False, force_new_integration=False): |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
876 # source_url is the url of the mutation resources archive to download. |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
877 # genome_build_dir is the location where the archive will be placed. |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
878 # If cosmic_files_location is set, that is the location where the files are presumed to exist. |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
879 # If cosmic_files_location is not set, the files will assumed to exist in genome_build_directory. |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
880 # If force_new_download is True, then even if the archive has previously been downloaded, |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
881 # it will be downloaded again. |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
882 """ |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
883 From https://github.com/NCIP/ctat-mutations/tree/master/mutation_lib_prep |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
884 |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
885 Step 1 (after CTAT Genome Resource Library is built) |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
886 download mutation_lib.hg38.tar.gz into GRCh38_v27_CTAT_lib_Feb092018 |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
887 or |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
888 download mutation_lib.hg19.tar.gz into GRCh37_v19_CTAT_lib_Feb092018 |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
889 or |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
890 download mc-7.tar.gz into Mouse_M16_CTAT_lib_Feb202018 |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
891 (Need to ask about support for mouse, since there is not info about Cosmic mouse genome files in instracutions.) |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
892 |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
893 Step 2: Cosmic files download - User must perform this step prior to running this code. We check if files are present. |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
894 |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
895 Next download COSMIC resources required in this directory. Depending on the version of genome you need you can install either COSMIC's hg38 or COSMIC's hg19. You will need to download 2 sets of files: COSMIC Mutation Data (CosmicMutantExport.tsv.gz) and COSMIC Coding Mutation VCF File (CosmicCodingMuts.vcf.gz). Please note, for download to succeed you will need to register and login to their service. |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
896 |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
897 So is there a way the user can give their credentials through the Data Manager interface as a part of specifying Mutation parameters and then I can programatically use those credentials to download the file, or maybe instead, the interface needs to have the intructions for the user to download the files, then the use needs to specify the absolute path to where those files are. |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
898 |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
899 Step 3: Mutation lib integration |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
900 |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
901 Once you have downloaded CosmicMutantExport.tsv.gz AND CosmicCodingMuts.vcf.gz (hg38 or hg19), proceed with mutation lib integration step which will integrate the mutation resource with CTAT_GENOME_LIB (This corresponds to "GRCh37_v19_CTAT_lib_Feb092018" or "GRCh38_v27_CTAT_lib_Feb092018" downloaded in Step 1). You will find this script in ctat-mutations repo in 'src' directory. |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
902 |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
903 #Keep Picard in PICARD_HOME environmental variable like so |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
904 export PICARD_HOME=/path/to/picard |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
905 |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
906 #Integrate CTAT mutations lib with CTAT genome library |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
907 python ctat-mutations/mutation_lib_prep/ctat-mutation-lib-integration.py \ |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
908 --CosmicMutantExport CosmicMutantExport.tsv.gz \ |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
909 --CosmicCodingMuts CosmicCodingMuts.vcf.gz \ |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
910 --genome_lib_dir GRCh37_v19_CTAT_lib_Feb092018/ # OR GRCh38_v27_CTAT_lib_Feb092018/ |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
911 |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
912 Now you are all set to run the ctat-mutations pipeline |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
913 """ |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
914 print "\n***********************************" |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
915 print "* Integrating Mutation Resources. *" |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
916 print "***********************************\n" |
8 | 917 # It is assumed that this procedure is only called with a valid genome_build_directory. |
11
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
918 url_parts = urlparse.urlparse(source_url) |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
919 source_filename = os.path.basename(url_parts.path) |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
920 if url_parts.scheme == "": |
8 | 921 # Then we were given a source_url without a leading https: or similar. |
11
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
922 # Assume we only were given the filename and that it exists at _CTAT_Mutation_URL. |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
923 source_url = urlparse.urljoin(_CTAT_Mutation_URL, source_url) |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
924 # FIX - We might want to otherwise check if we have a valid url and/or if we can reach it. |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
925 cannonical_destination = ensure_we_can_write_numbytes_to(genome_build_directory, _NumBytesNeededForMutationResources) |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
926 print "Download and Integrate a Mutation Resource Archive." |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
927 print "The source URL is:\n\t{:s}".format(str(source_url)) |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
928 print "The destination is:\n\t{:s}".format(str(cannonical_destination)) |
8 | 929 # Get the list of files in the directory, |
930 # We use it to check for a previous download or extraction among other things. | |
11
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
931 orig_files_in_destdir = set(os.listdir(cannonical_destination)) |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
932 |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
933 # DOWNLOAD SECTION |
8 | 934 # See whether the index file has been downloaded already. |
11
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
935 download_success_file = "{:s}.{:s}".format(source_filename, _MutationDownloadSuccessFile) |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
936 download_success_file_path = os.path.join(cannonical_destination, download_success_file) |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
937 if ((download_success_file not in orig_files_in_destdir) or force_new_download): |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
938 # DO THE DOWNLOAD |
8 | 939 if (download_success_file in orig_files_in_destdir): |
940 # Since we are redoing the download, | |
941 # the success file needs to be removed | |
942 # until the download has succeeded. | |
943 os.remove(download_success_file_path) | |
11
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
944 # The following raises an IOError if the download fails for some reason. |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
945 archive_fullpath = download_file_from_url(source_url, cannonical_destination, resume_download=(not force_new_download)) |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
946 create_success_file(download_success_file_path, \ |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
947 "Download of the mutation resource archive:\n\t{:s}\n".format(source_url) + \ |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
948 "to:\n\t{:s}\nsucceeded.".format(cannonical_destination)) |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
949 elif (download_success_file in orig_files_in_destdir): |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
950 print "The download success file exists, so no download is being attempted:" |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
951 print "\t{:s}".format(download_success_file_path) |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
952 print "Remove the file or set <new_mutation_download> if you want a new download to occur." |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
953 else: |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
954 print "download_and_integrate_mutation_resources() - Download: This code should never be printed. Something is wrong." |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
955 |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
956 # INTEGRATION SECTION |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
957 integration_success_file = "{:s}.{:s}".format(source_filename, _MutationIntegrationSuccessFile) |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
958 integration_success_file_path = os.path.join(cannonical_destination, integration_success_file) |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
959 if ((integration_success_file not in orig_files_in_destdir) or force_new_integration): |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
960 # INTEGRATE THE LIBRARY |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
961 if (integration_success_file in orig_files_in_destdir): |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
962 # Since we are redoing the integration, |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
963 # the success file needs to be removed |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
964 # until the download has succeeded. |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
965 os.remove(integration_success_file_path) |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
966 mutation_lib_dirpath = os.path.join(cannonical_destination, _CTAT_MutationLibDirname) |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
967 # If we do not remove the directory, then the old files will exist and a new integration does not occur. |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
968 # Also, with the Cosmic files, when the integrated file is created, if there is a previous one, gzip |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
969 # asks a question of the user, and this program is not prepared to respond to a question from a subprocess: |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
970 # [bgzip] /path/to/ctat_mutation_lib/cosmic.vcf.gz already exists; do you wish to overwrite (y or n)? |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
971 if os.path.exists(mutation_lib_dirpath): |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
972 shutil.rmtree(mutation_lib_dirpath) |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
973 # Check for Cosmic resources. User has to place these files into the correct location. |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
974 if (cosmic_resources_location is None) or (cosmic_resources_location == ""): |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
975 cosmic_resources_loc_full_path = cannonical_destination |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
976 end_err_msg = "These files must be placed into:\n\t{:s}".format(cosmic_resources_loc_full_path) |
8 | 977 else: |
11
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
978 cosmic_resources_loc_full_path = os.path.realpath(cosmic_resources_location) |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
979 end_err_msg = "This function was told they would be placed into:\n\t{:s}".format(cosmic_resources_loc_full_path) |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
980 cosmic_mutant_full_path = os.path.join(cosmic_resources_loc_full_path, _COSMIC_Mutant_Filename) |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
981 cosmic_coding_full_path = os.path.join(cosmic_resources_loc_full_path, _COSMIC_Coding_Filename) |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
982 if not (os.path.exists(cosmic_mutant_full_path) and os.path.exists(cosmic_coding_full_path)): |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
983 raise IOError("Either one or both of Cosmic Resources are missing:\n\t" + \ |
31
0df7a729910d
Fixing an error message to use correct filename.
trinity_ctat
parents:
11
diff
changeset
|
984 "{:s}\nand/or\n\t{:s}\n".format(cosmic_mutant_full_path, cosmic_coding_full_path) + \ |
11
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
985 "Unable to integrate mutation resources.\n{:s}".format(end_err_msg)) |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
986 # Create the integration command. We also must define PICARD_HOME for the command to work. |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
987 picard_home = find_path_to_picard_home() |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
988 integration_command = find_path_to_mutation_lib_integration() |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
989 command = "export PICARD_HOME={:s} && python {:s} ".format(picard_home, integration_command) + \ |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
990 "--CosmicMutantExport {:s} ".format(cosmic_mutant_full_path) + \ |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
991 "--CosmicCodingMuts {:s} ".format(cosmic_coding_full_path) + \ |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
992 "--genome_lib_dir {:s}".format(cannonical_destination) |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
993 try: # to send the ctat-mutation-lib-integration command. |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
994 subprocess.check_call(command, shell=True) |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
995 except subprocess.CalledProcessError: |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
996 print "ERROR: While trying to integrate the mutation resources:\n\t{:s}".format(command) |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
997 raise |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
998 finally: |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
999 # Some code to help us if errors occur. |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
1000 print "/n*********************************************************" |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
1001 print "* After download and integration of Mutation Resources. *" |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
1002 print_directory_contents(cannonical_destination, 2) |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
1003 print "*********************************************************\n" |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
1004 create_success_file(integration_success_file_path, \ |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
1005 "Download and integration of mutation resources:\n\t{:s}\n".format(source_url) + \ |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
1006 "to:\n\t{:s}\nsucceeded.".format(genome_build_directory)) |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
1007 elif (integration_success_file in orig_files_in_destdir): |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
1008 print "The mutation resources integration success file exists, so no integration is being attempted:" |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
1009 print "\t{:s}".format(integration_success_file_path) |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
1010 print "Remove the file or set <new_mutation_integration> if you want a new integration to occur." |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
1011 else: |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
1012 print "download_and_integrate_mutation_resources() - Integration: This code should never be printed. Something is wrong." |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
1013 return |
0 | 1014 |
6 | 1015 def search_for_genome_build_dir(top_dir_path): |
1016 # If we do not download the directory, the topdir_path could be the | |
1017 # location of the genome resource library, but we also want to allow the | |
1018 # user to give the same value for top_dir_path that they do when a | |
1019 # build happens, so we need to handle all three cases: | |
1020 # 1) Is the top_dir_path the build directory, | |
1021 # 2) or is it inside of the given directory, | |
1022 # 3) or is it inside a subdirectory of the given directory. | |
1023 # The source_data downloads are built to a directory named _CTAT_Build_dirname, | |
1024 # and the plug-n-play downloads contain a sub-directory named _CTAT_Build_dirname. | |
7 | 1025 # We also look for the genome name and return that, if we find it in the |
1026 # directory name of the directory holding the build directory. | |
1027 top_dir_full_path = os.path.realpath(top_dir_path) | |
6 | 1028 genome_build_directory = None |
7 | 1029 genome_name_from_dirname = None |
6 | 1030 print_warning = False |
1031 | |
7 | 1032 if not os.path.exists(top_dir_full_path): |
6 | 1033 raise ValueError("Cannot find the CTAT Genome Resource Library. " + \ |
7 | 1034 "The given directory does not exist:\n\t{:s}".format(top_dir_full_path)) |
1035 elif not os.path.isdir(top_dir_full_path): | |
6 | 1036 raise ValueError("Cannot find the CTAT Genome Resource Library. " + \ |
7 | 1037 "The given directory is not a directory:\n\t{:s}".format(top_dir_full_path)) |
11
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
1038 if os.path.basename(top_dir_full_path) == _CTAT_Build_dirname: |
7 | 1039 print "Build directory is: {:s}".format(top_dir_full_path) |
6 | 1040 # The top_dir_path is the path to the genome_build_directory. |
7 | 1041 genome_build_directory = top_dir_full_path |
6 | 1042 else: |
1043 # Look for it inside of the top_dir_path directory. | |
7 | 1044 print "Looking inside of: {:s}".format(top_dir_full_path) |
1045 top_dir_contents = os.listdir(top_dir_full_path) | |
6 | 1046 if (_CTAT_Build_dirname in top_dir_contents): |
1047 # The genome_build_directory is inside of the top_dir_path directory. | |
1048 print "1. Found it." | |
7 | 1049 genome_build_directory = "{:s}/{:s}".format(top_dir_full_path,_CTAT_Build_dirname) |
6 | 1050 else: |
1051 # Find all subdirectories containing the _CTAT_Build_dirname or the _CTAT_RefGenome_Filename. | |
1052 # Look down the directory tree two levels. | |
1053 build_dirs_in_subdirs = list() | |
1054 subdirs_with_genome_files = list() | |
1055 build_dirs_in_sub_subdirs = list() | |
1056 sub_subdirs_with_genome_files = list() | |
7 | 1057 subdirs = [entry for entry in top_dir_contents if (os.path.isdir("{:s}/{:s}".format(top_dir_full_path,entry)))] |
6 | 1058 for subdir in subdirs: |
7 | 1059 subdir_path = "{:s}/{:s}".format(top_dir_full_path, subdir) |
6 | 1060 subdir_path_contents = os.listdir(subdir_path) |
1061 # print "Is it one of:\n\t" + "\n\t".join(subdir_path_contents) | |
1062 if (_CTAT_Build_dirname in subdir_path_contents): | |
1063 # The genome_build_directory is inside of the subdir_path directory. | |
1064 print "2a, Found one." | |
1065 build_dirs_in_subdirs.append("{:s}/{:s}".format(subdir_path, _CTAT_Build_dirname)) | |
1066 if (_CTAT_RefGenome_Filename in subdir_path_contents): | |
1067 subdirs_with_genome_files.append(subdir_path) | |
1068 # Since we are already looping, loop through all dirs one level deeper as well. | |
1069 sub_subdirs = [entry for entry in subdir_path_contents if (os.path.isdir("{:s}/{:s}".format(subdir_path,entry)))] | |
1070 for sub_subdir in sub_subdirs: | |
1071 sub_subdir_path = "{:s}/{:s}".format(subdir_path, sub_subdir) | |
1072 sub_subdir_path_contents = os.listdir(sub_subdir_path) | |
1073 # print "Is it one of:\n\t" + "\n\t".join(sub_subdir_path_contents) | |
1074 if (_CTAT_Build_dirname in sub_subdir_path_contents): | |
1075 # The genome_build_directory is inside of the sub_subdir_path directory. | |
1076 print "3a. Found one." | |
1077 build_dirs_in_sub_subdirs.append("{:s}/{:s}".format(sub_subdir_path, _CTAT_Build_dirname)) | |
1078 if (_CTAT_RefGenome_Filename in sub_subdir_path_contents): | |
1079 sub_subdirs_with_genome_files.append(sub_subdir_path) | |
1080 # Hopefully there is one and only one found build directory. | |
1081 # If none are found we check for a directory containing the genome reference file, | |
1082 # but the build process sometimes causes more than one directory to have a copy, | |
1083 # so finding that file is not a sure thing. | |
1084 if (len(build_dirs_in_subdirs) + len(build_dirs_in_sub_subdirs)) > 1: | |
1085 print "\n***************************************" | |
1086 print "Found multiple CTAT Genome Resource Libraries " + \ | |
7 | 1087 "in the given directory:\n\t{:s}".format(top_dir_full_path) |
1088 print_directory_contents(top_dir_full_path, 2) | |
6 | 1089 print "***************************************\n" |
1090 raise ValueError("Found multiple CTAT Genome Resource Libraries " + \ | |
7 | 1091 "in the given directory:\n\t{:s}".format(top_dir_full_path)) |
6 | 1092 elif len(build_dirs_in_subdirs) == 1: |
1093 # The genome_build_directory is inside of the subdir_path directory. | |
1094 print "2b, Found it." | |
1095 genome_build_directory = build_dirs_in_subdirs[0] | |
1096 elif len(build_dirs_in_sub_subdirs) == 1: | |
1097 # The genome_build_directory is inside of the subdir_path directory. | |
1098 print "3b, Found it." | |
1099 genome_build_directory = build_dirs_in_sub_subdirs[0] | |
1100 elif (len(sub_subdirs_with_genome_files) + len(subdirs_with_genome_files)) > 1: | |
1101 print "\n***************************************" | |
1102 print "Unable to find CTAT Genome Resource Library " + \ | |
7 | 1103 "in the given directory:\n\t{:s}".format(top_dir_full_path) |
6 | 1104 print "And multiple directories contain {:s}".format(_CTAT_RefGenome_Filename) |
7 | 1105 print_directory_contents(top_dir_full_path, 2) |
6 | 1106 print "***************************************\n" |
1107 raise ValueError("Unable to find CTAT Genome Resource Library " + \ | |
7 | 1108 "in the given directory:\n\t{:s}".format(top_dir_full_path)) |
6 | 1109 elif (len(sub_subdirs_with_genome_files) == 1): |
1110 print "3c, Maybe found it." | |
1111 genome_build_directory = sub_subdirs_with_genome_files[0] | |
1112 print_warning = True | |
1113 elif (len(subdirs_with_genome_files) == 1): | |
1114 print "2c, Maybe found it." | |
1115 genome_build_directory = subdirs_with_genome_files[0] | |
1116 print_warning = True | |
1117 elif (_CTAT_RefGenome_Filename in top_dir_contents): | |
1118 print "1c. Maybe found it." | |
7 | 1119 genome_build_directory = top_dir_full_path |
6 | 1120 print_warning = True |
1121 else: | |
1122 print "\n***************************************" | |
1123 print "Unable to find CTAT Genome Resource Library " + \ | |
7 | 1124 "in the given directory:\n\t{:s}".format(top_dir_full_path) |
1125 print_directory_contents(top_dir_full_path, 2) | |
6 | 1126 print "***************************************\n" |
1127 raise ValueError("Unable to find CTAT Genome Resource Library " + \ | |
7 | 1128 "in the given directory:\n\t{:s}".format(top_dir_full_path)) |
6 | 1129 # end else |
1130 # Check if the CTAT Genome Resource Lib has anything in it (and specifically ref_genome.fa). | |
1131 if (genome_build_directory is None): | |
1132 print "\n***************************************" | |
1133 print "Cannot find the CTAT Genome Resource Library " + \ | |
7 | 1134 "in the given directory:\n\t{:s}".format(top_dir_full_path) |
1135 print_directory_contents(top_dir_full_path, 2) | |
6 | 1136 print "***************************************\n" |
1137 raise ValueError("Cannot find the CTAT Genome Resource Library " + \ | |
7 | 1138 "in the given directory:\n\t{:s}".format(top_dir_full_path)) |
1139 else: | |
1140 if (_CTAT_RefGenome_Filename not in os.listdir(genome_build_directory)): | |
1141 print "\n***************************************" | |
11
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
1142 print "\nWARNING: Cannot find Genome Reference file {:s} ".format(_CTAT_RefGenome_Filename) + \ |
7 | 1143 "in the genome build directory:\n\t{:s}".format(genome_build_directory) |
1144 print_directory_contents(genome_build_directory, 2) | |
1145 print "***************************************\n" | |
1146 if print_warning and genome_build_directory: | |
1147 print "\n***************************************" | |
11
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
1148 print "\nWARNING: Cannot find the CTAT Genome Resource Library, " + \ |
7 | 1149 "but found a {:s} file, so set its directory as the library.".format(_CTAT_RefGenome_Filename) |
1150 print "This my not be the correct directory:\n\t{:s}".format(genome_build_directory) | |
1151 print_directory_contents(genome_build_directory, 2) | |
1152 print "***************************************\n" | |
6 | 1153 return genome_build_directory |
1154 | |
11
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
1155 def build_directory_from_build_location(src_filename, build_location): |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
1156 build_directory = None |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
1157 genome_dir_name = find_genome_name_in_path(src_filename) |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
1158 if os.path.basename(build_location) == genome_dir_name: |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
1159 build_directory = os.path.join(build_location, _CTAT_Build_dirname) |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
1160 elif os.path.basename(build_location) == _CTAT_Build_dirname: |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
1161 build_directory = build_location |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
1162 else: |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
1163 build_directory = os.path.join(build_location, genome_dir_name, _CTAT_Build_dirname) |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
1164 return build_directory |
7 | 1165 |
0 | 1166 def main(): |
10 | 1167 #Parse Command Line. There are three basic ways to use this tool. |
1168 # 1) Download and Build the CTAT Genome Resource Library from an archive. | |
1169 # 2) Build the library from source data files that are already downloaded. | |
1170 # 3) Specify the location of an already built library. | |
11
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
1171 # Any of these methods can incorporate or be followed by a gmap build. |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
1172 # Any of these methods can be followed by a mutation resources download and/or integration. |
10 | 1173 # Choose arguments for only one method. |
1174 # Do not use arguments in a mixed manner. I am not writing code to handle that at this time. | |
0 | 1175 parser = argparse.ArgumentParser() |
10 | 1176 # Arguments for all methods: |
0 | 1177 parser.add_argument('-o', '--output_filename', \ |
1178 help='Name of the output file, where the json dictionary will be written.') | |
11
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
1179 parser.add_argument('-y', '--display_name', |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
1180 default='', \ |
10 | 1181 help='Is used as the display name for the entry of this Genome Resource Library in the data table.') |
8 | 1182 parser.add_argument('-g', '--gmap_build', \ |
11
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
1183 help='Will do a gmap_build on the Genome Resource Library, if it has not previously been gmapped.', |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
1184 action='store_true') |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
1185 parser.add_argument('-f', '--force_gmap_build', \ |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
1186 help='Will force gmap_build of the Genome Resource Library, even if previously gmapped.', |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
1187 action='store_true') |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
1188 parser.add_argument('-m', '--download_mutation_resources_url', |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
1189 default='', \ |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
1190 help='Value should be the url of the zipped up mutation resources. ' + \ |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
1191 'These are located at: https://data.broadinstitute.org/Trinity/CTAT/mutation/.' + \ |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
1192 'Will download mutation resources and integrate them into the Genome Resource Library.' + \ |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
1193 'Cosmic resources must previously have beeen downloaded (https://cancer.sanger.ac.uk/cosmic/download).' + \ |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
1194 'Cosmic resources can be placed directly into the Genome Resource Library ' + \ |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
1195 'or you can set the --cosmic_resources_location argument.' + \ |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
1196 'See https://github.com/NCIP/ctat-mutations/tree/no_sciedpiper/mutation_lib_prep for more info. ' + \ |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
1197 'If a previous download and integration was not completed, ' + \ |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
1198 'calling with this option set will attempt to finish the integration.') |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
1199 parser.add_argument('-l', '--new_mutation_download', \ |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
1200 help='Forces the mutation resources to be downloaded, ' + \ |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
1201 'even if previously downloaded into this Genome Resource Library.', |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
1202 action='store_true') |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
1203 parser.add_argument('-i', '--new_mutation_integration', \ |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
1204 help='Forces the mutation resources to be integrated, ' + \ |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
1205 'even if previously integrated into this Genome Resource Library.', |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
1206 action='store_true') |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
1207 parser.add_argument('-c', '--cosmic_resources_location', |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
1208 default='', \ |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
1209 help='Specify a non-default location where the Cosmic files reside. ' + \ |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
1210 'Normally they are assumed to reside in the build directory, ' + \ |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
1211 'but if that directory has not been created yet when this program ' + \ |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
1212 'is called, you can specify the full path to the directory where they reside.') |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
1213 # Method 1) arguments - Download and Build. |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
1214 # - One can optionally utilize --build_location argument with this group of arguments. |
10 | 1215 download_and_build_args = parser.add_argument_group('Download and Build arguments') |
11
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
1216 download_and_build_args.add_argument('-u', '--download_url', |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
1217 default='', \ |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
1218 help='This is the url of an archive file containing the library files. ' + \ |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
1219 'These are located at https://data.broadinstitute.org/Trinity/CTAT_RESOURCE_LIB/. ' + \ |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
1220 'Works with both source-data and plug-n-play archives.') |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
1221 download_and_build_args.add_argument('-d', '--download_location', |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
1222 default='', \ |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
1223 help='Full path of the CTAT Resource Library download location, where the download will be placed. ' + \ |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
1224 'If the archive file has already had been successfully downloaded, ' + \ |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
1225 'it will only be downloaded again if --new_archive_download is selected. ' + \ |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
1226 'If --build_location is not set, then the archive will be built in place at the download_location. ' + \ |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
1227 'If a previous download and build was started but not completed at this or a specified build_location, ' + \ |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
1228 'calling with this and the previous option set, but not --new_archive_download, ' + \ |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
1229 'will attempt to finish the download and build.') |
10 | 1230 download_and_build_args.add_argument('-a', '--new_archive_download', \ |
11
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
1231 help='Forces a new download (and build if needed) of the Genome Resource Library, ' + \ |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
1232 'even if previously downloaded and built.', |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
1233 action='store_true') |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
1234 download_and_build_args.add_argument('-k', '--keep_archive', \ |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
1235 help='The archive will not be deleted after it is extracted.', |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
1236 action='store_true') |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
1237 # Method 2) arguments - Specify source and build locations. |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
1238 specify_source_and_build_args = parser.add_argument_group('Specify Source and Build locations arguments') |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
1239 specify_source_and_build_args.add_argument('-s', '--source_location', |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
1240 default='', \ |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
1241 help='Full path to the directory containing CTAT Resource Library source-data files ' + \ |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
1242 'or the full path to a CTAT Resource Library archive file (.tar.gz). ' + \ |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
1243 'If the --build_location option is not set, the reference library will be built in the source_location directory.' + \ |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
1244 'If a previous download and build was started but not completed at this location, ' + \ |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
1245 'calling with this option set, but not --new_library_build, ' + \ |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
1246 'will attempt to finish the build.') |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
1247 specify_source_and_build_args.add_argument('-r', '--new_library_build', \ |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
1248 help='Forces build of the CTAT Genome Resource Library, even if previously built. ' + \ |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
1249 'The --source_location must be a source-data archive or directory, or this is a no-op.', |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
1250 action='store_true') |
10 | 1251 # Method 3) arguments - Specify the location of a built library. |
1252 built_lib_location_arg = parser.add_argument_group('Specify location of built library arguments') | |
11
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
1253 built_lib_location_arg.add_argument('-b', '--build_location', |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
1254 default='', \ |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
1255 help='Full path to the location of a built CTAT Genome Resource Library, ' + \ |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
1256 'either where it is, or where it will be placed.') |
10 | 1257 |
0 | 1258 args = parser.parse_args() |
1259 | |
1260 # All of the input parameters are written by default to the output file prior to | |
1261 # this program being called. | |
1262 # But I do not get input values from the json file, but rather from command line. | |
1263 # Just leaving the following code as a comment, in case it might be useful to someone later. | |
1264 # params = from_json_string(open(filename).read()) | |
1265 # target_directory = params['output_data'][0]['extra_files_path'] | |
1266 # os.mkdir(target_directory) | |
1267 | |
10 | 1268 print "The value of download_url argument is:\n\t{:s}".format(str(args.download_url)) |
4 | 1269 |
8 | 1270 lib_was_built = False |
11
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
1271 extracted_directory = None |
10 | 1272 source_data_directory = None |
0 | 1273 genome_build_directory = None |
11
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
1274 download_url_is_set = (args.download_url is not None) and (args.download_url != "") |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
1275 download_location_is_set = (args.download_location is not None) and (args.download_location != "") |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
1276 source_location_is_set = (args.source_location is not None) and (args.source_location != "") |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
1277 build_location_is_set = (args.build_location is not None) and (args.build_location != "") |
0 | 1278 # FIX - need to make sure we are handling all "possible" combinations of arguments. |
1279 # Probably would be good if we could simplify/remove some of them. | |
6 | 1280 # But I think the current interface is using them all. |
10 | 1281 |
11
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
1282 if download_url_is_set: |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
1283 if source_location_is_set: |
10 | 1284 raise ValueError("Argument --source_location cannot be used in combination with --download_url.") |
11
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
1285 if not download_location_is_set: |
10 | 1286 raise ValueError("Argument --download_url requires that --download_location be specified.") |
11
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
1287 downloaded_filename_full_path = \ |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
1288 download_genome_archive(source_url=args.download_url, \ |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
1289 destination=args.download_location, \ |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
1290 force_new_download=args.new_archive_download) |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
1291 print "\nThe downloaded file is:\n\t{:s}.\n".format(str(downloaded_filename_full_path)) |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
1292 |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
1293 |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
1294 if ctat_library_type(downloaded_filename_full_path) == _LIBTYPE_SOURCE_DATA: |
10 | 1295 print "It is source data." |
11
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
1296 # If it is source_data, extract to download_location (the directory where the download was placed). |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
1297 extracted_directory = extract_genome_file(archive_filepath=downloaded_filename_full_path, \ |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
1298 destination=args.download_location, \ |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
1299 force_new_extraction=args.new_archive_download, \ |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
1300 keep_archive=args.keep_archive) |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
1301 source_data_directory = extracted_directory |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
1302 if build_location_is_set: |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
1303 genome_build_directory = build_directory_from_build_location(source_data_directory, args.build_location) |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
1304 else: |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
1305 # We will build within a subdirectory of the source_data_directory . |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
1306 # The name of the build directory will be the default _CTAT_Build_dirname. |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
1307 # This _CTAT_Build_dirname directory will not exist until the library is built. |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
1308 genome_build_directory = os.path.join(source_data_directory, _CTAT_Build_dirname) |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
1309 |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
1310 elif ctat_library_type(downloaded_filename_full_path) == _LIBTYPE_PLUG_N_PLAY: |
10 | 1311 print "It is plug-n-play data." |
11
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
1312 if build_location_is_set: |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
1313 # Extract to the build location. The library is already built. |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
1314 extracted_directory = extract_genome_file(archive_filepath=downloaded_filename_full_path, \ |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
1315 destination=args.build_location, \ |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
1316 force_new_extraction=args.new_archive_download, \ |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
1317 keep_archive=args.keep_archive) |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
1318 else: |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
1319 # Extract to the download location. |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
1320 extracted_directory = extract_genome_file(archive_filepath=downloaded_filename_full_path, \ |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
1321 destination=args.download_location, \ |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
1322 force_new_extraction=args.new_archive_download, \ |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
1323 keep_archive=args.keep_archive) |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
1324 # There is no source_data_directory, so its value stays as None. |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
1325 |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
1326 # Look for the build directory. It should be inside the extracted_directory |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
1327 if len(os.listdir(extracted_directory)) == 1: |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
1328 # Then that one file is a subdirectory that should be the build_directory. |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
1329 # That is how the plug-n-play directories are structured. |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
1330 subdir_filename = os.listdir(extracted_directory)[0] |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
1331 genome_build_directory = os.path.join(extracted_directory, subdir_filename) |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
1332 else: |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
1333 # We need to search for the build directory, since there is more than one file. |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
1334 genome_build_directory = search_for_genome_build_dir(extracted_directory) |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
1335 else: |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
1336 raise ValueError("Unexpected CTAT Library type. Neither plug-n-play nor source_data:\n\t" + \ |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
1337 "{:s}".format(downloaded_filename_full_path)) |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
1338 elif source_location_is_set: |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
1339 # Then the user wants to build the directory from the source data. |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
1340 source_data_directory = os.path.realpath(args.source_location) |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
1341 print "\nThe user is saying the source data is in:\n\t{:s}.\n".format(str(source_data_directory)) |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
1342 if build_location_is_set: |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
1343 genome_build_directory = build_directory_from_build_location(source_data_directory, args.build_location) |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
1344 else: |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
1345 # We will build within a subdirectory of the source_data_directory . |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
1346 # The name of the build directory will be the default _CTAT_Build_dirname. |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
1347 # This _CTAT_Build_dirname directory will not exist until the library is built. |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
1348 genome_build_directory = os.path.join(source_data_directory, _CTAT_Build_dirname) |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
1349 elif build_location_is_set: |
10 | 1350 genome_build_directory = args.build_location |
11
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
1351 |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
1352 if (genome_build_directory is None) or (genome_build_directory == ""): |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
1353 raise ValueError("At least one of --download_url, --source_location, or --build_location must be specified.") |
10 | 1354 |
1355 print "\nThe location where the CTAT Genome Resource Library exists " + \ | |
11
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
1356 "or will be built is {:s}.\n".format(str(genome_build_directory)) |
0 | 1357 |
8 | 1358 # To take out builds for testing, comment out the lines that do the building. |
1359 # The command that builds the ctat genome library also has an option for building the gmap indexes. | |
1360 # That is why the gmap_build value is sent to build_the_library(), but if we are not building the | |
1361 # library, the user might still be asking for a gmap_build. That is done after rechecking for the | |
1362 # genome_build_directory. | |
10 | 1363 if (source_data_directory is not None): |
1364 build_the_library(source_data_directory, \ | |
1365 genome_build_directory, \ | |
11
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
1366 args.new_library_build, \ |
10 | 1367 args.gmap_build) |
8 | 1368 lib_was_built = True |
11
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
1369 |
6 | 1370 # The following looks to see if the library actually exists after the build, |
1371 # and raises an error if it cannot find the library files. | |
11
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
1372 # The reassignment of genome_build_directory is superfluous in most cases, |
8 | 1373 # since genome_build_directory should already point to the correct directory, |
11
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
1374 # except in the case where a user specifies a location that contains the |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
1375 # genome_build_directory rather than is the genome_build_directory. |
6 | 1376 genome_build_directory = search_for_genome_build_dir(genome_build_directory) |
1377 | |
8 | 1378 if (args.gmap_build and not lib_was_built): |
1379 # If we did not build the genome resource library | |
1380 # the user might still be asking for a gmap_build. | |
11
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
1381 gmap_the_library(genome_build_directory, args.force_gmap_build) |
8 | 1382 |
11
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
1383 if (args.download_mutation_resources_url != ""): |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
1384 download_and_integrate_mutation_resources(source_url=args.download_mutation_resources_url, \ |
8 | 1385 genome_build_directory=genome_build_directory, \ |
11
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
1386 cosmic_resources_location=args.cosmic_resources_location, \ |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
1387 force_new_download=args.new_mutation_download, \ |
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
1388 force_new_integration=args.new_mutation_integration) |
8 | 1389 |
7 | 1390 # Need to get the genome name. |
10 | 1391 genome_name = find_genome_name_in_path(args.download_url) |
7 | 1392 if genome_name is None: |
1393 genome_name = find_genome_name_in_path(genome_build_directory) | |
1394 if genome_name is None: | |
11
57428396c6e4
Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents:
10
diff
changeset
|
1395 genome_name = find_genome_name_in_path(extracted_directory) |
7 | 1396 if genome_name is None: |
10 | 1397 genome_name = find_genome_name_in_path(args.source_location) |
1398 if genome_name is None: | |
1399 genome_name = find_genome_name_in_path(args.download_location) | |
7 | 1400 if genome_name is None: |
1401 genome_name = find_genome_name_in_path(args.display_name) | |
1402 if genome_name is None: | |
1403 genome_name = _CTAT_ResourceLib_DefaultGenome | |
1404 print "WARNING: We could not find a genome name in any of the directory paths." | |
0 | 1405 |
1406 # Determine the display_name for the library. | |
1407 if (args.display_name is None) or (args.display_name == ""): | |
7 | 1408 # Create the display_name from the genome_name. |
1409 display_name = _CTAT_ResourceLib_DisplayNamePrefix + genome_name | |
0 | 1410 else: |
1411 display_name = _CTAT_ResourceLib_DisplayNamePrefix + args.display_name | |
1412 display_name = display_name.replace(" ","_") | |
1413 | |
1414 # Create a unique_id for the library. | |
1415 datetime_stamp = datetime.now().strftime("_%Y_%m_%d_%H_%M_%S_%f") | |
8 | 1416 unique_id = genome_name + "." + datetime_stamp |
0 | 1417 |
6 | 1418 print "The Genome Resource Library's display_name will be set to: {:s}\n".format(display_name) |
0 | 1419 print "Its unique_id will be set to: {:s}\n".format(unique_id) |
1420 print "Its dir_path will be set to: {:s}\n".format(genome_build_directory) | |
1421 | |
1422 data_manager_dict = {} | |
1423 data_manager_dict['data_tables'] = {} | |
1424 data_manager_dict['data_tables']['ctat_genome_resource_libs'] = [] | |
1425 data_table_entry = dict(value=unique_id, name=display_name, path=genome_build_directory) | |
1426 data_manager_dict['data_tables']['ctat_genome_resource_libs'].append(data_table_entry) | |
1427 | |
1428 # Temporarily the output file's dictionary is written for debugging: | |
1429 print "The dictionary for the output file is:\n\t{:s}".format(str(data_manager_dict)) | |
1430 # Save info to json file. This is used to transfer data from the DataManager tool, to the data manager, | |
1431 # which then puts it into the correct .loc file (I think). | |
1432 # Comment out the following line when testing without galaxy package. | |
1433 open(args.output_filename, 'wb').write(to_json_string(data_manager_dict)) | |
1434 | |
1435 if __name__ == "__main__": | |
1436 main() |