annotate data_manager/add_ctat_resource_lib.py @ 32:9b7dc7d09fda draft

Fixing some indentation errors in build_the_library.
author trinity_ctat
date Thu, 25 Oct 2018 10:31:19 -0400
parents 0df7a729910d
children 91319ae21a16
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
1 #!/usr/bin/env python
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
2 # ref: https://galaxyproject.org/admin/tools/data-managers/how-to/define/
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
3
11
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
4 # Rewritten by H.E. Cicada Brokaw Dennis from code downloaded from the toolshed and
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
5 # other example code on the web. It has however been extensively modified and augmented.
0
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
6 # This now allows downloading of a user selected library
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
7 # but only from the CTAT Genome Resource Library website.
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
8 # Ultimately we might want to allow the user to specify any location
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
9 # from which to download.
11
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
10 # Users can create or download other libraries and use this Data Manger to add them
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
11 # if they don't want to add them by hand.
0
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
12
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
13 import argparse
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
14 import os
11
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
15 import shutil
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
16 import tarfile
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
17 import hashlib
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
18 import urllib
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
19 import urlparse
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
20 import contextlib
0
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
21 import subprocess
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
22
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
23 # Comment out the following line when testing without galaxy package.
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
24 from galaxy.util.json import to_json_string
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
25 # The following is not being used, but leaving as info
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
26 # in case we ever want to get input values using json.
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
27 # from galaxy.util.json import from_json_string
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
28
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
29 # datetime.now() is used to create the unique_id
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
30 from datetime import datetime
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
31
11
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
32 # The Data Manager uses a subclass of HTMLParser to look through a web page's html
0
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
33 # searching for the filenames within anchor tags.
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
34 import urllib2
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
35 from HTMLParser import HTMLParser
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
36
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
37 _CTAT_ResourceLib_URL = 'https://data.broadinstitute.org/Trinity/CTAT_RESOURCE_LIB/'
11
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
38 _CTAT_Mutation_URL = 'https://data.broadinstitute.org/Trinity/CTAT/mutation/'
6
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
39 _CTAT_Build_dirname = 'ctat_genome_lib_build_dir'
11
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
40 _CTAT_MutationLibDirname = 'ctat_mutation_lib'
0
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
41 _CTAT_ResourceLib_DisplayNamePrefix = 'CTAT_GenomeResourceLib_'
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
42 _CTAT_ResourceLib_DefaultGenome = 'Unspecified_Genome'
6
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
43 _CTAT_HumanFusionLib_FilenamePrefix = 'CTAT_HumanFusionLib'
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
44 _CTAT_RefGenome_Filename = 'ref_genome.fa'
7
f22a13378750 Uploaded
trinity_ctat
parents: 6
diff changeset
45 _CTAT_MouseGenome_Prefix = 'Mouse'
f22a13378750 Uploaded
trinity_ctat
parents: 6
diff changeset
46 _CTAT_HumanGenome_Prefix = 'GRCh'
11
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
47 _COSMIC_Mutant_Filename = 'CosmicMutantExport.tsv.gz'
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
48 _COSMIC_Coding_Filename = 'CosmicCodingMuts.vcf.gz'
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
49
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
50 # FIX - The following numbers need to be checked and other numbers for gmap, etc. need to be determined.
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
51 # Values for each genome should be determined, so we can get more precise values for each genome.
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
52 _NumBytesNeededForSourceDataExtraction = 10737418240 # 10 Gigabytes. FIX - Not checked - Largest archive is currently 2.5GB.
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
53 _NumBytesNeededForPlugNPlayExtraction = 48318382080 # 45 Gigabytes. Largest archive is currently 28GB and extracts to 43GB.
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
54 # Built Human Genome archive (GRCh38_v27_CTAT_lib_Feb092018) with mutation lib is 46GB.
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
55 # Fix - check amount with gmap.
8
b2e6ed40840a Uploaded
trinity_ctat
parents: 7
diff changeset
56 _NumBytesNeededForBuild = 66571993088 # 62 Gigabytes. FIX - This might not be correct.
11
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
57 _NumBytesNeededForMutationResources = 4294967296 # 4 Gigabytes. Actually need about 3.8GB.
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
58 # Once built the downloaded archive could be deleted to reduce the amount used, but with the archive
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
59 # there and the Cosmic files and the built ctat_mutation_library, 3.8GB is needed.
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
60 # If the archive files are deleted after the integration of the library, only 1.8GB would be used at that point.
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
61 _Write_TestFile = 'write_testfile.txt'
0
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
62 _DownloadSuccessFile = 'download_succeeded.txt'
11
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
63 _ExtractionSuccessFile = 'extraction_succeeded.txt'
8
b2e6ed40840a Uploaded
trinity_ctat
parents: 7
diff changeset
64 _LibBuiltSuccessFile = 'build_succeeded.txt'
11
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
65 _GmapSuccessFile = 'gmap_succeeded.txt'
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
66 _MutationDownloadSuccessFile = 'mutation_download_succeeded.txt'
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
67 _MutationIntegrationSuccessFile = 'mutation_integration_succeeded.txt'
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
68 _LIBTYPE_SOURCE_DATA = 'source_data'
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
69 _LIBTYPE_PLUG_N_PLAY = 'plug-n-play'
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
70
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
71 class resumable_URL_opener(urllib.FancyURLopener):
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
72 # This class is used to do downloads that can restart a download from
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
73 # the point where it left off after a partial download was interupted.
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
74 # This class and code using it was found online:
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
75 # http://code.activestate.com/recipes/83208-resuming-download-of-a-file/
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
76 # A sub-class is created in order to overide error 206.
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
77 # This error means a partial file is being sent,
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
78 # which is ok in this case. Do nothing with this error.
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
79 def http_error_206(self, url, fp, errcode, errmsg, headers, data=None):
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
80 pass
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
81 # End of class resumable_URL_opener
0
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
82
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
83 class FileListParser(HTMLParser):
11
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
84 # The FileListParser object is used by get_ctat_genome_urls() and get_mutation_resource_urls(),
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
85 # which can be called by the Data Manager interface (.xml file) to get
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
86 # the filenames that are available online at broadinstitute.org
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
87 # Apparently creating dynamic option lists this way is deprecated, but no
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
88 # other method exists by which I can get the options dynamically from the web.
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
89 # I believe that it is considered a security risk.
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
90
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
91 # This HTMLParser facilitates getting url's of tar.gz links in an HTML page.
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
92 # These are assumed to be files that can be downloaded and are the files we
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
93 # are particularly interested in this Data Manager.
0
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
94 def __init__(self):
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
95 # Have to use direct call to super class rather than using super():
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
96 # super(FileListParser, self).__init__()
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
97 # because HTMLParser is an "old style" class and its inheritance chain does not include object.
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
98 HTMLParser.__init__(self)
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
99 self.urls = set()
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
100 def handle_starttag(self, tag, attrs):
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
101 # Look for filename references in anchor tags and add them to urls.
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
102 if tag == "a":
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
103 # The tag is an anchor tag.
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
104 for attribute in attrs:
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
105 # print "Checking: {:s}".format(str(attribute))
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
106 if attribute[0] == "href":
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
107 # Does the href have a tar.gz in it?
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
108 if ("tar.gz" in attribute[1]) and ("md5" not in attribute[1]):
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
109 # Add the value to urls.
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
110 self.urls.add(attribute[1])
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
111 # End of class FileListParser
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
112
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
113 def get_ctat_genome_urls():
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
114 # open the url and retrieve the urls of the files in the directory.
11
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
115 # If we can't get the list, send a default list.
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
116
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
117 build_default_list = False
0
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
118 resource = urllib2.urlopen(_CTAT_ResourceLib_URL)
11
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
119 if resource is None:
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
120 build_default_list = True
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
121 else:
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
122 theHTML = resource.read()
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
123 if (theHTML is None) or (theHTML == ""):
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
124 build_default_list = True
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
125 if build_default_list:
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
126 # These are the filenames for what was there at least until 2018/10/09.
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
127 urls_to_return = set()
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
128 urls_to_return.add("GRCh37_v19_CTAT_lib_Feb092018.plug-n-play.tar.gz")
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
129 urls_to_return.add("GRCh37_v19_CTAT_lib_Feb092018.source_data.tar.gz")
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
130 urls_to_return.add("GRCh38_v27_CTAT_lib_Feb092018.plug-n-play.tar.gz")
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
131 urls_to_return.add("GRCh38_v27_CTAT_lib_Feb092018.source_data.tar.gz")
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
132 urls_to_return.add("Mouse_M16_CTAT_lib_Feb202018.plug-n-play.tar.gz")
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
133 urls_to_return.add("Mouse_M16_CTAT_lib_Feb202018.source_data.tar.gz")
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
134 else:
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
135 filelist_parser = FileListParser()
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
136 filelist_parser.feed(theHTML)
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
137 urls_to_return = filelist_parser.urls
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
138
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
139 # For dynamic options need to return an itterable with contents that are tuples with 3 items.
0
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
140 # Item one is a string that is the display name put into the option list.
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
141 # Item two is the value that is put into the parameter associated with the option list.
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
142 # Item three is a True or False value, indicating whether the item is selected.
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
143 options = []
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
144 for i, url in enumerate(filelist_parser.urls):
5
7f1257532b6f Uploaded
trinity_ctat
parents: 4
diff changeset
145 # The urls should look like:
0
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
146 # https://data.broadinstitute.org/Trinity/CTAT_RESOURCE_LIB/GRCh37_v19_CTAT_lib_Feb092018.plug-n-play.tar.gz
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
147 # https://data.broadinstitute.org/Trinity/CTAT_RESOURCE_LIB/Mouse_M16_CTAT_lib_Feb202018.source_data.tar.gz
8
b2e6ed40840a Uploaded
trinity_ctat
parents: 7
diff changeset
148 # But in actuality, they are coming in looking like:
5
7f1257532b6f Uploaded
trinity_ctat
parents: 4
diff changeset
149 # GRCh37_v19_CTAT_lib_Feb092018.plug-n-play.tar.gz
7f1257532b6f Uploaded
trinity_ctat
parents: 4
diff changeset
150 # Mouse_M16_CTAT_lib_Feb202018.source_data.tar.gz
7f1257532b6f Uploaded
trinity_ctat
parents: 4
diff changeset
151 # Write code to handle both situations, or an ftp: url.
11
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
152 url_parts = urlparse.urlparse(url)
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
153 if (url_parts.scheme != ""):
5
7f1257532b6f Uploaded
trinity_ctat
parents: 4
diff changeset
154 full_url_path = url
7f1257532b6f Uploaded
trinity_ctat
parents: 4
diff changeset
155 else:
7f1257532b6f Uploaded
trinity_ctat
parents: 4
diff changeset
156 # Assume the path is relative to the page location.
11
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
157 full_url_path = os.path.join(_CTAT_ResourceLib_URL, url)
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
158 filename = os.path.basename(url)
8
b2e6ed40840a Uploaded
trinity_ctat
parents: 7
diff changeset
159 # if filename.split("_")[0] != _CTAT_MouseGenome_Prefix:
b2e6ed40840a Uploaded
trinity_ctat
parents: 7
diff changeset
160 # # Don't put in the mouse genome options for now.
b2e6ed40840a Uploaded
trinity_ctat
parents: 7
diff changeset
161 # # The mouse genome option is not handled correctly yet
b2e6ed40840a Uploaded
trinity_ctat
parents: 7
diff changeset
162 # options.append((filename, full_url_path, i == 0))
b2e6ed40840a Uploaded
trinity_ctat
parents: 7
diff changeset
163 # Mouse genomes should work now (we hope) - FIX - still not tested.
b2e6ed40840a Uploaded
trinity_ctat
parents: 7
diff changeset
164 options.append((filename, full_url_path, i == 0))
b2e6ed40840a Uploaded
trinity_ctat
parents: 7
diff changeset
165 options.sort() # So the list will be in alphabetical order.
b2e6ed40840a Uploaded
trinity_ctat
parents: 7
diff changeset
166 # return a tuple of the urls
b2e6ed40840a Uploaded
trinity_ctat
parents: 7
diff changeset
167 print "The list being returned as options is:"
b2e6ed40840a Uploaded
trinity_ctat
parents: 7
diff changeset
168 print "{:s}\n".format(str(options))
b2e6ed40840a Uploaded
trinity_ctat
parents: 7
diff changeset
169 return options
5
7f1257532b6f Uploaded
trinity_ctat
parents: 4
diff changeset
170
11
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
171 def get_mutation_resource_urls():
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
172 # FIX - Rather than letting user choose mutation resource url,
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
173 # download the correct one for the chosen library?
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
174 # Not sure about this.
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
175 # In that case don't provide a pull down interface for this.
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
176 # FIX -
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
177 build_default_list = False
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
178 resource = urllib2.urlopen(_CTAT_Mutation_URL)
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
179 if resource is None:
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
180 build_default_list = True
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
181 else:
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
182 theHTML = resource.read()
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
183 if (theHTML is None) or (theHTML == ""):
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
184 build_default_list = True
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
185 if build_default_list:
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
186 # These are the filenames for what was there at least until 2018/10/09.
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
187 urls_to_return = set()
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
188 urls_to_return.add("mutation_lib.hg19.tar.gz")
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
189 urls_to_return.add("mutation_lib.hg38.tar.gz")
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
190 else:
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
191 filelist_parser = FileListParser()
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
192 filelist_parser.feed(theHTML)
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
193 urls_to_return = filelist_parser.urls
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
194
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
195 # For dynamic options need to return an itterable with contents that are tuples with 3 items.
8
b2e6ed40840a Uploaded
trinity_ctat
parents: 7
diff changeset
196 # Item one is a string that is the display name put into the option list.
b2e6ed40840a Uploaded
trinity_ctat
parents: 7
diff changeset
197 # Item two is the value that is put into the parameter associated with the option list.
b2e6ed40840a Uploaded
trinity_ctat
parents: 7
diff changeset
198 # Item three is a True or False value, indicating whether the item is selected.
b2e6ed40840a Uploaded
trinity_ctat
parents: 7
diff changeset
199 options = []
b2e6ed40840a Uploaded
trinity_ctat
parents: 7
diff changeset
200 for i, url in enumerate(filelist_parser.urls):
b2e6ed40840a Uploaded
trinity_ctat
parents: 7
diff changeset
201 # The urls should look like:
b2e6ed40840a Uploaded
trinity_ctat
parents: 7
diff changeset
202 # https://data.broadinstitute.org/Trinity/CTAT/mutation/mc7.tar.gz
b2e6ed40840a Uploaded
trinity_ctat
parents: 7
diff changeset
203 # https://data.broadinstitute.org/Trinity/CTAT/mutation/hg19.tar.gz
b2e6ed40840a Uploaded
trinity_ctat
parents: 7
diff changeset
204 # But in actuality, they are coming in looking like:
b2e6ed40840a Uploaded
trinity_ctat
parents: 7
diff changeset
205 # hg19.tar.gz
b2e6ed40840a Uploaded
trinity_ctat
parents: 7
diff changeset
206 # mc7.tar.gz
11
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
207 #
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
208 # On 2018/10/06, the following tar.gz files were present:
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
209 # mutation_lib.hg19.tar.gz
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
210 # mutation_lib.hg38.tar.gz
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
211 # mc-7.tar.gz
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
212 # ctat_mutation_demo.tar.gz
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
213 #
8
b2e6ed40840a Uploaded
trinity_ctat
parents: 7
diff changeset
214 # Write code to handle both situations, or an ftp: url.
11
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
215 url_parts = urlparse.urlparse(url)
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
216 if (url_parts.scheme != ""):
8
b2e6ed40840a Uploaded
trinity_ctat
parents: 7
diff changeset
217 full_url_path = url
b2e6ed40840a Uploaded
trinity_ctat
parents: 7
diff changeset
218 else:
b2e6ed40840a Uploaded
trinity_ctat
parents: 7
diff changeset
219 # Assume the path is relative to the page location.
11
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
220 full_url_path = os.path.join(_CTAT_Mutation_URL, url)
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
221 filename = os.path.basename(url)
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
222 if (filename.split(".")[0] == "mutation_lib"):
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
223 # As of 2018_10_09, the only ones supported have mutation_lib as the first part of the name.
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
224 options.append((filename, full_url_path, i == 0))
5
7f1257532b6f Uploaded
trinity_ctat
parents: 4
diff changeset
225 options.sort() # So the list will be in alphabetical order.
0
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
226 # return a tuple of the urls
4
c372930aaba1 Uploaded
trinity_ctat
parents: 0
diff changeset
227 print "The list being returned as options is:"
c372930aaba1 Uploaded
trinity_ctat
parents: 0
diff changeset
228 print "{:s}\n".format(str(options))
0
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
229 return options
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
230
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
231 # The following was used by the example program to get input parameters through the json.
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
232 # Just leaving here for reference.
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
233 # We are getting all of our parameter values through command line arguments.
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
234 #def get_reference_id_name(params):
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
235 # genome_id = params['param_dict']['genome_id']
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
236 # genome_name = params['param_dict']['genome_name']
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
237 # return genome_id, genome_name
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
238 #
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
239 #def get_url(params):
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
240 # trained_url = params['param_dict']['trained_url']
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
241 # return trained_url
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
242
6
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
243 def print_directory_contents(dir_path, num_levels):
11
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
244 # This procedure is used to help with debugging and for user information.
6
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
245 if num_levels > 0:
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
246 if os.path.exists(dir_path) and os.path.isdir(dir_path):
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
247 print "\nDirectory {:s}:".format(dir_path)
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
248 subprocess.call("ls -la {:s} 2>&1".format(dir_path), shell=True)
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
249 else:
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
250 print "Path either does not exist, or is not a directory:\n\t{:s}.".format(dir_path)
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
251 if num_levels > 1:
8
b2e6ed40840a Uploaded
trinity_ctat
parents: 7
diff changeset
252 if os.path.exists(dir_path) and os.path.isdir(dir_path):
b2e6ed40840a Uploaded
trinity_ctat
parents: 7
diff changeset
253 for filename in os.listdir(dir_path):
11
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
254 filename_path = os.path.join(dir_path, filename)
8
b2e6ed40840a Uploaded
trinity_ctat
parents: 7
diff changeset
255 if os.path.exists(filename_path) and os.path.isdir(filename_path):
b2e6ed40840a Uploaded
trinity_ctat
parents: 7
diff changeset
256 print_directory_contents(filename_path, num_levels-1)
b2e6ed40840a Uploaded
trinity_ctat
parents: 7
diff changeset
257 else:
b2e6ed40840a Uploaded
trinity_ctat
parents: 7
diff changeset
258 print "Path either does not exist, or is not a directory:\n\t{:s}.".format(dir_path)
6
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
259
11
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
260 def which(file):
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
261 # This procedure is similar to the linux "which" command.
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
262 # It is used to find the location of an executable program that is in the PATH.
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
263 # However this implementation does not check whether the program's file is executable.
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
264 for path in os.environ["PATH"].split(os.pathsep):
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
265 if os.path.exists(os.path.join(path, file)):
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
266 return os.path.join(path, file)
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
267 return None
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
268
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
269 def size_of_file_at(file_url):
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
270 # Returns the size of the file at file_url.
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
271 # We have to open the file, in order to find out how big it is.
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
272 file_retriever = resumable_URL_opener()
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
273 with contextlib.closing(file_retriever.open(file_url)) as filelike_object:
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
274 filesize = int(filelike_object.headers['Content-Length'])
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
275 return filesize
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
276
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
277 def md5sum_for(filename, blocksize=2**20):
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
278 # I got this code for this function off the web, but don't remember where.
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
279 m = hashlib.md5()
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
280 finished = False
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
281 with open(filename, "rb" ) as f:
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
282 while not finished:
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
283 buf = f.read(blocksize)
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
284 if buf:
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
285 m.update( buf )
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
286 else:
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
287 finished = True
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
288 return m.hexdigest()
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
289
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
290 def ctat_library_type(filepath):
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
291 # This function pulls out the string indicating the library type of the file.
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
292 # If the filename indicates source_data, as opposed to plug-n-play,
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
293 # then the library will have to be built after it is downloaded.
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
294 base_filename = os.path.basename(filepath)
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
295 library_type = base_filename.split(".")[1]
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
296 #print "The file {:s}".format(base_filename)
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
297 #print "is of type {:s}".format(library_type)
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
298 return library_type
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
299
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
300 def find_genome_name_in_path(path, raise_error=False):
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
301 # The form of the genome name in directory names (if present in the path) looks like:
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
302 # GRCh37_v19_CTAT_lib_Feb092018
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
303 # GRCh38_v27_CTAT_lib_Feb092018
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
304 # Mouse_M16_CTAT_lib_Feb202018
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
305 # Raises a ValueError if there is no genome name in the given path.
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
306 genome_name = None
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
307 if (path is not None) and (path != ""):
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
308 for element in path.split(os.sep):
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
309 # print "Looking for genome name in {:s}.".format(element)
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
310 if (element[0:len(_CTAT_MouseGenome_Prefix)] == _CTAT_MouseGenome_Prefix) \
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
311 or (element[0:len(_CTAT_HumanGenome_Prefix)] == _CTAT_HumanGenome_Prefix):
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
312 # Remove any extension that might be in the filename.
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
313 genome_name = element.split(".")[0]
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
314 if (genome_name is None or (genome_name == "")) and raise_error:
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
315 raise ValueError("Cannnot find genome name in the given filename path:\n\t".format(path))
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
316 return genome_name
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
317
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
318 def bytes_needed_to_extract(archive_filepath):
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
319 # FIX -- The following should be replaced by a series of statements that return the right value for each archive.
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
320 # The numbers used now estimates for the human genome, and so are big enough for the mouse genome, so ok for now.
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
321 # But now we are also using this for the mutation resource files, so really need to FIX this.
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
322 # FIX --
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
323 bytes_needed = _NumBytesNeededForPlugNPlayExtraction
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
324 if (ctat_library_type(archive_filepath) == _LIBTYPE_SOURCE_DATA):
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
325 bytes_needed = _NumBytesNeededForSourceDataExtraction
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
326 else: # assume otherwise that it is a plug-n-play archive.
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
327 bytes_needed = _NumBytesNeededForPlugNPlayExtraction
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
328 return bytes_needed
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
329
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
330 def bytes_needed_to_build(source_data_filepath):
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
331 # FIX - The following should be replaced by a series of statements that return the right value for each archive.
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
332 # The numbers used now estimates that largest size needed. Also, it is probably not correct.
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
333 return _NumBytesNeededForBuild
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
334
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
335 def create_success_file(full_file_path, contents=None):
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
336 # full_file_path is the path to the file to write.
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
337 # It should not exist before calling this function,
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
338 # but if it does, it will be overwritten.
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
339 # contents is some text that will be written into the file.
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
340 # It can be empty and nothing will be written.
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
341 try:
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
342 with open(full_file_path,"w") as success_file:
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
343 if contents is not None:
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
344 success_file.write(contents)
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
345 # else nothing is written into it, but we still will have created the file.
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
346 except IOError:
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
347 print "The success indication file could not be created: " + \
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
348 "{:s}".format(full_file_path)
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
349 raise
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
350
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
351 def download_file_from_url(file_url, dest_dir, resume_download=True):
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
352 # Some of the code used in this procedure was downloaded and modified for our needs.
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
353 # That code was at: http://code.activestate.com/recipes/83208-resuming-download-of-a-file/
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
354 # Given a file_url, downloads that file to dest_dir.
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
355 # The url must specify a file to download, so I can grab the filename from the end of the url's path.
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
356 # It is best to fully specify dest_dir. Otherwise the dest_dir will be opened relative to whatever cwd is.
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
357 # If resume_download is True (the default), the function will attempt to resume the download where it left off,
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
358 # if, for example, a previous download was interupted.
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
359 # If resume_download is False, any existing download of the file is deleted and a new download is started.
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
360
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
361 # DOWNLOAD_BLOCK_SIZE = 65536 # 64KB. Old number was 8192 or 8KB.
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
362 DOWNLOAD_BLOCK_SIZE = 1048576 # 1 MB
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
363 download_complete = False
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
364 existing_size = 0
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
365 bytes_read = 0
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
366 file_retriever = resumable_URL_opener()
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
367 dest_filename = os.path.basename(file_url)
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
368 dest_fullpath = os.path.join(dest_dir, dest_filename)
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
369 source_filesize = size_of_file_at(file_url)
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
370 print "Downloading {:s}\nSize of the file is {:d}".format(file_url, source_filesize)
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
371 print "Destination file for the download is {:s}".format(dest_fullpath)
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
372
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
373 # If the file exists and resume_download is requested, then only download the remainder
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
374 if resume_download and os.path.exists(dest_fullpath):
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
375 existing_size = os.path.getsize(dest_fullpath)
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
376 #If the file exists, but we already have the whole thing, don't download again
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
377 print "The destination file exists and is {:d} bytes in size.".format(existing_size)
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
378 if (source_filesize == existing_size):
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
379 print "The file has already been completely downloaded:\n\t{:s}".format(dest_fullpath)
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
380 download_complete = True
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
381 else:
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
382 header = "Range","bytes={:s}-".format(str(existing_size))
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
383 print "Adding header to resume download:\n\t{:s}".format(header)
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
384 file_retriever.addheader("Range","bytes={:s}-".format(str(existing_size)))
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
385 # We open even if download is complete, to avoid adding code to determine whether to close.
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
386 output_file = open(dest_fullpath,"ab")
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
387 else:
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
388 if os.path.exists(dest_fullpath):
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
389 print "resume_download is set to False. Download will overwrite an existing file."
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
390 else:
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
391 print "The destination file does not exist yet."
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
392 existing_size = 0
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
393 output_file = open(dest_fullpath,"wb")
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
394 try:
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
395 # Check whether there is enough space on the device for the rest of the file to download.
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
396 statvfs = os.statvfs(dest_dir)
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
397 num_avail_bytes = statvfs.f_frsize * statvfs.f_bavail
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
398 # num_avail_bytes is the number of free bytes that ordinary users
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
399 # are allowed to use (excl. reserved space)
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
400 # Perhaps should subtract some padding amount from num_avail_bytes
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
401 # rather than raising only if there is less than exactly what is needed.
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
402 if (num_avail_bytes < (source_filesize-existing_size)):
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
403 raise OSError("There is insufficient space ({:s} bytes)".format(str(num_avail_bytes)) + \
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
404 " on the device of the destination directory for the download: " + \
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
405 "{:s}".format(cannonical_destination))
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
406
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
407 source_file = file_retriever.open(file_url)
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
408 while not download_complete:
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
409 data = source_file.read(DOWNLOAD_BLOCK_SIZE)
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
410 if data:
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
411 output_file.write(data)
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
412 bytes_read = bytes_read + len(data)
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
413 else:
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
414 download_complete = True
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
415 source_file.close()
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
416 except IOError:
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
417 print "Error while attempting to download {:s}".format(file_url)
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
418 raise
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
419 finally:
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
420 output_file.close()
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
421
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
422 for k,v in source_file.headers.items():
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
423 print k, "=",v
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
424 print "Downloaded {:s} bytes from {:s}".format(str(bytes_read), str(file_url))
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
425 dest_filesize = os.path.getsize(dest_fullpath)
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
426 print "{:s} {:s}".format(str(dest_filesize), str(dest_fullpath))
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
427 if source_filesize != dest_filesize:
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
428 raise IOError("Download error:\n\t" + \
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
429 "The source file\n\t\t{:d}\t{:s}\n\t".format(source_filesize, file_url) + \
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
430 "and the destination file\n\t\t{:d}\t{:s}\n\t".format(dest_filesize, dest_fullpath) + \
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
431 "are different sizes.")
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
432 return dest_fullpath
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
433
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
434 def ensure_we_can_write_numbytes_to(destination, numbytes):
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
435 # Attempts to create the destination directory if it does not exist.
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
436 # Tests whether a file can be written to that directory.
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
437 # Tests whether there is numbytes space on the device of the destination.
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
438 # Raises errors if it cannot do any of the above.
0
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
439 #
11
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
440 # Returns the full specification of the destination path.
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
441 # We want to make sure that destination is an absolute fully specified path.
0
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
442 cannonical_destination = os.path.realpath(destination)
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
443 if os.path.exists(cannonical_destination):
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
444 if not os.path.isdir(cannonical_destination):
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
445 raise ValueError("The destination is not a directory: " + \
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
446 "{:s}".format(cannonical_destination))
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
447 # else all is good. It is a directory.
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
448 else:
11
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
449 # We need to create it since it does not exist.
0
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
450 try:
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
451 os.makedirs(cannonical_destination)
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
452 except os.error:
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
453 print "ERROR: Trying to create the following directory path:"
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
454 print "\t{:s}".format(cannonical_destination)
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
455 raise
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
456 # Make sure the directory now exists and we can write to it.
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
457 if not os.path.exists(cannonical_destination):
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
458 # It should have been created, but if it doesn't exist at this point
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
459 # in the code, something is wrong. Raise an error.
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
460 raise OSError("The destination directory could not be created: " + \
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
461 "{:s}".format(cannonical_destination))
11
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
462 test_writing_filename = "{:s}.{:s}".format(os.path.basename(cannonical_destination), _Write_TestFile)
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
463 test_writing_filepath = os.path.join(cannonical_destination, test_writing_filename)
0
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
464 try:
11
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
465 with open(test_writing_filepath, "w") as test_writing_file:
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
466 test_writing_file.write("Testing writing to this file.")
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
467 if os.path.exists(test_writing_filepath):
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
468 os.remove(test_writing_filepath)
0
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
469 except IOError:
11
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
470 print "The destination directory could not be written into:\n\t" + \
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
471 "{:s}".format(cannonical_destination)
0
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
472 raise
11
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
473 # Check whether there are numbytes available on cannonical_destination's device.
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
474 statvfs = os.statvfs(cannonical_destination)
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
475 # fs_size = statvfs.f_frsize * statvfs.f_blocks # Size of filesystem in bytes
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
476 # num_free_bytes = statvfs.f_frsize * statvfs.f_bfree # Actual number of free bytes
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
477 num_avail_bytes = statvfs.f_frsize * statvfs.f_bavail # Number of free bytes that ordinary users
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
478 # are allowed to use (excl. reserved space)
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
479 if (num_avail_bytes < numbytes):
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
480 raise OSError("There is insufficient space ({:s} bytes)".format(str(num_avail_bytes)) + \
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
481 " on the device of the destination directory:\n\t" + \
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
482 "{:s}\n\t{:d} bytes are needed.".format(cannonical_destination, numbytes))
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
483
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
484 return cannonical_destination
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
485
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
486 def download_genome_archive(source_url, destination, force_new_download=False):
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
487 # This function downloads but does not extract the archive at source_url.
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
488 # This function can be called on a file whose download was interrupted, and if force_new_download
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
489 # is False, the download will proceed where it left off.
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
490 # If download does not succeed, an IOError is raised.
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
491 # The function checks whether there is enough space at the destination for the expanded library.
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
492 # It raises an OSError if not.
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
493 # ValueError can also be raised by this function.
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
494
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
495 # Input Parameters
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
496 # source_url is the full URL of the file we want to download.
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
497 # It should look something like:
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
498 # https://data.broadinstitute.org/Trinity/CTAT_RESOURCE_LIB/GRCh37_v19_CTAT_lib_Feb092018.plug-n-play.tar.gz
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
499 # If only the filename is given, it is assumed to reside at _CTAT_ResourceLib_URL.
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
500 # destination is the location (directory) where a copy of the source file will be placed.
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
501 # Relative paths are expanded using the current working directory, so within Galaxy,
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
502 # it is best to send in absolute fully specified path names so you know to where
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
503 # the source file is going to be copied.
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
504 # force_new_download if True, will cause a new download to occur, even if the file has been downloaded previously.
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
505 #
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
506 # Returns the canonical path to the file that was downloaded.
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
507
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
508 dest_fullpath = None
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
509 url_parts = urlparse.urlparse(source_url)
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
510 source_filename = os.path.basename(url_parts.path)
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
511 if url_parts.scheme == "":
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
512 # Then we were given a source_url without a leading https: or similar.
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
513 # Assume we only were given the filename and that it exists at _CTAT_ResourceLib_URL.
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
514 source_url = urlparse.urljoin(_CTAT_ResourceLib_URL, source_url)
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
515 # FIX - We might want to otherwise check if we have a valid url and/or if we can reach it.
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
516
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
517 print "Downloading:\n\t{:s}".format(str(source_url))
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
518 print "to:\n\t{:s}".format(destination)
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
519 # The next is done so that if the source_url does not have a genome name in it, an error will be raised.
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
520 find_genome_name_in_path(source_url, raise_error=True)
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
521 cannonical_destination = ensure_we_can_write_numbytes_to(destination, size_of_file_at(source_url))
0
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
522
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
523 # Get the list of files in the directory,
11
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
524 # We use it to check for a previous download.
0
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
525 orig_files_in_destdir = set(os.listdir(cannonical_destination))
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
526 # See whether the file has been downloaded already.
11
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
527 download_success_filename = "{:s}.{:s}".format(source_filename, _DownloadSuccessFile)
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
528 download_success_full_file_path = os.path.join(cannonical_destination, download_success_filename)
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
529 if ((download_success_filename not in orig_files_in_destdir) \
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
530 or force_new_download):
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
531 if (download_success_filename in orig_files_in_destdir):
0
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
532 # Since we are redoing the download,
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
533 # the success file needs to be removed
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
534 # until the download has succeeded.
11
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
535 os.remove(download_success_full_file_path)
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
536 # The following raises an error if the download fails for some reason.
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
537 dest_fullpath = download_file_from_url(source_url, cannonical_destination, \
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
538 resume_download=(not force_new_download))
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
539 # Check the md5sum of the cannonical_destination file to ensure the data in the file is correct.
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
540 file_retriever = resumable_URL_opener()
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
541 md5_url = "{:s}.md5".format(source_url)
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
542 print "Checking the md5sum of the downloaded file."
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
543 try:
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
544 md5_file = file_retriever.open(md5_url, "r")
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
545 md5sum_from_web = md5_file.readlines()[0].strip().split()[0]
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
546 md5_file.close()
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
547 md5sum_from_file = md5sum_for(dest_fullpath)
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
548 except IOError:
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
549 print "Error while attempting to check the md5sum for {:s}".format(dest_fullpath)
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
550 raise
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
551 if md5sum_from_web != md5sum_from_file:
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
552 raise IOError("Download error:\n\t" + \
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
553 "The md5 sum for\n\t\t({:s})\n\t".format(dest_fullpath) + \
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
554 "does not match the value read from the web:\n\t\t" + \
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
555 "({:s} != {:s})".format(md5sum_from_file, md5sum_from_web))
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
556 print "Check of md5sum succeeded."
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
557 create_success_file(download_success_full_file_path, \
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
558 "Download of:\n\t{:s}\n".format(source_url) + \
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
559 "to:\n\t{:s}\nsucceeded.".format(dest_fullpath))
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
560 elif download_success_filename in orig_files_in_destdir:
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
561 print "The download success file exists, so no download is being attempted:"
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
562 print "\t{:s}".format(download_success_full_file_path)
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
563 print "Remove the file or set <Force New Download> if you want a new download to occur."
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
564 dest_filename = os.path.basename(source_url)
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
565 dest_fullpath = os.path.join(cannonical_destination, dest_filename)
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
566 else:
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
567 print "download_genome_archive(): This code should never be printed. Something is wrong."
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
568
0
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
569 # Some code to help us if errors occur.
11
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
570 print "\n*******************************"
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
571 print "* Finished download. *"
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
572 print_directory_contents(cannonical_destination, 1)
6
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
573 print "*******************************\n"
0
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
574
11
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
575 return dest_fullpath
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
576
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
577 def extract_archive(archive_filepath, destination, force_new_extraction=False):
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
578 # Generic function will use tarfile object to extract the given archive_filepath
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
579 # to the destination. If a file indicating a previous successful extraction exists
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
580 # the file is not extracted again unless force_new_extraction is True.
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
581 # This procedure does not write the extraction success file, because some error checking
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
582 # is dependant on the file being extracted. The calling procedure can/should write the
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
583 # success file after doing error checking.
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
584 cannonical_destination = ensure_we_can_write_numbytes_to(destination, bytes_needed_to_extract(archive_filepath))
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
585
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
586 # Create the name of the file used to indicate prior success of the file's extraction.
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
587 extraction_success_filename = "{:s}.{:s}".format(os.path.basename(archive_filepath), _ExtractionSuccessFile)
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
588 extraction_success_full_file_path = os.path.join(cannonical_destination, extraction_success_filename)
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
589 #print "extraction_success_filename is {:s}".format(extraction_success_filename)
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
590
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
591 orig_files_in_destination = set(os.listdir(cannonical_destination))
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
592 if ((extraction_success_filename not in orig_files_in_destination) \
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
593 or force_new_extraction):
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
594 # Do the extraction.
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
595 if (extraction_success_filename in orig_files_in_destination):
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
596 # Since we are redoing the extraction,
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
597 # the success file needs to be removed
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
598 # until the extraction has succeeded.
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
599 os.remove(extraction_success_full_file_path)
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
600 with tarfile.open(archive_filepath, mode="r:*") as archive_file:
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
601 archive_file.extractall(path=cannonical_destination)
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
602 elif (extraction_success_filename in orig_files_in_destination):
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
603 # The archive was successfully extracted before so we do not do it again.
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
604 print "The extraction success file exists, so no new extraction was attempted:"
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
605 print "\t{:s}".format(extraction_success_filename)
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
606 print "Remove the success file or set <force new extraction> if you want a new extraction to occur."
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
607 else:
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
608 print "extract_archive(): This code should never be printed. Something is wrong."
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
609
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
610 # Some code to help us if errors occur.
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
611 print "\n*******************************************************"
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
612 print "* Finished extraction. Destination directory listing. *"
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
613 print_directory_contents(cannonical_destination, 1)
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
614 print "*******************************************************\n"
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
615 return
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
616
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
617 def extract_genome_file(archive_filepath, destination, force_new_extraction=False, keep_archive=False):
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
618 # Extract a CTAT Genome Reference Library archive file.
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
619 # It is best if archive_filepath is an absolute, fully specified filepath, not a relative one.
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
620 # destination is the directory to which the archive will be extracted.
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
621 # force_new_extraction can be used to cause extraction to occur, even if the file was extracted before.
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
622 #
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
623 # Returns extracted_directory
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
624 # The full path of the top level directory that is
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
625 # created by the extraction of the files from the archive.
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
626
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
627 print "Extracting:\n\t {:s}".format(str(archive_filepath))
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
628 print "to:\n\t{:s}".format(destination)
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
629 cannonical_destination = ensure_we_can_write_numbytes_to(destination, bytes_needed_to_extract(archive_filepath))
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
630 # Get the root filename of the Genome Directory from the source file's name.
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
631 # That should also be the name of the extracted directory.
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
632 genome_dirname = find_genome_name_in_path(archive_filepath, raise_error=True)
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
633
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
634 orig_files_in_destination = set(os.listdir(cannonical_destination))
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
635 extract_archive(archive_filepath, destination, force_new_extraction)
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
636 newfiles_in_destdir = set(os.listdir(cannonical_destination)) - orig_files_in_destination
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
637
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
638 if (genome_dirname not in newfiles_in_destdir):
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
639 # Perhaps it has a different name than what we expect it to be.
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
640 # It will be a sub-directory that was not in the directory
0
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
641 # before we did the download and extraction.
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
642 found_filename = None
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
643 if len(newfiles_in_destdir) == 1:
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
644 found_filename = newfiles_in_destdir[0]
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
645 else:
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
646 for filename in newfiles_in_destdir:
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
647 # In most cases, there will only be one new file, but some OS's might have created
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
648 # other files in the directory.
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
649 # Look for the directory that was downloaded and extracted.
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
650 # The correct file's name should be a substring of the tar file that was downloaded.
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
651 if filename in src_filename:
11
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
652 # make sure it is a directory
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
653 if os.path.isdir(os.path.join(cannonical_destination,filename)):
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
654 found_filename = filename
0
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
655 if found_filename is not None:
11
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
656 genome_dirname = found_filename
0
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
657
11
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
658 extracted_directory = os.path.join(cannonical_destination, genome_dirname)
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
659 if (os.path.exists(extracted_directory)):
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
660 # Create the name of the file used to indicate prior success of the file's extraction.
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
661 extraction_success_filename = "{:s}.{:s}".format(os.path.basename(archive_filepath), _ExtractionSuccessFile)
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
662 extraction_success_full_file_path = os.path.join(cannonical_destination, extraction_success_filename)
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
663 create_success_file(extraction_success_full_file_path, \
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
664 "Extraction of:\n\t{:s}\n".format(archive_filepath) + \
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
665 "to:\n\t{:s}\nsucceeded.".format(extracted_directory))
0
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
666 else:
11
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
667 raise ValueError("ERROR: Could not find the extracted directory in the destination directory:" + \
0
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
668 "\n\t{:s}".format(cannonical_destination))
11
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
669 if not keep_archive:
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
670 # We are done extracting, so remove the archive file.
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
671 if os.path.exists(archive_filepath):
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
672 print "Removing the archive file:\n\t{:s}".format(archive_filepath)
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
673 os.remove(archive_filepath)
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
674 # else: # It was removed previously, so we don't need to remove it again.
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
675 return extracted_directory
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
676
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
677 def gmap_the_library(genome_build_directory, force_new_gmap=False):
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
678 # This is the processing that needs to happen for gmap-fusion to work.
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
679 # genome_build_directory should normally be a fully specified path,
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
680 # though this function should work even if it is relative.
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
681 # The command prints messages out to stderr, even when there is not an error,
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
682 # so route stderr to stdout. Otherwise, galaxy thinks an error occurred.
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
683
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
684 # Create the name of the file used to indicate prior success of gmap.
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
685 gmap_success_filename = "{:s}.{:s}".format(os.path.basename(genome_build_directory), _GmapSuccessFile)
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
686 gmap_success_full_file_path = os.path.join(genome_build_directory, gmap_success_filename)
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
687
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
688 orig_files_in_build_dir = set(os.listdir(genome_build_directory))
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
689 if ((gmap_success_filename not in orig_files_in_build_dir) \
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
690 or force_new_gmap):
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
691 # Do the gmap.
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
692 if (gmap_success_filename in orig_files_in_build_dir):
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
693 # Since we are redoing the gmap,
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
694 # the success file needs to be removed
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
695 # until the gmap has succeeded.
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
696 os.remove(gmap_success_full_file_path)
6
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
697 command = "gmap_build -D {:s}/ -d ref_genome.fa.gmap -k 13 {:s}/ref_genome.fa 2>&1".format( \
11
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
698 genome_build_directory, genome_build_directory)
0
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
699 try: # to send the gmap_build command.
11
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
700 subprocess.check_call(command, shell=True)
0
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
701 except subprocess.CalledProcessError:
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
702 print "ERROR: While trying to run the gmap_build command on the library:\n\t{:s}".format(command)
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
703 raise
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
704 finally:
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
705 # Some code to help us if errors occur.
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
706 print "\n*******************************\nAfter running gmap_build."
6
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
707 print_directory_contents(genome_build_directory, 2)
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
708 print "*******************************\n"
11
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
709 create_success_file(gmap_success_full_file_path, \
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
710 "gmap of:\n\t{:s}\nsucceeded.".format(genome_build_directory))
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
711 elif gmap_success_filename in orig_files_in_build_dir:
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
712 print "The gmap success file exists, so no gmap is being attempted:"
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
713 print "\t{:s}".format(gmap_success_full_file_path)
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
714 print "Remove the file or set <force new gmap> if you want a new gmap to occur."
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
715 else:
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
716 print "gmap_the_library(): This code should never be printed. Something is wrong."
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
717 return
0
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
718
11
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
719
32
9b7dc7d09fda Fixing some indentation errors in build_the_library.
trinity_ctat
parents: 31
diff changeset
720 def build_the_library(genome_source_directory, genome_build_directory, force_new_build=False, gmap_build=False):
11
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
721 """ genome_source_directory is the location of the source_data needed to build the library.
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
722 Normally it is fully specified, but could be relative.
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
723 genome_build_directory is the location where the library will be built.
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
724 It can be relative to the current working directory or an absolute path.
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
725 build specifies whether to run prep_genome_lib.pl even if it was run before.
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
726 gmap_build specifies whether to run gmap_build or not.
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
727
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
728 Following was the old way to do it. Before FusionFilter 0.5.0.
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
729 prep_genome_lib.pl \
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
730 --genome_fa ref_genome.fa \
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
731 --gtf ref_annot.gtf \
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
732 --blast_pairs blast_pairs.gene_syms.outfmt6.gz \
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
733 --fusion_annot_lib fusion_lib.dat.gz
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
734 --output_dir ctat_genome_lib_build_dir
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
735 index_pfam_domain_info.pl \
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
736 --pfam_domains PFAM.domtblout.dat.gz \
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
737 --genome_lib_dir ctat_genome_lib_build_dir
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
738 gmap_build -D ctat_genome_lib_build_dir -d ref_genome.fa.gmap -k 13 ctat_genome_lib_build_dir/ref_genome.fa"
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
739 """
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
740
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
741 if (genome_source_directory is None) or (genome_source_directory == "" ) or not os.path.exists(genome_source_directory):
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
742 raise ValueError("Cannot build the CTAT Genome Resource Library. " + \
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
743 "The source directory does not exist:\n\t{:s}".format(str(genome_source_directory)))
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
744 cannonical_destination = ensure_we_can_write_numbytes_to(genome_build_directory, \
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
745 bytes_needed_to_build(genome_source_directory))
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
746 print "Building the CTAT Genome Resource Library from source data at:\n\t{:s}".format(str(genome_source_directory))
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
747 print "The Destination directory is at:\n\t{:s}".format(str(cannonical_destination))
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
748
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
749 # Get the root filename of the Genome Directory.
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
750 src_filename = os.path.basename(genome_source_directory)
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
751 # See whether the library has been built already. The success file is written into the source directory.
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
752 files_in_sourcedir = set(os.listdir(genome_source_directory))
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
753 build_success_filename = "{:s}.{:s}".format(src_filename, _LibBuiltSuccessFile)
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
754 build_success_file_path = os.path.join(genome_source_directory, build_success_filename)
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
755 if (build_success_filename not in files_in_sourcedir) or force_new_build:
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
756 os.chdir(genome_source_directory)
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
757 if (build_success_filename in files_in_sourcedir):
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
758 # Since we are redoing the build,
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
759 # the success file needs to be removed
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
760 # until the build has succeeded.
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
761 os.remove(build_success_file_path)
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
762 # Create the command that builds the Genome Resource Library form the source data.
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
763 command = "prep_genome_lib.pl --genome_fa ref_genome.fa --gtf ref_annot.gtf " + \
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
764 "--pfam_db PFAM.domtblout.dat.gz " + \
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
765 "--output_dir {:s} ".format(cannonical_destination)
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
766 found_HumanFusionLib = False
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
767 HumanFusionLib_filename = "NoFileFound"
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
768 for filename in os.listdir(genome_source_directory):
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
769 # At the time this was written, the filename was CTAT_HumanFusionLib.v0.1.0.dat.gz
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
770 # We only check the prefix, in case other versions are used later.
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
771 # I assume there is only one in the directory, but if there are more than one,
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
772 # the later one, alphabetically, will be used.
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
773 if filename.split(".")[0] == _CTAT_HumanFusionLib_FilenamePrefix:
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
774 found_HumanFusionLib = True
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
775 filename_of_HumanFusionLib = filename
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
776 if found_HumanFusionLib:
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
777 # The mouse genomes do not have a fusion_annot_lib
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
778 # so only add the following for Human genomes.
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
779 command += "--fusion_annot_lib {:s} ".format(filename_of_HumanFusionLib) + \
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
780 "--annot_filter_rule AnnotFilterRule.pm "
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
781 if gmap_build:
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
782 command += "--gmap_build "
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
783 # Send stderr of the command to stdout, because some functions may write to stderr,
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
784 # even though no error has occurred. We will depend on error code return in order
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
785 # to know if an error occurred.
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
786 command += " 2>&1"
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
787 print "About to run the following command:\n\t{:s}".format(command)
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
788 try: # to send the prep_genome_lib command.
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
789 subprocess.check_call(command, shell=True)
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
790 except subprocess.CalledProcessError:
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
791 print "ERROR: While trying to run the prep_genome_lib.pl command " + \
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
792 "on the CTAT Genome Resource Library:\n\t{:s}".format(command)
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
793 raise
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
794 finally:
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
795 # Some code to help us if errors occur.
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
796 print "\n*******************************"
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
797 print "Contents of Genome Source Directory {:s}:".format(genome_source_directory)
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
798 print_directory_contents(genome_source_directory, 2)
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
799 print "\nContents of Genome Build Directory {:s}:".format(cannonical_destination)
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
800 print_directory_contents(cannonical_destination, 2)
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
801 print "*******************************\n"
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
802 create_success_file(build_success_file_path, \
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
803 "Build of:\n\t{:s}\n".format(genome_source_directory) + \
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
804 "to:\n\t{:s}\nsucceeded.".format(cannonical_destination))
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
805 if gmap_build:
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
806 # Create the gmap success file.
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
807 gmap_success_filename = "{:s}.{:s}".format(src_filename, _GmapSuccessFile)
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
808 gmap_success_full_file_path = os.path.join(cannonical_destination, gmap_success_filename)
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
809 create_success_file(gmap_success_full_file_path, \
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
810 "gmap of:\n\t{:s}\nsucceeded.".format(cannonical_destination))
32
9b7dc7d09fda Fixing some indentation errors in build_the_library.
trinity_ctat
parents: 31
diff changeset
811 elif (build_success_filename in files_in_sourcedir):
9b7dc7d09fda Fixing some indentation errors in build_the_library.
trinity_ctat
parents: 31
diff changeset
812 print "The build success file exists, so no build is being attempted:"
9b7dc7d09fda Fixing some indentation errors in build_the_library.
trinity_ctat
parents: 31
diff changeset
813 print "\t{:s}".format(build_success_file_path)
9b7dc7d09fda Fixing some indentation errors in build_the_library.
trinity_ctat
parents: 31
diff changeset
814 print "Remove the file or set <force new build> if you want a new build to occur."
9b7dc7d09fda Fixing some indentation errors in build_the_library.
trinity_ctat
parents: 31
diff changeset
815 # We might still need to do a gmap_build.
11
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
816 if gmap_build:
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
817 print "Checking if we need to gmap the library."
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
818 gmap_the_library(cannonical_destination, force_new_build)
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
819 # gmap_the_library creates a gmap success file if it succeeds.
32
9b7dc7d09fda Fixing some indentation errors in build_the_library.
trinity_ctat
parents: 31
diff changeset
820 else:
9b7dc7d09fda Fixing some indentation errors in build_the_library.
trinity_ctat
parents: 31
diff changeset
821 print "build_the_library(): This code should never be printed. Something is wrong."
9b7dc7d09fda Fixing some indentation errors in build_the_library.
trinity_ctat
parents: 31
diff changeset
822
11
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
823 return
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
824 # End of build_the_library()
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
825
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
826 def find_path_to_mutation_lib_integration():
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
827 # We are assuming that we exist inside of a conda environment and that the directory that we want
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
828 # is in the share directory, one level up from the bin directory that contains the ctat_mutations
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
829 # command.
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
830 path_to_mutation_lib_integration = None
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
831 path_to_ctat_mutations = which("ctat_mutations")
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
832 if (path_to_ctat_mutations is None) or (path_to_ctat_mutations == ""):
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
833 raise ValueError("Unable to find ctat_mutations, which is required to do mutation resource processing.")
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
834 conda_root_dir = os.path.dirname(os.path.dirname(path_to_ctat_mutations))
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
835 share_dir = os.path.join(conda_root_dir, "share")
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
836 ctat_mutations_dir = None
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
837 for filename in os.listdir(share_dir):
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
838 if "ctat-mutations" in filename:
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
839 ctat_mutations_dir = filename
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
840 if (ctat_mutations_dir is None) or (ctat_mutations_dir == ""):
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
841 raise ValueError("Unable to find the home of ctat_mutations.\n" + \
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
842 "It should be in the share directory:\n\t{:s}.".format(share_dir))
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
843 path_to_mutation_lib_integration = os.path.join(share_dir, \
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
844 ctat_mutations_dir, \
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
845 "mutation_lib_prep", \
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
846 "ctat-mutation-lib-integration.py")
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
847 return path_to_mutation_lib_integration
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
848
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
849 def find_path_to_picard_home():
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
850 picard_home = None
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
851 path_to_ctat_mutations = which("ctat_mutations")
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
852 if (path_to_ctat_mutations is None) or (path_to_ctat_mutations == ""):
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
853 raise ValueError("Unable to find ctat_mutations, which is required to do mutation resources processing.")
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
854 # The ctat_mutations shell script defines PICARD_HOME. We just need to get it out of that file.
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
855 ctat_mutations_file = open(path_to_ctat_mutations, "r")
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
856 for line in ctat_mutations_file:
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
857 if ("export" in line) and ("PICARD_HOME=" in line):
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
858 # Get the value after the equal sign and strip off the newline at the end of string.
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
859 # Then strip off quotes at begin and end if they are there.
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
860 # And then strip off any other whitespace that might have been inside of stripped off quotes.
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
861 picard_home = line.split("=")[1].strip().strip('\"').strip()
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
862 if (picard_home is None) or (picard_home == ""):
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
863 # We didn't find it in the ctat_mutations file. Search for it.
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
864 conda_root_dir = os.path.dirname(os.path.dirname(path_to_ctat_mutations))
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
865 share_dir = os.path.join(conda_root_dir, "share")
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
866 for filename in os.listdir(share_dir):
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
867 if "picard" in filename:
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
868 picard_home = os.path.join(share_dir,filename)
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
869 if (picard_home is None) or (picard_home == ""):
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
870 raise ValueError("Unable to find PICARD_HOME.\n" +
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
871 "It should be in the share directory:\n\t{:s}.".format(share_dir))
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
872 return picard_home
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
873
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
874 def download_and_integrate_mutation_resources(source_url, genome_build_directory, cosmic_resources_location=None, \
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
875 force_new_download=False, force_new_integration=False):
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
876 # source_url is the url of the mutation resources archive to download.
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
877 # genome_build_dir is the location where the archive will be placed.
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
878 # If cosmic_files_location is set, that is the location where the files are presumed to exist.
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
879 # If cosmic_files_location is not set, the files will assumed to exist in genome_build_directory.
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
880 # If force_new_download is True, then even if the archive has previously been downloaded,
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
881 # it will be downloaded again.
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
882 """
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
883 From https://github.com/NCIP/ctat-mutations/tree/master/mutation_lib_prep
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
884
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
885 Step 1 (after CTAT Genome Resource Library is built)
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
886 download mutation_lib.hg38.tar.gz into GRCh38_v27_CTAT_lib_Feb092018
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
887 or
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
888 download mutation_lib.hg19.tar.gz into GRCh37_v19_CTAT_lib_Feb092018
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
889 or
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
890 download mc-7.tar.gz into Mouse_M16_CTAT_lib_Feb202018
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
891 (Need to ask about support for mouse, since there is not info about Cosmic mouse genome files in instracutions.)
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
892
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
893 Step 2: Cosmic files download - User must perform this step prior to running this code. We check if files are present.
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
894
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
895 Next download COSMIC resources required in this directory. Depending on the version of genome you need you can install either COSMIC's hg38 or COSMIC's hg19. You will need to download 2 sets of files: COSMIC Mutation Data (CosmicMutantExport.tsv.gz) and COSMIC Coding Mutation VCF File (CosmicCodingMuts.vcf.gz). Please note, for download to succeed you will need to register and login to their service.
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
896
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
897 So is there a way the user can give their credentials through the Data Manager interface as a part of specifying Mutation parameters and then I can programatically use those credentials to download the file, or maybe instead, the interface needs to have the intructions for the user to download the files, then the use needs to specify the absolute path to where those files are.
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
898
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
899 Step 3: Mutation lib integration
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
900
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
901 Once you have downloaded CosmicMutantExport.tsv.gz AND CosmicCodingMuts.vcf.gz (hg38 or hg19), proceed with mutation lib integration step which will integrate the mutation resource with CTAT_GENOME_LIB (This corresponds to "GRCh37_v19_CTAT_lib_Feb092018" or "GRCh38_v27_CTAT_lib_Feb092018" downloaded in Step 1). You will find this script in ctat-mutations repo in 'src' directory.
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
902
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
903 #Keep Picard in PICARD_HOME environmental variable like so
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
904 export PICARD_HOME=/path/to/picard
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
905
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
906 #Integrate CTAT mutations lib with CTAT genome library
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
907 python ctat-mutations/mutation_lib_prep/ctat-mutation-lib-integration.py \
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
908 --CosmicMutantExport CosmicMutantExport.tsv.gz \
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
909 --CosmicCodingMuts CosmicCodingMuts.vcf.gz \
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
910 --genome_lib_dir GRCh37_v19_CTAT_lib_Feb092018/ # OR GRCh38_v27_CTAT_lib_Feb092018/
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
911
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
912 Now you are all set to run the ctat-mutations pipeline
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
913 """
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
914 print "\n***********************************"
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
915 print "* Integrating Mutation Resources. *"
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
916 print "***********************************\n"
8
b2e6ed40840a Uploaded
trinity_ctat
parents: 7
diff changeset
917 # It is assumed that this procedure is only called with a valid genome_build_directory.
11
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
918 url_parts = urlparse.urlparse(source_url)
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
919 source_filename = os.path.basename(url_parts.path)
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
920 if url_parts.scheme == "":
8
b2e6ed40840a Uploaded
trinity_ctat
parents: 7
diff changeset
921 # Then we were given a source_url without a leading https: or similar.
11
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
922 # Assume we only were given the filename and that it exists at _CTAT_Mutation_URL.
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
923 source_url = urlparse.urljoin(_CTAT_Mutation_URL, source_url)
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
924 # FIX - We might want to otherwise check if we have a valid url and/or if we can reach it.
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
925 cannonical_destination = ensure_we_can_write_numbytes_to(genome_build_directory, _NumBytesNeededForMutationResources)
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
926 print "Download and Integrate a Mutation Resource Archive."
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
927 print "The source URL is:\n\t{:s}".format(str(source_url))
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
928 print "The destination is:\n\t{:s}".format(str(cannonical_destination))
8
b2e6ed40840a Uploaded
trinity_ctat
parents: 7
diff changeset
929 # Get the list of files in the directory,
b2e6ed40840a Uploaded
trinity_ctat
parents: 7
diff changeset
930 # We use it to check for a previous download or extraction among other things.
11
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
931 orig_files_in_destdir = set(os.listdir(cannonical_destination))
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
932
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
933 # DOWNLOAD SECTION
8
b2e6ed40840a Uploaded
trinity_ctat
parents: 7
diff changeset
934 # See whether the index file has been downloaded already.
11
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
935 download_success_file = "{:s}.{:s}".format(source_filename, _MutationDownloadSuccessFile)
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
936 download_success_file_path = os.path.join(cannonical_destination, download_success_file)
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
937 if ((download_success_file not in orig_files_in_destdir) or force_new_download):
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
938 # DO THE DOWNLOAD
8
b2e6ed40840a Uploaded
trinity_ctat
parents: 7
diff changeset
939 if (download_success_file in orig_files_in_destdir):
b2e6ed40840a Uploaded
trinity_ctat
parents: 7
diff changeset
940 # Since we are redoing the download,
b2e6ed40840a Uploaded
trinity_ctat
parents: 7
diff changeset
941 # the success file needs to be removed
b2e6ed40840a Uploaded
trinity_ctat
parents: 7
diff changeset
942 # until the download has succeeded.
b2e6ed40840a Uploaded
trinity_ctat
parents: 7
diff changeset
943 os.remove(download_success_file_path)
11
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
944 # The following raises an IOError if the download fails for some reason.
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
945 archive_fullpath = download_file_from_url(source_url, cannonical_destination, resume_download=(not force_new_download))
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
946 create_success_file(download_success_file_path, \
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
947 "Download of the mutation resource archive:\n\t{:s}\n".format(source_url) + \
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
948 "to:\n\t{:s}\nsucceeded.".format(cannonical_destination))
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
949 elif (download_success_file in orig_files_in_destdir):
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
950 print "The download success file exists, so no download is being attempted:"
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
951 print "\t{:s}".format(download_success_file_path)
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
952 print "Remove the file or set <new_mutation_download> if you want a new download to occur."
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
953 else:
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
954 print "download_and_integrate_mutation_resources() - Download: This code should never be printed. Something is wrong."
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
955
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
956 # INTEGRATION SECTION
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
957 integration_success_file = "{:s}.{:s}".format(source_filename, _MutationIntegrationSuccessFile)
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
958 integration_success_file_path = os.path.join(cannonical_destination, integration_success_file)
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
959 if ((integration_success_file not in orig_files_in_destdir) or force_new_integration):
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
960 # INTEGRATE THE LIBRARY
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
961 if (integration_success_file in orig_files_in_destdir):
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
962 # Since we are redoing the integration,
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
963 # the success file needs to be removed
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
964 # until the download has succeeded.
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
965 os.remove(integration_success_file_path)
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
966 mutation_lib_dirpath = os.path.join(cannonical_destination, _CTAT_MutationLibDirname)
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
967 # If we do not remove the directory, then the old files will exist and a new integration does not occur.
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
968 # Also, with the Cosmic files, when the integrated file is created, if there is a previous one, gzip
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
969 # asks a question of the user, and this program is not prepared to respond to a question from a subprocess:
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
970 # [bgzip] /path/to/ctat_mutation_lib/cosmic.vcf.gz already exists; do you wish to overwrite (y or n)?
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
971 if os.path.exists(mutation_lib_dirpath):
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
972 shutil.rmtree(mutation_lib_dirpath)
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
973 # Check for Cosmic resources. User has to place these files into the correct location.
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
974 if (cosmic_resources_location is None) or (cosmic_resources_location == ""):
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
975 cosmic_resources_loc_full_path = cannonical_destination
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
976 end_err_msg = "These files must be placed into:\n\t{:s}".format(cosmic_resources_loc_full_path)
8
b2e6ed40840a Uploaded
trinity_ctat
parents: 7
diff changeset
977 else:
11
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
978 cosmic_resources_loc_full_path = os.path.realpath(cosmic_resources_location)
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
979 end_err_msg = "This function was told they would be placed into:\n\t{:s}".format(cosmic_resources_loc_full_path)
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
980 cosmic_mutant_full_path = os.path.join(cosmic_resources_loc_full_path, _COSMIC_Mutant_Filename)
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
981 cosmic_coding_full_path = os.path.join(cosmic_resources_loc_full_path, _COSMIC_Coding_Filename)
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
982 if not (os.path.exists(cosmic_mutant_full_path) and os.path.exists(cosmic_coding_full_path)):
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
983 raise IOError("Either one or both of Cosmic Resources are missing:\n\t" + \
31
0df7a729910d Fixing an error message to use correct filename.
trinity_ctat
parents: 11
diff changeset
984 "{:s}\nand/or\n\t{:s}\n".format(cosmic_mutant_full_path, cosmic_coding_full_path) + \
11
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
985 "Unable to integrate mutation resources.\n{:s}".format(end_err_msg))
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
986 # Create the integration command. We also must define PICARD_HOME for the command to work.
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
987 picard_home = find_path_to_picard_home()
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
988 integration_command = find_path_to_mutation_lib_integration()
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
989 command = "export PICARD_HOME={:s} && python {:s} ".format(picard_home, integration_command) + \
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
990 "--CosmicMutantExport {:s} ".format(cosmic_mutant_full_path) + \
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
991 "--CosmicCodingMuts {:s} ".format(cosmic_coding_full_path) + \
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
992 "--genome_lib_dir {:s}".format(cannonical_destination)
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
993 try: # to send the ctat-mutation-lib-integration command.
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
994 subprocess.check_call(command, shell=True)
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
995 except subprocess.CalledProcessError:
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
996 print "ERROR: While trying to integrate the mutation resources:\n\t{:s}".format(command)
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
997 raise
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
998 finally:
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
999 # Some code to help us if errors occur.
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
1000 print "/n*********************************************************"
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
1001 print "* After download and integration of Mutation Resources. *"
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
1002 print_directory_contents(cannonical_destination, 2)
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
1003 print "*********************************************************\n"
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
1004 create_success_file(integration_success_file_path, \
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
1005 "Download and integration of mutation resources:\n\t{:s}\n".format(source_url) + \
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
1006 "to:\n\t{:s}\nsucceeded.".format(genome_build_directory))
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
1007 elif (integration_success_file in orig_files_in_destdir):
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
1008 print "The mutation resources integration success file exists, so no integration is being attempted:"
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
1009 print "\t{:s}".format(integration_success_file_path)
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
1010 print "Remove the file or set <new_mutation_integration> if you want a new integration to occur."
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
1011 else:
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
1012 print "download_and_integrate_mutation_resources() - Integration: This code should never be printed. Something is wrong."
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
1013 return
0
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
1014
6
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
1015 def search_for_genome_build_dir(top_dir_path):
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
1016 # If we do not download the directory, the topdir_path could be the
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
1017 # location of the genome resource library, but we also want to allow the
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
1018 # user to give the same value for top_dir_path that they do when a
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
1019 # build happens, so we need to handle all three cases:
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
1020 # 1) Is the top_dir_path the build directory,
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
1021 # 2) or is it inside of the given directory,
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
1022 # 3) or is it inside a subdirectory of the given directory.
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
1023 # The source_data downloads are built to a directory named _CTAT_Build_dirname,
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
1024 # and the plug-n-play downloads contain a sub-directory named _CTAT_Build_dirname.
7
f22a13378750 Uploaded
trinity_ctat
parents: 6
diff changeset
1025 # We also look for the genome name and return that, if we find it in the
f22a13378750 Uploaded
trinity_ctat
parents: 6
diff changeset
1026 # directory name of the directory holding the build directory.
f22a13378750 Uploaded
trinity_ctat
parents: 6
diff changeset
1027 top_dir_full_path = os.path.realpath(top_dir_path)
6
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
1028 genome_build_directory = None
7
f22a13378750 Uploaded
trinity_ctat
parents: 6
diff changeset
1029 genome_name_from_dirname = None
6
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
1030 print_warning = False
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
1031
7
f22a13378750 Uploaded
trinity_ctat
parents: 6
diff changeset
1032 if not os.path.exists(top_dir_full_path):
6
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
1033 raise ValueError("Cannot find the CTAT Genome Resource Library. " + \
7
f22a13378750 Uploaded
trinity_ctat
parents: 6
diff changeset
1034 "The given directory does not exist:\n\t{:s}".format(top_dir_full_path))
f22a13378750 Uploaded
trinity_ctat
parents: 6
diff changeset
1035 elif not os.path.isdir(top_dir_full_path):
6
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
1036 raise ValueError("Cannot find the CTAT Genome Resource Library. " + \
7
f22a13378750 Uploaded
trinity_ctat
parents: 6
diff changeset
1037 "The given directory is not a directory:\n\t{:s}".format(top_dir_full_path))
11
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
1038 if os.path.basename(top_dir_full_path) == _CTAT_Build_dirname:
7
f22a13378750 Uploaded
trinity_ctat
parents: 6
diff changeset
1039 print "Build directory is: {:s}".format(top_dir_full_path)
6
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
1040 # The top_dir_path is the path to the genome_build_directory.
7
f22a13378750 Uploaded
trinity_ctat
parents: 6
diff changeset
1041 genome_build_directory = top_dir_full_path
6
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
1042 else:
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
1043 # Look for it inside of the top_dir_path directory.
7
f22a13378750 Uploaded
trinity_ctat
parents: 6
diff changeset
1044 print "Looking inside of: {:s}".format(top_dir_full_path)
f22a13378750 Uploaded
trinity_ctat
parents: 6
diff changeset
1045 top_dir_contents = os.listdir(top_dir_full_path)
6
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
1046 if (_CTAT_Build_dirname in top_dir_contents):
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
1047 # The genome_build_directory is inside of the top_dir_path directory.
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
1048 print "1. Found it."
7
f22a13378750 Uploaded
trinity_ctat
parents: 6
diff changeset
1049 genome_build_directory = "{:s}/{:s}".format(top_dir_full_path,_CTAT_Build_dirname)
6
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
1050 else:
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
1051 # Find all subdirectories containing the _CTAT_Build_dirname or the _CTAT_RefGenome_Filename.
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
1052 # Look down the directory tree two levels.
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
1053 build_dirs_in_subdirs = list()
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
1054 subdirs_with_genome_files = list()
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
1055 build_dirs_in_sub_subdirs = list()
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
1056 sub_subdirs_with_genome_files = list()
7
f22a13378750 Uploaded
trinity_ctat
parents: 6
diff changeset
1057 subdirs = [entry for entry in top_dir_contents if (os.path.isdir("{:s}/{:s}".format(top_dir_full_path,entry)))]
6
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
1058 for subdir in subdirs:
7
f22a13378750 Uploaded
trinity_ctat
parents: 6
diff changeset
1059 subdir_path = "{:s}/{:s}".format(top_dir_full_path, subdir)
6
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
1060 subdir_path_contents = os.listdir(subdir_path)
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
1061 # print "Is it one of:\n\t" + "\n\t".join(subdir_path_contents)
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
1062 if (_CTAT_Build_dirname in subdir_path_contents):
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
1063 # The genome_build_directory is inside of the subdir_path directory.
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
1064 print "2a, Found one."
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
1065 build_dirs_in_subdirs.append("{:s}/{:s}".format(subdir_path, _CTAT_Build_dirname))
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
1066 if (_CTAT_RefGenome_Filename in subdir_path_contents):
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
1067 subdirs_with_genome_files.append(subdir_path)
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
1068 # Since we are already looping, loop through all dirs one level deeper as well.
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
1069 sub_subdirs = [entry for entry in subdir_path_contents if (os.path.isdir("{:s}/{:s}".format(subdir_path,entry)))]
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
1070 for sub_subdir in sub_subdirs:
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
1071 sub_subdir_path = "{:s}/{:s}".format(subdir_path, sub_subdir)
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
1072 sub_subdir_path_contents = os.listdir(sub_subdir_path)
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
1073 # print "Is it one of:\n\t" + "\n\t".join(sub_subdir_path_contents)
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
1074 if (_CTAT_Build_dirname in sub_subdir_path_contents):
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
1075 # The genome_build_directory is inside of the sub_subdir_path directory.
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
1076 print "3a. Found one."
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
1077 build_dirs_in_sub_subdirs.append("{:s}/{:s}".format(sub_subdir_path, _CTAT_Build_dirname))
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
1078 if (_CTAT_RefGenome_Filename in sub_subdir_path_contents):
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
1079 sub_subdirs_with_genome_files.append(sub_subdir_path)
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
1080 # Hopefully there is one and only one found build directory.
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
1081 # If none are found we check for a directory containing the genome reference file,
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
1082 # but the build process sometimes causes more than one directory to have a copy,
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
1083 # so finding that file is not a sure thing.
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
1084 if (len(build_dirs_in_subdirs) + len(build_dirs_in_sub_subdirs)) > 1:
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
1085 print "\n***************************************"
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
1086 print "Found multiple CTAT Genome Resource Libraries " + \
7
f22a13378750 Uploaded
trinity_ctat
parents: 6
diff changeset
1087 "in the given directory:\n\t{:s}".format(top_dir_full_path)
f22a13378750 Uploaded
trinity_ctat
parents: 6
diff changeset
1088 print_directory_contents(top_dir_full_path, 2)
6
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
1089 print "***************************************\n"
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
1090 raise ValueError("Found multiple CTAT Genome Resource Libraries " + \
7
f22a13378750 Uploaded
trinity_ctat
parents: 6
diff changeset
1091 "in the given directory:\n\t{:s}".format(top_dir_full_path))
6
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
1092 elif len(build_dirs_in_subdirs) == 1:
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
1093 # The genome_build_directory is inside of the subdir_path directory.
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
1094 print "2b, Found it."
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
1095 genome_build_directory = build_dirs_in_subdirs[0]
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
1096 elif len(build_dirs_in_sub_subdirs) == 1:
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
1097 # The genome_build_directory is inside of the subdir_path directory.
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
1098 print "3b, Found it."
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
1099 genome_build_directory = build_dirs_in_sub_subdirs[0]
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
1100 elif (len(sub_subdirs_with_genome_files) + len(subdirs_with_genome_files)) > 1:
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
1101 print "\n***************************************"
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
1102 print "Unable to find CTAT Genome Resource Library " + \
7
f22a13378750 Uploaded
trinity_ctat
parents: 6
diff changeset
1103 "in the given directory:\n\t{:s}".format(top_dir_full_path)
6
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
1104 print "And multiple directories contain {:s}".format(_CTAT_RefGenome_Filename)
7
f22a13378750 Uploaded
trinity_ctat
parents: 6
diff changeset
1105 print_directory_contents(top_dir_full_path, 2)
6
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
1106 print "***************************************\n"
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
1107 raise ValueError("Unable to find CTAT Genome Resource Library " + \
7
f22a13378750 Uploaded
trinity_ctat
parents: 6
diff changeset
1108 "in the given directory:\n\t{:s}".format(top_dir_full_path))
6
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
1109 elif (len(sub_subdirs_with_genome_files) == 1):
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
1110 print "3c, Maybe found it."
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
1111 genome_build_directory = sub_subdirs_with_genome_files[0]
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
1112 print_warning = True
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
1113 elif (len(subdirs_with_genome_files) == 1):
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
1114 print "2c, Maybe found it."
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
1115 genome_build_directory = subdirs_with_genome_files[0]
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
1116 print_warning = True
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
1117 elif (_CTAT_RefGenome_Filename in top_dir_contents):
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
1118 print "1c. Maybe found it."
7
f22a13378750 Uploaded
trinity_ctat
parents: 6
diff changeset
1119 genome_build_directory = top_dir_full_path
6
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
1120 print_warning = True
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
1121 else:
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
1122 print "\n***************************************"
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
1123 print "Unable to find CTAT Genome Resource Library " + \
7
f22a13378750 Uploaded
trinity_ctat
parents: 6
diff changeset
1124 "in the given directory:\n\t{:s}".format(top_dir_full_path)
f22a13378750 Uploaded
trinity_ctat
parents: 6
diff changeset
1125 print_directory_contents(top_dir_full_path, 2)
6
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
1126 print "***************************************\n"
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
1127 raise ValueError("Unable to find CTAT Genome Resource Library " + \
7
f22a13378750 Uploaded
trinity_ctat
parents: 6
diff changeset
1128 "in the given directory:\n\t{:s}".format(top_dir_full_path))
6
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
1129 # end else
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
1130 # Check if the CTAT Genome Resource Lib has anything in it (and specifically ref_genome.fa).
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
1131 if (genome_build_directory is None):
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
1132 print "\n***************************************"
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
1133 print "Cannot find the CTAT Genome Resource Library " + \
7
f22a13378750 Uploaded
trinity_ctat
parents: 6
diff changeset
1134 "in the given directory:\n\t{:s}".format(top_dir_full_path)
f22a13378750 Uploaded
trinity_ctat
parents: 6
diff changeset
1135 print_directory_contents(top_dir_full_path, 2)
6
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
1136 print "***************************************\n"
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
1137 raise ValueError("Cannot find the CTAT Genome Resource Library " + \
7
f22a13378750 Uploaded
trinity_ctat
parents: 6
diff changeset
1138 "in the given directory:\n\t{:s}".format(top_dir_full_path))
f22a13378750 Uploaded
trinity_ctat
parents: 6
diff changeset
1139 else:
f22a13378750 Uploaded
trinity_ctat
parents: 6
diff changeset
1140 if (_CTAT_RefGenome_Filename not in os.listdir(genome_build_directory)):
f22a13378750 Uploaded
trinity_ctat
parents: 6
diff changeset
1141 print "\n***************************************"
11
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
1142 print "\nWARNING: Cannot find Genome Reference file {:s} ".format(_CTAT_RefGenome_Filename) + \
7
f22a13378750 Uploaded
trinity_ctat
parents: 6
diff changeset
1143 "in the genome build directory:\n\t{:s}".format(genome_build_directory)
f22a13378750 Uploaded
trinity_ctat
parents: 6
diff changeset
1144 print_directory_contents(genome_build_directory, 2)
f22a13378750 Uploaded
trinity_ctat
parents: 6
diff changeset
1145 print "***************************************\n"
f22a13378750 Uploaded
trinity_ctat
parents: 6
diff changeset
1146 if print_warning and genome_build_directory:
f22a13378750 Uploaded
trinity_ctat
parents: 6
diff changeset
1147 print "\n***************************************"
11
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
1148 print "\nWARNING: Cannot find the CTAT Genome Resource Library, " + \
7
f22a13378750 Uploaded
trinity_ctat
parents: 6
diff changeset
1149 "but found a {:s} file, so set its directory as the library.".format(_CTAT_RefGenome_Filename)
f22a13378750 Uploaded
trinity_ctat
parents: 6
diff changeset
1150 print "This my not be the correct directory:\n\t{:s}".format(genome_build_directory)
f22a13378750 Uploaded
trinity_ctat
parents: 6
diff changeset
1151 print_directory_contents(genome_build_directory, 2)
f22a13378750 Uploaded
trinity_ctat
parents: 6
diff changeset
1152 print "***************************************\n"
6
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
1153 return genome_build_directory
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
1154
11
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
1155 def build_directory_from_build_location(src_filename, build_location):
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
1156 build_directory = None
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
1157 genome_dir_name = find_genome_name_in_path(src_filename)
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
1158 if os.path.basename(build_location) == genome_dir_name:
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
1159 build_directory = os.path.join(build_location, _CTAT_Build_dirname)
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
1160 elif os.path.basename(build_location) == _CTAT_Build_dirname:
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
1161 build_directory = build_location
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
1162 else:
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
1163 build_directory = os.path.join(build_location, genome_dir_name, _CTAT_Build_dirname)
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
1164 return build_directory
7
f22a13378750 Uploaded
trinity_ctat
parents: 6
diff changeset
1165
0
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
1166 def main():
10
a7cd51b60f58 Uploaded
trinity_ctat
parents: 9
diff changeset
1167 #Parse Command Line. There are three basic ways to use this tool.
a7cd51b60f58 Uploaded
trinity_ctat
parents: 9
diff changeset
1168 # 1) Download and Build the CTAT Genome Resource Library from an archive.
a7cd51b60f58 Uploaded
trinity_ctat
parents: 9
diff changeset
1169 # 2) Build the library from source data files that are already downloaded.
a7cd51b60f58 Uploaded
trinity_ctat
parents: 9
diff changeset
1170 # 3) Specify the location of an already built library.
11
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
1171 # Any of these methods can incorporate or be followed by a gmap build.
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
1172 # Any of these methods can be followed by a mutation resources download and/or integration.
10
a7cd51b60f58 Uploaded
trinity_ctat
parents: 9
diff changeset
1173 # Choose arguments for only one method.
a7cd51b60f58 Uploaded
trinity_ctat
parents: 9
diff changeset
1174 # Do not use arguments in a mixed manner. I am not writing code to handle that at this time.
0
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
1175 parser = argparse.ArgumentParser()
10
a7cd51b60f58 Uploaded
trinity_ctat
parents: 9
diff changeset
1176 # Arguments for all methods:
0
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
1177 parser.add_argument('-o', '--output_filename', \
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
1178 help='Name of the output file, where the json dictionary will be written.')
11
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
1179 parser.add_argument('-y', '--display_name',
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
1180 default='', \
10
a7cd51b60f58 Uploaded
trinity_ctat
parents: 9
diff changeset
1181 help='Is used as the display name for the entry of this Genome Resource Library in the data table.')
8
b2e6ed40840a Uploaded
trinity_ctat
parents: 7
diff changeset
1182 parser.add_argument('-g', '--gmap_build', \
11
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
1183 help='Will do a gmap_build on the Genome Resource Library, if it has not previously been gmapped.',
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
1184 action='store_true')
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
1185 parser.add_argument('-f', '--force_gmap_build', \
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
1186 help='Will force gmap_build of the Genome Resource Library, even if previously gmapped.',
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
1187 action='store_true')
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
1188 parser.add_argument('-m', '--download_mutation_resources_url',
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
1189 default='', \
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
1190 help='Value should be the url of the zipped up mutation resources. ' + \
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
1191 'These are located at: https://data.broadinstitute.org/Trinity/CTAT/mutation/.' + \
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
1192 'Will download mutation resources and integrate them into the Genome Resource Library.' + \
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
1193 'Cosmic resources must previously have beeen downloaded (https://cancer.sanger.ac.uk/cosmic/download).' + \
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
1194 'Cosmic resources can be placed directly into the Genome Resource Library ' + \
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
1195 'or you can set the --cosmic_resources_location argument.' + \
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
1196 'See https://github.com/NCIP/ctat-mutations/tree/no_sciedpiper/mutation_lib_prep for more info. ' + \
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
1197 'If a previous download and integration was not completed, ' + \
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
1198 'calling with this option set will attempt to finish the integration.')
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
1199 parser.add_argument('-l', '--new_mutation_download', \
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
1200 help='Forces the mutation resources to be downloaded, ' + \
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
1201 'even if previously downloaded into this Genome Resource Library.',
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
1202 action='store_true')
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
1203 parser.add_argument('-i', '--new_mutation_integration', \
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
1204 help='Forces the mutation resources to be integrated, ' + \
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
1205 'even if previously integrated into this Genome Resource Library.',
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
1206 action='store_true')
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
1207 parser.add_argument('-c', '--cosmic_resources_location',
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
1208 default='', \
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
1209 help='Specify a non-default location where the Cosmic files reside. ' + \
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
1210 'Normally they are assumed to reside in the build directory, ' + \
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
1211 'but if that directory has not been created yet when this program ' + \
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
1212 'is called, you can specify the full path to the directory where they reside.')
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
1213 # Method 1) arguments - Download and Build.
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
1214 # - One can optionally utilize --build_location argument with this group of arguments.
10
a7cd51b60f58 Uploaded
trinity_ctat
parents: 9
diff changeset
1215 download_and_build_args = parser.add_argument_group('Download and Build arguments')
11
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
1216 download_and_build_args.add_argument('-u', '--download_url',
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
1217 default='', \
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
1218 help='This is the url of an archive file containing the library files. ' + \
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
1219 'These are located at https://data.broadinstitute.org/Trinity/CTAT_RESOURCE_LIB/. ' + \
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
1220 'Works with both source-data and plug-n-play archives.')
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
1221 download_and_build_args.add_argument('-d', '--download_location',
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
1222 default='', \
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
1223 help='Full path of the CTAT Resource Library download location, where the download will be placed. ' + \
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
1224 'If the archive file has already had been successfully downloaded, ' + \
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
1225 'it will only be downloaded again if --new_archive_download is selected. ' + \
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
1226 'If --build_location is not set, then the archive will be built in place at the download_location. ' + \
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
1227 'If a previous download and build was started but not completed at this or a specified build_location, ' + \
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
1228 'calling with this and the previous option set, but not --new_archive_download, ' + \
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
1229 'will attempt to finish the download and build.')
10
a7cd51b60f58 Uploaded
trinity_ctat
parents: 9
diff changeset
1230 download_and_build_args.add_argument('-a', '--new_archive_download', \
11
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
1231 help='Forces a new download (and build if needed) of the Genome Resource Library, ' + \
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
1232 'even if previously downloaded and built.',
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
1233 action='store_true')
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
1234 download_and_build_args.add_argument('-k', '--keep_archive', \
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
1235 help='The archive will not be deleted after it is extracted.',
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
1236 action='store_true')
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
1237 # Method 2) arguments - Specify source and build locations.
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
1238 specify_source_and_build_args = parser.add_argument_group('Specify Source and Build locations arguments')
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
1239 specify_source_and_build_args.add_argument('-s', '--source_location',
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
1240 default='', \
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
1241 help='Full path to the directory containing CTAT Resource Library source-data files ' + \
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
1242 'or the full path to a CTAT Resource Library archive file (.tar.gz). ' + \
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
1243 'If the --build_location option is not set, the reference library will be built in the source_location directory.' + \
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
1244 'If a previous download and build was started but not completed at this location, ' + \
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
1245 'calling with this option set, but not --new_library_build, ' + \
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
1246 'will attempt to finish the build.')
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
1247 specify_source_and_build_args.add_argument('-r', '--new_library_build', \
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
1248 help='Forces build of the CTAT Genome Resource Library, even if previously built. ' + \
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
1249 'The --source_location must be a source-data archive or directory, or this is a no-op.',
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
1250 action='store_true')
10
a7cd51b60f58 Uploaded
trinity_ctat
parents: 9
diff changeset
1251 # Method 3) arguments - Specify the location of a built library.
a7cd51b60f58 Uploaded
trinity_ctat
parents: 9
diff changeset
1252 built_lib_location_arg = parser.add_argument_group('Specify location of built library arguments')
11
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
1253 built_lib_location_arg.add_argument('-b', '--build_location',
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
1254 default='', \
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
1255 help='Full path to the location of a built CTAT Genome Resource Library, ' + \
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
1256 'either where it is, or where it will be placed.')
10
a7cd51b60f58 Uploaded
trinity_ctat
parents: 9
diff changeset
1257
0
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
1258 args = parser.parse_args()
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
1259
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
1260 # All of the input parameters are written by default to the output file prior to
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
1261 # this program being called.
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
1262 # But I do not get input values from the json file, but rather from command line.
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
1263 # Just leaving the following code as a comment, in case it might be useful to someone later.
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
1264 # params = from_json_string(open(filename).read())
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
1265 # target_directory = params['output_data'][0]['extra_files_path']
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
1266 # os.mkdir(target_directory)
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
1267
10
a7cd51b60f58 Uploaded
trinity_ctat
parents: 9
diff changeset
1268 print "The value of download_url argument is:\n\t{:s}".format(str(args.download_url))
4
c372930aaba1 Uploaded
trinity_ctat
parents: 0
diff changeset
1269
8
b2e6ed40840a Uploaded
trinity_ctat
parents: 7
diff changeset
1270 lib_was_built = False
11
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
1271 extracted_directory = None
10
a7cd51b60f58 Uploaded
trinity_ctat
parents: 9
diff changeset
1272 source_data_directory = None
0
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
1273 genome_build_directory = None
11
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
1274 download_url_is_set = (args.download_url is not None) and (args.download_url != "")
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
1275 download_location_is_set = (args.download_location is not None) and (args.download_location != "")
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
1276 source_location_is_set = (args.source_location is not None) and (args.source_location != "")
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
1277 build_location_is_set = (args.build_location is not None) and (args.build_location != "")
0
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
1278 # FIX - need to make sure we are handling all "possible" combinations of arguments.
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
1279 # Probably would be good if we could simplify/remove some of them.
6
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
1280 # But I think the current interface is using them all.
10
a7cd51b60f58 Uploaded
trinity_ctat
parents: 9
diff changeset
1281
11
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
1282 if download_url_is_set:
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
1283 if source_location_is_set:
10
a7cd51b60f58 Uploaded
trinity_ctat
parents: 9
diff changeset
1284 raise ValueError("Argument --source_location cannot be used in combination with --download_url.")
11
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
1285 if not download_location_is_set:
10
a7cd51b60f58 Uploaded
trinity_ctat
parents: 9
diff changeset
1286 raise ValueError("Argument --download_url requires that --download_location be specified.")
11
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
1287 downloaded_filename_full_path = \
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
1288 download_genome_archive(source_url=args.download_url, \
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
1289 destination=args.download_location, \
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
1290 force_new_download=args.new_archive_download)
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
1291 print "\nThe downloaded file is:\n\t{:s}.\n".format(str(downloaded_filename_full_path))
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
1292
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
1293
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
1294 if ctat_library_type(downloaded_filename_full_path) == _LIBTYPE_SOURCE_DATA:
10
a7cd51b60f58 Uploaded
trinity_ctat
parents: 9
diff changeset
1295 print "It is source data."
11
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
1296 # If it is source_data, extract to download_location (the directory where the download was placed).
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
1297 extracted_directory = extract_genome_file(archive_filepath=downloaded_filename_full_path, \
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
1298 destination=args.download_location, \
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
1299 force_new_extraction=args.new_archive_download, \
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
1300 keep_archive=args.keep_archive)
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
1301 source_data_directory = extracted_directory
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
1302 if build_location_is_set:
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
1303 genome_build_directory = build_directory_from_build_location(source_data_directory, args.build_location)
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
1304 else:
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
1305 # We will build within a subdirectory of the source_data_directory .
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
1306 # The name of the build directory will be the default _CTAT_Build_dirname.
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
1307 # This _CTAT_Build_dirname directory will not exist until the library is built.
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
1308 genome_build_directory = os.path.join(source_data_directory, _CTAT_Build_dirname)
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
1309
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
1310 elif ctat_library_type(downloaded_filename_full_path) == _LIBTYPE_PLUG_N_PLAY:
10
a7cd51b60f58 Uploaded
trinity_ctat
parents: 9
diff changeset
1311 print "It is plug-n-play data."
11
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
1312 if build_location_is_set:
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
1313 # Extract to the build location. The library is already built.
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
1314 extracted_directory = extract_genome_file(archive_filepath=downloaded_filename_full_path, \
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
1315 destination=args.build_location, \
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
1316 force_new_extraction=args.new_archive_download, \
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
1317 keep_archive=args.keep_archive)
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
1318 else:
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
1319 # Extract to the download location.
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
1320 extracted_directory = extract_genome_file(archive_filepath=downloaded_filename_full_path, \
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
1321 destination=args.download_location, \
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
1322 force_new_extraction=args.new_archive_download, \
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
1323 keep_archive=args.keep_archive)
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
1324 # There is no source_data_directory, so its value stays as None.
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
1325
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
1326 # Look for the build directory. It should be inside the extracted_directory
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
1327 if len(os.listdir(extracted_directory)) == 1:
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
1328 # Then that one file is a subdirectory that should be the build_directory.
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
1329 # That is how the plug-n-play directories are structured.
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
1330 subdir_filename = os.listdir(extracted_directory)[0]
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
1331 genome_build_directory = os.path.join(extracted_directory, subdir_filename)
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
1332 else:
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
1333 # We need to search for the build directory, since there is more than one file.
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
1334 genome_build_directory = search_for_genome_build_dir(extracted_directory)
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
1335 else:
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
1336 raise ValueError("Unexpected CTAT Library type. Neither plug-n-play nor source_data:\n\t" + \
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
1337 "{:s}".format(downloaded_filename_full_path))
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
1338 elif source_location_is_set:
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
1339 # Then the user wants to build the directory from the source data.
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
1340 source_data_directory = os.path.realpath(args.source_location)
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
1341 print "\nThe user is saying the source data is in:\n\t{:s}.\n".format(str(source_data_directory))
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
1342 if build_location_is_set:
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
1343 genome_build_directory = build_directory_from_build_location(source_data_directory, args.build_location)
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
1344 else:
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
1345 # We will build within a subdirectory of the source_data_directory .
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
1346 # The name of the build directory will be the default _CTAT_Build_dirname.
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
1347 # This _CTAT_Build_dirname directory will not exist until the library is built.
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
1348 genome_build_directory = os.path.join(source_data_directory, _CTAT_Build_dirname)
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
1349 elif build_location_is_set:
10
a7cd51b60f58 Uploaded
trinity_ctat
parents: 9
diff changeset
1350 genome_build_directory = args.build_location
11
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
1351
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
1352 if (genome_build_directory is None) or (genome_build_directory == ""):
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
1353 raise ValueError("At least one of --download_url, --source_location, or --build_location must be specified.")
10
a7cd51b60f58 Uploaded
trinity_ctat
parents: 9
diff changeset
1354
a7cd51b60f58 Uploaded
trinity_ctat
parents: 9
diff changeset
1355 print "\nThe location where the CTAT Genome Resource Library exists " + \
11
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
1356 "or will be built is {:s}.\n".format(str(genome_build_directory))
0
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
1357
8
b2e6ed40840a Uploaded
trinity_ctat
parents: 7
diff changeset
1358 # To take out builds for testing, comment out the lines that do the building.
b2e6ed40840a Uploaded
trinity_ctat
parents: 7
diff changeset
1359 # The command that builds the ctat genome library also has an option for building the gmap indexes.
b2e6ed40840a Uploaded
trinity_ctat
parents: 7
diff changeset
1360 # That is why the gmap_build value is sent to build_the_library(), but if we are not building the
b2e6ed40840a Uploaded
trinity_ctat
parents: 7
diff changeset
1361 # library, the user might still be asking for a gmap_build. That is done after rechecking for the
b2e6ed40840a Uploaded
trinity_ctat
parents: 7
diff changeset
1362 # genome_build_directory.
10
a7cd51b60f58 Uploaded
trinity_ctat
parents: 9
diff changeset
1363 if (source_data_directory is not None):
a7cd51b60f58 Uploaded
trinity_ctat
parents: 9
diff changeset
1364 build_the_library(source_data_directory, \
a7cd51b60f58 Uploaded
trinity_ctat
parents: 9
diff changeset
1365 genome_build_directory, \
11
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
1366 args.new_library_build, \
10
a7cd51b60f58 Uploaded
trinity_ctat
parents: 9
diff changeset
1367 args.gmap_build)
8
b2e6ed40840a Uploaded
trinity_ctat
parents: 7
diff changeset
1368 lib_was_built = True
11
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
1369
6
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
1370 # The following looks to see if the library actually exists after the build,
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
1371 # and raises an error if it cannot find the library files.
11
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
1372 # The reassignment of genome_build_directory is superfluous in most cases,
8
b2e6ed40840a Uploaded
trinity_ctat
parents: 7
diff changeset
1373 # since genome_build_directory should already point to the correct directory,
11
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
1374 # except in the case where a user specifies a location that contains the
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
1375 # genome_build_directory rather than is the genome_build_directory.
6
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
1376 genome_build_directory = search_for_genome_build_dir(genome_build_directory)
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
1377
8
b2e6ed40840a Uploaded
trinity_ctat
parents: 7
diff changeset
1378 if (args.gmap_build and not lib_was_built):
b2e6ed40840a Uploaded
trinity_ctat
parents: 7
diff changeset
1379 # If we did not build the genome resource library
b2e6ed40840a Uploaded
trinity_ctat
parents: 7
diff changeset
1380 # the user might still be asking for a gmap_build.
11
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
1381 gmap_the_library(genome_build_directory, args.force_gmap_build)
8
b2e6ed40840a Uploaded
trinity_ctat
parents: 7
diff changeset
1382
11
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
1383 if (args.download_mutation_resources_url != ""):
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
1384 download_and_integrate_mutation_resources(source_url=args.download_mutation_resources_url, \
8
b2e6ed40840a Uploaded
trinity_ctat
parents: 7
diff changeset
1385 genome_build_directory=genome_build_directory, \
11
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
1386 cosmic_resources_location=args.cosmic_resources_location, \
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
1387 force_new_download=args.new_mutation_download, \
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
1388 force_new_integration=args.new_mutation_integration)
8
b2e6ed40840a Uploaded
trinity_ctat
parents: 7
diff changeset
1389
7
f22a13378750 Uploaded
trinity_ctat
parents: 6
diff changeset
1390 # Need to get the genome name.
10
a7cd51b60f58 Uploaded
trinity_ctat
parents: 9
diff changeset
1391 genome_name = find_genome_name_in_path(args.download_url)
7
f22a13378750 Uploaded
trinity_ctat
parents: 6
diff changeset
1392 if genome_name is None:
f22a13378750 Uploaded
trinity_ctat
parents: 6
diff changeset
1393 genome_name = find_genome_name_in_path(genome_build_directory)
f22a13378750 Uploaded
trinity_ctat
parents: 6
diff changeset
1394 if genome_name is None:
11
57428396c6e4 Adding retartable downloads, ctat_mutations library.
trinity_ctat
parents: 10
diff changeset
1395 genome_name = find_genome_name_in_path(extracted_directory)
7
f22a13378750 Uploaded
trinity_ctat
parents: 6
diff changeset
1396 if genome_name is None:
10
a7cd51b60f58 Uploaded
trinity_ctat
parents: 9
diff changeset
1397 genome_name = find_genome_name_in_path(args.source_location)
a7cd51b60f58 Uploaded
trinity_ctat
parents: 9
diff changeset
1398 if genome_name is None:
a7cd51b60f58 Uploaded
trinity_ctat
parents: 9
diff changeset
1399 genome_name = find_genome_name_in_path(args.download_location)
7
f22a13378750 Uploaded
trinity_ctat
parents: 6
diff changeset
1400 if genome_name is None:
f22a13378750 Uploaded
trinity_ctat
parents: 6
diff changeset
1401 genome_name = find_genome_name_in_path(args.display_name)
f22a13378750 Uploaded
trinity_ctat
parents: 6
diff changeset
1402 if genome_name is None:
f22a13378750 Uploaded
trinity_ctat
parents: 6
diff changeset
1403 genome_name = _CTAT_ResourceLib_DefaultGenome
f22a13378750 Uploaded
trinity_ctat
parents: 6
diff changeset
1404 print "WARNING: We could not find a genome name in any of the directory paths."
0
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
1405
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
1406 # Determine the display_name for the library.
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
1407 if (args.display_name is None) or (args.display_name == ""):
7
f22a13378750 Uploaded
trinity_ctat
parents: 6
diff changeset
1408 # Create the display_name from the genome_name.
f22a13378750 Uploaded
trinity_ctat
parents: 6
diff changeset
1409 display_name = _CTAT_ResourceLib_DisplayNamePrefix + genome_name
0
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
1410 else:
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
1411 display_name = _CTAT_ResourceLib_DisplayNamePrefix + args.display_name
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
1412 display_name = display_name.replace(" ","_")
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
1413
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
1414 # Create a unique_id for the library.
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
1415 datetime_stamp = datetime.now().strftime("_%Y_%m_%d_%H_%M_%S_%f")
8
b2e6ed40840a Uploaded
trinity_ctat
parents: 7
diff changeset
1416 unique_id = genome_name + "." + datetime_stamp
0
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
1417
6
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
1418 print "The Genome Resource Library's display_name will be set to: {:s}\n".format(display_name)
0
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
1419 print "Its unique_id will be set to: {:s}\n".format(unique_id)
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
1420 print "Its dir_path will be set to: {:s}\n".format(genome_build_directory)
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
1421
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
1422 data_manager_dict = {}
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
1423 data_manager_dict['data_tables'] = {}
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
1424 data_manager_dict['data_tables']['ctat_genome_resource_libs'] = []
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
1425 data_table_entry = dict(value=unique_id, name=display_name, path=genome_build_directory)
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
1426 data_manager_dict['data_tables']['ctat_genome_resource_libs'].append(data_table_entry)
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
1427
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
1428 # Temporarily the output file's dictionary is written for debugging:
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
1429 print "The dictionary for the output file is:\n\t{:s}".format(str(data_manager_dict))
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
1430 # Save info to json file. This is used to transfer data from the DataManager tool, to the data manager,
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
1431 # which then puts it into the correct .loc file (I think).
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
1432 # Comment out the following line when testing without galaxy package.
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
1433 open(args.output_filename, 'wb').write(to_json_string(data_manager_dict))
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
1434
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
1435 if __name__ == "__main__":
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
1436 main()