comparison util/subtools.py @ 39:4a69515eed63 draft

planemo upload for repository https://github.com/Yating-L/jbrowse-archive-creator.git commit 91271a6c0d39c923f0d460b2979247baa297286b-dirty
author yating-l
date Fri, 13 Apr 2018 18:21:35 -0400
parents d17f629f5486
children 061da5d3a219
comparison
equal deleted inserted replaced
38:d17f629f5486 39:4a69515eed63
235 if os.path.exists(filename): 235 if os.path.exists(filename):
236 return filename 236 return filename
237 else: 237 else:
238 raise ValueError('Did not find fai file') 238 raise ValueError('Did not find fai file')
239 239
240 def generate_indexed_refseq_track(fastaFile, referenceName, outputFolder):
241 faiFile = createFastaIndex(fastaFile)
242 refSeqFile = os.path.join(outputFolder, referenceName)
243 refSeqIndexFile = os.path.join(outputFolder, referenceName+'.fai')
244 shutil.copy(fastaFile, refSeqFile)
245 shutil.copy(faiFile, refSeqIndexFile)
246
247 def remove_gene_lines(gff3_file, gff3_filtered):
248 with open(gff3_file, 'r') as f:
249 with open(gff3_filtered, 'w') as out:
250 for line in f:
251 if not line.startswith('#'):
252 feature_type = line.split('\t')[2].rstrip()
253 if feature_type == 'transcript' or feature_type == 'mRNA':
254 arr = line.split('\t')
255 # as we remove the gene features, we should also remove the Parent attribute (gene id) from the transcript
256 arr[8] = ';'.join([item for item in arr[8].split(';') if 'Parent=' not in item]).rstrip()
257 line = '\t'.join(arr) + '\n'
258 if feature_type == 'gene':
259 continue
260 out.write(line)
261
240 def gff3sort(inputFile, outputFile, precise=False): 262 def gff3sort(inputFile, outputFile, precise=False):
241 array_call = ['gff3sort.pl', inputFile, '>', outputFile] 263 array_call = ['gff3sort.pl', inputFile]
242 if precise: 264 if precise:
243 array_call.append('--precise') 265 array_call.append('--precise')
244 p = _handleExceptionAndCheckCall(array_call) 266 p = _handleExceptionAndCheckCall(array_call, stdout=outputFile)
245 return p 267 return p
246 268
247 def bedSort(inputFile, outputFile): 269 def bedSort(inputFile, outputFile):
248 array_call = ['sort', '-k1,1', '-k2,2n', '-k6,6', inputFile, '>', outputFile] 270 array_call = ['sort', '-k1,1', '-k2,2n', '-k6,6', inputFile]
249 p = _handleExceptionAndCheckCall(array_call) 271 p = _handleExceptionAndCheckCall(array_call, stdout=outputFile)
250 return p 272 return p
251 273
252 def bgzip(inputFile): 274 def bgzip(inputFile):
253 subprocess.call(['bgzip', inputFile]) 275 subprocess.call(['bgzip', inputFile])
254 filename = inputFile + '.gz' 276 filename = inputFile + '.gz'
263 if os.path.exists(filename): 285 if os.path.exists(filename):
264 return filename 286 return filename
265 else: 287 else:
266 raise ValueError('Did not find tbi file') 288 raise ValueError('Did not find tbi file')
267 289
268 def generate_tabix_indexed_track(inputFile, dataType, outputFolder): 290 def generate_tabix_indexed_track(inputFile, dataType, trackName, outputFolder):
269 if "bed" in dataType: 291 if "bed" in dataType:
270 fileType = 'bed' 292 fileType = 'bed'
271 sortedFile = tempfile.NamedTemporaryFile(bufsize=0) 293 sortedFile = tempfile.NamedTemporaryFile(bufsize=0)
272 bedSort(inputFile, sortedFile.name) 294 bedSort(inputFile, sortedFile)
273 elif "gff" in dataType: 295 elif "gff" in dataType:
274 fileType = 'gff' 296 fileType = 'gff'
297 filteredFile = tempfile.NamedTemporaryFile(bufsize=0)
298 remove_gene_lines(inputFile, filteredFile.name)
275 sortedFile = tempfile.NamedTemporaryFile(bufsize=0) 299 sortedFile = tempfile.NamedTemporaryFile(bufsize=0)
276 gff3sort(inputFile, sortedFile.name) 300 gff3sort(filteredFile.name, sortedFile)
277 compressedFile = bgzip(sortedFile) 301 # add .gff3.gz extension to Tabix GFF3 files, in order to enable creating name index with generate-names.pl
302 trackName = trackName + '.gff3.gz'
303 compressedFile = bgzip(sortedFile.name)
278 tabixFile = createTabix(compressedFile, fileType) 304 tabixFile = createTabix(compressedFile, fileType)
279 trackPath = os.path.join(outputFolder, inputFile) 305 trackPath = os.path.join(outputFolder, trackName)
280 trackIndexPath = os.path.join(outputFolder, inputFile+'.tbi') 306 trackIndexPath = os.path.join(outputFolder, trackName+'.tbi')
281 shutil.copy(compressedFile, trackPath) 307 shutil.copy(compressedFile, trackPath)
282 shutil.copy(tabixFile, trackIndexPath) 308 shutil.copy(tabixFile, trackIndexPath)
283 309
284 def flatfile_to_json(inputFile, dataType, trackType, trackLabel, outputFolder, options=None, compress=True): 310 def flatfile_to_json(inputFile, dataType, trackType, trackLabel, outputFolder, options=None, compress=True):
285 if "bed" in dataType: 311 if "bed" in dataType:
347 track_json = json.dumps(track_json) 373 track_json = json.dumps(track_json)
348 new_track = subprocess.Popen(['echo', track_json], stdout=subprocess.PIPE) 374 new_track = subprocess.Popen(['echo', track_json], stdout=subprocess.PIPE)
349 p = subprocess.call(['add-track-json.pl', trackList], stdin=new_track.stdout) 375 p = subprocess.call(['add-track-json.pl', trackList], stdin=new_track.stdout)
350 return p 376 return p
351 377
352 def prepare_refseqs(fasta_file_name, outputFolder): 378 def prepare_refseqs(fastaFile, outputFolder):
353 array_call = ['prepare-refseqs.pl', '--fasta', fasta_file_name, '--out', outputFolder] 379 #array_call = ['prepare-refseqs.pl', '--fasta', fasta_file_name, '--out', outputFolder]
380 createFastaIndex(fastaFile)
381 array_call = ['prepare-refseqs.pl', '--indexed_fasta', fastaFile, '--out', outputFolder]
354 p = _handleExceptionAndCheckCall(array_call) 382 p = _handleExceptionAndCheckCall(array_call)
355 return p 383 return p
356 384
357 def generate_names(outputFolder): 385 def generate_names(outputFolder, hashBits=4):
358 array_call = ['generate-names.pl', '-v', '--out', outputFolder] 386 array_call = ['generate-names.pl', '--hashBits', '4', '-v', '--out', outputFolder]
359 p = _handleExceptionAndCheckCall(array_call) 387 p = _handleExceptionAndCheckCall(array_call)
360 return p 388 return p
361 389
362 def validateFiles(input_file, chrom_sizes_file_name, file_type, options=None): 390 def validateFiles(input_file, chrom_sizes_file_name, file_type, options=None):
363 """ 391 """