Mercurial > repos > yating-l > jbrowse_hub
comparison utils.py @ 0:e4f3f2ed4fa5 draft
planemo upload for repository https://github.com/Yating-L/jbrowse_hub commit f18ea51d27ec7addfa6413716391cfefebc8acbc-dirty
| author | yating-l |
|---|---|
| date | Fri, 10 Mar 2017 13:48:19 -0500 |
| parents | |
| children | e7c80e9b70ae |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:e4f3f2ed4fa5 |
|---|---|
| 1 #!/usr/bin/env python | |
| 2 | |
| 3 ''' | |
| 4 This file include common used functions for converting file format to gff3 | |
| 5 ''' | |
| 6 from collections import OrderedDict | |
| 7 import json | |
| 8 import subprocess | |
| 9 import os | |
| 10 import tempfile | |
| 11 | |
| 12 | |
| 13 def write_features(field, attribute, gff3): | |
| 14 ''' | |
| 15 The function write the features to gff3 format (defined in https://github.com/The-Sequence-Ontology/Specifications/blob/master/gff3.md) | |
| 16 field, attribute are ordered dictionary | |
| 17 gff3 is the file handler | |
| 18 ''' | |
| 19 attr = [] | |
| 20 for v in field.values(): | |
| 21 gff3.write(str(v) + '\t') | |
| 22 for k, v in attribute.items(): | |
| 23 s = str(k) + '=' + str(v) | |
| 24 attr.append(s) | |
| 25 gff3.write(';'.join(attr)) | |
| 26 gff3.write('\n') | |
| 27 | |
| 28 def getChromSizes(reference, tool_dir): | |
| 29 faToTwoBit = os.path.join(tool_dir, 'faToTwoBit') | |
| 30 twoBitInfo = os.path.join(tool_dir, 'twoBitInfo') | |
| 31 try: | |
| 32 twoBitFile = tempfile.NamedTemporaryFile(bufsize=0) | |
| 33 chrom_sizes = tempfile.NamedTemporaryFile(bufsize=0, suffix='.chrom.sizes', delete=False) | |
| 34 except IOError as err: | |
| 35 print "Cannot create tempfile err({0}): {1}".format(err.errno, err.strerror) | |
| 36 try: | |
| 37 p = subprocess.Popen([faToTwoBit, reference, twoBitFile.name]) | |
| 38 p.communicate() | |
| 39 except OSError as err: | |
| 40 print "Cannot generate twoBitFile from faToTwoBit err({0}): {1}".format(err.errno, err.strerror) | |
| 41 try: | |
| 42 p = subprocess.Popen([twoBitInfo, twoBitFile.name, chrom_sizes.name]) | |
| 43 p.communicate() | |
| 44 except OSError as err: | |
| 45 print "Cannot generate chrom_sizes from twoBitInfo err({0}): {1}".format(err.errno, err.strerror) | |
| 46 return chrom_sizes | |
| 47 | |
| 48 def sequence_region(chrom_sizes): | |
| 49 ''' | |
| 50 This function read from a chromatin size file generated by twoBitInfo and write the information to dict | |
| 51 return a dict | |
| 52 ''' | |
| 53 f = open(chrom_sizes, 'r') | |
| 54 sizes = f.readlines() | |
| 55 sizes_dict = {} | |
| 56 for line in sizes: | |
| 57 chrom_info = line.rstrip().split('\t') | |
| 58 sizes_dict[chrom_info[0]] = chrom_info[1] | |
| 59 return sizes_dict | |
| 60 | |
| 61 def child_blocks(parent_field, parent_attr, gff3): | |
| 62 num = 0 | |
| 63 blockcount = int(parent_attr['blockcount']) | |
| 64 chromstart = parent_attr['chromstarts'].split(',') | |
| 65 blocksize = parent_attr['blocksizes'].split(',') | |
| 66 while num < blockcount: | |
| 67 child_attr = OrderedDict() | |
| 68 child_field = parent_field | |
| 69 child_field['type'] = 'exon_junction' | |
| 70 child_field['start'] = int(chromstart[num]) + int(parent_field['start']) | |
| 71 child_field['end'] = int(child_field['start']) + int(blocksize[num]) - 1 | |
| 72 child_attr['ID'] = parent_attr['ID'] + '_exon_' + str(num+1) | |
| 73 child_attr['Parent'] = parent_attr['ID'] | |
| 74 write_features(child_field, child_attr, gff3) | |
| 75 num = num + 1 | |
| 76 | |
| 77 def add_tracks_to_json(trackList_json, new_tracks, modify_type): | |
| 78 ''' | |
| 79 Add to track configuration (trackList.json) | |
| 80 # modify_type = 'add_tracks': add a new track like bam or bigwig, new_track = dict() | |
| 81 # modify_type = 'add_attr': add configuration to the existing track, new_track = dict(track_name: dict()) | |
| 82 ''' | |
| 83 with open(trackList_json, 'r+') as f: | |
| 84 data = json.load(f) | |
| 85 if modify_type == 'add_tracks': | |
| 86 data['tracks'].append(new_tracks) | |
| 87 elif modify_type == 'add_attr': | |
| 88 for k in new_tracks: | |
| 89 for track in data['tracks']: | |
| 90 if k.lower() in track['urlTemplate'].lower(): | |
| 91 attr = new_tracks[k] | |
| 92 for k, v in attr.items(): | |
| 93 track[k] = v | |
| 94 f.seek(0, 0) | |
| 95 f.write(json.dumps(data, separators=(',' , ':'), indent=4)) | |
| 96 f.truncate() | |
| 97 f.close() | |
| 98 | |
| 99 def gtfToGff3(gtf_file, gff3_file, chrom_sizes): | |
| 100 ''' | |
| 101 Covert gtf file output from StringTie to gff3 format | |
| 102 ''' | |
| 103 gff3 = open(gff3_file, 'w') | |
| 104 gff3.write("##gff-version 3\n") | |
| 105 sizes_dict = sequence_region(chrom_sizes) | |
| 106 seq_regions = dict() | |
| 107 parents = dict() | |
| 108 with open(gtf_file, 'r') as gtf: | |
| 109 for line in gtf: | |
| 110 if line.startswith('#'): | |
| 111 continue | |
| 112 field = OrderedDict() | |
| 113 attribute = OrderedDict() | |
| 114 li = line.rstrip().split("\t") | |
| 115 #print li | |
| 116 field['seqid'] = li[0] | |
| 117 #print field['seqid'] | |
| 118 if field['seqid'] not in seq_regions: | |
| 119 end_region = sizes_dict[field['seqid']] | |
| 120 gff3.write("##sequence-region " + field['seqid'] + ' 1 ' + str(end_region) + '\n') | |
| 121 seq_regions[field['seqid']] = end_region | |
| 122 field['source'] = li[1] | |
| 123 field['type'] = li[2] | |
| 124 # The first base in a chromosome is numbered 0 in BED format | |
| 125 field['start'] = li[3] | |
| 126 field['end'] = li[4] | |
| 127 field['score'] = li[5] | |
| 128 field['strand'] = li[6] | |
| 129 field['phase'] = li[7] | |
| 130 attr_li = li[8].split(';') | |
| 131 gene_id = attr_li[0].split()[1].strip('"') | |
| 132 attribute['ID'] = gene_id + '_' + field['type'] + '_' + str(field['start']) + '_' + str(field['end']) | |
| 133 if field['type'] == 'transcript': | |
| 134 parents[gene_id] = attribute['ID'] | |
| 135 attribute['transcript_id'] = attr_li[1].split()[1].strip('"') | |
| 136 attribute['coverage'] = attr_li[2].split()[1].strip('"') | |
| 137 attribute['fpkm'] = attr_li[3].split()[1].strip('"') | |
| 138 attribute['tpm'] = attr_li[4].split()[1].strip('"') | |
| 139 elif field['type'] == 'exon': | |
| 140 attribute['Parent'] = parents[gene_id] | |
| 141 attribute['transcript_id'] = attr_li[1].split()[1].strip('"') | |
| 142 attribute['coverage'] = attr_li[3].split()[1].strip('"') | |
| 143 write_features(field, attribute, gff3) | |
| 144 gff3.close() |
