comparison bedToGff3.py @ 15:671231da45f9 draft

planemo upload for repository https://github.com/Yating-L/jbrowse-archive-creator.git commit 594c5fa1e3cdb378ecf6b5490ec2cbac5fa8a61e-dirty
author yating-l
date Wed, 05 Jul 2017 15:47:15 -0400
parents 8d1cf7ce65cd
children 466d52f83079
comparison
equal deleted inserted replaced
14:f4a9197281d5 15:671231da45f9
1 #!/usr/bin/env python 1 #!/usr/bin/env python
2 2
3 ''' 3 '''
4 Convert BED format to gff3 4 Convert BED format to gff3
5 reference for gff3: https://github.com/The-Sequence-Ontology/Specifications/blob/master/gff3.md
5 ''' 6 '''
6 import os 7 import os
7 from collections import OrderedDict 8 from collections import OrderedDict
8 import utils 9 import utils
9 10
17 self.type = bed_type 18 self.type = bed_type
18 if self.type == "trfbig": 19 if self.type == "trfbig":
19 self.trfbig_to_gff3() 20 self.trfbig_to_gff3()
20 if self.type == "regtools": 21 if self.type == "regtools":
21 self.splicejunctions_to_gff3() 22 self.splicejunctions_to_gff3()
23 if self.type == "blat":
24 self.bigpsl_to_gff3()
22 25
23 def trfbig_to_gff3(self): 26 def trfbig_to_gff3(self):
24 gff3 = open(self.output, 'w') 27 gff3 = open(self.output, 'w')
25 gff3.write("##gff-version 3\n") 28 gff3.write("##gff-version 3\n")
26 sizes_dict = utils.sequence_region(self.chrom_sizes) 29 sizes_dict = utils.sequence_region(self.chrom_sizes)
79 field['start'] = int(li[1]) + 1 82 field['start'] = int(li[1]) + 1
80 field['end'] = li[2] 83 field['end'] = li[2]
81 field['score'] = li[12] 84 field['score'] = li[12]
82 field['strand'] = li[5] 85 field['strand'] = li[5]
83 field['phase'] = '.' 86 field['phase'] = '.'
84 attribute['ID'] = li[3] 87 attribute['ID'] = li[0] + '_' + li[3]
85 attribute['Name'] = li[3] 88 attribute['Name'] = li[3]
86 attribute['blockcount'] = li[9] 89 attribute['blockcount'] = li[9]
87 attribute['blocksizes'] = li[10] 90 attribute['blocksizes'] = li[10]
88 attribute['chromstarts'] = li[11] 91 attribute['chromstarts'] = li[11]
89 utils.write_features(field, attribute, gff3) 92 utils.write_features(field, attribute, gff3)
90 utils.child_blocks(field, attribute, gff3) 93 utils.child_blocks(field, attribute, gff3, 'exon_junction')
94 gff3.close()
95
96 def bigpsl_to_gff3(self):
97 gff3 = open(self.output, 'w')
98 gff3.write("##gff-version 3\n")
99 sizes_dict = utils.sequence_region(self.chrom_sizes)
100 seq_regions = dict()
101 with open(self.input, 'r') as bed:
102 for line in bed:
103 field = OrderedDict()
104 attribute = OrderedDict()
105 li = line.rstrip().split("\t")
106 field['seqid'] = li[0]
107 if field['seqid'] not in seq_regions:
108 end_region = sizes_dict[field['seqid']]
109 gff3.write("##sequence-region " + field['seqid'] + ' 1 ' + str(end_region) + '\n')
110 seq_regions[field['seqid']] = end_region
111 field['source'] = 'UCSC BLAT alignment tool'
112 field['type'] = 'match'
113 # The first base in a chromosome is numbered 0 in BED format
114 field['start'] = str(int(li[1]) + 1)
115 field['end'] = li[2]
116 field['score'] = li[4]
117 field['strand'] = li[5]
118 field['phase'] = '.'
119 attribute['ID'] = li[0] + '_' + li[3]
120 attribute['Name'] = li[3]
121 attribute['blockcount'] = li[9]
122 attribute['blocksizes'] = li[10]
123 attribute['chromstarts'] = li[11]
124 attribute['sequence on other chromosome'] = li[17]
125 attribute['cds in ncbi format'] = li[18]
126 attribute['size of target chromosome'] = li[19]
127 attribute['number of bases matched'] = li[20]
128 attribute['number of bases that don\'t match'] = li[21]
129 attribute['number of bases that match but are part of repeats'] = li[22]
130 attribute['number of \'N\' bases'] = li[23]
131 utils.write_features(field, attribute, gff3)
132 utils.child_blocks(field, attribute, gff3, 'match_part')
91 gff3.close() 133 gff3.close()
92 134