Mercurial > repos > yating-l > jbrowsearchivecreator
comparison bedToGff3.py @ 15:671231da45f9 draft
planemo upload for repository https://github.com/Yating-L/jbrowse-archive-creator.git commit 594c5fa1e3cdb378ecf6b5490ec2cbac5fa8a61e-dirty
| author | yating-l |
|---|---|
| date | Wed, 05 Jul 2017 15:47:15 -0400 |
| parents | 8d1cf7ce65cd |
| children | 466d52f83079 |
comparison
equal
deleted
inserted
replaced
| 14:f4a9197281d5 | 15:671231da45f9 |
|---|---|
| 1 #!/usr/bin/env python | 1 #!/usr/bin/env python |
| 2 | 2 |
| 3 ''' | 3 ''' |
| 4 Convert BED format to gff3 | 4 Convert BED format to gff3 |
| 5 reference for gff3: https://github.com/The-Sequence-Ontology/Specifications/blob/master/gff3.md | |
| 5 ''' | 6 ''' |
| 6 import os | 7 import os |
| 7 from collections import OrderedDict | 8 from collections import OrderedDict |
| 8 import utils | 9 import utils |
| 9 | 10 |
| 17 self.type = bed_type | 18 self.type = bed_type |
| 18 if self.type == "trfbig": | 19 if self.type == "trfbig": |
| 19 self.trfbig_to_gff3() | 20 self.trfbig_to_gff3() |
| 20 if self.type == "regtools": | 21 if self.type == "regtools": |
| 21 self.splicejunctions_to_gff3() | 22 self.splicejunctions_to_gff3() |
| 23 if self.type == "blat": | |
| 24 self.bigpsl_to_gff3() | |
| 22 | 25 |
| 23 def trfbig_to_gff3(self): | 26 def trfbig_to_gff3(self): |
| 24 gff3 = open(self.output, 'w') | 27 gff3 = open(self.output, 'w') |
| 25 gff3.write("##gff-version 3\n") | 28 gff3.write("##gff-version 3\n") |
| 26 sizes_dict = utils.sequence_region(self.chrom_sizes) | 29 sizes_dict = utils.sequence_region(self.chrom_sizes) |
| 79 field['start'] = int(li[1]) + 1 | 82 field['start'] = int(li[1]) + 1 |
| 80 field['end'] = li[2] | 83 field['end'] = li[2] |
| 81 field['score'] = li[12] | 84 field['score'] = li[12] |
| 82 field['strand'] = li[5] | 85 field['strand'] = li[5] |
| 83 field['phase'] = '.' | 86 field['phase'] = '.' |
| 84 attribute['ID'] = li[3] | 87 attribute['ID'] = li[0] + '_' + li[3] |
| 85 attribute['Name'] = li[3] | 88 attribute['Name'] = li[3] |
| 86 attribute['blockcount'] = li[9] | 89 attribute['blockcount'] = li[9] |
| 87 attribute['blocksizes'] = li[10] | 90 attribute['blocksizes'] = li[10] |
| 88 attribute['chromstarts'] = li[11] | 91 attribute['chromstarts'] = li[11] |
| 89 utils.write_features(field, attribute, gff3) | 92 utils.write_features(field, attribute, gff3) |
| 90 utils.child_blocks(field, attribute, gff3) | 93 utils.child_blocks(field, attribute, gff3, 'exon_junction') |
| 94 gff3.close() | |
| 95 | |
| 96 def bigpsl_to_gff3(self): | |
| 97 gff3 = open(self.output, 'w') | |
| 98 gff3.write("##gff-version 3\n") | |
| 99 sizes_dict = utils.sequence_region(self.chrom_sizes) | |
| 100 seq_regions = dict() | |
| 101 with open(self.input, 'r') as bed: | |
| 102 for line in bed: | |
| 103 field = OrderedDict() | |
| 104 attribute = OrderedDict() | |
| 105 li = line.rstrip().split("\t") | |
| 106 field['seqid'] = li[0] | |
| 107 if field['seqid'] not in seq_regions: | |
| 108 end_region = sizes_dict[field['seqid']] | |
| 109 gff3.write("##sequence-region " + field['seqid'] + ' 1 ' + str(end_region) + '\n') | |
| 110 seq_regions[field['seqid']] = end_region | |
| 111 field['source'] = 'UCSC BLAT alignment tool' | |
| 112 field['type'] = 'match' | |
| 113 # The first base in a chromosome is numbered 0 in BED format | |
| 114 field['start'] = str(int(li[1]) + 1) | |
| 115 field['end'] = li[2] | |
| 116 field['score'] = li[4] | |
| 117 field['strand'] = li[5] | |
| 118 field['phase'] = '.' | |
| 119 attribute['ID'] = li[0] + '_' + li[3] | |
| 120 attribute['Name'] = li[3] | |
| 121 attribute['blockcount'] = li[9] | |
| 122 attribute['blocksizes'] = li[10] | |
| 123 attribute['chromstarts'] = li[11] | |
| 124 attribute['sequence on other chromosome'] = li[17] | |
| 125 attribute['cds in ncbi format'] = li[18] | |
| 126 attribute['size of target chromosome'] = li[19] | |
| 127 attribute['number of bases matched'] = li[20] | |
| 128 attribute['number of bases that don\'t match'] = li[21] | |
| 129 attribute['number of bases that match but are part of repeats'] = li[22] | |
| 130 attribute['number of \'N\' bases'] = li[23] | |
| 131 utils.write_features(field, attribute, gff3) | |
| 132 utils.child_blocks(field, attribute, gff3, 'match_part') | |
| 91 gff3.close() | 133 gff3.close() |
| 92 | 134 |
