Mercurial > repos > yating-l > jbrowsearchivecreator
comparison bedToGff3.py @ 0:8d1cf7ce65cd draft
planemo upload for repository https://github.com/Yating-L/jbrowse-archive-creator.git commit d583ac16a6c6942730ea536eb59cc37941816030-dirty
| author | yating-l |
|---|---|
| date | Thu, 18 May 2017 17:25:33 -0400 |
| parents | |
| children | 671231da45f9 |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:8d1cf7ce65cd |
|---|---|
| 1 #!/usr/bin/env python | |
| 2 | |
| 3 ''' | |
| 4 Convert BED format to gff3 | |
| 5 ''' | |
| 6 import os | |
| 7 from collections import OrderedDict | |
| 8 import utils | |
| 9 | |
| 10 class bedToGff3(): | |
| 11 def __init__(self, inputBedFile, chrom_sizes, bed_type, output): | |
| 12 self.input = inputBedFile | |
| 13 #file_dir = os.path.basename(inputBedFile) | |
| 14 #print file_dir + "\n\n" | |
| 15 self.output = output | |
| 16 self.chrom_sizes = chrom_sizes | |
| 17 self.type = bed_type | |
| 18 if self.type == "trfbig": | |
| 19 self.trfbig_to_gff3() | |
| 20 if self.type == "regtools": | |
| 21 self.splicejunctions_to_gff3() | |
| 22 | |
| 23 def trfbig_to_gff3(self): | |
| 24 gff3 = open(self.output, 'w') | |
| 25 gff3.write("##gff-version 3\n") | |
| 26 sizes_dict = utils.sequence_region(self.chrom_sizes) | |
| 27 seq_regions = dict() | |
| 28 with open(self.input, 'r') as bed: | |
| 29 for line in bed: | |
| 30 field = OrderedDict() | |
| 31 attribute = OrderedDict() | |
| 32 li = line.rstrip().split("\t") | |
| 33 field['seqid'] = li[0] | |
| 34 if field['seqid'] not in seq_regions: | |
| 35 end_region = sizes_dict[field['seqid']] | |
| 36 gff3.write("##sequence-region " + field['seqid'] + ' 1 ' + str(end_region) + '\n') | |
| 37 seq_regions[field['seqid']] = end_region | |
| 38 field['source'] = li[3] | |
| 39 field['type'] = 'tandem_repeat' | |
| 40 # The first base in a chromosome is numbered 0 in BED format | |
| 41 field['start'] = str(int(li[1]) + 1) | |
| 42 field['end'] = li[2] | |
| 43 field['score'] = li[9] | |
| 44 field['strand'] = '+' | |
| 45 field['phase'] = '.' | |
| 46 attribute['length of repeat unit'] = li[4] | |
| 47 attribute['mean number of copies of repeat'] = li[5] | |
| 48 attribute['length of consensus sequence'] = li[6] | |
| 49 attribute['percentage match'] = li[7] | |
| 50 attribute['percentage indel'] = li[8] | |
| 51 attribute['percent of a\'s in repeat unit'] = li[10] | |
| 52 attribute['percent of c\'s in repeat unit'] = li[11] | |
| 53 attribute['percent of g\'s in repeat unit'] = li[12] | |
| 54 attribute['percent of t\'s in repeat unit'] = li[13] | |
| 55 attribute['entropy'] = li[14] | |
| 56 attribute['sequence of repeat unit element'] = li[15] | |
| 57 utils.write_features(field, attribute, gff3) | |
| 58 gff3.close() | |
| 59 | |
| 60 | |
| 61 def splicejunctions_to_gff3(self): | |
| 62 gff3 = open(self.output, 'w') | |
| 63 gff3.write("##gff-version 3\n") | |
| 64 sizes_dict = utils.sequence_region(self.chrom_sizes) | |
| 65 seq_regions = dict() | |
| 66 with open(self.input, 'r') as bed: | |
| 67 for line in bed: | |
| 68 field = OrderedDict() | |
| 69 attribute = OrderedDict() | |
| 70 li = line.rstrip().split("\t") | |
| 71 field['seqid'] = li[0] | |
| 72 if field['seqid'] not in seq_regions: | |
| 73 end_region = sizes_dict[field['seqid']] | |
| 74 gff3.write("##sequence-region " + field['seqid'] + ' 1 ' + str(end_region) + '\n') | |
| 75 seq_regions[field['seqid']] = end_region | |
| 76 field['source'] = li[3] | |
| 77 field['type'] = 'junction' | |
| 78 # The first base in a chromosome is numbered 0 in BED format | |
| 79 field['start'] = int(li[1]) + 1 | |
| 80 field['end'] = li[2] | |
| 81 field['score'] = li[12] | |
| 82 field['strand'] = li[5] | |
| 83 field['phase'] = '.' | |
| 84 attribute['ID'] = li[3] | |
| 85 attribute['Name'] = li[3] | |
| 86 attribute['blockcount'] = li[9] | |
| 87 attribute['blocksizes'] = li[10] | |
| 88 attribute['chromstarts'] = li[11] | |
| 89 utils.write_features(field, attribute, gff3) | |
| 90 utils.child_blocks(field, attribute, gff3) | |
| 91 gff3.close() | |
| 92 |
