Mercurial > repos > yating-l > jbrowsearchivecreator

diff bedToGff3.py @ 15:671231da45f9 draft
planemo upload for repository https://github.com/Yating-L/jbrowse-archive-creator.git commit 594c5fa1e3cdb378ecf6b5490ec2cbac5fa8a61e-dirty
author: yating-l
date: Wed, 05 Jul 2017 15:47:15 -0400
parents: 8d1cf7ce65cd
children: 466d52f83079
--- a/bedToGff3.py	Mon Jul 03 16:05:49 2017 -0400
+++ b/bedToGff3.py	Wed Jul 05 15:47:15 2017 -0400
@@ -2,6 +2,7 @@
 
 '''
 Convert BED format to gff3
+reference for gff3: https://github.com/The-Sequence-Ontology/Specifications/blob/master/gff3.md
 '''
 import os
 from collections import OrderedDict
@@ -19,6 +20,8 @@
             self.trfbig_to_gff3()
         if self.type == "regtools":
             self.splicejunctions_to_gff3()
+        if self.type == "blat":
+            self.bigpsl_to_gff3()
 
     def trfbig_to_gff3(self):
         gff3 = open(self.output, 'w')
@@ -81,12 +84,51 @@
                 field['score'] = li[12]
                 field['strand'] = li[5]
                 field['phase'] = '.'
-                attribute['ID'] = li[3]
+                attribute['ID'] = li[0] + '_' + li[3]
                 attribute['Name'] = li[3]
                 attribute['blockcount'] = li[9]
                 attribute['blocksizes'] = li[10]
                 attribute['chromstarts'] = li[11]
                 utils.write_features(field, attribute, gff3)
-                utils.child_blocks(field, attribute, gff3)
+                utils.child_blocks(field, attribute, gff3, 'exon_junction')
+        gff3.close()
+
+    def bigpsl_to_gff3(self):
+        gff3 = open(self.output, 'w')
+        gff3.write("##gff-version 3\n")
+        sizes_dict = utils.sequence_region(self.chrom_sizes)
+        seq_regions = dict()
+        with open(self.input, 'r') as bed:
+            for line in bed:
+                field = OrderedDict()
+                attribute = OrderedDict()
+                li = line.rstrip().split("\t")
+                field['seqid'] = li[0]
+                if field['seqid'] not in seq_regions:
+                    end_region = sizes_dict[field['seqid']]
+                    gff3.write("##sequence-region " + field['seqid'] + ' 1 ' + str(end_region) + '\n')
+                    seq_regions[field['seqid']] = end_region
+                field['source'] = 'UCSC BLAT alignment tool'
+                field['type'] = 'match'
+                # The first base in a chromosome is numbered 0 in BED format
+                field['start'] = str(int(li[1]) + 1)
+                field['end'] = li[2]
+                field['score'] = li[4]
+                field['strand'] = li[5]
+                field['phase'] = '.'
+                attribute['ID'] = li[0] + '_' + li[3]
+                attribute['Name'] = li[3]
+                attribute['blockcount'] = li[9]
+                attribute['blocksizes'] = li[10]
+                attribute['chromstarts'] = li[11]
+                attribute['sequence on other chromosome'] = li[17]
+                attribute['cds in ncbi format'] = li[18]
+                attribute['size of target chromosome'] = li[19]
+                attribute['number of bases matched'] = li[20]
+                attribute['number of bases that don\'t match'] = li[21]
+                attribute['number of bases that match but are part of repeats'] = li[22]
+                attribute['number of \'N\' bases'] = li[23]
+                utils.write_features(field, attribute, gff3)
+                utils.child_blocks(field, attribute, gff3, 'match_part')
         gff3.close()
         
\ No newline at end of file
author	yating-l
date	Wed, 05 Jul 2017 15:47:15 -0400
parents	8d1cf7ce65cd
children	466d52f83079