Mercurial > repos > jjohnson > bed_to_protein_map

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/bed_to_protein_map.py	Mon Nov 20 14:58:18 2017 -0500
@@ -0,0 +1,120 @@
+#!/usr/bin/env python
+"""
+"""
+import sys
+import os.path
+import optparse
+
+"""
+X	276352	291629	ENST00000430923	20	+	284187	291629	80,80,80	5	42,148,137,129,131	0,7814,12380,14295,15146
+X	304749	318819	ENST00000326153	20	-	305073	318787	80,80,80	10	448,153,149,209,159,68,131,71,138,381	0,2610,2982,6669,8016,9400,10140,10479,12164,13689
+grep ENST bed_to_protein_map.py | grep -v grep | ./bed_to_protein_map.py
+"""
+
+def __main__():
+    #Parse Command Line
+    parser = optparse.OptionParser()
+    #I/O
+    parser.add_option( '-i', '--input', dest='input', default=None, help='Tabular file with peptide_sequence column' )
+    parser.add_option( '-c', '--column', type='int', dest='column', default=1, help='column ordinal with Ensembl transcript ID' )
+    parser.add_option( '-g', '--gtf', dest='gtf', default=None, help='GTF gene model file. Used to annotate NSJ peptide entries.')
+    parser.add_option( '-2', '--twobit', dest='twobit', default=None, help='Reference genome in UCSC twobit format')
+    parser.add_option( '-C', '--coding', dest='coding', action='store_true', default=False, help='Output coding BED line')
+    parser.add_option( '-o', '--output', dest='output', default=None, help='The output bed (else write to stdout)' )
+    parser.add_option( '-s', '--sqlitedb', dest='sqlitedb', default=None, help='The sqlitedb' )
+    parser.add_option( '--debug', dest='debug', action='store_true', default=False, help='Print debugging messages')
+    (options, args) = parser.parse_args()
+    ##INPUTS##
+    if options.input != None:
+        try:
+            inputPath = os.path.abspath(options.input)
+            inputFile = open(inputPath, 'r')
+        except Exception, e:
+            print >> sys.stderr, "failed: %s" % e
+            exit(2)
+    else:
+        inputFile = sys.stdin
+
+    if options.output != None:
+        try:
+            outputPath = os.path.abspath(options.output)
+            outputFile = open(outputPath, 'w')
+        except Exception, e:
+            print >> sys.stderr, "failed: %s" % e
+            exit(3)
+    else:
+        outputFile = sys.stdout
+
+    dbFile = None
+    if options.sqlitedb != None:
+        try:
+            dbPath = os.path.abspath(options.sqlitedb)
+            dbFile = open(dbPath, 'w')
+        except Exception, e:
+            print >> sys.stderr, "failed: %s" % e
+            exit(3)
+
+
+    try:
+        for linenum,line in enumerate(inputFile):
+            if options.debug:
+                print >> sys.stderr, "%d: %s\n" % (linenum,line)
+            if line.startswith('#'):
+                continue
+            if line.strip() == '':
+                continue
+            fields = line.rstrip('\r\n').split('\t')
+            if len(fields) < 12:
+                print >> sys.stderr, "%d: %s\n" % (linenum,line)
+                continue
+            (chrom,_chromStart,_chromEnd,name,score,strand,_thickStart,_thickEnd,itemRgb,_blockCount,blockSizes,blockStarts) = fields[0:12]
+            chromStart = int(_chromStart)
+            chromEnd = int(_chromEnd)
+            thickStart = int(_thickStart)
+            thickEnd = int(_thickEnd)
+            blockCount = int(_blockCount)
+            blockSizes = [int(x) for x in blockSizes.split(',')]
+            blockStarts = [int(x) for x in blockStarts.split(',')]
+            if strand == '+':
+                cds_start = 0
+                cds_end = 0
+                for i in range(blockCount):
+                    start = chromStart + blockStarts[i]
+                    end = start + blockSizes[i]
+                    if end < thickStart:
+                        continue
+                    if start > thickEnd:
+                        break
+                    if start < thickStart:
+                        start = thickStart
+                    if end > thickEnd:
+                        end = thickEnd
+                    cds_end = cds_start + (end - start)
+                    ##dbFile.write('%s\t%s\t%d\t%d\t%s\t%d\t%d\n' % (name,chrom,start,end,strand,cds_start,cds_end))
+                    outputFile.write('%s\t%s\t%d\t%d\t%s\t%d\t%d\n' % (name,chrom,start,end,strand,cds_start,cds_end))
+                    cds_start = cds_end
+            elif strand == '-':
+                cds_start = 0
+                cds_end = 0
+                for i in reversed(range(blockCount)):
+                    start = chromStart + blockStarts[i]
+                    end = start + blockSizes[i]
+                    if end < thickStart:
+                        break
+                    if start > thickEnd:
+                        continue
+                    if start < thickStart:
+                        start = thickStart
+                    if end > thickEnd:
+                        end = thickEnd
+                    cds_end = cds_start + (end - start)
+                    outputFile.write('%s\t%s\t%d\t%d\t%s\t%d\t%d\n' % (name,chrom,start,end,strand,cds_start,cds_end))
+                    cds_start = cds_end
+                pass
+    except Exception, e:
+        print >> sys.stderr, "failed: %s" % e
+        exit(1)
+
+
+if __name__ == "__main__" : __main__()
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/bed_to_protein_map.xml	Mon Nov 20 14:58:18 2017 -0500
@@ -0,0 +1,70 @@
+<tool id="bed_to_protein_map" name="bed to protein map" version="0.1.0">
+    <description>genomic location of proteins for MVP</description>
+    <requirements>
+    </requirements>
+    <stdio>
+        <exit_code range="1:" />
+    </stdio>
+    <command><![CDATA[
+        python '$__tool_directory__/bed_to_protein_map.py' -i '$input' -o '$output'
+    ]]></command>
+    <inputs>
+        <param name="input" type="data" format="bed" label="A BED file with 12 columns, thickStart and thickEnd define protein coding region"/>
+    </inputs>
+    <outputs>
+        <data name="output" format="tabular">
+            <actions>
+                <action name="column_names" type="metadata" default="name,chrom,start,end,strand,cds_start,cds_end"/>
+            </actions>
+        </data>
+    </outputs>
+    <tests>
+        <test>
+            <param name="input" ftype="bed" value="input.bed"/>
+            <output name="output" file="output.tabular"/>
+        </test>
+    </tests>
+    <help><![CDATA[
+Convert a BED format file of the proteins from a proteomics search database into a tabular format for the Multiomics Visualization Platform (MVP).
+
+Example input BED dataset::
+
+	X	276352	291629	ENST00000430923	20	+	284187	291629	80,80,80	5	42,148,137,129,131	0,7814,12380,14295,15146
+	X	304749	318819	ENST00000326153	20	-	305073	318787	80,80,80	10	448,153,149,209,159,68,131,71,138,381	0,2610,2982,6669,8016,9400,10140,10479,12164,13689
+
+
+Output::
+
+    name               chrom   start     end       strand  cds_start  cds_end
+    ENST00000430923    X       284187    284314    +          0        127
+    ENST00000430923    X       288732    288869    +        127        264
+    ENST00000430923    X       290647    290776    +        264        393
+    ENST00000430923    X       291498    291629    +        393        524
+    ENST00000326153    X       318438    318787    -          0        349
+    ENST00000326153    X       316913    317051    -        349        487
+    ENST00000326153    X       315228    315299    -        487        558
+    ENST00000326153    X       314889    315020    -        558        689
+    ENST00000326153    X       314149    314217    -        689        757
+    ENST00000326153    X       312765    312924    -        757        916
+    ENST00000326153    X       311418    311627    -        916       1125
+    ENST00000326153    X       307731    307880    -       1125       1274
+    ENST00000326153    X       307359    307512    -       1274       1427
+    ENST00000326153    X       305073    305197    -       1427       1551
+
+
+The tabular output can be converted to a sqlite database using the Query_Tabular_ tool.
+
+The sqlite table should be named:  feature_cds_map
+The names for the columns should be: name,chrom,start,end,strand,cds_start,cds_end
+
+This SQL query will return the genomic location for a peptide sequence in a protein (multiply the animo acid position by 3 for the cds location)::
+
+    SELECT distinct chrom, CASE WHEN strand = '+' THEN start + cds_offset - cds_start ELSE end - cds_offset - cds_start END as "pos"
+    FROM feature_cds_map
+    WHERE name = acc_name AND cds_offset >= cds_start AND cds_offset < cds_end
+
+
+.. _Query_Tabular: https://toolshed.g2.bx.psu.edu/view/iuc/query_tabular/1ea4e668bf73
+
+    ]]></help>
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/input.bed	Mon Nov 20 14:58:18 2017 -0500
@@ -0,0 +1,2 @@
+X	276352	291629	ENST00000430923	20	+	284187	291629	80,80,80	5	42,148,137,129,131	0,7814,12380,14295,15146
+X	304749	318819	ENST00000326153	20	-	305073	318787	80,80,80	10	448,153,149,209,159,68,131,71,138,381	0,2610,2982,6669,8016,9400,10140,10479,12164,13689
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/output.tabular	Mon Nov 20 14:58:18 2017 -0500
@@ -0,0 +1,14 @@
+ENST00000430923	X	284187	284314	+	0	127
+ENST00000430923	X	288732	288869	+	127	264
+ENST00000430923	X	290647	290776	+	264	393
+ENST00000430923	X	291498	291629	+	393	524
+ENST00000326153	X	318438	318787	-	0	349
+ENST00000326153	X	316913	317051	-	349	487
+ENST00000326153	X	315228	315299	-	487	558
+ENST00000326153	X	314889	315020	-	558	689
+ENST00000326153	X	314149	314217	-	689	757
+ENST00000326153	X	312765	312924	-	757	916
+ENST00000326153	X	311418	311627	-	916	1125
+ENST00000326153	X	307731	307880	-	1125	1274
+ENST00000326153	X	307359	307512	-	1274	1427
+ENST00000326153	X	305073	305197	-	1427	1551