Mercurial > repos > kellrott > segment_map

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/segment_map/segment_map.py	Thu Jun 13 16:06:42 2013 -0400
@@ -0,0 +1,471 @@
+#!/usr/bin/env python
+
+
+import csv
+import re
+import array
+import sys
+import math
+from optparse import OptionParser
+import re
+
+reAttr = re.compile(r'([^ ]+) \"(.*)\"')
+
+def block_both_strand(start, end, strand, gene):
+    """
+    Check is segment is between gene start and end, either strand
+
+    **Code 'b'**
+    """
+    if gene.end >= start and gene.start <= end:
+        return True
+    return False
+
+class ProbeMapper(object):
+    """
+    Class to map the probes.
+    """
+
+    def __init__(self):
+        self.cmp_func = block_both_strand
+
+    def find_overlap(self, segment, ref_gene, cmp_func=None):
+        """
+        Function to find overlaps for a given probe description.
+        the cmp_func arg is a function that returns a 'True' or 'False' for
+        a given probe description and a gene, examples include 'gene_overlap'
+        and 'gene_simple_meth_overlap'
+        """
+        if cmp_func is None:
+            cmp_func = self.cmp_func
+        if not ref_gene.has_chrom(segment.chrom):
+            return []
+        chromList = ref_gene.get_chrom(segment.chrom)
+
+        out = []
+        for gene in chromList:
+            if cmp_func(segment.start,
+                        segment.end, segment.strand, gene):
+                out.append(gene)
+        return out
+
+
+class Segment:
+    def __init__(self, id, chrom, start, end, value, strand='.'):
+        self.id = id
+        self.chrom = chrom
+        self.start = start
+        self.end = end
+        self.value = value
+        self.strand = strand
+
+class GenomicSegment:
+    def __init__(self, ref_col, strand_col, start_col, stop_col, score_col, name_col, head_skip, ref_search, ref_replace, base_zero):
+        self.samples = None
+        self.ref_col = ref_col
+        self.strand_col = strand_col
+        self.start_col = start_col
+        self.stop_col = stop_col
+        self.score_col = score_col
+        self.name_col = name_col
+        self.head_skip = head_skip
+        self.ref_search = ref_search
+        self.ref_replace = ref_replace
+        self.base_zero = base_zero
+
+    def read(self, handle):
+        self.samples = {}
+        self.chrom_map = {}
+
+        re_chrom = re.compile(self.ref_search)
+        head_skip = self.head_skip
+        for line in handle:
+            if head_skip > 0:
+                head_skip -= 1
+            else:
+                tmp = line.rstrip().split("\t")
+                sample = tmp[self.name_col]
+                if sample not in self.samples:
+                    self.samples[sample] = []
+                seg = None
+                chrom = re_chrom.sub(self.ref_replace, tmp[self.ref_col])
+                try:
+                    start = int(tmp[self.start_col])
+                    end = int(tmp[self.stop_col])
+                    if self.base_zero:
+                        start += 1
+                    seg = Segment(
+                        chrom=chrom,
+                        start=start, end=end,
+                        id=sample, value=float(tmp[self.score_col])
+                    )
+                except ValueError:
+                    #I've seen cases of locations being printed out in float style notation, ie '6.0e+6'
+                    start = int(float(tmp[self.start_col]))
+                    end = int(float(tmp[self.stop_col]))
+                    if self.base_zero:
+                        start += 1
+                    seg = Segment(
+                        chrom=chrom,
+                        start=start, end=end,
+                        id=sample, value=float(tmp[self.score_col])
+                    )
+                self.samples[sample].append( seg )
+                if seg.chrom not in self.chrom_map:
+                    self.chrom_map[ seg.chrom ] = []
+                self.chrom_map[seg.chrom].append(seg)
+
+    def keys(self):
+        return self.samples.keys()
+
+    def get(self, id):
+        return self.samples[id]
+
+    def has_chrom(self, chrom):
+        return chrom in self.chrom_map
+
+    def get_chrom(self, chrom):
+        return self.chrom_map[chrom]
+
+class FloatMatrix:
+    """
+    array.array based float matrix class
+    """
+    def __init__(self):
+        self.corner_name = "probe"
+        self.data = None
+        self.nrows = None
+        self.ncols = None
+        self.rowmap = None
+        self.colmap = None
+
+    def read(self, handle):
+        header = None
+        for line in handle:
+            row = line.rstrip().split("\t")
+            if header is None:
+                header = row
+                self.data = array.array("f")
+                self.colmap = {}
+                self.rowmap = {}
+                self.ncols = len(row) - 1
+                self.nrows = 0
+                for i, c in enumerate(row[1:]):
+                    self.colmap[c] = i
+            else:
+                if len(row) - 1 != self.ncols:
+                    raise DataException("Misformed matrix")
+                self.rowmap[row[0]] = len(self.rowmap)
+                a = []
+                for v in row[1:]:
+                    try:
+                        a.append(float(v))
+                    except ValueError:
+                        a.append(float('Nan'))
+                self.data.extend(a)
+                self.nrows += 1
+
+    def init_blank(self, rows, cols):
+        self.data = array.array("f")
+        self.colmap = {}
+        for i,c in enumerate(cols):
+            self.colmap[c] = i
+        self.rowmap = {}
+        for i,r in enumerate(rows):
+            self.rowmap[r] = i
+        self.ncols = len(cols)
+        self.nrows = len(rows)
+        for i in range(self.nrows):
+            self.data.extend([float('nan')] * self.ncols)
+
+    def get_value(self, row_name, col_name):
+        return self.data[ self.rowmap[row_name] * self.ncols + self.colmap[col_name] ]
+
+    def set_value(self, row_name, col_name, value):
+        self.data[ self.rowmap[row_name] * self.ncols + self.colmap[col_name] ] = value
+
+    def get_row(self, row_name):
+        return self.data[ self.rowmap[row_name] * self.ncols :  (self.rowmap[row_name]+1) * self.ncols ]
+
+    def get_cols(self):
+        return self.colmap.keys()
+
+    def get_rows(self):
+        return self.rowmap.keys()
+
+    def write(self, handle, missing='NA'):
+        write = csv.writer(handle, delimiter="\t", lineterminator='\n')
+        col_list = self.get_cols()
+
+        write.writerow([self.corner_name] + col_list)
+        for rowName in self.rowmap:
+            out = [rowName]
+            row = self.get_row(rowName)
+            for col in col_list:
+                val = row[self.colmap[col]]
+                if val is None or math.isnan(val):
+                    val = missing
+                else:
+                    val = "%.5f" % (val)
+                out.append(val)
+            write.writerow(out)
+
+
+def filter_longest_form(refgene):
+    """
+    take a refgene table and filter multiple gene isoforms down to the longest
+    """
+    ng = ProbeMapper()
+    for g in refgene:
+        longest = None
+        length = 0
+        for elem in refgene[g]:
+            newLength = elem.end - elem.start
+            if newLength > length:
+                length = newLength
+                longest = elem
+        ng.add(longest)
+    ng.loaded = True
+    return ng
+
+
+def genomicSegment2Matrix(genomicSegment, refGene, probeMapper, assertCoverage=True):
+    """
+    Given a genomicSegment definition and reference genome and a probemapping class,
+    produce a FloatMatrix with the segment values to the genome features
+    """
+    ng = refGene # filter_longest_form(refGene)
+    #enumerate the col order of the sample ids
+    idList = genomicSegment.keys()
+
+    geneList = ng.get_gene_list()
+
+    tmp = FloatMatrix()
+    tmp.init_blank( rows=geneList, cols=idList )
+    geneHits = {}
+    #read through the segment one sample id at a time
+    for id in idList:
+        segmentMap = {}
+        for segment in genomicSegment.get(id):
+            for hit in probeMapper.find_overlap( segment, ng ):
+                span = float(min(segment.end, hit.end) - max(segment.start, hit.start)) / float(hit.end - hit.start)
+                #if hit.name not in segmentMap:
+                #    segmentMap[hit.name] = []
+                try:
+                    segmentMap[hit.gene_id].append(
+                        ( span, segment.value )
+                    )
+                except KeyError:
+                    segmentMap[hit.gene_id] = [
+                        ( span, segment.value )
+                    ]
+
+        for gene in segmentMap:
+            geneHits[gene] = True
+            mapInfo = segmentMap[gene]
+            coverage = sum( i[0] for i in mapInfo )
+            if assertCoverage:
+                assert coverage <= 1.0
+            value = sum( i[0]*i[1] for i in mapInfo )
+            #print coverage, value, value/coverage, segmentMap[gene]
+            if coverage > 0.0:
+                tmp.set_value(row_name=gene, col_name=id, value=value/coverage)
+
+    #now remove the blanks
+    out = FloatMatrix()
+    out.init_blank( rows=geneHits, cols=idList )
+    for gene in geneHits:
+        for sample in idList:
+            out.set_value( row_name=gene, col_name=sample, value=tmp.get_value(row_name=gene, col_name=sample))
+    return out
+
+class GTFLine:
+    def __init__(self, seqname, source, feature, start, end, score, strand, frame, attr):
+        self.seqname = seqname
+        self.source = source
+        self.feature = feature
+        self.start = int(start)
+        self.end = int(end)
+        try:
+            self.score = float(score)
+        except ValueError:
+            self.score = None
+        self.strand = strand
+        try:
+            self.frame = int(frame)
+        except ValueError:
+            self.frame = None
+        self.attr = attr
+
+    def __str__(self):
+        return "\t".join( [
+            self.seqname,
+            self.source,
+            self.feature,
+            str(self.start),
+            str(self.end),
+            '.' if self.score is None else str(self.score) ,
+            self.strand,
+            '.' if self.frame is None else str(self.frame),
+            str(self.attr)
+            ])
+
+    def __repr__(self):
+        return "%s:%s:%s-%s" % (self.feature, self.seqname, self.start, self.end)
+
+    def gene_id(self):
+        return self.attr['gene_id']
+
+    def get_type(self):
+        return self.feature
+
+    def __getitem__(self, name):
+        return self.attr[name]
+
+
+
+def gtfParse(handle):
+    for line in handle:
+        row = line.split("\t")
+        attr_str = row[8]
+        attr = {}
+        for s in attr_str.split("; "):
+            res = reAttr.search(s)
+            if res:
+                attr[res.group(1)] = res.group(2)
+        yield GTFLine(row[0], row[1], row[2], row[3], row[4], row[5], row[6], row[7], attr)
+
+class GTFGene:
+    def __init__(self, gene_id):
+        self.elements = {}
+        self.gene_id = gene_id
+        self.start = None
+        self.end = None
+
+    def append(self, elem):
+        if self.start is None or elem.start < self.start:
+            self.start = elem.start
+        if self.end is None or elem.end > self.end:
+            self.end = elem.end
+        t = elem.get_type()
+        if t not in self.elements:
+            self.elements[t] = []
+        self.elements[t].append(elem)
+
+
+
+class GTFMap:
+    def __init__(self):
+        self.gene_map = {}
+        self.chrom_map = {}
+
+    def read(self, handle):
+        for line in gtfParse(handle):
+            g = line.gene_id()
+
+            if line.seqname not in self.chrom_map:
+                self.chrom_map[line.seqname] = []
+
+            if g not in self.gene_map:
+                gene = GTFGene(g)
+                self.gene_map[g] = gene
+                self.chrom_map[line.seqname].append(gene)
+
+            self.gene_map[g].append(line)
+
+    def __iter__(self):
+        for k in self.gene_map:
+            yield k
+
+    def __getitem__(self, n):
+        return self.gene_map[n]
+
+    def keys(self):
+        return self.gene_map.keys()
+
+    def has_item(self, gene, item):
+        if gene not in self.gene_map:
+            return False
+        if item not in self.gene_map[gene]:
+            return False
+        return True
+
+    def has_chrom(self, chrome):
+        return chrome in self.chrom_map
+
+    def get_chrom(self, chrome):
+        return self.chrom_map[chrome]
+
+    def get_gene_list(self):
+        return self.gene_map.keys()
+
+    def get_item_len(self, gene, item):
+        total = 0
+        for i in self.gene_map[gene][item]:
+            total += i.end - i.start
+        return total
+
+    def get_item_span(self, gene, item):
+        start = None
+        end = None
+        for i in self.gene_map[gene][item]:
+            if start is None:
+                start = i.start
+            else:
+                start = min(start, i.start)
+            if end is None:
+                end = i.end
+            else:
+                end = max(end, i.end)
+        return start, end
+
+
+
+if __name__ == "__main__":
+    parser = OptionParser()
+    parser.add_option("-i", "--input", dest="bed", help="Input Segment file", default=None)
+    parser.add_option("-a", "--annotation", dest="ann", help="GTF Annotation", default=None)
+    parser.add_option("-o", "--out", dest="out", help="Output file", default=None)
+    parser.add_option("-r", "--ref-col", help="Reference Sequence Name Column", type=int, default=1)
+    parser.add_option("-s", "--strand-col", help="String Column", type=int, default=6)
+    parser.add_option("-x", "--start-col", help="Start Position Column", type=int, default=2)
+    parser.add_option("-y", "--stop-col", help="Stop Position Column", type=int, default=3)
+    parser.add_option("-d", "--score-col", help="Score Column", type=int, default=5)
+    parser.add_option("-n", "--name-col", help="Name Column", type=int, default=4)
+    parser.add_option("-m", "--head-skip", help="Header Skip", type=int, default=0)
+    parser.add_option("-z", "--base-zero", help="Base Zero", action="store_true", default=False)
+    parser.add_option("--ref-regex", help="Ref Name regex", nargs=2, default=('',''))
+
+    opts, args = parser.parse_args()
+
+    handle = open( opts.bed )
+    sg = GenomicSegment(
+        ref_col=opts.ref_col - 1,
+        strand_col=opts.strand_col - 1,
+        start_col=opts.start_col - 1,
+        stop_col=opts.stop_col - 1,
+        score_col=opts.score_col - 1,
+        name_col=opts.name_col - 1,
+        head_skip = opts.head_skip,
+        ref_search = opts.ref_regex[0],
+        ref_replace = opts.ref_regex[1],
+        base_zero = opts.base_zero )
+    sg.read( handle )
+    handle.close()
+
+
+    handle = open( opts.ann )
+    rg = GTFMap()
+    rg.read( handle )
+    handle.close()
+
+    pm = ProbeMapper()
+
+    out = genomicSegment2Matrix(sg,rg,pm)
+
+    if opts.out is None:
+        out.write( sys.stdout )
+    else:
+        handle = open(opts.out, "w")
+        out.write(handle)
+        handle.close()
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/segment_map/segment_map.xml	Thu Jun 13 16:06:42 2013 -0400
@@ -0,0 +1,63 @@
+<tool id="segment_map" name="Genomic Segment Mapper" version="1.0.0">
+  <description>Map Genomic Segment file to probe space using genome annotation</description>
+  <command interpreter="python">segment_map.py
+-i $segfile
+-a $gtffile
+-r $ref_col
+-x $start_col
+-y $stop_col
+-d $score_col
+-n $name_col
+-m $head_skip
+#if $base_zero:
+-z
+#end if
+--ref-regex '$ref_search' '$ref_replace'
+-o $outfile
+  </command>
+  <inputs>
+	  <param name="segfile" type="data" format="tabular" label="Segment File"/>
+	  <param name="gtffile" type="data" format="gtf" label="GTF File"/>
+    <param name="ref_col" type="integer" label="Reference Sequence Column" value="1"/>
+    <param name="name_col" type="integer" label="Sample Name Column" value="4"/>
+    <param name="start_col" type="integer" label="Start Column" value="2"/>
+    <param name="stop_col" type="integer" label="Stop Column" value="3"/>
+    <param name="score_col" type="integer" label="Score Column" value="5"/>
+    <param name="head_skip" type="integer" label="Header Skip" value="1"/>
+    <param name="ref_search" type="text" label="Ref name search" value=""/>
+    <param name="ref_replace" type="text" label="Ref name replace" value=""/>
+    <param name="base_zero" type='boolean' label="Base Zero"/>
+  </inputs>
+  <outputs>
+      <data name="outfile" format="tabular"/>
+  </outputs>
+  <help>
+The Segment Map takes a file that describes values to genomic spans, and remaps
+it to values assigned to genomic features (such as genes) that this spans intersect with.
+The output is a matrix, with samples as columns and rows as genes.
+
+The segment file is a tabular file, you define which columns provide which information. In the case of a bed file the settings are
+
+ - Reference Sequence column : 1
+ - Start Column : 2
+ - Stop Column : 3
+ - Sample Name Column : 4
+ - Score Column : 5
+ - Base Zero : True
+
+In addition, if there are string corrections that have to be done to the reference sequence name,
+to get it to match the label in the GTF file, you can use the 'Ref name Search'/'Ref name Replace'
+regular expression to fix it.
+
+For example, the segment file thinks calls the sequence '1' while the gtf file calls it 'chr1', use:
+
+ - Ref name search : ^
+ - Ref name replace : chr
+
+The zero base refers to the nucleotide numbering space. 1 base (ie not zero-base) the first nucleotide is
+1, and the stop position is inclusive (1-10 includes 10 postion, the last of which is 10). In the case of
+zero base, the stop position is exclusive (0-10 includes 10 postions, the last of which is 9). BED files are
+in zero base, most everything else is not.
+
+  </help>
+</tool>