Mercurial > repos > kellrott > segment_map
changeset 0:7c9f8abc5b70 draft default tip
Uploaded
author | kellrott |
---|---|
date | Thu, 13 Jun 2013 16:06:42 -0400 |
parents | |
children | |
files | segment_map/segment_map.py segment_map/segment_map.xml |
diffstat | 2 files changed, 534 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/segment_map/segment_map.py Thu Jun 13 16:06:42 2013 -0400 @@ -0,0 +1,471 @@ +#!/usr/bin/env python + + +import csv +import re +import array +import sys +import math +from optparse import OptionParser +import re + +reAttr = re.compile(r'([^ ]+) \"(.*)\"') + +def block_both_strand(start, end, strand, gene): + """ + Check is segment is between gene start and end, either strand + + **Code 'b'** + """ + if gene.end >= start and gene.start <= end: + return True + return False + +class ProbeMapper(object): + """ + Class to map the probes. + """ + + def __init__(self): + self.cmp_func = block_both_strand + + def find_overlap(self, segment, ref_gene, cmp_func=None): + """ + Function to find overlaps for a given probe description. + the cmp_func arg is a function that returns a 'True' or 'False' for + a given probe description and a gene, examples include 'gene_overlap' + and 'gene_simple_meth_overlap' + """ + if cmp_func is None: + cmp_func = self.cmp_func + if not ref_gene.has_chrom(segment.chrom): + return [] + chromList = ref_gene.get_chrom(segment.chrom) + + out = [] + for gene in chromList: + if cmp_func(segment.start, + segment.end, segment.strand, gene): + out.append(gene) + return out + + +class Segment: + def __init__(self, id, chrom, start, end, value, strand='.'): + self.id = id + self.chrom = chrom + self.start = start + self.end = end + self.value = value + self.strand = strand + +class GenomicSegment: + def __init__(self, ref_col, strand_col, start_col, stop_col, score_col, name_col, head_skip, ref_search, ref_replace, base_zero): + self.samples = None + self.ref_col = ref_col + self.strand_col = strand_col + self.start_col = start_col + self.stop_col = stop_col + self.score_col = score_col + self.name_col = name_col + self.head_skip = head_skip + self.ref_search = ref_search + self.ref_replace = ref_replace + self.base_zero = base_zero + + def read(self, handle): + self.samples = {} + self.chrom_map = {} + + re_chrom = re.compile(self.ref_search) + head_skip = self.head_skip + for line in handle: + if head_skip > 0: + head_skip -= 1 + else: + tmp = line.rstrip().split("\t") + sample = tmp[self.name_col] + if sample not in self.samples: + self.samples[sample] = [] + seg = None + chrom = re_chrom.sub(self.ref_replace, tmp[self.ref_col]) + try: + start = int(tmp[self.start_col]) + end = int(tmp[self.stop_col]) + if self.base_zero: + start += 1 + seg = Segment( + chrom=chrom, + start=start, end=end, + id=sample, value=float(tmp[self.score_col]) + ) + except ValueError: + #I've seen cases of locations being printed out in float style notation, ie '6.0e+6' + start = int(float(tmp[self.start_col])) + end = int(float(tmp[self.stop_col])) + if self.base_zero: + start += 1 + seg = Segment( + chrom=chrom, + start=start, end=end, + id=sample, value=float(tmp[self.score_col]) + ) + self.samples[sample].append( seg ) + if seg.chrom not in self.chrom_map: + self.chrom_map[ seg.chrom ] = [] + self.chrom_map[seg.chrom].append(seg) + + def keys(self): + return self.samples.keys() + + def get(self, id): + return self.samples[id] + + def has_chrom(self, chrom): + return chrom in self.chrom_map + + def get_chrom(self, chrom): + return self.chrom_map[chrom] + +class FloatMatrix: + """ + array.array based float matrix class + """ + def __init__(self): + self.corner_name = "probe" + self.data = None + self.nrows = None + self.ncols = None + self.rowmap = None + self.colmap = None + + def read(self, handle): + header = None + for line in handle: + row = line.rstrip().split("\t") + if header is None: + header = row + self.data = array.array("f") + self.colmap = {} + self.rowmap = {} + self.ncols = len(row) - 1 + self.nrows = 0 + for i, c in enumerate(row[1:]): + self.colmap[c] = i + else: + if len(row) - 1 != self.ncols: + raise DataException("Misformed matrix") + self.rowmap[row[0]] = len(self.rowmap) + a = [] + for v in row[1:]: + try: + a.append(float(v)) + except ValueError: + a.append(float('Nan')) + self.data.extend(a) + self.nrows += 1 + + def init_blank(self, rows, cols): + self.data = array.array("f") + self.colmap = {} + for i,c in enumerate(cols): + self.colmap[c] = i + self.rowmap = {} + for i,r in enumerate(rows): + self.rowmap[r] = i + self.ncols = len(cols) + self.nrows = len(rows) + for i in range(self.nrows): + self.data.extend([float('nan')] * self.ncols) + + def get_value(self, row_name, col_name): + return self.data[ self.rowmap[row_name] * self.ncols + self.colmap[col_name] ] + + def set_value(self, row_name, col_name, value): + self.data[ self.rowmap[row_name] * self.ncols + self.colmap[col_name] ] = value + + def get_row(self, row_name): + return self.data[ self.rowmap[row_name] * self.ncols : (self.rowmap[row_name]+1) * self.ncols ] + + def get_cols(self): + return self.colmap.keys() + + def get_rows(self): + return self.rowmap.keys() + + def write(self, handle, missing='NA'): + write = csv.writer(handle, delimiter="\t", lineterminator='\n') + col_list = self.get_cols() + + write.writerow([self.corner_name] + col_list) + for rowName in self.rowmap: + out = [rowName] + row = self.get_row(rowName) + for col in col_list: + val = row[self.colmap[col]] + if val is None or math.isnan(val): + val = missing + else: + val = "%.5f" % (val) + out.append(val) + write.writerow(out) + + +def filter_longest_form(refgene): + """ + take a refgene table and filter multiple gene isoforms down to the longest + """ + ng = ProbeMapper() + for g in refgene: + longest = None + length = 0 + for elem in refgene[g]: + newLength = elem.end - elem.start + if newLength > length: + length = newLength + longest = elem + ng.add(longest) + ng.loaded = True + return ng + + +def genomicSegment2Matrix(genomicSegment, refGene, probeMapper, assertCoverage=True): + """ + Given a genomicSegment definition and reference genome and a probemapping class, + produce a FloatMatrix with the segment values to the genome features + """ + ng = refGene # filter_longest_form(refGene) + #enumerate the col order of the sample ids + idList = genomicSegment.keys() + + geneList = ng.get_gene_list() + + tmp = FloatMatrix() + tmp.init_blank( rows=geneList, cols=idList ) + geneHits = {} + #read through the segment one sample id at a time + for id in idList: + segmentMap = {} + for segment in genomicSegment.get(id): + for hit in probeMapper.find_overlap( segment, ng ): + span = float(min(segment.end, hit.end) - max(segment.start, hit.start)) / float(hit.end - hit.start) + #if hit.name not in segmentMap: + # segmentMap[hit.name] = [] + try: + segmentMap[hit.gene_id].append( + ( span, segment.value ) + ) + except KeyError: + segmentMap[hit.gene_id] = [ + ( span, segment.value ) + ] + + for gene in segmentMap: + geneHits[gene] = True + mapInfo = segmentMap[gene] + coverage = sum( i[0] for i in mapInfo ) + if assertCoverage: + assert coverage <= 1.0 + value = sum( i[0]*i[1] for i in mapInfo ) + #print coverage, value, value/coverage, segmentMap[gene] + if coverage > 0.0: + tmp.set_value(row_name=gene, col_name=id, value=value/coverage) + + #now remove the blanks + out = FloatMatrix() + out.init_blank( rows=geneHits, cols=idList ) + for gene in geneHits: + for sample in idList: + out.set_value( row_name=gene, col_name=sample, value=tmp.get_value(row_name=gene, col_name=sample)) + return out + +class GTFLine: + def __init__(self, seqname, source, feature, start, end, score, strand, frame, attr): + self.seqname = seqname + self.source = source + self.feature = feature + self.start = int(start) + self.end = int(end) + try: + self.score = float(score) + except ValueError: + self.score = None + self.strand = strand + try: + self.frame = int(frame) + except ValueError: + self.frame = None + self.attr = attr + + def __str__(self): + return "\t".join( [ + self.seqname, + self.source, + self.feature, + str(self.start), + str(self.end), + '.' if self.score is None else str(self.score) , + self.strand, + '.' if self.frame is None else str(self.frame), + str(self.attr) + ]) + + def __repr__(self): + return "%s:%s:%s-%s" % (self.feature, self.seqname, self.start, self.end) + + def gene_id(self): + return self.attr['gene_id'] + + def get_type(self): + return self.feature + + def __getitem__(self, name): + return self.attr[name] + + + +def gtfParse(handle): + for line in handle: + row = line.split("\t") + attr_str = row[8] + attr = {} + for s in attr_str.split("; "): + res = reAttr.search(s) + if res: + attr[res.group(1)] = res.group(2) + yield GTFLine(row[0], row[1], row[2], row[3], row[4], row[5], row[6], row[7], attr) + +class GTFGene: + def __init__(self, gene_id): + self.elements = {} + self.gene_id = gene_id + self.start = None + self.end = None + + def append(self, elem): + if self.start is None or elem.start < self.start: + self.start = elem.start + if self.end is None or elem.end > self.end: + self.end = elem.end + t = elem.get_type() + if t not in self.elements: + self.elements[t] = [] + self.elements[t].append(elem) + + + +class GTFMap: + def __init__(self): + self.gene_map = {} + self.chrom_map = {} + + def read(self, handle): + for line in gtfParse(handle): + g = line.gene_id() + + if line.seqname not in self.chrom_map: + self.chrom_map[line.seqname] = [] + + if g not in self.gene_map: + gene = GTFGene(g) + self.gene_map[g] = gene + self.chrom_map[line.seqname].append(gene) + + self.gene_map[g].append(line) + + def __iter__(self): + for k in self.gene_map: + yield k + + def __getitem__(self, n): + return self.gene_map[n] + + def keys(self): + return self.gene_map.keys() + + def has_item(self, gene, item): + if gene not in self.gene_map: + return False + if item not in self.gene_map[gene]: + return False + return True + + def has_chrom(self, chrome): + return chrome in self.chrom_map + + def get_chrom(self, chrome): + return self.chrom_map[chrome] + + def get_gene_list(self): + return self.gene_map.keys() + + def get_item_len(self, gene, item): + total = 0 + for i in self.gene_map[gene][item]: + total += i.end - i.start + return total + + def get_item_span(self, gene, item): + start = None + end = None + for i in self.gene_map[gene][item]: + if start is None: + start = i.start + else: + start = min(start, i.start) + if end is None: + end = i.end + else: + end = max(end, i.end) + return start, end + + + +if __name__ == "__main__": + parser = OptionParser() + parser.add_option("-i", "--input", dest="bed", help="Input Segment file", default=None) + parser.add_option("-a", "--annotation", dest="ann", help="GTF Annotation", default=None) + parser.add_option("-o", "--out", dest="out", help="Output file", default=None) + parser.add_option("-r", "--ref-col", help="Reference Sequence Name Column", type=int, default=1) + parser.add_option("-s", "--strand-col", help="String Column", type=int, default=6) + parser.add_option("-x", "--start-col", help="Start Position Column", type=int, default=2) + parser.add_option("-y", "--stop-col", help="Stop Position Column", type=int, default=3) + parser.add_option("-d", "--score-col", help="Score Column", type=int, default=5) + parser.add_option("-n", "--name-col", help="Name Column", type=int, default=4) + parser.add_option("-m", "--head-skip", help="Header Skip", type=int, default=0) + parser.add_option("-z", "--base-zero", help="Base Zero", action="store_true", default=False) + parser.add_option("--ref-regex", help="Ref Name regex", nargs=2, default=('','')) + + opts, args = parser.parse_args() + + handle = open( opts.bed ) + sg = GenomicSegment( + ref_col=opts.ref_col - 1, + strand_col=opts.strand_col - 1, + start_col=opts.start_col - 1, + stop_col=opts.stop_col - 1, + score_col=opts.score_col - 1, + name_col=opts.name_col - 1, + head_skip = opts.head_skip, + ref_search = opts.ref_regex[0], + ref_replace = opts.ref_regex[1], + base_zero = opts.base_zero ) + sg.read( handle ) + handle.close() + + + handle = open( opts.ann ) + rg = GTFMap() + rg.read( handle ) + handle.close() + + pm = ProbeMapper() + + out = genomicSegment2Matrix(sg,rg,pm) + + if opts.out is None: + out.write( sys.stdout ) + else: + handle = open(opts.out, "w") + out.write(handle) + handle.close()
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/segment_map/segment_map.xml Thu Jun 13 16:06:42 2013 -0400 @@ -0,0 +1,63 @@ +<tool id="segment_map" name="Genomic Segment Mapper" version="1.0.0"> + <description>Map Genomic Segment file to probe space using genome annotation</description> + <command interpreter="python">segment_map.py +-i $segfile +-a $gtffile +-r $ref_col +-x $start_col +-y $stop_col +-d $score_col +-n $name_col +-m $head_skip +#if $base_zero: +-z +#end if +--ref-regex '$ref_search' '$ref_replace' +-o $outfile + </command> + <inputs> + <param name="segfile" type="data" format="tabular" label="Segment File"/> + <param name="gtffile" type="data" format="gtf" label="GTF File"/> + <param name="ref_col" type="integer" label="Reference Sequence Column" value="1"/> + <param name="name_col" type="integer" label="Sample Name Column" value="4"/> + <param name="start_col" type="integer" label="Start Column" value="2"/> + <param name="stop_col" type="integer" label="Stop Column" value="3"/> + <param name="score_col" type="integer" label="Score Column" value="5"/> + <param name="head_skip" type="integer" label="Header Skip" value="1"/> + <param name="ref_search" type="text" label="Ref name search" value=""/> + <param name="ref_replace" type="text" label="Ref name replace" value=""/> + <param name="base_zero" type='boolean' label="Base Zero"/> + </inputs> + <outputs> + <data name="outfile" format="tabular"/> + </outputs> + <help> +The Segment Map takes a file that describes values to genomic spans, and remaps +it to values assigned to genomic features (such as genes) that this spans intersect with. +The output is a matrix, with samples as columns and rows as genes. + +The segment file is a tabular file, you define which columns provide which information. In the case of a bed file the settings are + + - Reference Sequence column : 1 + - Start Column : 2 + - Stop Column : 3 + - Sample Name Column : 4 + - Score Column : 5 + - Base Zero : True + +In addition, if there are string corrections that have to be done to the reference sequence name, +to get it to match the label in the GTF file, you can use the 'Ref name Search'/'Ref name Replace' +regular expression to fix it. + +For example, the segment file thinks calls the sequence '1' while the gtf file calls it 'chr1', use: + + - Ref name search : ^ + - Ref name replace : chr + +The zero base refers to the nucleotide numbering space. 1 base (ie not zero-base) the first nucleotide is +1, and the stop position is inclusive (1-10 includes 10 postion, the last of which is 10). In the case of +zero base, the stop position is exclusive (0-10 includes 10 postions, the last of which is 9). BED files are +in zero base, most everything else is not. + + </help> +</tool>