# HG changeset patch # User galaxyp # Date 1516647542 18000 # Node ID 817e54905a9f8e7e48a3e20d35b0721dc18b9756 # Parent d615767650a0545af37f69db45e0d5fa21590b74 planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/proteogenomics/translate_bed commit 383bb485120a193bcc14f88364e51356d6ede219 diff -r d615767650a0 -r 817e54905a9f bedutil.py --- a/bedutil.py Sun Jan 14 14:06:15 2018 -0500 +++ b/bedutil.py Mon Jan 22 13:59:02 2018 -0500 @@ -12,6 +12,8 @@ #------------------------------------------------------------------------------ """ +from __future__ import print_function + import sys from Bio.Seq import reverse_complement, translate @@ -45,6 +47,17 @@ return bed_entry +def as_int_list(obj): + if obj is None: + return None + if isinstance(obj, list): + return [int(x) for x in obj] + elif isinstance(obj, str): + return [int(x) for x in obj.split(',')] + else: # python2 unicode? + return [int(x) for x in str(obj).split(',')] + + class BedEntry(object): def __init__(self, chrom=None, chromStart=None, chromEnd=None, name=None, score=None, strand=None, @@ -60,18 +73,8 @@ self.thickEnd = int(thickEnd) if thickEnd else self.chromEnd self.itemRgb = str(itemRgb) if itemRgb is not None else r'100,100,100' self.blockCount = int(blockCount) - if isinstance(blockSizes, str) or isinstance(blockSizes, unicode): - self.blockSizes = [int(x) for x in blockSizes.split(',')] - elif isinstance(blockSizes, list): - self.blockSizes = [int(x) for x in blockSizes] - else: - self.blockSizes = blockSizes - if isinstance(blockStarts, str) or isinstance(blockSizes, unicode): - self.blockStarts = [int(x) for x in blockStarts.split(',')] - elif isinstance(blockStarts, list): - self.blockStarts = [int(x) for x in blockStarts] - else: - self.blockStarts = blockStarts + self.blockSizes = as_int_list(blockSizes) + self.blockStarts = as_int_list(blockStarts) self.second_name = None self.cds_start_status = None self.cds_end_status = None @@ -165,6 +168,11 @@ return self.set_cds(min(cds_pos), max(cds_pos) + basepairs) return None + def get_cds_bed(self): + cds_pos = [self.cdna_offset_of_pos(self.thickStart), + self.cdna_offset_of_pos(self.thickEnd)] + return self.trim(min(cds_pos), max(cds_pos)) + def get_cigar(self): cigar = '' r = range(self.blockCount) @@ -214,7 +222,7 @@ def pos_of_cdna_offet(self, offset): if offset is not None and 0 <= offset < sum(self.blockSizes): - r = range(self.blockCount) + r = list(range(self.blockCount)) rev = self.strand == '-' if rev: r.reverse() @@ -233,7 +241,7 @@ def cdna_offset_of_pos(self, pos): if not self.chromStart <= pos < self.chromEnd: return -1 - r = range(self.blockCount) + r = list(range(self.blockCount)) rev = self.strand == '-' if rev: r.reverse() @@ -248,19 +256,21 @@ def apply_variant(self, pos, ref, alt): pos = int(pos) if not ref or not alt: - print >> sys.stderr, "variant requires ref and alt sequences" + print("variant requires ref and alt sequences", file=sys.stderr) return if not self.chromStart <= pos <= self.chromEnd: - print >> sys.stderr, "variant not in entry %s: %s %d < %d < %d"\ - % (self.name, self.strand, self.chromStart, pos, self.chromEnd) - print >> sys.stderr, "%s" % str(self) + print("variant not in entry %s: %s %d < %d < %d" % + (self.name, self.strand, + self.chromStart, pos, self.chromEnd), + file=sys.stderr) + print("%s" % str(self), file=sys.stderr) return if len(ref) != len(alt): - print >> sys.stderr, "variant only works for snp: %s %s"\ - % (ref, alt) + print("variant only works for snp: %s %s" % (ref, alt), + file=sys.stderr) return if not self.seq: - print >> sys.stderr, "variant entry %s has no seq" % self.name + print("variant entry %s has no seq" % self.name, file=sys.stderr) return """ if self.strand == '-': @@ -274,27 +284,27 @@ if offset is not None: bases[offset+i] = alt[i] else: - print >> sys.stderr,\ - "variant offset %s: %s %d < %d < %d"\ - % (self.name, self.strand, self.chromStart, - pos+1, self.chromEnd) - print >> sys.stderr, "%s" % str(self) + print("variant offset %s: %s %d < %d < %d" % + (self.name, self.strand, self.chromStart, + pos+1, self.chromEnd), file=sys.stderr) + print("%s" % str(self), file=sys.stderr) self.seq = ''.join(bases) self.variants.append("g.%d%s>%s" % (pos+1, ref, alt)) def get_variant_bed(self, pos, ref, alt): pos = int(pos) if not ref or not alt: - print >> sys.stderr, "variant requires ref and alt sequences" + print("variant requires ref and alt sequences", file=sys.stderr) return None if not self.chromStart <= pos <= self.chromEnd: - print >> sys.stderr,\ - "variant not in entry %s: %s %d < %d < %d"\ - % (self.name, self.strand, self.chromStart, pos, self.chromEnd) - print >> sys.stderr, "%s" % str(self) + print("variant not in entry %s: %s %d < %d < %d" % + (self.name, self.strand, + self.chromStart, pos, self.chromEnd), + file=sys.stderr) + print("%s" % str(self), file=sys.stderr) return None if not self.seq: - print >> sys.stderr, "variant entry %s has no seq" % self.name + print("variant entry %s has no seq" % self.name, file=sys.stderr) return None tbed = BedEntry(chrom=self.chrom, chromStart=self.chromStart, chromEnd=self.chromEnd, @@ -330,8 +340,8 @@ chromStart = self.chromStart chromEnd = self.chromEnd if debug: - print >> sys.stderr, "%s" % (str(self)) - r = range(self.blockCount) + print("%s" % (str(self)), file=sys.stderr) + r = list(range(self.blockCount)) if self.strand == '-': r.reverse() bStart = 0 @@ -353,10 +363,9 @@ chromStart = self.chromStart + self.blockStarts[x] +\ self.blockSizes[x] - (tstop - bStart) if debug: - print >> sys.stderr,\ - "%3d %s\t%d\t%d\t%d\t%d\t%d\t%d"\ - % (x, self.strand, bStart, bEnd, - tstart, tstop, chromStart, chromEnd) + print("%3d %s\t%d\t%d\t%d\t%d\t%d\t%d" % + (x, self.strand, bStart, bEnd, + tstart, tstop, chromStart, chromEnd), file=sys.stderr) bStart += self.blockSizes[x] return(chromStart, chromEnd) @@ -409,7 +418,7 @@ splice_sites = [sum(exon_sizes[:x]) / 3 for x in range(1, len(exon_sizes))] if debug: - print >> sys.stderr, "splice_sites: %s" % splice_sites + print("splice_sites: %s" % splice_sites, file=sys.stderr) junc = splice_sites[0] if len(splice_sites) > 0 else exon_sizes[0] if seq: for i in range(3): @@ -419,9 +428,10 @@ tstop = len(translation) offset = (block_sum - i) % 3 if debug: - print >> sys.stderr,\ - "frame: %d\ttstart: %d tstop: %d offset: %d\t%s"\ - % (i, tstart, tstop, offset, translation) + print("frame: %d\ttstart: %d tstop: %d " + + "offset: %d\t%s" % + (i, tstart, tstop, offset, translation), + file=sys.stderr) if not untrimmed: tstart = translation.rfind('*', 0, junc) + 1 stop = translation.find('*', junc) @@ -429,9 +439,10 @@ offset = (block_sum - i) % 3 trimmed = translation[tstart:tstop] if debug: - print >> sys.stderr,\ - "frame: %d\ttstart: %d tstop: %d offset: %d\t%s"\ - % (i, tstart, tstop, offset, trimmed) + print("frame: %d\ttstart: %d tstop: %d " + + "offset: %d\t%s" % + (i, tstart, tstop, offset, trimmed), + file=sys.stderr) if filtering and tstart > ignore: continue # get genomic locations for start and end @@ -449,9 +460,10 @@ translations[i] = (chromStart, chromEnd, trimmed, tblockCount, tblockSizes, tblockStarts) if debug: - print >> sys.stderr,\ - "tblockCount: %d tblockStarts: %s tblockSizes: %s"\ - % (tblockCount, tblockStarts, tblockSizes) + print("tblockCount: %d tblockStarts: %s " + + "tblockSizes: %s" % + (tblockCount, tblockStarts, tblockSizes), + file=sys.stderr) return translations def get_seq_id(self, seqtype='unk:unk', reference='', frame=None): diff -r d615767650a0 -r 817e54905a9f ensembl_rest.py --- a/ensembl_rest.py Sun Jan 14 14:06:15 2018 -0500 +++ b/ensembl_rest.py Mon Jan 22 13:59:02 2018 -0500 @@ -12,6 +12,8 @@ #------------------------------------------------------------------------------ """ +from __future__ import print_function +from __future__ import unicode_literals import sys @@ -28,10 +30,10 @@ def ensembl_rest(ext, headers): if debug: - print >> sys.stderr, "%s" % ext + print("%s" % ext, file=sys.stderr) r = requests.get(server+ext, headers=headers) if r.status_code == 429: - print >> sys.stderr, "response headers: %s\n" % r.headers + print("response headers: %s\n" % r.headers, file=sys.stderr) if 'Retry-After' in r.headers: sleep(r.headers['Retry-After']) r = requests.get(server+ext, headers=headers) @@ -47,12 +49,11 @@ r = ensembl_rest(ext, req_header) for species in r.json()['species']: results[species['name']] = species - print >> sys.stdout,\ - "%s\t%s\t%s\t%s\t%s"\ - % (species['name'], species['common_name'], + print("%s\t%s\t%s\t%s\t%s" % + (species['name'], species['common_name'], species['display_name'], species['strain'], - species['taxon_id']) + species['taxon_id']), file=sys.stdout) return results @@ -86,7 +87,7 @@ bed = [] param = params if params else '' req_header = {"Content-Type": "text/x-bed"} - regions = range(start, length, max_region) + regions = list(range(start, length, max_region)) if not regions or regions[-1] < length: regions.append(length) for end in regions[1:]: diff -r d615767650a0 -r 817e54905a9f translate_bed.py --- a/translate_bed.py Sun Jan 14 14:06:15 2018 -0500 +++ b/translate_bed.py Mon Jan 22 13:59:02 2018 -0500 @@ -12,6 +12,8 @@ #------------------------------------------------------------------------------ """ +from __future__ import print_function + import argparse import re import sys @@ -122,7 +124,7 @@ selected_regions[chrom] = [] selected_regions[chrom].append([start, end, strand]) if args.debug: - print >> sys.stderr, "selected_regions: %s" % selected_regions + print("selected_regions: %s" % selected_regions, file=sys.stderr) def filter_by_regions(bed): if not selected_regions: @@ -158,18 +160,21 @@ def get_sequence(chrom, start, end): if twobit: - if chrom in twobit: + if chrom in twobit and 0 <= start < end < len(twobit[chrom]): return twobit[chrom][start:end] contig = chrom[3:] if chrom.startswith('chr') else 'chr%s' % chrom - if contig in twobit: + if contig in twobit and 0 <= start < end < len(twobit[contig]): return twobit[contig][start:end] return None - def write_translation(tbed, prot): + def write_translation(tbed, accession, peptide): if args.id_prefix: tbed.name = "%s%s" % (args.id_prefix, tbed.name) + probed = "%s\t%s\t%s\t%s%s" % (accession, peptide, + 'unique', args.reference, + '\t.' * 9) if bed_wtr: - bed_wtr.write("%s\t%s\n" % (str(tbed), prot)) + bed_wtr.write("%s\t%s\n" % (str(tbed), probed)) bed_wtr.flush() location = "chromosome:%s:%s:%s:%s:%s"\ % (args.reference, tbed.chrom, @@ -178,7 +183,7 @@ fa_db = '%s%s' % (args.fa_db, args.fa_sep) if args.fa_db else '' fa_id = ">%s%s%s\n" % (fa_db, tbed.name, fa_desc) fa_wtr.write(fa_id) - fa_wtr.write(prot) + fa_wtr.write(peptide) fa_wtr.write("\n") fa_wtr.flush() @@ -199,8 +204,8 @@ cds = bed.get_cds() if cds: if args.debug: - print >> sys.stderr, "cdna:%s" % str(cdna) - print >> sys.stderr, "cds: %s" % str(cds) + print("cdna:%s" % str(cdna), file=sys.stderr) + print("cds: %s" % str(cds), file=sys.stderr) if len(cds) % 3 != 0: cds = cds[:-(len(cds) % 3)] refprot = translate(cds) if cds else None @@ -208,6 +213,7 @@ refprot = None if args.cds: if refprot: + tbed = bed.get_cds_bed() if args.start_codon: m = refprot.find('M') if m < 0: @@ -220,15 +226,16 @@ bed.trim_cds((stop - len(refprot)) * 3) refprot = refprot[:stop] if len(refprot) >= args.min_length: - write_translation(bed, refprot) + write_translation(tbed, bed.name, refprot) return 1 return 0 if args.debug: - print >> sys.stderr, "%s\n" % (str(bed)) - print >> sys.stderr, "CDS: %s %d %d"\ - % (bed.strand, bed.cdna_offset_of_pos(bed.thickStart), - bed.cdna_offset_of_pos(bed.thickEnd)) - print >> sys.stderr, "refprot: %s" % str(refprot) + print("%s\n" % (str(bed)), file=sys.stderr) + print("CDS: %s %d %d" % + (bed.strand, bed.cdna_offset_of_pos(bed.thickStart), + bed.cdna_offset_of_pos(bed.thickEnd)), + file=sys.stderr) + print("refprot: %s" % str(refprot), file=sys.stderr) for offset in range(3): seqend = cdna_len - (cdna_len - offset) % 3 aaseq = translate(cdna[offset:seqend]) @@ -251,8 +258,8 @@ break is_cds = refprot and prot in refprot if args.debug: - print >> sys.stderr, "is_cds: %s %s"\ - % (str(is_cds), str(prot)) + print("is_cds: %s %s" % (str(is_cds), str(prot)), + file=sys.stderr) if len(prot) < args.min_length: pass elif not args.all and is_cds: @@ -265,7 +272,7 @@ if args.all or unique_prot(tbed, prot): translate_count += 1 tbed.name = prot_acc - write_translation(tbed, prot) + write_translation(tbed, bed.name, prot) aa_start = aa_end + 1 return translate_count @@ -284,13 +291,12 @@ if filter_by_regions(bed): translation_count += translate_bed(bed) except Exception as e: - print >> sys.stderr, "BED format Error: line %d: %s\n%s"\ - % (bedline, e) + print("BED format Error: line %d: %s\n%s" + % (i, bedline, e), file=sys.stderr) break if args.debug or args.verbose: - print >> sys.stderr,\ - "transcripts: %d\ttranslations: %d"\ - % (transcript_count, translation_count) + print("transcripts: %d\ttranslations: %d" + % (transcript_count, translation_count), file=sys.stderr) if __name__ == "__main__": diff -r d615767650a0 -r 817e54905a9f translate_bed.xml --- a/translate_bed.xml Sun Jan 14 14:06:15 2018 -0500 +++ b/translate_bed.xml Mon Jan 22 13:59:02 2018 -0500 @@ -126,13 +126,14 @@ - + + + default="chrom,chromStart,chromEnd,name,score,strand,thickStart,thickEnd,itemRgb,blockCount,blockSizes,blockStarts,proteinAccession,peptideSequence,uniqueness,genomeReferenceVersion,psmScore,fdr,modifications,charge,expMassToCharge,calcMassToCharge,psmRank,datasetID,uri"/> - + @@ -194,13 +195,13 @@ - + - +