# HG changeset patch
# User galaxyp
# Date 1516647542 18000
# Node ID 817e54905a9f8e7e48a3e20d35b0721dc18b9756
# Parent d615767650a0545af37f69db45e0d5fa21590b74
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/proteogenomics/translate_bed commit 383bb485120a193bcc14f88364e51356d6ede219
diff -r d615767650a0 -r 817e54905a9f bedutil.py
--- a/bedutil.py Sun Jan 14 14:06:15 2018 -0500
+++ b/bedutil.py Mon Jan 22 13:59:02 2018 -0500
@@ -12,6 +12,8 @@
#------------------------------------------------------------------------------
"""
+from __future__ import print_function
+
import sys
from Bio.Seq import reverse_complement, translate
@@ -45,6 +47,17 @@
return bed_entry
+def as_int_list(obj):
+ if obj is None:
+ return None
+ if isinstance(obj, list):
+ return [int(x) for x in obj]
+ elif isinstance(obj, str):
+ return [int(x) for x in obj.split(',')]
+ else: # python2 unicode?
+ return [int(x) for x in str(obj).split(',')]
+
+
class BedEntry(object):
def __init__(self, chrom=None, chromStart=None, chromEnd=None,
name=None, score=None, strand=None,
@@ -60,18 +73,8 @@
self.thickEnd = int(thickEnd) if thickEnd else self.chromEnd
self.itemRgb = str(itemRgb) if itemRgb is not None else r'100,100,100'
self.blockCount = int(blockCount)
- if isinstance(blockSizes, str) or isinstance(blockSizes, unicode):
- self.blockSizes = [int(x) for x in blockSizes.split(',')]
- elif isinstance(blockSizes, list):
- self.blockSizes = [int(x) for x in blockSizes]
- else:
- self.blockSizes = blockSizes
- if isinstance(blockStarts, str) or isinstance(blockSizes, unicode):
- self.blockStarts = [int(x) for x in blockStarts.split(',')]
- elif isinstance(blockStarts, list):
- self.blockStarts = [int(x) for x in blockStarts]
- else:
- self.blockStarts = blockStarts
+ self.blockSizes = as_int_list(blockSizes)
+ self.blockStarts = as_int_list(blockStarts)
self.second_name = None
self.cds_start_status = None
self.cds_end_status = None
@@ -165,6 +168,11 @@
return self.set_cds(min(cds_pos), max(cds_pos) + basepairs)
return None
+ def get_cds_bed(self):
+ cds_pos = [self.cdna_offset_of_pos(self.thickStart),
+ self.cdna_offset_of_pos(self.thickEnd)]
+ return self.trim(min(cds_pos), max(cds_pos))
+
def get_cigar(self):
cigar = ''
r = range(self.blockCount)
@@ -214,7 +222,7 @@
def pos_of_cdna_offet(self, offset):
if offset is not None and 0 <= offset < sum(self.blockSizes):
- r = range(self.blockCount)
+ r = list(range(self.blockCount))
rev = self.strand == '-'
if rev:
r.reverse()
@@ -233,7 +241,7 @@
def cdna_offset_of_pos(self, pos):
if not self.chromStart <= pos < self.chromEnd:
return -1
- r = range(self.blockCount)
+ r = list(range(self.blockCount))
rev = self.strand == '-'
if rev:
r.reverse()
@@ -248,19 +256,21 @@
def apply_variant(self, pos, ref, alt):
pos = int(pos)
if not ref or not alt:
- print >> sys.stderr, "variant requires ref and alt sequences"
+ print("variant requires ref and alt sequences", file=sys.stderr)
return
if not self.chromStart <= pos <= self.chromEnd:
- print >> sys.stderr, "variant not in entry %s: %s %d < %d < %d"\
- % (self.name, self.strand, self.chromStart, pos, self.chromEnd)
- print >> sys.stderr, "%s" % str(self)
+ print("variant not in entry %s: %s %d < %d < %d" %
+ (self.name, self.strand,
+ self.chromStart, pos, self.chromEnd),
+ file=sys.stderr)
+ print("%s" % str(self), file=sys.stderr)
return
if len(ref) != len(alt):
- print >> sys.stderr, "variant only works for snp: %s %s"\
- % (ref, alt)
+ print("variant only works for snp: %s %s" % (ref, alt),
+ file=sys.stderr)
return
if not self.seq:
- print >> sys.stderr, "variant entry %s has no seq" % self.name
+ print("variant entry %s has no seq" % self.name, file=sys.stderr)
return
"""
if self.strand == '-':
@@ -274,27 +284,27 @@
if offset is not None:
bases[offset+i] = alt[i]
else:
- print >> sys.stderr,\
- "variant offset %s: %s %d < %d < %d"\
- % (self.name, self.strand, self.chromStart,
- pos+1, self.chromEnd)
- print >> sys.stderr, "%s" % str(self)
+ print("variant offset %s: %s %d < %d < %d" %
+ (self.name, self.strand, self.chromStart,
+ pos+1, self.chromEnd), file=sys.stderr)
+ print("%s" % str(self), file=sys.stderr)
self.seq = ''.join(bases)
self.variants.append("g.%d%s>%s" % (pos+1, ref, alt))
def get_variant_bed(self, pos, ref, alt):
pos = int(pos)
if not ref or not alt:
- print >> sys.stderr, "variant requires ref and alt sequences"
+ print("variant requires ref and alt sequences", file=sys.stderr)
return None
if not self.chromStart <= pos <= self.chromEnd:
- print >> sys.stderr,\
- "variant not in entry %s: %s %d < %d < %d"\
- % (self.name, self.strand, self.chromStart, pos, self.chromEnd)
- print >> sys.stderr, "%s" % str(self)
+ print("variant not in entry %s: %s %d < %d < %d" %
+ (self.name, self.strand,
+ self.chromStart, pos, self.chromEnd),
+ file=sys.stderr)
+ print("%s" % str(self), file=sys.stderr)
return None
if not self.seq:
- print >> sys.stderr, "variant entry %s has no seq" % self.name
+ print("variant entry %s has no seq" % self.name, file=sys.stderr)
return None
tbed = BedEntry(chrom=self.chrom,
chromStart=self.chromStart, chromEnd=self.chromEnd,
@@ -330,8 +340,8 @@
chromStart = self.chromStart
chromEnd = self.chromEnd
if debug:
- print >> sys.stderr, "%s" % (str(self))
- r = range(self.blockCount)
+ print("%s" % (str(self)), file=sys.stderr)
+ r = list(range(self.blockCount))
if self.strand == '-':
r.reverse()
bStart = 0
@@ -353,10 +363,9 @@
chromStart = self.chromStart + self.blockStarts[x] +\
self.blockSizes[x] - (tstop - bStart)
if debug:
- print >> sys.stderr,\
- "%3d %s\t%d\t%d\t%d\t%d\t%d\t%d"\
- % (x, self.strand, bStart, bEnd,
- tstart, tstop, chromStart, chromEnd)
+ print("%3d %s\t%d\t%d\t%d\t%d\t%d\t%d" %
+ (x, self.strand, bStart, bEnd,
+ tstart, tstop, chromStart, chromEnd), file=sys.stderr)
bStart += self.blockSizes[x]
return(chromStart, chromEnd)
@@ -409,7 +418,7 @@
splice_sites = [sum(exon_sizes[:x]) / 3
for x in range(1, len(exon_sizes))]
if debug:
- print >> sys.stderr, "splice_sites: %s" % splice_sites
+ print("splice_sites: %s" % splice_sites, file=sys.stderr)
junc = splice_sites[0] if len(splice_sites) > 0 else exon_sizes[0]
if seq:
for i in range(3):
@@ -419,9 +428,10 @@
tstop = len(translation)
offset = (block_sum - i) % 3
if debug:
- print >> sys.stderr,\
- "frame: %d\ttstart: %d tstop: %d offset: %d\t%s"\
- % (i, tstart, tstop, offset, translation)
+ print("frame: %d\ttstart: %d tstop: %d " +
+ "offset: %d\t%s" %
+ (i, tstart, tstop, offset, translation),
+ file=sys.stderr)
if not untrimmed:
tstart = translation.rfind('*', 0, junc) + 1
stop = translation.find('*', junc)
@@ -429,9 +439,10 @@
offset = (block_sum - i) % 3
trimmed = translation[tstart:tstop]
if debug:
- print >> sys.stderr,\
- "frame: %d\ttstart: %d tstop: %d offset: %d\t%s"\
- % (i, tstart, tstop, offset, trimmed)
+ print("frame: %d\ttstart: %d tstop: %d " +
+ "offset: %d\t%s" %
+ (i, tstart, tstop, offset, trimmed),
+ file=sys.stderr)
if filtering and tstart > ignore:
continue
# get genomic locations for start and end
@@ -449,9 +460,10 @@
translations[i] = (chromStart, chromEnd, trimmed,
tblockCount, tblockSizes, tblockStarts)
if debug:
- print >> sys.stderr,\
- "tblockCount: %d tblockStarts: %s tblockSizes: %s"\
- % (tblockCount, tblockStarts, tblockSizes)
+ print("tblockCount: %d tblockStarts: %s " +
+ "tblockSizes: %s" %
+ (tblockCount, tblockStarts, tblockSizes),
+ file=sys.stderr)
return translations
def get_seq_id(self, seqtype='unk:unk', reference='', frame=None):
diff -r d615767650a0 -r 817e54905a9f ensembl_rest.py
--- a/ensembl_rest.py Sun Jan 14 14:06:15 2018 -0500
+++ b/ensembl_rest.py Mon Jan 22 13:59:02 2018 -0500
@@ -12,6 +12,8 @@
#------------------------------------------------------------------------------
"""
+from __future__ import print_function
+from __future__ import unicode_literals
import sys
@@ -28,10 +30,10 @@
def ensembl_rest(ext, headers):
if debug:
- print >> sys.stderr, "%s" % ext
+ print("%s" % ext, file=sys.stderr)
r = requests.get(server+ext, headers=headers)
if r.status_code == 429:
- print >> sys.stderr, "response headers: %s\n" % r.headers
+ print("response headers: %s\n" % r.headers, file=sys.stderr)
if 'Retry-After' in r.headers:
sleep(r.headers['Retry-After'])
r = requests.get(server+ext, headers=headers)
@@ -47,12 +49,11 @@
r = ensembl_rest(ext, req_header)
for species in r.json()['species']:
results[species['name']] = species
- print >> sys.stdout,\
- "%s\t%s\t%s\t%s\t%s"\
- % (species['name'], species['common_name'],
+ print("%s\t%s\t%s\t%s\t%s" %
+ (species['name'], species['common_name'],
species['display_name'],
species['strain'],
- species['taxon_id'])
+ species['taxon_id']), file=sys.stdout)
return results
@@ -86,7 +87,7 @@
bed = []
param = params if params else ''
req_header = {"Content-Type": "text/x-bed"}
- regions = range(start, length, max_region)
+ regions = list(range(start, length, max_region))
if not regions or regions[-1] < length:
regions.append(length)
for end in regions[1:]:
diff -r d615767650a0 -r 817e54905a9f translate_bed.py
--- a/translate_bed.py Sun Jan 14 14:06:15 2018 -0500
+++ b/translate_bed.py Mon Jan 22 13:59:02 2018 -0500
@@ -12,6 +12,8 @@
#------------------------------------------------------------------------------
"""
+from __future__ import print_function
+
import argparse
import re
import sys
@@ -122,7 +124,7 @@
selected_regions[chrom] = []
selected_regions[chrom].append([start, end, strand])
if args.debug:
- print >> sys.stderr, "selected_regions: %s" % selected_regions
+ print("selected_regions: %s" % selected_regions, file=sys.stderr)
def filter_by_regions(bed):
if not selected_regions:
@@ -158,18 +160,21 @@
def get_sequence(chrom, start, end):
if twobit:
- if chrom in twobit:
+ if chrom in twobit and 0 <= start < end < len(twobit[chrom]):
return twobit[chrom][start:end]
contig = chrom[3:] if chrom.startswith('chr') else 'chr%s' % chrom
- if contig in twobit:
+ if contig in twobit and 0 <= start < end < len(twobit[contig]):
return twobit[contig][start:end]
return None
- def write_translation(tbed, prot):
+ def write_translation(tbed, accession, peptide):
if args.id_prefix:
tbed.name = "%s%s" % (args.id_prefix, tbed.name)
+ probed = "%s\t%s\t%s\t%s%s" % (accession, peptide,
+ 'unique', args.reference,
+ '\t.' * 9)
if bed_wtr:
- bed_wtr.write("%s\t%s\n" % (str(tbed), prot))
+ bed_wtr.write("%s\t%s\n" % (str(tbed), probed))
bed_wtr.flush()
location = "chromosome:%s:%s:%s:%s:%s"\
% (args.reference, tbed.chrom,
@@ -178,7 +183,7 @@
fa_db = '%s%s' % (args.fa_db, args.fa_sep) if args.fa_db else ''
fa_id = ">%s%s%s\n" % (fa_db, tbed.name, fa_desc)
fa_wtr.write(fa_id)
- fa_wtr.write(prot)
+ fa_wtr.write(peptide)
fa_wtr.write("\n")
fa_wtr.flush()
@@ -199,8 +204,8 @@
cds = bed.get_cds()
if cds:
if args.debug:
- print >> sys.stderr, "cdna:%s" % str(cdna)
- print >> sys.stderr, "cds: %s" % str(cds)
+ print("cdna:%s" % str(cdna), file=sys.stderr)
+ print("cds: %s" % str(cds), file=sys.stderr)
if len(cds) % 3 != 0:
cds = cds[:-(len(cds) % 3)]
refprot = translate(cds) if cds else None
@@ -208,6 +213,7 @@
refprot = None
if args.cds:
if refprot:
+ tbed = bed.get_cds_bed()
if args.start_codon:
m = refprot.find('M')
if m < 0:
@@ -220,15 +226,16 @@
bed.trim_cds((stop - len(refprot)) * 3)
refprot = refprot[:stop]
if len(refprot) >= args.min_length:
- write_translation(bed, refprot)
+ write_translation(tbed, bed.name, refprot)
return 1
return 0
if args.debug:
- print >> sys.stderr, "%s\n" % (str(bed))
- print >> sys.stderr, "CDS: %s %d %d"\
- % (bed.strand, bed.cdna_offset_of_pos(bed.thickStart),
- bed.cdna_offset_of_pos(bed.thickEnd))
- print >> sys.stderr, "refprot: %s" % str(refprot)
+ print("%s\n" % (str(bed)), file=sys.stderr)
+ print("CDS: %s %d %d" %
+ (bed.strand, bed.cdna_offset_of_pos(bed.thickStart),
+ bed.cdna_offset_of_pos(bed.thickEnd)),
+ file=sys.stderr)
+ print("refprot: %s" % str(refprot), file=sys.stderr)
for offset in range(3):
seqend = cdna_len - (cdna_len - offset) % 3
aaseq = translate(cdna[offset:seqend])
@@ -251,8 +258,8 @@
break
is_cds = refprot and prot in refprot
if args.debug:
- print >> sys.stderr, "is_cds: %s %s"\
- % (str(is_cds), str(prot))
+ print("is_cds: %s %s" % (str(is_cds), str(prot)),
+ file=sys.stderr)
if len(prot) < args.min_length:
pass
elif not args.all and is_cds:
@@ -265,7 +272,7 @@
if args.all or unique_prot(tbed, prot):
translate_count += 1
tbed.name = prot_acc
- write_translation(tbed, prot)
+ write_translation(tbed, bed.name, prot)
aa_start = aa_end + 1
return translate_count
@@ -284,13 +291,12 @@
if filter_by_regions(bed):
translation_count += translate_bed(bed)
except Exception as e:
- print >> sys.stderr, "BED format Error: line %d: %s\n%s"\
- % (bedline, e)
+ print("BED format Error: line %d: %s\n%s"
+ % (i, bedline, e), file=sys.stderr)
break
if args.debug or args.verbose:
- print >> sys.stderr,\
- "transcripts: %d\ttranslations: %d"\
- % (transcript_count, translation_count)
+ print("transcripts: %d\ttranslations: %d"
+ % (transcript_count, translation_count), file=sys.stderr)
if __name__ == "__main__":
diff -r d615767650a0 -r 817e54905a9f translate_bed.xml
--- a/translate_bed.xml Sun Jan 14 14:06:15 2018 -0500
+++ b/translate_bed.xml Mon Jan 22 13:59:02 2018 -0500
@@ -126,13 +126,14 @@
-
+
+
+ default="chrom,chromStart,chromEnd,name,score,strand,thickStart,thickEnd,itemRgb,blockCount,blockSizes,blockStarts,proteinAccession,peptideSequence,uniqueness,genomeReferenceVersion,psmScore,fdr,modifications,charge,expMassToCharge,calcMassToCharge,psmRank,datasetID,uri"/>
-
+
@@ -194,13 +195,13 @@