Mercurial > repos > jjohnson > ensembl_cdna_translate
diff ensembl_cdna_translate.py @ 7:d59e3ce10e74 draft
Uploaded
author | jjohnson |
---|---|
date | Wed, 13 Dec 2017 11:15:34 -0500 |
parents | b7f2f5e3390c |
children | 5c92d0be6514 |
line wrap: on
line diff
--- a/ensembl_cdna_translate.py Wed Nov 29 17:46:00 2017 -0500 +++ b/ensembl_cdna_translate.py Wed Dec 13 11:15:34 2017 -0500 @@ -13,6 +13,7 @@ """ import argparse +import re import sys from time import sleep @@ -25,11 +26,11 @@ server = "https://rest.ensembl.org" ext = "/info/assembly/homo_sapiens?" -max_region = 5000000 +max_region = 4000000 def ensembl_rest(ext, headers): - # if True: print >> sys.stderr, "%s" % ext + if True: print >> sys.stderr, "%s" % ext r = requests.get(server+ext, headers=headers) if r.status_code == 429: print >> sys.stderr, "response headers: %s\n" % r.headers @@ -82,7 +83,7 @@ return coord_systems -def get_transcripts_bed(species, refseq, start, length,params=None): +def get_transcripts_bed(species, refseq, start, length, strand='', params=None): bed = [] param = params if params else '' req_header = {"Content-Type": "text/x-bed"} @@ -90,8 +91,8 @@ if not regions or regions[-1] < length: regions.append(length) for end in regions[1:]: - ext = "/overlap/region/%s/%s:%d-%d?feature=transcript;%s"\ - % (species, refseq, start, end, param) + ext = "/overlap/region/%s/%s:%d-%d%s?feature=transcript;%s"\ + % (species, refseq, start, end, strand, param) start = end + 1 r = ensembl_rest(ext, req_header) if r.text: @@ -115,8 +116,17 @@ return get_seq(id, 'cds',params=params) +def get_transcript_haplotypes(species,transcript): + ext = "/transcript_haplotypes/%s/%s?aligned_sequences=1" % (species,transcript) + req_header = {"Content-Type" : "application/json"} + r = ensembl_rest(ext, req_header) + decoded = r.json() + + def bed_from_line(line): fields = line.rstrip('\r\n').split('\t') + if len(fields) < 12: + return None (chrom, chromStart, chromEnd, name, score, strand, thickStart, thickEnd, itemRgb, blockCount, blockSizes, blockStarts) = fields[0:12] @@ -127,6 +137,15 @@ blockCount=blockCount, blockSizes=blockSizes.rstrip(','), blockStarts=blockStarts.rstrip(',')) + if len(fields) == 20: + bed_entry.second_name = fields[12] + bed_entry.cds_start_status = fields[13] + bed_entry.cds_end_status = fields[14] + bed_entry.exon_frames = fields[15] + bed_entry.biotype = fields[16] + bed_entry.gene_name = fields[17] + bed_entry.second_gene_name = fields[18] + bed_entry.gene_type = fields[19] return bed_entry @@ -157,6 +176,14 @@ self.blockStarts = [int(x) for x in blockStarts] else: self.blockStarts = blockStarts + self.second_name = None + self.cds_start_status = None + self.cds_end_status = None + self.exon_frames = None + self.biotype = None + self.gene_name = None + self.second_gene_name = None + self.gene_type = None self.seq = None self.pep = None @@ -246,6 +273,9 @@ '-s', '--species', default='human', help='Ensembl Species to retrieve') parser.add_argument( + '-R', '--regions', action='append', default=[], + help='Restrict Ensembl retrieval to regions e.g.: X,2:20000-25000,3:100-500+') + parser.add_argument( '-B', '--biotypes', action='append', default=[], help='Restrict Ensembl biotypes to retrieve') parser.add_argument( @@ -295,6 +325,22 @@ biotypes = ';'.join(['biotype=%s' % bt.strip() for biotype in args.biotypes for bt in biotype.split(',') if bt.strip()]) # print >> sys.stderr, "biotypes: %s" % biotypes + selected_regions = dict() # chrom:(start,end) + region_pat = '^([^:]+)(?::(\d*)(?:-(\d+)([+-])?)?)?' + if args.regions: + for entry in args.regions: + if not entry: + continue + regs = [x.strip() for x in entry.split(',') if x.strip()] + for reg in regs: + m = re.match(region_pat,reg) + if m: + (chrom,start,end,strand) = m.groups() + if chrom: + if chrom not in selected_regions: + selected_regions[chrom] = [] + selected_regions[chrom].append([start,end,strand]) + translations = dict() # start : end : seq def unique_prot(tbed, seq): @@ -364,11 +410,44 @@ aa_start = aa_end + 1 return translate_count + def translate_region(species,ref,start,stop,strand): + translation_count = 0 + regions = range(start, stop, max_region) + if not regions or regions[-1] < stop: + regions.append(stop) + for end in regions[1:]: + bedlines = get_transcripts_bed(species, ref, start, end, strand=strand, params=biotypes) + if args.verbose or args.debug: + print >> sys.stderr,\ + "%s\t%s\tstart: %d\tend: %d\tcDNA transcripts:%d"\ + % (species, ref, start, end, len(bedlines)) + # start, end, seq + for i, bedline in enumerate(bedlines): + try: + bed = bed_from_line(bedline)\ + if any([not args.raw, fa_wtr, bed_wtr])\ + else None + if tx_wtr: + tx_wtr.write(bedline if args.raw else str(bed)) + tx_wtr.write("\n") + tx_wtr.flush() + if bed: + translation_count += translate_bed(bed) + except Exception as e: + print >> sys.stderr,\ + "BED error (%s) : %s\n" % (e, bedline) + start = end + 1 + return translation_count + if input_rdr: translation_count = 0 for i, bedline in enumerate(input_rdr): try: bed = bed_from_line(bedline) + if bed is None: + continue + if bed.biotype and biotypea and bed.biotype not in biotypea: + continue translation_count += translate_bed(bed) except: print >> sys.stderr, "BED format error: %s\n" % bedline @@ -378,48 +457,38 @@ else: coord_systems = get_toplevel(species) if 'chromosome' in coord_systems: + ref_lengths = dict() for ref in sorted(coord_systems['chromosome'].keys()): length = coord_systems['chromosome'][ref] + ref_lengths[ref] = length if not any([tx_wtr, fa_wtr, bed_wtr]): print >> sys.stderr,\ "%s\t%s\tlength: %d" % (species, ref, length) - continue - if args.debug: - print >> sys.stderr,\ - "Retrieving transcripts: %s\t%s\tlength: %d"\ - % (species, ref, length) + if selected_regions: translation_count = 0 + for ref in sorted(selected_regions.keys()): + if ref in ref_lengths: + for reg in selected_regions[ref]: + (_start,_stop,_strand) = reg + start = int(_start) if _start else 0 + stop = int(_stop) if _stop else ref_lengths[ref] + strand = '' if not _strand else ':1' if _strand == '+' else ':-1' + translation_count += translate_region(species,ref,start,stop,strand) + else: + strand = '' start = 0 - regions = range(start, length, max_region) - if not regions or regions[-1] < length: - regions.append(length) - for end in regions[1:]: - bedlines = get_transcripts_bed(species, ref, start, end, params=biotypes) - if args.verbose or args.debug: + for ref in sorted(ref_lengths.keys()): + length = ref_lengths[ref] + translation_count = 0 + if args.debug: print >> sys.stderr,\ - "%s\t%s\tstart: %d\tend: %d\tcDNA transcripts:%d"\ - % (species, ref, start, end, len(bedlines)) - # start, end, seq - for i, bedline in enumerate(bedlines): - try: - bed = bed_from_line(bedline)\ - if any([not args.raw, fa_wtr, bed_wtr])\ - else None - if tx_wtr: - tx_wtr.write(bedline if args.raw else str(bed)) - tx_wtr.write("\n") - tx_wtr.flush() - if bed: - translation_count += translate_bed(bed) - except Exception as e: - print >> sys.stderr,\ - "BED error (%s) : %s\n" % (e, bedline) - start = end + 1 - - if args.debug or (args.verbose and any([fa_wtr, bed_wtr])): - print >> sys.stderr,\ - "%s\t%s\tlength: %d\tcDNA translations:%d"\ - % (species, ref, length, translation_count) + "Retrieving transcripts: %s\t%s\tlength: %d"\ + % (species, ref, length) + translation_count += translate_region(species,ref,start,length,strand) + if args.debug or (args.verbose and any([fa_wtr, bed_wtr])): + print >> sys.stderr,\ + "%s\t%s\tlength: %d\tcDNA translations:%d"\ + % (species, ref, length, translation_count) if __name__ == "__main__":