Mercurial > repos > artbio > mircounts
diff mature_mir_gff_translation.py @ 3:ee99c6374a3b draft
"planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/mircounts commit af48e9f6df2717ffd3731a974be1ec36e4eff779"
author | artbio |
---|---|
date | Fri, 18 Oct 2019 19:18:17 -0400 |
parents | 10f0e4c00b13 |
children |
line wrap: on
line diff
--- a/mature_mir_gff_translation.py Tue Jul 18 06:49:21 2017 -0400 +++ b/mature_mir_gff_translation.py Fri Oct 18 19:18:17 2019 -0400 @@ -1,106 +1,109 @@ -#!/usr/bin/env python - -import sys import argparse +from datetime import datetime def Parser(): the_parser = argparse.ArgumentParser() the_parser.add_argument( - '--input', action="store", type=str, help="input miRBase GFF3 file") + '--gff_path', action="store", type=str, + help="path to miRBase GFF3 file") the_parser.add_argument( - '--output', action="store", type=str, help="output GFF3 file with converted mature mir coordinates") + '--output', action="store", type=str, + help="output GFF3 file with converted mature mir coordinates") args = the_parser.parse_args() return args -GFF3_header= '''##gff-version 3 -##generated by mature_mir_gff_translation.py -# -# Chromosomal coordinates of microRNAs ** relative to the hairpin precursors ** -# microRNAs: miRBase current_version -# genome-build-id: check http://mirbase.org/ -# -# Hairpin precursor sequences have type "miRNA_primary_transcript". -# Note, these sequences do not represent the full primary transcript, -# rather a predicted stem-loop portion that includes the precursor -# miRNA. Mature sequences have type "miRNA". -# -''' + +def convert_and_print_gff(gff_input_file, output): -def load_gff_in_dict(gff_input_file): - ''' - Reads the gff3 file and return a dictionary of dictionaries - with keys equal to standard gff3 fields (9) - Note that the key of the primary dictionary is the ID - ''' + def get_gff_header(gff_input_file): + string_list = [] + for line in open(gff_input_file, "r"): + if line[0] == '#': + string_list.append(line) + string_list.append('# generated by mature_mir_gff_translation.py \ + %s\n#\n' % str(datetime.now())) + return ''.join(string_list) + gff_dict = {} + # see https://github.com/ARTbio/tools-artbio/issues/246 + # currently fixed by perl pretreatment or the gff3 file for line in open(gff_input_file, "r"): - if line[0]=="#": + if line[0] == "#": continue - gff_fields=line[:-1].split("\t") - ID=gff_fields[8].split("ID=")[1].split(";")[0] - gff_dict[ID] = {} - gff_dict[ID]["seqid"]=gff_fields[0] - gff_dict[ID]["source"]=gff_fields[1] - gff_dict[ID]["type"]=gff_fields[2] - gff_dict[ID]["start"]=gff_fields[3] - gff_dict[ID]["end"]=gff_fields[4] - gff_dict[ID]["score"]=gff_fields[5] - gff_dict[ID]["strand"]=gff_fields[6] - gff_dict[ID]["phase"]=gff_fields[7] - gff_dict[ID]["attributes"]=gff_fields[8] - if "Derives_from" in gff_dict[ID]["attributes"]: - parent_primary_transcript=gff_dict[ID]["attributes"].split("Derives_from=")[1] - parent_primary_transcript=gff_dict[parent_primary_transcript]["attributes"].split("Name=")[1] - gff_dict[ID]["attributes"]="%s;Parent_mir_Name=%s" % (gff_dict[ID]["attributes"], parent_primary_transcript) - return gff_dict - - -def genome_to_mir_gff(gff_dict, output): - ''' - Converts seqid field from chromosome to item Name - Then converts coordinates relative to "miRNA_primary_transcript" - Note that GFF files are 1-based coordinates - ''' - for key in gff_dict: - name=gff_dict[key]["attributes"].split("Name=")[1].split(";")[0] - gff_dict[key]["seqid"]=name - if "Derives_from=" in gff_dict[key]["attributes"]: - parent_ID=gff_dict[key]["attributes"].split("Derives_from=")[1].split(";")[0] - gff_dict[key]["start"]=str(int(gff_dict[key]["start"]) - int(gff_dict[parent_ID]["start"]) + 1) - gff_dict[key]["end"]=str(int(gff_dict[key]["end"]) - int(gff_dict[parent_ID]["start"]) + 1) - hairpins={} - matures={} - for key in gff_dict: ## treats miRNA_primary_transcript coordinates in a second loop to avoid errors in conversion - if gff_dict[key]["type"]=="miRNA_primary_transcript": - gff_dict[key]["end"]=str(int(gff_dict[key]["end"]) - int(gff_dict[key]["start"]) + 1) - gff_dict[key]["start"]="1" - # now, do a dict[ID]=Name but only for miRNA_primary_transcript - hairpins[key]=gff_dict[key]["attributes"].split("Name=")[1].split(";")[0] + gff_fields = line[:-1].split("\t") + ID = gff_fields[8].split("ID=")[1].split(";")[0] + if gff_fields[2] == "miRNA_primary_transcript": + gff_dict[ID] = {} + gff_dict[ID]["premir_name"] = gff_fields[8].split( + "Name=")[1].split(";")[0] + gff_dict[ID]["primary"] = line[:-1] + gff_dict[ID]["miRNAs"] = [] + elif gff_fields[2] == "miRNA": + if "_" in ID: + continue + parent_ID = gff_fields[8].split("erives_from=")[1] + gff_dict[parent_ID]["miRNAs"].append(line[:-1]) + # Now reorganise features and recalculate coordinates of premirs and mirs + gff_list = [] + for ID in sorted(gff_dict, key=lambda x: (gff_dict[x]['premir_name'])): + # delete premir and their mir with ID containing "_" + if "_" in ID: + del gff_dict[ID] else: - matures[key]=gff_dict[key]["attributes"].split("Name=")[1].split(";")[0] + primary_fields = gff_dict[ID]["primary"].split('\t') + seqid = primary_fields[8].split("Name=")[1].split(";")[0] + source = primary_fields[1] + type = primary_fields[2] + start = primary_fields[3] + newstart = "1" + end = primary_fields[4] + newend = str(int(end)-int(start)+1) + score = primary_fields[5] + strand = primary_fields[6] + phase = primary_fields[7] + attributes = primary_fields[8] + gff_list.append('%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s' % (seqid, + source, type, newstart, newend, score, strand, + phase, attributes)) + # ensure their is only 2 child miRNAs at best + if len(gff_dict[ID]["miRNAs"]) > 2: + gff_dict[ID]["miRNAs"] = gff_dict[ID]["miRNAs"][:2] + # sort child miRNAs 5p first 3p second, + # if there are two miR mature at least ! + if len(gff_dict[ID]["miRNAs"]) > 1 and \ + gff_dict[ID]["miRNAs"][0].find('5p') == -1: + gff_dict[ID]["miRNAs"] = gff_dict[ID]["miRNAs"][::-1] + for mir in gff_dict[ID]["miRNAs"]: + mir_fields = mir.split('\t') + mir_seqid = mir_fields[8].split("Name=")[1].split(";")[0] + mir_source = mir_fields[1] + mir_type = mir_fields[2] + mir_start = mir_fields[3] + mir_end = mir_fields[4] + new_mir_start = str(int(mir_start)-int(start)+1) + new_mir_end = str(int(mir_end)-int(start)+1) + mir_score = mir_fields[5] + mir_strand = mir_fields[6] + mir_phase = mir_fields[7] + mir_attributes = mir_fields[8] + mir_sfx = ";Parent_mir_Name=%s" % gff_dict[ID]["premir_name"] + gff_list.append( + '%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s%s' % ( + mir_seqid, mir_source, mir_type, + new_mir_start, new_mir_end, mir_score, + mir_strand, mir_phase, mir_attributes, + mir_sfx)) with open(output, "w") as output: - output.write(GFF3_header) - for ID in sorted(hairpins, key=hairpins.get): - output.write("\t".join([gff_dict[ID]["seqid"], gff_dict[ID]["source"], - gff_dict[ID]["type"], gff_dict[ID]["start"], gff_dict[ID]["end"], - gff_dict[ID]["score"], gff_dict[ID]["strand"], gff_dict[ID]["phase"], - gff_dict[ID]["attributes"]])) - output.write("\n") - for id in sorted(matures, key=matures.get, reverse=True): - if ID in gff_dict[id]["attributes"]: - output.write("\t".join([gff_dict[id]["seqid"], gff_dict[id]["source"], - gff_dict[id]["type"], gff_dict[id]["start"], gff_dict[id]["end"], - gff_dict[id]["score"], gff_dict[id]["strand"], - gff_dict[id]["phase"], gff_dict[id]["attributes"]])) - output.write("\n") + output.write('%s' % get_gff_header(gff_input_file)) + output.write('\n'.join(gff_list)) + output.write('\n') -def main(infile, outfile): - gff_dict = load_gff_in_dict(infile) - genome_to_mir_gff(gff_dict, outfile) +def main(gff_path, outfile): + convert_and_print_gff(gff_path, outfile) if __name__ == "__main__": args = Parser() - main(args.input, args.output) \ No newline at end of file + main(args.gff_path, args.output)