comparison mature_mir_gff_translation.py @ 3:ee99c6374a3b draft

"planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/mircounts commit af48e9f6df2717ffd3731a974be1ec36e4eff779"
author artbio
date Fri, 18 Oct 2019 19:18:17 -0400
parents 10f0e4c00b13
children
comparison
equal deleted inserted replaced
2:fe1ed513da99 3:ee99c6374a3b
1 #!/usr/bin/env python
2
3 import sys
4 import argparse 1 import argparse
2 from datetime import datetime
5 3
6 4
7 def Parser(): 5 def Parser():
8 the_parser = argparse.ArgumentParser() 6 the_parser = argparse.ArgumentParser()
9 the_parser.add_argument( 7 the_parser.add_argument(
10 '--input', action="store", type=str, help="input miRBase GFF3 file") 8 '--gff_path', action="store", type=str,
9 help="path to miRBase GFF3 file")
11 the_parser.add_argument( 10 the_parser.add_argument(
12 '--output', action="store", type=str, help="output GFF3 file with converted mature mir coordinates") 11 '--output', action="store", type=str,
12 help="output GFF3 file with converted mature mir coordinates")
13 args = the_parser.parse_args() 13 args = the_parser.parse_args()
14 return args 14 return args
15 15
16 GFF3_header= '''##gff-version 3
17 ##generated by mature_mir_gff_translation.py
18 #
19 # Chromosomal coordinates of microRNAs ** relative to the hairpin precursors **
20 # microRNAs: miRBase current_version
21 # genome-build-id: check http://mirbase.org/
22 #
23 # Hairpin precursor sequences have type "miRNA_primary_transcript".
24 # Note, these sequences do not represent the full primary transcript,
25 # rather a predicted stem-loop portion that includes the precursor
26 # miRNA. Mature sequences have type "miRNA".
27 #
28 '''
29 16
30 def load_gff_in_dict(gff_input_file): 17 def convert_and_print_gff(gff_input_file, output):
31 ''' 18
32 Reads the gff3 file and return a dictionary of dictionaries 19 def get_gff_header(gff_input_file):
33 with keys equal to standard gff3 fields (9) 20 string_list = []
34 Note that the key of the primary dictionary is the ID 21 for line in open(gff_input_file, "r"):
35 ''' 22 if line[0] == '#':
23 string_list.append(line)
24 string_list.append('# generated by mature_mir_gff_translation.py \
25 %s\n#\n' % str(datetime.now()))
26 return ''.join(string_list)
27
36 gff_dict = {} 28 gff_dict = {}
29 # see https://github.com/ARTbio/tools-artbio/issues/246
30 # currently fixed by perl pretreatment or the gff3 file
37 for line in open(gff_input_file, "r"): 31 for line in open(gff_input_file, "r"):
38 if line[0]=="#": 32 if line[0] == "#":
39 continue 33 continue
40 gff_fields=line[:-1].split("\t") 34 gff_fields = line[:-1].split("\t")
41 ID=gff_fields[8].split("ID=")[1].split(";")[0] 35 ID = gff_fields[8].split("ID=")[1].split(";")[0]
42 gff_dict[ID] = {} 36 if gff_fields[2] == "miRNA_primary_transcript":
43 gff_dict[ID]["seqid"]=gff_fields[0] 37 gff_dict[ID] = {}
44 gff_dict[ID]["source"]=gff_fields[1] 38 gff_dict[ID]["premir_name"] = gff_fields[8].split(
45 gff_dict[ID]["type"]=gff_fields[2] 39 "Name=")[1].split(";")[0]
46 gff_dict[ID]["start"]=gff_fields[3] 40 gff_dict[ID]["primary"] = line[:-1]
47 gff_dict[ID]["end"]=gff_fields[4] 41 gff_dict[ID]["miRNAs"] = []
48 gff_dict[ID]["score"]=gff_fields[5] 42 elif gff_fields[2] == "miRNA":
49 gff_dict[ID]["strand"]=gff_fields[6] 43 if "_" in ID:
50 gff_dict[ID]["phase"]=gff_fields[7] 44 continue
51 gff_dict[ID]["attributes"]=gff_fields[8] 45 parent_ID = gff_fields[8].split("erives_from=")[1]
52 if "Derives_from" in gff_dict[ID]["attributes"]: 46 gff_dict[parent_ID]["miRNAs"].append(line[:-1])
53 parent_primary_transcript=gff_dict[ID]["attributes"].split("Derives_from=")[1] 47 # Now reorganise features and recalculate coordinates of premirs and mirs
54 parent_primary_transcript=gff_dict[parent_primary_transcript]["attributes"].split("Name=")[1] 48 gff_list = []
55 gff_dict[ID]["attributes"]="%s;Parent_mir_Name=%s" % (gff_dict[ID]["attributes"], parent_primary_transcript) 49 for ID in sorted(gff_dict, key=lambda x: (gff_dict[x]['premir_name'])):
56 return gff_dict 50 # delete premir and their mir with ID containing "_"
57 51 if "_" in ID:
58 52 del gff_dict[ID]
59 def genome_to_mir_gff(gff_dict, output):
60 '''
61 Converts seqid field from chromosome to item Name
62 Then converts coordinates relative to "miRNA_primary_transcript"
63 Note that GFF files are 1-based coordinates
64 '''
65 for key in gff_dict:
66 name=gff_dict[key]["attributes"].split("Name=")[1].split(";")[0]
67 gff_dict[key]["seqid"]=name
68 if "Derives_from=" in gff_dict[key]["attributes"]:
69 parent_ID=gff_dict[key]["attributes"].split("Derives_from=")[1].split(";")[0]
70 gff_dict[key]["start"]=str(int(gff_dict[key]["start"]) - int(gff_dict[parent_ID]["start"]) + 1)
71 gff_dict[key]["end"]=str(int(gff_dict[key]["end"]) - int(gff_dict[parent_ID]["start"]) + 1)
72 hairpins={}
73 matures={}
74 for key in gff_dict: ## treats miRNA_primary_transcript coordinates in a second loop to avoid errors in conversion
75 if gff_dict[key]["type"]=="miRNA_primary_transcript":
76 gff_dict[key]["end"]=str(int(gff_dict[key]["end"]) - int(gff_dict[key]["start"]) + 1)
77 gff_dict[key]["start"]="1"
78 # now, do a dict[ID]=Name but only for miRNA_primary_transcript
79 hairpins[key]=gff_dict[key]["attributes"].split("Name=")[1].split(";")[0]
80 else: 53 else:
81 matures[key]=gff_dict[key]["attributes"].split("Name=")[1].split(";")[0] 54 primary_fields = gff_dict[ID]["primary"].split('\t')
55 seqid = primary_fields[8].split("Name=")[1].split(";")[0]
56 source = primary_fields[1]
57 type = primary_fields[2]
58 start = primary_fields[3]
59 newstart = "1"
60 end = primary_fields[4]
61 newend = str(int(end)-int(start)+1)
62 score = primary_fields[5]
63 strand = primary_fields[6]
64 phase = primary_fields[7]
65 attributes = primary_fields[8]
66 gff_list.append('%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s' % (seqid,
67 source, type, newstart, newend, score, strand,
68 phase, attributes))
69 # ensure their is only 2 child miRNAs at best
70 if len(gff_dict[ID]["miRNAs"]) > 2:
71 gff_dict[ID]["miRNAs"] = gff_dict[ID]["miRNAs"][:2]
72 # sort child miRNAs 5p first 3p second,
73 # if there are two miR mature at least !
74 if len(gff_dict[ID]["miRNAs"]) > 1 and \
75 gff_dict[ID]["miRNAs"][0].find('5p') == -1:
76 gff_dict[ID]["miRNAs"] = gff_dict[ID]["miRNAs"][::-1]
77 for mir in gff_dict[ID]["miRNAs"]:
78 mir_fields = mir.split('\t')
79 mir_seqid = mir_fields[8].split("Name=")[1].split(";")[0]
80 mir_source = mir_fields[1]
81 mir_type = mir_fields[2]
82 mir_start = mir_fields[3]
83 mir_end = mir_fields[4]
84 new_mir_start = str(int(mir_start)-int(start)+1)
85 new_mir_end = str(int(mir_end)-int(start)+1)
86 mir_score = mir_fields[5]
87 mir_strand = mir_fields[6]
88 mir_phase = mir_fields[7]
89 mir_attributes = mir_fields[8]
90 mir_sfx = ";Parent_mir_Name=%s" % gff_dict[ID]["premir_name"]
91 gff_list.append(
92 '%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s%s' % (
93 mir_seqid, mir_source, mir_type,
94 new_mir_start, new_mir_end, mir_score,
95 mir_strand, mir_phase, mir_attributes,
96 mir_sfx))
82 with open(output, "w") as output: 97 with open(output, "w") as output:
83 output.write(GFF3_header) 98 output.write('%s' % get_gff_header(gff_input_file))
84 for ID in sorted(hairpins, key=hairpins.get): 99 output.write('\n'.join(gff_list))
85 output.write("\t".join([gff_dict[ID]["seqid"], gff_dict[ID]["source"], 100 output.write('\n')
86 gff_dict[ID]["type"], gff_dict[ID]["start"], gff_dict[ID]["end"],
87 gff_dict[ID]["score"], gff_dict[ID]["strand"], gff_dict[ID]["phase"],
88 gff_dict[ID]["attributes"]]))
89 output.write("\n")
90 for id in sorted(matures, key=matures.get, reverse=True):
91 if ID in gff_dict[id]["attributes"]:
92 output.write("\t".join([gff_dict[id]["seqid"], gff_dict[id]["source"],
93 gff_dict[id]["type"], gff_dict[id]["start"], gff_dict[id]["end"],
94 gff_dict[id]["score"], gff_dict[id]["strand"],
95 gff_dict[id]["phase"], gff_dict[id]["attributes"]]))
96 output.write("\n")
97 101
98 102
99 def main(infile, outfile): 103 def main(gff_path, outfile):
100 gff_dict = load_gff_in_dict(infile) 104 convert_and_print_gff(gff_path, outfile)
101 genome_to_mir_gff(gff_dict, outfile)
102 105
103 106
104 if __name__ == "__main__": 107 if __name__ == "__main__":
105 args = Parser() 108 args = Parser()
106 main(args.input, args.output) 109 main(args.gff_path, args.output)