Mercurial > repos > artbio > mircounts
comparison mature_mir_gff_translation.py @ 3:ee99c6374a3b draft
"planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/mircounts commit af48e9f6df2717ffd3731a974be1ec36e4eff779"
author | artbio |
---|---|
date | Fri, 18 Oct 2019 19:18:17 -0400 |
parents | 10f0e4c00b13 |
children |
comparison
equal
deleted
inserted
replaced
2:fe1ed513da99 | 3:ee99c6374a3b |
---|---|
1 #!/usr/bin/env python | |
2 | |
3 import sys | |
4 import argparse | 1 import argparse |
2 from datetime import datetime | |
5 | 3 |
6 | 4 |
7 def Parser(): | 5 def Parser(): |
8 the_parser = argparse.ArgumentParser() | 6 the_parser = argparse.ArgumentParser() |
9 the_parser.add_argument( | 7 the_parser.add_argument( |
10 '--input', action="store", type=str, help="input miRBase GFF3 file") | 8 '--gff_path', action="store", type=str, |
9 help="path to miRBase GFF3 file") | |
11 the_parser.add_argument( | 10 the_parser.add_argument( |
12 '--output', action="store", type=str, help="output GFF3 file with converted mature mir coordinates") | 11 '--output', action="store", type=str, |
12 help="output GFF3 file with converted mature mir coordinates") | |
13 args = the_parser.parse_args() | 13 args = the_parser.parse_args() |
14 return args | 14 return args |
15 | 15 |
16 GFF3_header= '''##gff-version 3 | |
17 ##generated by mature_mir_gff_translation.py | |
18 # | |
19 # Chromosomal coordinates of microRNAs ** relative to the hairpin precursors ** | |
20 # microRNAs: miRBase current_version | |
21 # genome-build-id: check http://mirbase.org/ | |
22 # | |
23 # Hairpin precursor sequences have type "miRNA_primary_transcript". | |
24 # Note, these sequences do not represent the full primary transcript, | |
25 # rather a predicted stem-loop portion that includes the precursor | |
26 # miRNA. Mature sequences have type "miRNA". | |
27 # | |
28 ''' | |
29 | 16 |
30 def load_gff_in_dict(gff_input_file): | 17 def convert_and_print_gff(gff_input_file, output): |
31 ''' | 18 |
32 Reads the gff3 file and return a dictionary of dictionaries | 19 def get_gff_header(gff_input_file): |
33 with keys equal to standard gff3 fields (9) | 20 string_list = [] |
34 Note that the key of the primary dictionary is the ID | 21 for line in open(gff_input_file, "r"): |
35 ''' | 22 if line[0] == '#': |
23 string_list.append(line) | |
24 string_list.append('# generated by mature_mir_gff_translation.py \ | |
25 %s\n#\n' % str(datetime.now())) | |
26 return ''.join(string_list) | |
27 | |
36 gff_dict = {} | 28 gff_dict = {} |
29 # see https://github.com/ARTbio/tools-artbio/issues/246 | |
30 # currently fixed by perl pretreatment or the gff3 file | |
37 for line in open(gff_input_file, "r"): | 31 for line in open(gff_input_file, "r"): |
38 if line[0]=="#": | 32 if line[0] == "#": |
39 continue | 33 continue |
40 gff_fields=line[:-1].split("\t") | 34 gff_fields = line[:-1].split("\t") |
41 ID=gff_fields[8].split("ID=")[1].split(";")[0] | 35 ID = gff_fields[8].split("ID=")[1].split(";")[0] |
42 gff_dict[ID] = {} | 36 if gff_fields[2] == "miRNA_primary_transcript": |
43 gff_dict[ID]["seqid"]=gff_fields[0] | 37 gff_dict[ID] = {} |
44 gff_dict[ID]["source"]=gff_fields[1] | 38 gff_dict[ID]["premir_name"] = gff_fields[8].split( |
45 gff_dict[ID]["type"]=gff_fields[2] | 39 "Name=")[1].split(";")[0] |
46 gff_dict[ID]["start"]=gff_fields[3] | 40 gff_dict[ID]["primary"] = line[:-1] |
47 gff_dict[ID]["end"]=gff_fields[4] | 41 gff_dict[ID]["miRNAs"] = [] |
48 gff_dict[ID]["score"]=gff_fields[5] | 42 elif gff_fields[2] == "miRNA": |
49 gff_dict[ID]["strand"]=gff_fields[6] | 43 if "_" in ID: |
50 gff_dict[ID]["phase"]=gff_fields[7] | 44 continue |
51 gff_dict[ID]["attributes"]=gff_fields[8] | 45 parent_ID = gff_fields[8].split("erives_from=")[1] |
52 if "Derives_from" in gff_dict[ID]["attributes"]: | 46 gff_dict[parent_ID]["miRNAs"].append(line[:-1]) |
53 parent_primary_transcript=gff_dict[ID]["attributes"].split("Derives_from=")[1] | 47 # Now reorganise features and recalculate coordinates of premirs and mirs |
54 parent_primary_transcript=gff_dict[parent_primary_transcript]["attributes"].split("Name=")[1] | 48 gff_list = [] |
55 gff_dict[ID]["attributes"]="%s;Parent_mir_Name=%s" % (gff_dict[ID]["attributes"], parent_primary_transcript) | 49 for ID in sorted(gff_dict, key=lambda x: (gff_dict[x]['premir_name'])): |
56 return gff_dict | 50 # delete premir and their mir with ID containing "_" |
57 | 51 if "_" in ID: |
58 | 52 del gff_dict[ID] |
59 def genome_to_mir_gff(gff_dict, output): | |
60 ''' | |
61 Converts seqid field from chromosome to item Name | |
62 Then converts coordinates relative to "miRNA_primary_transcript" | |
63 Note that GFF files are 1-based coordinates | |
64 ''' | |
65 for key in gff_dict: | |
66 name=gff_dict[key]["attributes"].split("Name=")[1].split(";")[0] | |
67 gff_dict[key]["seqid"]=name | |
68 if "Derives_from=" in gff_dict[key]["attributes"]: | |
69 parent_ID=gff_dict[key]["attributes"].split("Derives_from=")[1].split(";")[0] | |
70 gff_dict[key]["start"]=str(int(gff_dict[key]["start"]) - int(gff_dict[parent_ID]["start"]) + 1) | |
71 gff_dict[key]["end"]=str(int(gff_dict[key]["end"]) - int(gff_dict[parent_ID]["start"]) + 1) | |
72 hairpins={} | |
73 matures={} | |
74 for key in gff_dict: ## treats miRNA_primary_transcript coordinates in a second loop to avoid errors in conversion | |
75 if gff_dict[key]["type"]=="miRNA_primary_transcript": | |
76 gff_dict[key]["end"]=str(int(gff_dict[key]["end"]) - int(gff_dict[key]["start"]) + 1) | |
77 gff_dict[key]["start"]="1" | |
78 # now, do a dict[ID]=Name but only for miRNA_primary_transcript | |
79 hairpins[key]=gff_dict[key]["attributes"].split("Name=")[1].split(";")[0] | |
80 else: | 53 else: |
81 matures[key]=gff_dict[key]["attributes"].split("Name=")[1].split(";")[0] | 54 primary_fields = gff_dict[ID]["primary"].split('\t') |
55 seqid = primary_fields[8].split("Name=")[1].split(";")[0] | |
56 source = primary_fields[1] | |
57 type = primary_fields[2] | |
58 start = primary_fields[3] | |
59 newstart = "1" | |
60 end = primary_fields[4] | |
61 newend = str(int(end)-int(start)+1) | |
62 score = primary_fields[5] | |
63 strand = primary_fields[6] | |
64 phase = primary_fields[7] | |
65 attributes = primary_fields[8] | |
66 gff_list.append('%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s' % (seqid, | |
67 source, type, newstart, newend, score, strand, | |
68 phase, attributes)) | |
69 # ensure their is only 2 child miRNAs at best | |
70 if len(gff_dict[ID]["miRNAs"]) > 2: | |
71 gff_dict[ID]["miRNAs"] = gff_dict[ID]["miRNAs"][:2] | |
72 # sort child miRNAs 5p first 3p second, | |
73 # if there are two miR mature at least ! | |
74 if len(gff_dict[ID]["miRNAs"]) > 1 and \ | |
75 gff_dict[ID]["miRNAs"][0].find('5p') == -1: | |
76 gff_dict[ID]["miRNAs"] = gff_dict[ID]["miRNAs"][::-1] | |
77 for mir in gff_dict[ID]["miRNAs"]: | |
78 mir_fields = mir.split('\t') | |
79 mir_seqid = mir_fields[8].split("Name=")[1].split(";")[0] | |
80 mir_source = mir_fields[1] | |
81 mir_type = mir_fields[2] | |
82 mir_start = mir_fields[3] | |
83 mir_end = mir_fields[4] | |
84 new_mir_start = str(int(mir_start)-int(start)+1) | |
85 new_mir_end = str(int(mir_end)-int(start)+1) | |
86 mir_score = mir_fields[5] | |
87 mir_strand = mir_fields[6] | |
88 mir_phase = mir_fields[7] | |
89 mir_attributes = mir_fields[8] | |
90 mir_sfx = ";Parent_mir_Name=%s" % gff_dict[ID]["premir_name"] | |
91 gff_list.append( | |
92 '%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s%s' % ( | |
93 mir_seqid, mir_source, mir_type, | |
94 new_mir_start, new_mir_end, mir_score, | |
95 mir_strand, mir_phase, mir_attributes, | |
96 mir_sfx)) | |
82 with open(output, "w") as output: | 97 with open(output, "w") as output: |
83 output.write(GFF3_header) | 98 output.write('%s' % get_gff_header(gff_input_file)) |
84 for ID in sorted(hairpins, key=hairpins.get): | 99 output.write('\n'.join(gff_list)) |
85 output.write("\t".join([gff_dict[ID]["seqid"], gff_dict[ID]["source"], | 100 output.write('\n') |
86 gff_dict[ID]["type"], gff_dict[ID]["start"], gff_dict[ID]["end"], | |
87 gff_dict[ID]["score"], gff_dict[ID]["strand"], gff_dict[ID]["phase"], | |
88 gff_dict[ID]["attributes"]])) | |
89 output.write("\n") | |
90 for id in sorted(matures, key=matures.get, reverse=True): | |
91 if ID in gff_dict[id]["attributes"]: | |
92 output.write("\t".join([gff_dict[id]["seqid"], gff_dict[id]["source"], | |
93 gff_dict[id]["type"], gff_dict[id]["start"], gff_dict[id]["end"], | |
94 gff_dict[id]["score"], gff_dict[id]["strand"], | |
95 gff_dict[id]["phase"], gff_dict[id]["attributes"]])) | |
96 output.write("\n") | |
97 | 101 |
98 | 102 |
99 def main(infile, outfile): | 103 def main(gff_path, outfile): |
100 gff_dict = load_gff_in_dict(infile) | 104 convert_and_print_gff(gff_path, outfile) |
101 genome_to_mir_gff(gff_dict, outfile) | |
102 | 105 |
103 | 106 |
104 if __name__ == "__main__": | 107 if __name__ == "__main__": |
105 args = Parser() | 108 args = Parser() |
106 main(args.input, args.output) | 109 main(args.gff_path, args.output) |