diff mature_mir_gff_translation.py @ 3:ee99c6374a3b draft

"planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/mircounts commit af48e9f6df2717ffd3731a974be1ec36e4eff779"
author artbio
date Fri, 18 Oct 2019 19:18:17 -0400
parents 10f0e4c00b13
children
line wrap: on
line diff
--- a/mature_mir_gff_translation.py	Tue Jul 18 06:49:21 2017 -0400
+++ b/mature_mir_gff_translation.py	Fri Oct 18 19:18:17 2019 -0400
@@ -1,106 +1,109 @@
-#!/usr/bin/env python
-
-import sys
 import argparse
+from datetime import datetime
 
 
 def Parser():
     the_parser = argparse.ArgumentParser()
     the_parser.add_argument(
-        '--input', action="store", type=str, help="input miRBase GFF3 file")
+        '--gff_path', action="store", type=str,
+        help="path to miRBase GFF3 file")
     the_parser.add_argument(
-        '--output', action="store", type=str, help="output GFF3 file with converted mature mir coordinates")
+        '--output', action="store", type=str,
+        help="output GFF3 file with converted mature mir coordinates")
     args = the_parser.parse_args()
     return args
 
-GFF3_header= '''##gff-version 3
-##generated by mature_mir_gff_translation.py
-#
-# Chromosomal coordinates of microRNAs ** relative to the hairpin precursors **
-# microRNAs:               miRBase current_version
-# genome-build-id:         check http://mirbase.org/
-#
-# Hairpin precursor sequences have type "miRNA_primary_transcript". 
-# Note, these sequences do not represent the full primary transcript, 
-# rather a predicted stem-loop portion that includes the precursor 
-# miRNA. Mature sequences have type "miRNA".
-#
-'''
+
+def convert_and_print_gff(gff_input_file, output):
 
-def load_gff_in_dict(gff_input_file):
-    '''
-    Reads the gff3 file and return a dictionary of dictionaries
-    with keys equal to standard gff3 fields (9)
-    Note that the key of the primary dictionary is the ID
-    '''
+    def get_gff_header(gff_input_file):
+        string_list = []
+        for line in open(gff_input_file, "r"):
+            if line[0] == '#':
+                string_list.append(line)
+        string_list.append('# generated by mature_mir_gff_translation.py \
+                            %s\n#\n' % str(datetime.now()))
+        return ''.join(string_list)
+
     gff_dict = {}
+    # see https://github.com/ARTbio/tools-artbio/issues/246
+    # currently fixed by perl pretreatment or the gff3 file
     for line in open(gff_input_file, "r"):
-        if line[0]=="#":
+        if line[0] == "#":
             continue
-        gff_fields=line[:-1].split("\t")
-        ID=gff_fields[8].split("ID=")[1].split(";")[0]
-        gff_dict[ID] = {}
-        gff_dict[ID]["seqid"]=gff_fields[0]
-        gff_dict[ID]["source"]=gff_fields[1]
-        gff_dict[ID]["type"]=gff_fields[2]
-        gff_dict[ID]["start"]=gff_fields[3]
-        gff_dict[ID]["end"]=gff_fields[4]
-        gff_dict[ID]["score"]=gff_fields[5]
-        gff_dict[ID]["strand"]=gff_fields[6]
-        gff_dict[ID]["phase"]=gff_fields[7]
-        gff_dict[ID]["attributes"]=gff_fields[8]
-        if "Derives_from" in gff_dict[ID]["attributes"]:
-            parent_primary_transcript=gff_dict[ID]["attributes"].split("Derives_from=")[1]
-            parent_primary_transcript=gff_dict[parent_primary_transcript]["attributes"].split("Name=")[1]
-            gff_dict[ID]["attributes"]="%s;Parent_mir_Name=%s" % (gff_dict[ID]["attributes"], parent_primary_transcript)
-    return gff_dict
-    
-
-def genome_to_mir_gff(gff_dict, output):
-    '''
-    Converts seqid field from chromosome to item Name
-    Then converts coordinates relative to "miRNA_primary_transcript"
-    Note that GFF files are 1-based coordinates
-    '''
-    for key in gff_dict:
-        name=gff_dict[key]["attributes"].split("Name=")[1].split(";")[0]
-        gff_dict[key]["seqid"]=name
-        if "Derives_from=" in gff_dict[key]["attributes"]:
-            parent_ID=gff_dict[key]["attributes"].split("Derives_from=")[1].split(";")[0]
-            gff_dict[key]["start"]=str(int(gff_dict[key]["start"]) - int(gff_dict[parent_ID]["start"]) + 1)
-            gff_dict[key]["end"]=str(int(gff_dict[key]["end"]) - int(gff_dict[parent_ID]["start"]) + 1)
-    hairpins={}
-    matures={}
-    for key in gff_dict:  ## treats miRNA_primary_transcript coordinates in a second loop to avoid errors in conversion
-        if gff_dict[key]["type"]=="miRNA_primary_transcript":
-            gff_dict[key]["end"]=str(int(gff_dict[key]["end"]) - int(gff_dict[key]["start"]) + 1)
-            gff_dict[key]["start"]="1"
-            # now, do a dict[ID]=Name but only for miRNA_primary_transcript
-            hairpins[key]=gff_dict[key]["attributes"].split("Name=")[1].split(";")[0]
+        gff_fields = line[:-1].split("\t")
+        ID = gff_fields[8].split("ID=")[1].split(";")[0]
+        if gff_fields[2] == "miRNA_primary_transcript":
+            gff_dict[ID] = {}
+            gff_dict[ID]["premir_name"] = gff_fields[8].split(
+                "Name=")[1].split(";")[0]
+            gff_dict[ID]["primary"] = line[:-1]
+            gff_dict[ID]["miRNAs"] = []
+        elif gff_fields[2] == "miRNA":
+            if "_" in ID:
+                continue
+            parent_ID = gff_fields[8].split("erives_from=")[1]
+            gff_dict[parent_ID]["miRNAs"].append(line[:-1])
+    # Now reorganise features and recalculate coordinates of premirs and mirs
+    gff_list = []
+    for ID in sorted(gff_dict, key=lambda x: (gff_dict[x]['premir_name'])):
+        # delete premir and their mir with ID containing "_"
+        if "_" in ID:
+            del gff_dict[ID]
         else:
-            matures[key]=gff_dict[key]["attributes"].split("Name=")[1].split(";")[0]
+            primary_fields = gff_dict[ID]["primary"].split('\t')
+            seqid = primary_fields[8].split("Name=")[1].split(";")[0]
+            source = primary_fields[1]
+            type = primary_fields[2]
+            start = primary_fields[3]
+            newstart = "1"
+            end = primary_fields[4]
+            newend = str(int(end)-int(start)+1)
+            score = primary_fields[5]
+            strand = primary_fields[6]
+            phase = primary_fields[7]
+            attributes = primary_fields[8]
+            gff_list.append('%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s' % (seqid,
+                            source, type, newstart, newend, score, strand,
+                            phase, attributes))
+            # ensure their is only 2 child miRNAs at best
+            if len(gff_dict[ID]["miRNAs"]) > 2:
+                gff_dict[ID]["miRNAs"] = gff_dict[ID]["miRNAs"][:2]
+            # sort child miRNAs 5p first 3p second,
+            # if there are two miR mature at least !
+            if len(gff_dict[ID]["miRNAs"]) > 1 and \
+                    gff_dict[ID]["miRNAs"][0].find('5p') == -1:
+                gff_dict[ID]["miRNAs"] = gff_dict[ID]["miRNAs"][::-1]
+            for mir in gff_dict[ID]["miRNAs"]:
+                mir_fields = mir.split('\t')
+                mir_seqid = mir_fields[8].split("Name=")[1].split(";")[0]
+                mir_source = mir_fields[1]
+                mir_type = mir_fields[2]
+                mir_start = mir_fields[3]
+                mir_end = mir_fields[4]
+                new_mir_start = str(int(mir_start)-int(start)+1)
+                new_mir_end = str(int(mir_end)-int(start)+1)
+                mir_score = mir_fields[5]
+                mir_strand = mir_fields[6]
+                mir_phase = mir_fields[7]
+                mir_attributes = mir_fields[8]
+                mir_sfx = ";Parent_mir_Name=%s" % gff_dict[ID]["premir_name"]
+                gff_list.append(
+                                '%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s%s' % (
+                                    mir_seqid, mir_source, mir_type,
+                                    new_mir_start, new_mir_end, mir_score,
+                                    mir_strand, mir_phase, mir_attributes,
+                                    mir_sfx))
     with open(output, "w") as output:
-        output.write(GFF3_header)
-        for ID in sorted(hairpins, key=hairpins.get):
-            output.write("\t".join([gff_dict[ID]["seqid"], gff_dict[ID]["source"],
-                gff_dict[ID]["type"], gff_dict[ID]["start"], gff_dict[ID]["end"],
-                gff_dict[ID]["score"], gff_dict[ID]["strand"], gff_dict[ID]["phase"],
-                gff_dict[ID]["attributes"]]))
-            output.write("\n")
-            for id in sorted(matures, key=matures.get, reverse=True):
-                if ID in gff_dict[id]["attributes"]:
-                    output.write("\t".join([gff_dict[id]["seqid"], gff_dict[id]["source"],
-                        gff_dict[id]["type"], gff_dict[id]["start"], gff_dict[id]["end"],
-                        gff_dict[id]["score"], gff_dict[id]["strand"],
-                        gff_dict[id]["phase"], gff_dict[id]["attributes"]]))
-                    output.write("\n")
+        output.write('%s' % get_gff_header(gff_input_file))
+        output.write('\n'.join(gff_list))
+        output.write('\n')
 
 
-def main(infile, outfile):
-    gff_dict = load_gff_in_dict(infile)
-    genome_to_mir_gff(gff_dict, outfile)
+def main(gff_path, outfile):
+    convert_and_print_gff(gff_path, outfile)
 
 
 if __name__ == "__main__":
     args = Parser()
-    main(args.input, args.output)
\ No newline at end of file
+    main(args.gff_path, args.output)