diff scripts/format_transdecoder_headers.sh @ 1:3f862f346967 draft

planemo upload for repository htpps://github.com/abims-sbr/adaptearch commit cf1b9c905931ca2ca25faa4844d45c908756472f-dirty
author abims-sbr
date Wed, 17 Jan 2018 09:02:12 -0500
parents f3600c96e961
children
line wrap: on
line diff
--- a/scripts/format_transdecoder_headers.sh	Fri Oct 27 10:38:52 2017 -0400
+++ b/scripts/format_transdecoder_headers.sh	Wed Jan 17 09:02:12 2018 -0500
@@ -1,19 +1,27 @@
 #/bin/bash
 
+# v2 - this script modifies the 'Orthogroups.txt' file in order to make it easily readable by the following script, filter_orthofinder.py
+  #Example :
+    #OG0000001: Gene.117__As119_1/1_1.000_543__g.117__m.117 Gene.157__As170_1/1_1.000_1203__g.157__m.157
+  #Becomes :
+    #As119_1/1_1.000_543 As170_1/1_1.000_1203
+    
+# removes 'OGxxxxxxx: '
+sed -E 's/OG[0-9]{7,}:\s//' $1 > $2
+# removes things like Gene.119__
+sed -i -E 's/Gene\.[0-9]{1,}\_\_/>/g' $2
+# removes things like __g.117__m.117
+sed -i -E 's/\_\_g\.[0-9]{1,}\_\_m\.[0-9]{1,}//g' $2
 
-#This script contains regex to re-write the outputs of transdecoder to the original AdaptSearch format 
-#Example :
-#OG0007971: m.35 g.35  ORF g.35 m.35 type_internal len_307 _+_ Th132_1/1_1.000_923_1-924_+_
-#Becomes :
-#Th132_1/1_1.000_923
+# Old version
 
 # removes 'OGxxxxxxx '
-sed -i -E 's/OG[0-9]{7}:\s//' $1 
+#sed -E 's/OG[0-9]{7}:\s//' $1 > $2
 # replace _+_ by (+) because '_' causes bugs
-sed -i 's/_+_/(+)/g' $1
+#sed -i 's/_+_/(+)/g' $2
 # Replaces everything by '>'
-sed -i -E 's/m\.[0-9]{1,}[^()]+\(\+\)\s*/>/g' $1
+#sed -i -E 's/m\.[0-9]{1,}[^()]+\(\+\)\s*/>/g' $2
 # Removes terminal '(+)'
-sed -i 's/(+)//g' $1
+#sed -i 's/(+)//g' $2
 # Removes last suite of unwanted numbers, underscore and dash
-sed -i -E 's/\_[0-9]{1,}-[0-9]{1,}//g' $1
+#sed -i -E 's/\_[0-9]{1,}-[0-9]{1,}//g' $2