Mercurial > repos > stevecassidy > nltktools

diff g_tokenize.py @ 2:a47980ef2b96 draft
planemo upload for repository https://github.com/Alveo/alveo-galaxy-tools commit b5b26e9118f2ad8af109d606746b39a5588f0511-dirty
author: stevecassidy
date: Wed, 01 Nov 2017 01:19:55 -0400
parents: fb617586f4b2
--- a/g_tokenize.py	Mon Dec 05 05:22:05 2016 -0500
+++ b/g_tokenize.py	Wed Nov 01 01:19:55 2017 -0400
@@ -2,19 +2,21 @@
 import string
 import argparse
 
+nltk.download('punkt', quiet=True)
+
+
 def arguments():
     parser = argparse.ArgumentParser(description="tokenize a text")
     parser.add_argument('--input', required=True, action="store", type=str, help="input text file")
-    parser.add_argument('--output', required=True,  action="store", type=str, help="output file path")
+    parser.add_argument('--output', required=True, action="store", type=str, help="output file path")
     parser.add_argument('--lower', required=False, action="store_true", help="lowercase all words")
     parser.add_argument('--nopunct', required=False, action="store_true", help="remove all punctuation characters")
-    args = parser.parse_args()
-    return args
+    return parser.parse_args()
 
 
 def strip_punct(text):
-  table = string.maketrans("","")
-  return text.translate(table, string.punctuation)
+    table = text.maketrans("", "")
+    return text.translate(table, string.punctuation)
 
 
 def tokenize(in_file, out_file, lower=False, nopunct=False):
@@ -23,14 +25,15 @@
 
     if lower:
         text = text.lower()
-    if nopunct:
-        text = strip_punct(text)
     result = []
-    #text = unicode(text, errors='ignore')
+    # text = unicode(text, errors='ignore')
     sentences = nltk.sent_tokenize(text)
     for sentence in sentences:
         tokens = nltk.word_tokenize(sentence)
+        if nopunct:
+            tokens = filter(lambda w: w not in string.punctuation, tokens)
         result.append(tokens)
+
     with open(out_file, 'w') as output:
         # write one token per line
         for sentence in result:
author	stevecassidy
date	Wed, 01 Nov 2017 01:19:55 -0400
parents	fb617586f4b2
children