diff g_stemmer.py @ 0:e991d4e60c17 draft

planemo upload commit 0203cb3a0b40d9348674b2b098af805e2986abca-dirty
author stevecassidy
date Wed, 12 Oct 2016 22:17:53 -0400
parents
children fb617586f4b2
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/g_stemmer.py	Wed Oct 12 22:17:53 2016 -0400
@@ -0,0 +1,40 @@
+import sys
+import os
+import nltk
+from nltk.stem import *
+import argparse
+
+
+def arguments():
+    parser = argparse.ArgumentParser(description="Segments the text input into separate sentences")
+    parser.add_argument('--input', required=True, action="store", type=str, help="input text file")
+    parser.add_argument('--output', required=True, action="store", type=str, help="output file path")
+    parser.add_argument('--stemmer', required=False, action="store", type=str, help="output file path")
+    args = parser.parse_args()
+    return args
+
+def stem_file(in_file, out_file, stemmer_type):
+    unsegmented = unicode(open(in_file, 'r').read(), errors='ignore')
+    output = open(out_file, 'w')
+    sentences = nltk.sent_tokenize(unsegmented)
+    stemmer = get_stemmer(stemmer_type)
+    for sentence in sentences:
+        words = nltk.word_tokenize(sentence)
+        for word in words:
+            stemmed_word = stemmer.stem(word)
+            output.write(stemmed_word)
+            output.write('\n')
+    output.close()
+
+def get_stemmer(stemmer_type):
+    if stemmer_type == 'lancaster':
+        stemmer = LancasterStemmer()
+    elif stemmer_type == 'porter':
+        stemmer = PorterStemmer()
+    else:
+        stemmer = snowball.EnglishStemmer()
+    return stemmer
+
+if __name__ == '__main__':
+    args = arguments()
+    stem_file(args.input, args.output, args.stemmer)