comparison g_stemmer.py @ 1:fb617586f4b2 draft

planemo upload commit a81826fe44f09a3710a35c183aa88b745aeec064-dirty
author stevecassidy
date Mon, 05 Dec 2016 05:22:05 -0500
parents e991d4e60c17
children a47980ef2b96
comparison
equal deleted inserted replaced
0:e991d4e60c17 1:fb617586f4b2
12 parser.add_argument('--stemmer', required=False, action="store", type=str, help="output file path") 12 parser.add_argument('--stemmer', required=False, action="store", type=str, help="output file path")
13 args = parser.parse_args() 13 args = parser.parse_args()
14 return args 14 return args
15 15
16 def stem_file(in_file, out_file, stemmer_type): 16 def stem_file(in_file, out_file, stemmer_type):
17 unsegmented = unicode(open(in_file, 'r').read(), errors='ignore') 17 with open(in_file, 'r') as fd:
18 output = open(out_file, 'w') 18 unsegmented = fd.read()
19 sentences = nltk.sent_tokenize(unsegmented) 19
20 stemmer = get_stemmer(stemmer_type) 20 with open(out_file, 'w') as output:
21 for sentence in sentences: 21 sentences = nltk.sent_tokenize(unsegmented)
22 words = nltk.word_tokenize(sentence) 22 stemmer = get_stemmer(stemmer_type)
23 for word in words: 23 for sentence in sentences:
24 stemmed_word = stemmer.stem(word) 24 words = nltk.word_tokenize(sentence)
25 output.write(stemmed_word) 25 for word in words:
26 output.write('\n') 26 stemmed_word = stemmer.stem(word)
27 output.close() 27 output.write(stemmed_word)
28 output.write('\n')
28 29
29 def get_stemmer(stemmer_type): 30 def get_stemmer(stemmer_type):
30 if stemmer_type == 'lancaster': 31 if stemmer_type == 'lancaster':
31 stemmer = LancasterStemmer() 32 stemmer = LancasterStemmer()
32 elif stemmer_type == 'porter': 33 elif stemmer_type == 'porter':