nltktools: g_tokenize.py comparison

planemo upload commit a81826fe44f09a3710a35c183aa88b745aeec064-dirty

comparison

equal deleted inserted replaced

-:e991d4e60c17
+:fb617586f4b2
 table = string.maketrans("","")
 return text.translate(table, string.punctuation)
 def tokenize(in_file, out_file, lower=False, nopunct=False):
-text = open(in_file, 'r').read()
+with open(in_file, 'r') as fd:
+text = fd.read()
 if lower:
 text = text.lower()
 if nopunct:
 text = strip_punct(text)
 result = []
-text = unicode(text, errors='ignore')
+#text = unicode(text, errors='ignore')
 sentences = nltk.sent_tokenize(text)
 for sentence in sentences:
 tokens = nltk.word_tokenize(sentence)
 result.append(tokens)
-output = open(out_file, 'w')
+with open(out_file, 'w') as output:
 # write one token per line
 for sentence in result:
 for token in sentence:
 output.write(token + "\n")
-output.close()
 if __name__ == '__main__':
 args = arguments()
 tokenize(args.input, args.output, lower=args.lower, nopunct=args.nopunct)

Mercurial > repos > stevecassidy > nltktools