comparison g_tokenize.py @ 0:e991d4e60c17 draft

planemo upload commit 0203cb3a0b40d9348674b2b098af805e2986abca-dirty
author stevecassidy
date Wed, 12 Oct 2016 22:17:53 -0400
parents
children fb617586f4b2
comparison
equal deleted inserted replaced
-1:000000000000 0:e991d4e60c17
1 import nltk
2 import string
3 import argparse
4
5 def arguments():
6 parser = argparse.ArgumentParser(description="tokenize a text")
7 parser.add_argument('--input', required=True, action="store", type=str, help="input text file")
8 parser.add_argument('--output', required=True, action="store", type=str, help="output file path")
9 parser.add_argument('--lower', required=False, action="store_true", help="lowercase all words")
10 parser.add_argument('--nopunct', required=False, action="store_true", help="remove all punctuation characters")
11 args = parser.parse_args()
12 return args
13
14
15 def strip_punct(text):
16 table = string.maketrans("","")
17 return text.translate(table, string.punctuation)
18
19
20 def tokenize(in_file, out_file, lower=False, nopunct=False):
21 text = open(in_file, 'r').read()
22 if lower:
23 text = text.lower()
24 if nopunct:
25 text = strip_punct(text)
26 result = []
27 text = unicode(text, errors='ignore')
28 sentences = nltk.sent_tokenize(text)
29 for sentence in sentences:
30 tokens = nltk.word_tokenize(sentence)
31 result.append(tokens)
32 output = open(out_file, 'w')
33 # write one token per line
34 for sentence in result:
35 for token in sentence:
36 output.write(token + "\n")
37 output.close()
38
39
40 if __name__ == '__main__':
41 args = arguments()
42 tokenize(args.input, args.output, lower=args.lower, nopunct=args.nopunct)