comparison g_tokenize.py @ 2:a47980ef2b96 draft

planemo upload for repository https://github.com/Alveo/alveo-galaxy-tools commit b5b26e9118f2ad8af109d606746b39a5588f0511-dirty
author stevecassidy
date Wed, 01 Nov 2017 01:19:55 -0400
parents fb617586f4b2
children
comparison
equal deleted inserted replaced
1:fb617586f4b2 2:a47980ef2b96
1 import nltk 1 import nltk
2 import string 2 import string
3 import argparse 3 import argparse
4 4
5 nltk.download('punkt', quiet=True)
6
7
5 def arguments(): 8 def arguments():
6 parser = argparse.ArgumentParser(description="tokenize a text") 9 parser = argparse.ArgumentParser(description="tokenize a text")
7 parser.add_argument('--input', required=True, action="store", type=str, help="input text file") 10 parser.add_argument('--input', required=True, action="store", type=str, help="input text file")
8 parser.add_argument('--output', required=True, action="store", type=str, help="output file path") 11 parser.add_argument('--output', required=True, action="store", type=str, help="output file path")
9 parser.add_argument('--lower', required=False, action="store_true", help="lowercase all words") 12 parser.add_argument('--lower', required=False, action="store_true", help="lowercase all words")
10 parser.add_argument('--nopunct', required=False, action="store_true", help="remove all punctuation characters") 13 parser.add_argument('--nopunct', required=False, action="store_true", help="remove all punctuation characters")
11 args = parser.parse_args() 14 return parser.parse_args()
12 return args
13 15
14 16
15 def strip_punct(text): 17 def strip_punct(text):
16 table = string.maketrans("","") 18 table = text.maketrans("", "")
17 return text.translate(table, string.punctuation) 19 return text.translate(table, string.punctuation)
18 20
19 21
20 def tokenize(in_file, out_file, lower=False, nopunct=False): 22 def tokenize(in_file, out_file, lower=False, nopunct=False):
21 with open(in_file, 'r') as fd: 23 with open(in_file, 'r') as fd:
22 text = fd.read() 24 text = fd.read()
23 25
24 if lower: 26 if lower:
25 text = text.lower() 27 text = text.lower()
26 if nopunct:
27 text = strip_punct(text)
28 result = [] 28 result = []
29 #text = unicode(text, errors='ignore') 29 # text = unicode(text, errors='ignore')
30 sentences = nltk.sent_tokenize(text) 30 sentences = nltk.sent_tokenize(text)
31 for sentence in sentences: 31 for sentence in sentences:
32 tokens = nltk.word_tokenize(sentence) 32 tokens = nltk.word_tokenize(sentence)
33 if nopunct:
34 tokens = filter(lambda w: w not in string.punctuation, tokens)
33 result.append(tokens) 35 result.append(tokens)
36
34 with open(out_file, 'w') as output: 37 with open(out_file, 'w') as output:
35 # write one token per line 38 # write one token per line
36 for sentence in result: 39 for sentence in result:
37 for token in sentence: 40 for token in sentence:
38 output.write(token + "\n") 41 output.write(token + "\n")