comparison g_pos.py @ 3:0df72a8ab095 draft default tip

planemo upload for repository https://github.com/Alveo/alveo-galaxy-tools commit f2432aaedd36ae7662873623d8861d0982dffdd2
author stevecassidy
date Mon, 20 Nov 2017 22:52:11 -0500
parents a47980ef2b96
children
comparison
equal deleted inserted replaced
2:a47980ef2b96 3:0df72a8ab095
1 from __future__ import print_function, unicode_literals
1 import nltk 2 import nltk
2 import argparse 3 import argparse
4 import io
3 5
4 nltk.download('averaged_perceptron_tagger', quiet=True) 6 nltk.download('averaged_perceptron_tagger', quiet=True)
7 nltk.download('punkt', quiet=True)
5 8
6 9
7 def arguments(): 10 def arguments():
8 parser = argparse.ArgumentParser(description="tokenize a text") 11 parser = argparse.ArgumentParser(description="tokenize a text")
9 parser.add_argument('--input', required=True, action="store", type=str, help="input text file") 12 parser.add_argument('--input', required=True, action="store", type=str, help="input text file")
13 16
14 def postag(in_file, out_file): 17 def postag(in_file, out_file):
15 """Input: a text file with one token per line 18 """Input: a text file with one token per line
16 Output: a version of the text with Part of Speech tags written as word/TAG 19 Output: a version of the text with Part of Speech tags written as word/TAG
17 """ 20 """
18 with open(in_file, 'r') as fd: 21 with open(in_file, 'rb') as fd:
19 text = fd.read() 22 text = fd.read()
23 text = text.decode('utf-8')
20 24
21 sentences = nltk.sent_tokenize(text) 25 sentences = nltk.sent_tokenize(text)
22 26
23 with open(out_file, 'w') as output: 27 with io.open(out_file, 'w') as output:
24 for sentence in sentences: 28 for sentence in sentences:
25 tokens = nltk.word_tokenize(sentence) 29 tokens = nltk.word_tokenize(sentence)
26 postags = nltk.pos_tag(tokens) 30 postags = nltk.pos_tag(tokens)
27 for postag in postags: 31 for postag in postags:
28 # print postag 32 # print postag
29 output.write("%s/%s " % postag) 33 p = "%s/%s " % postag
34 output.write(p)
30 output.write('\n') 35 output.write('\n')
31 36
32 37
33 if __name__ == '__main__': 38 if __name__ == '__main__':
34 args = arguments() 39 args = arguments()