comparison g_pos.py @ 2:a47980ef2b96 draft

planemo upload for repository https://github.com/Alveo/alveo-galaxy-tools commit b5b26e9118f2ad8af109d606746b39a5588f0511-dirty
author stevecassidy
date Wed, 01 Nov 2017 01:19:55 -0400
parents fb617586f4b2
children 0df72a8ab095
comparison
equal deleted inserted replaced
1:fb617586f4b2 2:a47980ef2b96
1 import nltk 1 import nltk
2 import argparse 2 import argparse
3 import json 3
4 nltk.download('averaged_perceptron_tagger', quiet=True)
5
4 6
5 def arguments(): 7 def arguments():
6 parser = argparse.ArgumentParser(description="tokenize a text") 8 parser = argparse.ArgumentParser(description="tokenize a text")
7 parser.add_argument('--input', required=True, action="store", type=str, help="input text file") 9 parser.add_argument('--input', required=True, action="store", type=str, help="input text file")
8 parser.add_argument('--output', required=True, action="store", type=str, help="output file path") 10 parser.add_argument('--output', required=True, action="store", type=str, help="output file path")
9 args = parser.parse_args() 11 return parser.parse_args()
10 return args
11 12
12 13
13 def postag(in_file, out_file): 14 def postag(in_file, out_file):
14 """Input: a text file with one token per line 15 """Input: a text file with one token per line
15 Output: a version of the text with Part of Speech tags written as word/TAG 16 Output: a version of the text with Part of Speech tags written as word/TAG
16 """ 17 """
17 with open(in_file, 'r') as fd: 18 with open(in_file, 'r') as fd:
18 text = fd.read() 19 text = fd.read()
19 20
20 sentences = nltk.sent_tokenize(text) 21 sentences = nltk.sent_tokenize(text)
21 22
22 with open(out_file, 'w') as output: 23 with open(out_file, 'w') as output:
23 for sentence in sentences: 24 for sentence in sentences:
24 tokens = nltk.word_tokenize(sentence) 25 tokens = nltk.word_tokenize(sentence)
25 postags = nltk.pos_tag(tokens) 26 postags = nltk.pos_tag(tokens)
26 for postag in postags: 27 for postag in postags: