comparison g_read_sents.py @ 2:a47980ef2b96 draft

planemo upload for repository https://github.com/Alveo/alveo-galaxy-tools commit b5b26e9118f2ad8af109d606746b39a5588f0511-dirty
author stevecassidy
date Wed, 01 Nov 2017 01:19:55 -0400
parents fb617586f4b2
children
comparison
equal deleted inserted replaced
1:fb617586f4b2 2:a47980ef2b96
1 import sys 1
2 import os 2 import os
3 import nltk 3 import nltk
4 from nltk.corpus import PlaintextCorpusReader 4 from nltk.corpus import PlaintextCorpusReader
5 import argparse 5 import argparse
6
7 nltk.download('punkt', quiet=True)
8
6 9
7 def Parser(): 10 def Parser():
8 the_parser = argparse.ArgumentParser(description="Segments the text input into separate sentences") 11 the_parser = argparse.ArgumentParser(description="Segments the text input into separate sentences")
9 the_parser.add_argument('--input', required=True, action="store", type=str, help="input text file") 12 the_parser.add_argument('--input', required=True, action="store", type=str, help="input text file")
10 the_parser.add_argument('--output', required=True, action="store", type=str, help="output file path") 13 the_parser.add_argument('--output', required=True, action="store", type=str, help="output file path")
11 14
12 args = the_parser.parse_args() 15 args = the_parser.parse_args()
13 return args 16 return args
14 17
15 def print_out(outp, text, sentences): 18
19 def print_out(outp, sentences):
16 with open(outp, 'w') as output: 20 with open(outp, 'w') as output:
17 curr = 0
18 for sent in sentences: 21 for sent in sentences:
19 times = count_occurences(sent, sent[-1]) 22 for tok in sent:
20 curr = text.find(sent[0], curr) 23 output.write(tok)
21 end = find_nth(text, sent[-1], times, curr) + len(sent[-1]) 24 output.write(' ')
22 output.write(text[curr:end] + '\n') 25 output.write('\n')
23 curr = end 26
24 27
25 def find_nth(string, sub, n, offset): 28 def find_nth(string, sub, n, offset):
26 start = string.find(sub, offset) 29 start = string.find(sub, offset)
27 while start >= 0 and n > 1: 30 while start >= 0 and n > 1:
28 start = string.find(sub, start + len(sub)) 31 start = string.find(sub, start + len(sub))
29 n -= 1 32 n -= 1
30 return start 33 return start
31 34
35
32 def count_occurences(lst, string): 36 def count_occurences(lst, string):
33 count = 0 37 count = 0
34 for item in lst: 38 for item in lst:
35 if string in item: 39 if string in item:
36 count += 1 40 count += 1
37 return count 41 return count
38 42
43
39 def read_sents(inp, outp): 44 def read_sents(inp, outp):
40 with open(inp, 'r') as fd: 45
41 i = fd.read()
42 corpus = PlaintextCorpusReader(os.path.dirname(inp), os.path.basename(inp)) 46 corpus = PlaintextCorpusReader(os.path.dirname(inp), os.path.basename(inp))
43 sents = corpus.sents() 47 sents = corpus.sents()
44 print_out(outp, i, sents) 48 print_out(outp, sents)
49
45 50
46 if __name__ == '__main__': 51 if __name__ == '__main__':
47 args = Parser() 52 args = Parser()
48 read_sents(args.input, args.output) 53 read_sents(args.input, args.output)