# HG changeset patch
# User stevecassidy
# Date 1509513595 14400
# Node ID a47980ef2b96fc7a2a08b7ea14c71c10fa06985b
# Parent fb617586f4b283a987f0c33b03b43995e26c1b09
planemo upload for repository https://github.com/Alveo/alveo-galaxy-tools commit b5b26e9118f2ad8af109d606746b39a5588f0511-dirty
diff -r fb617586f4b2 -r a47980ef2b96 g_chart_parser.py
--- a/g_chart_parser.py Mon Dec 05 05:22:05 2016 -0500
+++ b/g_chart_parser.py Wed Nov 01 01:19:55 2017 -0400
@@ -1,15 +1,14 @@
import sys
import nltk
import argparse
-from nltk.corpus import PlaintextCorpusReader
+
def arguments():
parser = argparse.ArgumentParser(description="run NER on a text")
parser.add_argument('--input', required=True, action="store", type=str, help="input text file")
- parser.add_argument('--grammar', required=True, action="store", type=str, help="grammar file")
- parser.add_argument('--output', required=True, action="store", type=str, help="output file path")
- args = parser.parse_args()
- return args
+ parser.add_argument('--grammar', required=True, action="store", type=str, help="grammar file")
+ parser.add_argument('--output', required=True, action="store", type=str, help="output file path")
+ return parser.parse_args()
def chart_parse(in_file, grammar_file, out_file):
@@ -32,11 +31,13 @@
output.write('\n')
except Exception as e:
- message = "Error with parsing. Check the input files are correct and the grammar contains every word in the input sequence. \n----\n" + str(e) + "\n"
+ message = """Error with parsing. Check the input files are correct
+and the grammar contains every word in the input sequence. \n----\n""" + str(e) + "\n"
sys.stderr.write(message)
sys.exit()
output.close()
+
if __name__ == '__main__':
args = arguments()
chart_parse(args.input, args.grammar, args.output)
diff -r fb617586f4b2 -r a47980ef2b96 g_collocation.py
--- a/g_collocation.py Mon Dec 05 05:22:05 2016 -0500
+++ b/g_collocation.py Wed Nov 01 01:19:55 2017 -0400
@@ -1,9 +1,11 @@
-import sys
-import os
import nltk
-from nltk.collocations import *
+from nltk.collocations import BigramCollocationFinder, BigramAssocMeasures
+from nltk.collocations import TrigramCollocationFinder, TrigramAssocMeasures
import argparse
+nltk.download('punkt', quiet=True)
+
+
def Parser():
the_parser = argparse.ArgumentParser(description="Parse the sentence using Chart Parser and a supplied grammar")
the_parser.add_argument('--input', required=True, action="store", type=str, help="input text file")
@@ -13,8 +15,8 @@
the_parser.add_argument('--coll_type', required=True, action="store", type=str, help="Type of collocations to find")
the_parser.add_argument('--pos', required=True, action="store", type=str, help="Data input is a set of POS tags")
- args = the_parser.parse_args()
- return args
+ return the_parser.parse_args()
+
def collocation(inp, outp, freq_filter, results, coll_type, pos):
pos = bool(pos == 'true')
@@ -31,17 +33,19 @@
for sent in sents:
all_words += nltk.word_tokenize(sent)
if coll_type == 'bigram':
- measures = nltk.collocations.BigramAssocMeasures()
+ measures = BigramAssocMeasures()
finder = BigramCollocationFinder.from_words(all_words)
else:
- measures = nltk.collocations.TrigramAssocMeasures()
+ measures = TrigramAssocMeasures()
finder = TrigramCollocationFinder.from_words(all_words)
finder.apply_freq_filter(int(freq_filter))
- colls = finder.nbest(measures.pmi, int(results))
- with open(outp, 'w') as output:
+ # score the ngrams and get the first N
+ colls = finder.score_ngrams(measures.pmi)[:int(results)]
+ with open(outp, 'w') as output:
for coll in colls:
- output.write("%s\t%s" % coll)
- output.write('\n')
+ (a, b), score = coll
+ output.write("%s\t%s\n" % (a, b))
+
if __name__ == '__main__':
args = Parser()
diff -r fb617586f4b2 -r a47980ef2b96 g_frequency.py
--- a/g_frequency.py Mon Dec 05 05:22:05 2016 -0500
+++ b/g_frequency.py Wed Nov 01 01:19:55 2017 -0400
@@ -2,12 +2,14 @@
from nltk import FreqDist
import argparse
+nltk.download('punkt', quiet=True)
+
+
def arguments():
- parser = argparse.ArgumentParser(description="generate a word frequency table from a text")
- parser.add_argument('--input', required=True, action="store", type=str, help="input text file")
- parser.add_argument('--output', required=True, action="store", type=str, help="output file path")
- args = parser.parse_args()
- return args
+ parser = argparse.ArgumentParser(description="generate a word frequency table from a text")
+ parser.add_argument('--input', required=True, action="store", type=str, help="input text file")
+ parser.add_argument('--output', required=True, action="store", type=str, help="output file path")
+ return parser.parse_args()
def frequency(in_file, out_file):
@@ -18,13 +20,13 @@
text = fd.read()
words = nltk.word_tokenize(text)
- frequency = FreqDist(words)
- total = float(frequency.N())
-
+ fdist = FreqDist(words)
+ total = float(fdist.N())
+
with open(out_file, 'w') as output:
output.write("Word\tCount\tPercent\n")
- for pair in frequency.items():
- output.write("{pair[0]}\t{pair[1]}\t{pc:.2f}\n".format(pair=pair, pc=100*pair[1]/total))
+ for pair in fdist.items():
+ output.write("{pair[0]}\t{pair[1]}\t{pc:.2f}\n".format(pair=pair, pc=100 * pair[1] / total))
if __name__ == '__main__':
diff -r fb617586f4b2 -r a47980ef2b96 g_pos.py
--- a/g_pos.py Mon Dec 05 05:22:05 2016 -0500
+++ b/g_pos.py Wed Nov 01 01:19:55 2017 -0400
@@ -1,13 +1,14 @@
import nltk
import argparse
-import json
+
+nltk.download('averaged_perceptron_tagger', quiet=True)
+
def arguments():
parser = argparse.ArgumentParser(description="tokenize a text")
parser.add_argument('--input', required=True, action="store", type=str, help="input text file")
- parser.add_argument('--output', required=True, action="store", type=str, help="output file path")
- args = parser.parse_args()
- return args
+ parser.add_argument('--output', required=True, action="store", type=str, help="output file path")
+ return parser.parse_args()
def postag(in_file, out_file):
@@ -18,7 +19,7 @@
text = fd.read()
sentences = nltk.sent_tokenize(text)
-
+
with open(out_file, 'w') as output:
for sentence in sentences:
tokens = nltk.word_tokenize(sentence)
diff -r fb617586f4b2 -r a47980ef2b96 g_read_sents.py
--- a/g_read_sents.py Mon Dec 05 05:22:05 2016 -0500
+++ b/g_read_sents.py Wed Nov 01 01:19:55 2017 -0400
@@ -1,9 +1,12 @@
-import sys
+
import os
import nltk
from nltk.corpus import PlaintextCorpusReader
import argparse
+nltk.download('punkt', quiet=True)
+
+
def Parser():
the_parser = argparse.ArgumentParser(description="Segments the text input into separate sentences")
the_parser.add_argument('--input', required=True, action="store", type=str, help="input text file")
@@ -12,15 +15,15 @@
args = the_parser.parse_args()
return args
-def print_out(outp, text, sentences):
+
+def print_out(outp, sentences):
with open(outp, 'w') as output:
- curr = 0
for sent in sentences:
- times = count_occurences(sent, sent[-1])
- curr = text.find(sent[0], curr)
- end = find_nth(text, sent[-1], times, curr) + len(sent[-1])
- output.write(text[curr:end] + '\n')
- curr = end
+ for tok in sent:
+ output.write(tok)
+ output.write(' ')
+ output.write('\n')
+
def find_nth(string, sub, n, offset):
start = string.find(sub, offset)
@@ -29,6 +32,7 @@
n -= 1
return start
+
def count_occurences(lst, string):
count = 0
for item in lst:
@@ -36,12 +40,13 @@
count += 1
return count
+
def read_sents(inp, outp):
- with open(inp, 'r') as fd:
- i = fd.read()
+
corpus = PlaintextCorpusReader(os.path.dirname(inp), os.path.basename(inp))
sents = corpus.sents()
- print_out(outp, i, sents)
+ print_out(outp, sents)
+
if __name__ == '__main__':
args = Parser()
diff -r fb617586f4b2 -r a47980ef2b96 g_read_sents.xml
--- a/g_read_sents.xml Mon Dec 05 05:22:05 2016 -0500
+++ b/g_read_sents.xml Wed Nov 01 01:19:55 2017 -0400
@@ -4,11 +4,11 @@
nltk
-
+
g_read_sents.py --input $input1 --output $tab_file
-\
+