Mercurial > repos > stevecassidy > nltktools

--- a/g_chart_parser.py	Mon Dec 05 05:22:05 2016 -0500
+++ b/g_chart_parser.py	Wed Nov 01 01:19:55 2017 -0400
@@ -1,15 +1,14 @@
 import sys
 import nltk
 import argparse
-from nltk.corpus import PlaintextCorpusReader
+

 def arguments():
     parser = argparse.ArgumentParser(description="run NER on a text")
     parser.add_argument('--input', required=True, action="store", type=str, help="input text file")
-    parser.add_argument('--grammar', required=True,  action="store", type=str, help="grammar file")
-    parser.add_argument('--output', required=True,  action="store", type=str, help="output file path")
-    args = parser.parse_args()
-    return args
+    parser.add_argument('--grammar', required=True, action="store", type=str, help="grammar file")
+    parser.add_argument('--output', required=True, action="store", type=str, help="output file path")
+    return parser.parse_args()


 def chart_parse(in_file, grammar_file, out_file):
@@ -32,11 +31,13 @@
                     output.write('\n')

     except Exception as e:
-        message = "Error with parsing. Check the input files are correct and the grammar contains every word in the input sequence. \n----\n" + str(e) + "\n"
+        message = """Error with parsing. Check the input files are correct
+and the grammar contains every word in the input sequence. \n----\n""" + str(e) + "\n"
         sys.stderr.write(message)
         sys.exit()
     output.close()

+
 if __name__ == '__main__':
     args = arguments()
     chart_parse(args.input, args.grammar, args.output)
--- a/g_collocation.py	Mon Dec 05 05:22:05 2016 -0500
+++ b/g_collocation.py	Wed Nov 01 01:19:55 2017 -0400
@@ -1,9 +1,11 @@
-import sys
-import os
 import nltk
-from nltk.collocations import *
+from nltk.collocations import BigramCollocationFinder, BigramAssocMeasures
+from nltk.collocations import TrigramCollocationFinder, TrigramAssocMeasures
 import argparse

+nltk.download('punkt', quiet=True)
+
+
 def Parser():
     the_parser = argparse.ArgumentParser(description="Parse the sentence using Chart Parser and a supplied grammar")
     the_parser.add_argument('--input', required=True, action="store", type=str, help="input text file")
@@ -13,8 +15,8 @@
     the_parser.add_argument('--coll_type', required=True, action="store", type=str, help="Type of collocations to find")
     the_parser.add_argument('--pos', required=True, action="store", type=str, help="Data input is a set of POS tags")

-    args = the_parser.parse_args()
-    return args
+    return the_parser.parse_args()
+

 def collocation(inp, outp, freq_filter, results, coll_type, pos):
     pos = bool(pos == 'true')
@@ -31,17 +33,19 @@
         for sent in sents:
             all_words += nltk.word_tokenize(sent)
     if coll_type == 'bigram':
-        measures = nltk.collocations.BigramAssocMeasures()
+        measures = BigramAssocMeasures()
         finder = BigramCollocationFinder.from_words(all_words)
     else:
-        measures = nltk.collocations.TrigramAssocMeasures()
+        measures = TrigramAssocMeasures()
         finder = TrigramCollocationFinder.from_words(all_words)
     finder.apply_freq_filter(int(freq_filter))
-    colls = finder.nbest(measures.pmi, int(results))
-    with  open(outp, 'w') as output:
+    # score the ngrams and get the first N
+    colls = finder.score_ngrams(measures.pmi)[:int(results)]
+    with open(outp, 'w') as output:
         for coll in colls:
-            output.write("%s\t%s" % coll)
-            output.write('\n')
+            (a, b), score = coll
+            output.write("%s\t%s\n" % (a, b))
+

 if __name__ == '__main__':
     args = Parser()
--- a/g_frequency.py	Mon Dec 05 05:22:05 2016 -0500
+++ b/g_frequency.py	Wed Nov 01 01:19:55 2017 -0400
@@ -2,12 +2,14 @@
 from nltk import FreqDist
 import argparse

+nltk.download('punkt', quiet=True)
+
+
 def arguments():
-  parser = argparse.ArgumentParser(description="generate a word frequency table from a text")
-  parser.add_argument('--input', required=True, action="store", type=str, help="input text file")
-  parser.add_argument('--output', required=True,  action="store", type=str, help="output file path")
-  args = parser.parse_args()
-  return args
+    parser = argparse.ArgumentParser(description="generate a word frequency table from a text")
+    parser.add_argument('--input', required=True, action="store", type=str, help="input text file")
+    parser.add_argument('--output', required=True, action="store", type=str, help="output file path")
+    return parser.parse_args()


 def frequency(in_file, out_file):
@@ -18,13 +20,13 @@
         text = fd.read()

     words = nltk.word_tokenize(text)
-    frequency = FreqDist(words)
-    total = float(frequency.N())
-
+    fdist = FreqDist(words)
+    total = float(fdist.N())
+
     with open(out_file, 'w') as output:
         output.write("Word\tCount\tPercent\n")
-        for pair in frequency.items():
-            output.write("{pair[0]}\t{pair[1]}\t{pc:.2f}\n".format(pair=pair, pc=100*pair[1]/total))
+        for pair in fdist.items():
+            output.write("{pair[0]}\t{pair[1]}\t{pc:.2f}\n".format(pair=pair, pc=100 * pair[1] / total))


 if __name__ == '__main__':
--- a/g_pos.py	Mon Dec 05 05:22:05 2016 -0500
+++ b/g_pos.py	Wed Nov 01 01:19:55 2017 -0400
@@ -1,13 +1,14 @@
 import nltk
 import argparse
-import json
+
+nltk.download('averaged_perceptron_tagger', quiet=True)
+

 def arguments():
     parser = argparse.ArgumentParser(description="tokenize a text")
     parser.add_argument('--input', required=True, action="store", type=str, help="input text file")
-    parser.add_argument('--output', required=True,  action="store", type=str, help="output file path")
-    args = parser.parse_args()
-    return args
+    parser.add_argument('--output', required=True, action="store", type=str, help="output file path")
+    return parser.parse_args()


 def postag(in_file, out_file):
@@ -18,7 +19,7 @@
         text = fd.read()

     sentences = nltk.sent_tokenize(text)
-
+
     with open(out_file, 'w') as output:
         for sentence in sentences:
             tokens = nltk.word_tokenize(sentence)
--- a/g_read_sents.py	Mon Dec 05 05:22:05 2016 -0500
+++ b/g_read_sents.py	Wed Nov 01 01:19:55 2017 -0400
@@ -1,9 +1,12 @@
-import sys
+
 import os
 import nltk
 from nltk.corpus import PlaintextCorpusReader
 import argparse

+nltk.download('punkt', quiet=True)
+
+
 def Parser():
     the_parser = argparse.ArgumentParser(description="Segments the text input into separate sentences")
     the_parser.add_argument('--input', required=True, action="store", type=str, help="input text file")
@@ -12,15 +15,15 @@
     args = the_parser.parse_args()
     return args

-def print_out(outp, text, sentences):
+
+def print_out(outp, sentences):
     with open(outp, 'w') as output:
-        curr = 0
         for sent in sentences:
-            times = count_occurences(sent, sent[-1])
-            curr = text.find(sent[0], curr)
-            end = find_nth(text, sent[-1], times, curr) + len(sent[-1])
-            output.write(text[curr:end] + '\n')
-            curr = end
+            for tok in sent:
+                output.write(tok)
+                output.write(' ')
+            output.write('\n')
+

 def find_nth(string, sub, n, offset):
     start = string.find(sub, offset)
@@ -29,6 +32,7 @@
         n -= 1
     return start

+
 def count_occurences(lst, string):
     count = 0
     for item in lst:
@@ -36,12 +40,13 @@
             count += 1
     return count

+
 def read_sents(inp, outp):
-    with open(inp, 'r') as fd:
-        i = fd.read()
+
     corpus = PlaintextCorpusReader(os.path.dirname(inp), os.path.basename(inp))
     sents = corpus.sents()
-    print_out(outp, i, sents)
+    print_out(outp, sents)
+

 if __name__ == '__main__':
     args = Parser()
--- a/g_read_sents.xml	Mon Dec 05 05:22:05 2016 -0500
+++ b/g_read_sents.xml	Wed Nov 01 01:19:55 2017 -0400
@@ -4,11 +4,11 @@
     <requirements>
         <requirement type="package" version="3.2.1">nltk</requirement>
     </requirements>
-
+
     <command interpreter="python">
         g_read_sents.py --input $input1 --output $tab_file
     </command>
-\
+
     <inputs>
         <param name="input1" type="data" format="txt" label="Select a suitable input file from your history"/>
         <param name="job_name" type="text" size="25"
--- a/g_stemmer.py	Mon Dec 05 05:22:05 2016 -0500
+++ b/g_stemmer.py	Wed Nov 01 01:19:55 2017 -0400
@@ -1,7 +1,5 @@
-import sys
-import os
 import nltk
-from nltk.stem import *
+from nltk.stem import PorterStemmer, LancasterStemmer, snowball
 import argparse


@@ -13,6 +11,7 @@
     args = parser.parse_args()
     return args

+
 def stem_file(in_file, out_file, stemmer_type):
     with open(in_file, 'r') as fd:
         unsegmented = fd.read()
@@ -27,6 +26,7 @@
                 output.write(stemmed_word)
                 output.write('\n')

+
 def get_stemmer(stemmer_type):
     if stemmer_type == 'lancaster':
         stemmer = LancasterStemmer()
@@ -36,6 +36,7 @@
         stemmer = snowball.EnglishStemmer()
     return stemmer

+
 if __name__ == '__main__':
     args = arguments()
     stem_file(args.input, args.output, args.stemmer)
--- a/g_tokenize.py	Mon Dec 05 05:22:05 2016 -0500
+++ b/g_tokenize.py	Wed Nov 01 01:19:55 2017 -0400
@@ -2,19 +2,21 @@
 import string
 import argparse

+nltk.download('punkt', quiet=True)
+
+
 def arguments():
     parser = argparse.ArgumentParser(description="tokenize a text")
     parser.add_argument('--input', required=True, action="store", type=str, help="input text file")
-    parser.add_argument('--output', required=True,  action="store", type=str, help="output file path")
+    parser.add_argument('--output', required=True, action="store", type=str, help="output file path")
     parser.add_argument('--lower', required=False, action="store_true", help="lowercase all words")
     parser.add_argument('--nopunct', required=False, action="store_true", help="remove all punctuation characters")
-    args = parser.parse_args()
-    return args
+    return parser.parse_args()


 def strip_punct(text):
-  table = string.maketrans("","")
-  return text.translate(table, string.punctuation)
+    table = text.maketrans("", "")
+    return text.translate(table, string.punctuation)


 def tokenize(in_file, out_file, lower=False, nopunct=False):
@@ -23,14 +25,15 @@

     if lower:
         text = text.lower()
-    if nopunct:
-        text = strip_punct(text)
     result = []
-    #text = unicode(text, errors='ignore')
+    # text = unicode(text, errors='ignore')
     sentences = nltk.sent_tokenize(text)
     for sentence in sentences:
         tokens = nltk.word_tokenize(sentence)
+        if nopunct:
+            tokens = filter(lambda w: w not in string.punctuation, tokens)
         result.append(tokens)
+
     with open(out_file, 'w') as output:
         # write one token per line
         for sentence in result:
--- a/tmp.dat	Mon Dec 05 05:22:05 2016 -0500
+++ b/tmp.dat	Wed Nov 01 01:19:55 2017 -0400
@@ -1,10 +1,25 @@
-(S
-  (NP I)
-  (VP
-    (VP (V shot) (NP (Det an) (N elephant)))
-    (PP (P in) (NP (Det my) (N pajamas)))))
-(S
-  (NP I)
-  (VP
-    (V shot)
-    (NP (Det an) (N elephant) (PP (P in) (NP (Det my) (N pajamas))))))
+Some text that is nøt øß ascii
+DADA project is developing software for managing language resources and exposing them on the web .
+Language resources are digital collections of language as audio , video and text used to study language and build technology systems .
+The project has been going for a while with some initial funding from the ARC to build the basic infrastructure and later from Macquarie University for some work on the Auslan corpus of Australian Sign Language collected by Trevor Johnston .
+Recently we have two projects which DADA will be part of , and so the pace of development has picked up a little .
+The Australian National Corpus ( AusNC ) is an effort to build a centralised collection of resources of language in Australia .
+The core idea is to take whatever existing collections we can get permission to publish and make them available under a common technical infrastructure .
+Using some funding from HCSNet we build a small demonstration site that allowed free text search on two collections : the Australian Corpus of English and the Corpus of Oz Early English .
+We now have some funding to continue this work and expand both the size of the collection and the capability of the infrastructure that will support it .
+What we ’ ve already done is to separate the text in these corpora from their meta - data ( descriptions of each text ) and the annotation ( denoting things within the texts ).
+While the pilot allows searching on the text the next steps will allow search using the meta - data ( look for this in texts written after 1900 ) and the annotation ( find this in the titles of articles ).
+This project is funded by the Australian National Data Service ( ANDS ) and is a collaboration with Michael Haugh at Griffith .
+The Big Australian Speech Corpus , more recently renamed AusTalk , is an ARC funded project to collect speech and video from 1000 Australian speakers for a new freely available corpus .
+The project involves many partners around the country each of who will have a ‘ black box ’ recording station to collect audio and stereo video of subjects reading words and sentences , being interviewed and doing the Map task – a game designed to elicit natural speech between two people .
+Our part of the project is to provide the server infrastructure that will store the audio , video and annotation data that will make up the corpus .
+DADA will be part of this solution but the main driver is to be able to provide a secure and reliable store for the primary data as it comes in from the collection sites .
+An important feature of the collection is the meta - data that will describe the subjects in the recording .
+Some annotation of the data will be done automatically , for example some forced alignment of the read words and sentences .
+Later , we will move on to support manual annotation of some of the data – for example transcripts of the interviews and map task sessions .
+All of this will be published via the DADA server infrastructure to create a large , freely available research collection for Australian English .
+Since the development of DADA now involves people outside Macquarie , we have started using a public bitbucket repository for the code .
+As of this writing the code still needs some tidying and documentation to enable third parties to be able to install and work on it , but we hope to have that done within a month .
+The public DADA demo site is down at the moment due to network upgrades at Macquarie ( it ’ s only visible inside MQ ) – I hope to have that fixed soon with some new sample data sets loaded up for testing .
+2011 looks like it will be a significant year for DADA .
+We hope to end this year with a number of significant text , audio and video corpora hosted on DADA infrastructure and providing useful services to the linguistics and language technology communities .
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tmp.tok	Wed Nov 01 01:19:55 2017 -0400
@@ -0,0 +1,696 @@
+some
+text
+that
+is
+nøt
+øß
+ascii
+dada
+project
+is
+developing
+software
+for
+managing
+language
+resources
+and
+exposing
+them
+on
+the
+web
+.
+language
+resources
+are
+digital
+collections
+of
+language
+as
+audio
+,
+video
+and
+text
+used
+to
+study
+language
+and
+build
+technology
+systems
+.
+the
+project
+has
+been
+going
+for
+a
+while
+with
+some
+initial
+funding
+from
+the
+arc
+to
+build
+the
+basic
+infrastructure
+and
+later
+from
+macquarie
+university
+for
+some
+work
+on
+the
+auslan
+corpus
+of
+australian
+sign
+language
+collected
+by
+trevor
+johnston
+.
+recently
+we
+have
+two
+projects
+which
+dada
+will
+be
+part
+of
+,
+and
+so
+the
+pace
+of
+development
+has
+picked
+up
+a
+little
+.
+the
+australian
+national
+corpus
+(
+ausnc
+)
+is
+an
+effort
+to
+build
+a
+centralised
+collection
+of
+resources
+of
+language
+in
+australia
+.
+the
+core
+idea
+is
+to
+take
+whatever
+existing
+collections
+we
+can
+get
+permission
+to
+publish
+and
+make
+them
+available
+under
+a
+common
+technical
+infrastructure
+.
+using
+some
+funding
+from
+hcsnet
+we
+build
+a
+small
+demonstration
+site
+that
+allowed
+free
+text
+search
+on
+two
+collections
+:
+the
+australian
+corpus
+of
+english
+and
+the
+corpus
+of
+oz
+early
+english
+.
+we
+now
+have
+some
+funding
+to
+continue
+this
+work
+and
+expand
+both
+the
+size
+of
+the
+collection
+and
+the
+capability
+of
+the
+infrastructure
+that
+will
+support
+it
+.
+what
+we’ve
+already
+done
+is
+to
+separate
+the
+text
+in
+these
+corpora
+from
+their
+meta-data
+(
+descriptions
+of
+each
+text
+)
+and
+the
+annotation
+(
+denoting
+things
+within
+the
+texts
+)
+.
+while
+the
+pilot
+allows
+searching
+on
+the
+text
+the
+next
+steps
+will
+allow
+search
+using
+the
+meta-data
+(
+look
+for
+this
+in
+texts
+written
+after
+1900
+)
+and
+the
+annotation
+(
+find
+this
+in
+the
+titles
+of
+articles
+)
+.
+this
+project
+is
+funded
+by
+the
+australian
+national
+data
+service
+(
+ands
+)
+and
+is
+a
+collaboration
+with
+michael
+haugh
+at
+griffith
+.
+the
+big
+australian
+speech
+corpus
+,
+more
+recently
+renamed
+austalk
+,
+is
+an
+arc
+funded
+project
+to
+collect
+speech
+and
+video
+from
+1000
+australian
+speakers
+for
+a
+new
+freely
+available
+corpus
+.
+the
+project
+involves
+many
+partners
+around
+the
+country
+each
+of
+who
+will
+have
+a
+‘black
+box’
+recording
+station
+to
+collect
+audio
+and
+stereo
+video
+of
+subjects
+reading
+words
+and
+sentences
+,
+being
+interviewed
+and
+doing
+the
+map
+task
+–
+a
+game
+designed
+to
+elicit
+natural
+speech
+between
+two
+people
+.
+our
+part
+of
+the
+project
+is
+to
+provide
+the
+server
+infrastructure
+that
+will
+store
+the
+audio
+,
+video
+and
+annotation
+data
+that
+will
+make
+up
+the
+corpus
+.
+dada
+will
+be
+part
+of
+this
+solution
+but
+the
+main
+driver
+is
+to
+be
+able
+to
+provide
+a
+secure
+and
+reliable
+store
+for
+the
+primary
+data
+as
+it
+comes
+in
+from
+the
+collection
+sites
+.
+an
+important
+feature
+of
+the
+collection
+is
+the
+meta-data
+that
+will
+describe
+the
+subjects
+in
+the
+recording
+.
+some
+annotation
+of
+the
+data
+will
+be
+done
+automatically
+,
+for
+example
+some
+forced
+alignment
+of
+the
+read
+words
+and
+sentences
+.
+later
+,
+we
+will
+move
+on
+to
+support
+manual
+annotation
+of
+some
+of
+the
+data
+–
+for
+example
+transcripts
+of
+the
+interviews
+and
+map
+task
+sessions
+.
+all
+of
+this
+will
+be
+published
+via
+the
+dada
+server
+infrastructure
+to
+create
+a
+large
+,
+freely
+available
+research
+collection
+for
+australian
+english
+.
+since
+the
+development
+of
+dada
+now
+involves
+people
+outside
+macquarie
+,
+we
+have
+started
+using
+a
+public
+bitbucket
+repository
+for
+the
+code
+.
+as
+of
+this
+writing
+the
+code
+still
+needs
+some
+tidying
+and
+documentation
+to
+enable
+third
+parties
+to
+be
+able
+to
+install
+and
+work
+on
+it
+,
+but
+we
+hope
+to
+have
+that
+done
+within
+a
+month
+.
+the
+public
+dada
+demo
+site
+is
+down
+at
+the
+moment
+due
+to
+network
+upgrades
+at
+macquarie
+(
+it’s
+only
+visible
+inside
+mq
+)
+–
+i
+hope
+to
+have
+that
+fixed
+soon
+with
+some
+new
+sample
+data
+sets
+loaded
+up
+for
+testing
+.
+2011
+looks
+like
+it
+will
+be
+a
+significant
+year
+for
+dada
+.
+we
+hope
+to
+end
+this
+year
+with
+a
+number
+of
+significant
+text
+,
+audio
+and
+video
+corpora
+hosted
+on
+dada
+infrastructure
+and
+providing
+useful
+services
+to
+the
+linguistics
+and
+language
+technology
+communities
+.