Mercurial > repos > stevecassidy > nltktools

--- a/g_frequency.py	Wed Nov 01 01:19:55 2017 -0400
+++ b/g_frequency.py	Mon Nov 20 22:52:11 2017 -0500
@@ -12,23 +12,28 @@
     return parser.parse_args()


-def frequency(in_file, out_file):
+def frequency(textfiles, out_file):
     """Input: a text file
     Output: a table of word frequency with three columns for Word, Count and Percent frequency
     """
-    with open(in_file, 'r') as fd:
-        text = fd.read()

-    words = nltk.word_tokenize(text)
+    words = []
+    for textfile in textfiles:
+        with open(textfile, 'r') as fd:
+            text = fd.read()
+
+        words.extend(nltk.word_tokenize(text))
+
     fdist = FreqDist(words)
     total = float(fdist.N())

     with open(out_file, 'w') as output:
         output.write("Word\tCount\tPercent\n")
-        for pair in fdist.items():
+        for pair in sorted(fdist.items()):
             output.write("{pair[0]}\t{pair[1]}\t{pc:.2f}\n".format(pair=pair, pc=100 * pair[1] / total))


 if __name__ == '__main__':
     args = arguments()
-    frequency(args.input, args.output)
+    textfiles = args.input.split(',')
+    frequency(textfiles, args.output)
--- a/g_frequency.xml	Wed Nov 01 01:19:55 2017 -0400
+++ b/g_frequency.xml	Mon Nov 20 22:52:11 2017 -0500
@@ -6,26 +6,21 @@
     </requirements>

     <command interpreter="python">
-        g_frequency.py --input $input1 --output $frequency_table
+        g_frequency.py --input "${",".join(map(str, $input))}" --output $frequency_table
     </command>

     <inputs>
-        <param name="input1" type="data" format="txt"
-               label="Select a suitable input file from your history"/>
-
-        <param name="job_name" type="text" size="25"
-               label="Supply a name for the outputs to remind you what they contain"
-               value="Frequency List"/>
+        <param name="input" type="data" format="txt" multiple="true"
+               label="Input text(s)"/>
     </inputs>
     <outputs>
-        <data format="tabular" name="frequency_table" label="${job_name}"/>
+        <data format="tabular" name="frequency_table" label="Frequency Table"/>
     </outputs>

     <tests>
         <test>
-            <param name='input1' value='sample_text.txt'/>
-            <param name='job_name' value='testfrequency'/>
-            <output name='testfrequency' file='sample_text_frequency.dat'/>
+            <param name='input' value='sample_text.txt'/>
+            <output name='frequency_table' file='sample_text_frequency.dat'/>
         </test>
     </tests>
     <help>
--- a/g_pos.py	Wed Nov 01 01:19:55 2017 -0400
+++ b/g_pos.py	Mon Nov 20 22:52:11 2017 -0500
@@ -1,7 +1,10 @@
+from __future__ import print_function, unicode_literals
 import nltk
 import argparse
+import io

 nltk.download('averaged_perceptron_tagger', quiet=True)
+nltk.download('punkt', quiet=True)


 def arguments():
@@ -15,18 +18,20 @@
     """Input: a text file with one token per line
     Output: a version of the text with Part of Speech tags written as word/TAG
     """
-    with open(in_file, 'r') as fd:
+    with open(in_file, 'rb') as fd:
         text = fd.read()
+        text = text.decode('utf-8')

     sentences = nltk.sent_tokenize(text)

-    with open(out_file, 'w') as output:
+    with io.open(out_file, 'w') as output:
         for sentence in sentences:
             tokens = nltk.word_tokenize(sentence)
             postags = nltk.pos_tag(tokens)
             for postag in postags:
                 # print postag
-                output.write("%s/%s " % postag)
+                p = "%s/%s " % postag
+                output.write(p)
         output.write('\n')
--- a/g_pos.xml	Wed Nov 01 01:19:55 2017 -0400
+++ b/g_pos.xml	Mon Nov 20 22:52:11 2017 -0500
@@ -4,7 +4,7 @@
     <requirements>
         <requirement type="package" version="3.2.1">nltk</requirement>
     </requirements>
-
+
     <command interpreter="python">
         g_pos.py --input $input1 --output $postags
     </command>
@@ -18,14 +18,14 @@
                value="POS Tags"/>
     </inputs>
     <outputs>
-        <data format="json" name="postags" label="${job_name}"/>
+        <data format="txt" name="postags" label="${job_name}"/>
     </outputs>

     <tests>
         <test>
-            <param name='input1' value='sample_text_tok.json'/>
+            <param name='input1' value='sample_text.txt'/>
             <param name='job_name' value='testpos1'/>
-            <output name='tokens' file='sample_text_pos.json'/>
+            <output name='postags' file='sample_text_pos.dat'/>
         </test>
     </tests>
--- a/test-data/dependency_resolvers_config.xml	Wed Nov 01 01:19:55 2017 -0400
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,6 +0,0 @@
-<dependency_resolvers>
-  <tool_shed_packages />
-  <galaxy_packages />
-  <galaxy_packages versionless="true" />
-  <unlinked_tool_shed_packages />
-</dependency_resolvers>
--- a/test-data/elephant.txt	Wed Nov 01 01:19:55 2017 -0400
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,1 +0,0 @@
-I shot an elephant in my pajamas
--- a/test-data/grammar.dat	Wed Nov 01 01:19:55 2017 -0400
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,8 +0,0 @@
- S -> NP VP
- PP -> P NP
- NP -> Det N | Det N PP | 'I'
- VP -> V NP | VP PP
- Det -> 'an' | 'my'
- N -> 'elephant' | 'pajamas'
- V -> 'shot'
- P -> 'in'
--- a/test-data/sample_text.txt	Wed Nov 01 01:19:55 2017 -0400
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,9 +0,0 @@
-Some text that is nøt øß ascii
-
- DADA project is developing software for managing language resources and exposing them on the web. Language resources are digital collections of language as audio, video and text used to study language and build technology systems. The project has been going for a while with some initial funding from the ARC to build the basic infrastructure and later from Macquarie University for some work on the Auslan corpus of Australian Sign Language collected by Trevor Johnston. Recently we have two projects which DADA will be part of, and so the pace of development has picked up a little.
-
-The Australian National Corpus (AusNC) is an effort to build a centralised collection of resources of language in Australia.  The core idea is to take whatever existing collections we can get permission to publish and make them available under a common technical infrastructure.  Using some funding from HCSNet we build a small demonstration site that allowed free text search on two collections: the Australian Corpus of English and the Corpus of Oz Early English. We now have some funding to continue this work and expand both the size of the collection and the capability of the infrastructure that will support it. What we’ve already done is to separate the text in these corpora from their meta-data (descriptions of each text) and the annotation (denoting things within the texts).  While the pilot allows searching on the text the next steps will allow search using the meta-data (look for this in texts written after 1900) and the annotation (find this in the titles of articles).  This project is funded by the Australian National Data Service (ANDS) and is a collaboration with Michael Haugh at Griffith.
-
-The Big Australian Speech Corpus, more recently renamed AusTalk, is an ARC funded project to collect speech and video from 1000 Australian speakers for a new freely available corpus.  The project involves many partners around the country each of who will have a ‘black box’ recording station to collect audio and stereo video of subjects reading words and sentences, being interviewed and doing the Map task – a game designed to elicit natural speech between two people.   Our part of the project is to provide the server infrastructure that will store the audio, video and annotation data that will make up the corpus.  DADA will be part of this solution but the main driver is to be able to provide a secure and reliable store for the primary data as it comes in from the collection sites.  An important feature of the collection is the meta-data that will describe the subjects in the recording.  Some annotation of the data will be done automatically, for example some forced alignment of the read words and sentences.  Later, we will move on to support manual annotation of some of the data – for example transcripts of the interviews and map task sessions.   All of this will be published via the DADA server infrastructure to create a large, freely available research collection for Australian English.
-
-Since the development of DADA now involves people outside Macquarie, we have started using a public bitbucket repository for the code.  As of this writing the code still needs some tidying and documentation to enable third parties to be able to install and work on it, but we hope to have that done within a month.   The public DADA demo site is down at the moment due to network upgrades at Macquarie (it’s only visible inside MQ) – I hope to have that fixed soon with some new sample data sets loaded up for testing. 2011 looks like it will be a significant year for DADA. We hope to end this year with a number of significant text, audio and video corpora hosted on DADA infrastructure and providing useful services to the linguistics and language technology communities.
--- a/test-data/sample_text_frequency.dat	Wed Nov 01 01:19:55 2017 -0400
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,294 +0,0 @@
-Word	Count	Percent
-the	44	6.32
-of	26	3.74
-and	25	3.59
-.	24	3.45
-to	23	3.30
-a	15	2.16
-,	12	1.72
-for	12	1.72
-will	12	1.72
-is	11	1.58
-DADA	9	1.29
-some	8	1.15
-(	7	1.01
-be	7	1.01
-on	7	1.01
-that	7	1.01
-this	7	1.01
-Australian	7	1.01
-)	7	1.01
-The	7	1.01
-text	6	0.86
-project	6	0.86
-we	6	0.86
-infrastructure	6	0.86
-from	6	0.86
-have	6	0.86
-in	6	0.86
-video	5	0.72
-language	5	0.72
-data	5	0.72
-it	5	0.72
-collection	5	0.72
-annotation	5	0.72
-Corpus	4	0.57
-with	4	0.57
-build	4	0.57
-audio	4	0.57
-hope	3	0.43
-collections	3	0.43
-resources	3	0.43
-funding	3	0.43
-available	3	0.43
-English	3	0.43
-meta-data	3	0.43
-Macquarie	3	0.43
-done	3	0.43
-two	3	0.43
-corpus	3	0.43
-part	3	0.43
-work	3	0.43
-up	3	0.43
-at	3	0.43
--	3	0.43
-code	2	0.29
-people	2	0.29
-We	2	0.29
-but	2	0.29
-has	2	0.29
-them	2	0.29
-example	2	0.29
-words	2	0.29
-using	2	0.29
-now	2	0.29
-collect	2	0.29
-each	2	0.29
-corpora	2	0.29
-year	2	0.29
-server	2	0.29
-new	2	0.29
-public	2	0.29
-by	2	0.29
-search	2	0.29
-store	2	0.29
-involves	2	0.29
-within	2	0.29
-texts	2	0.29
-support	2	0.29
-Language	2	0.29
-sentences	2	0.29
-freely	2	0.29
-National	2	0.29
-funded	2	0.29
-site	2	0.29
-an	2	0.29
-as	2	0.29
-able	2	0.29
-make	2	0.29
-subjects	2	0.29
-speech	2	0.29
-development	2	0.29
-recording	2	0.29
-I	2	0.29
-significant	2	0.29
-task	2	0.29
-provide	2	0.29
-ARC	2	0.29
-demo	1	0.14
-automatically	1	0.14
-What	1	0.14
-Service	1	0.14
-being	1	0.14
-both	1	0.14
-soon	1	0.14
-existing	1	0.14
-large	1	0.14
-via	1	0.14
-looks	1	0.14
-Haugh	1	0.14
-still	1	0.14
-find	1	0.14
-alignment	1	0.14
-web	1	0.14
-Recently	1	0.14
-writing	1	0.14
-linguistics	1	0.14
-only	1	0.14
-going	1	0.14
-systems	1	0.14
-under	1	0.14
-Using	1	0.14
-2011	1	0.14
-take	1	0.14
-move	1	0.14
-around	1	0.14
-get	1	0.14
-read	1	0.14
-providing	1	0.14
-Michael	1	0.14
-number	1	0.14
-Project	1	0.14
-next	1	0.14
-While	1	0.14
-Oz	1	0.14
-communities	1	0.14
-comes	1	0.14
-projects	1	0.14
-articles	1	0.14
-like	1	0.14
-visible	1	0.14
-manual	1	0.14
-solution	1	0.14
-'ve	1	0.14
-capability	1	0.14
-these	1	0.14
-continue	1	0.14
-steps	1	0.14
-common	1	0.14
-small	1	0.14
-Speech	1	0.14
-fixed	1	0.14
-Griffith	1	0.14
-searching	1	0.14
-core	1	0.14
-doing	1	0.14
-Since	1	0.14
-idea	1	0.14
-All	1	0.14
-titles	1	0.14
-are	1	0.14
-picked	1	0.14
-Some	1	0.14
-network	1	0.14
-renamed	1	0.14
-managing	1	0.14
-sites	1	0.14
-publish	1	0.14
-research	1	0.14
-Later	1	0.14
-AusNC	1	0.14
-written	1	0.14
-between	1	0.14
-technology	1	0.14
-reading	1	0.14
-can	1	0.14
-recently	1	0.14
-repository	1	0.14
-partners	1	0.14
-This	1	0.14
-University	1	0.14
-hosted	1	0.14
-free	1	0.14
-box	1	0.14
-exposing	1	0.14
-technical	1	0.14
-study	1	0.14
-allows	1	0.14
-forced	1	0.14
-Sign	1	0.14
-published	1	0.14
-map	1	0.14
-MQ	1	0.14
-month	1	0.14
-interviews	1	0.14
-software	1	0.14
-already	1	0.14
-useful	1	0.14
-secure	1	0.14
-'black	1	0.14
-primary	1	0.14
-whatever	1	0.14
-Update	1	0.14
-1000	1	0.14
-parties	1	0.14
-loaded	1	0.14
-centralised	1	0.14
-Auslan	1	0.14
-1900	1	0.14
-size	1	0.14
-little	1	0.14
-Australia	1	0.14
-initial	1	0.14
-been	1	0.14
-Early	1	0.14
-their	1	0.14
-station	1	0.14
-down	1	0.14
-basic	1	0.14
-collected	1	0.14
-:	1	0.14
-Data	1	0.14
-ANDS	1	0.14
-more	1	0.14
-describe	1	0.14
-HCSNet	1	0.14
-denoting	1	0.14
-interviewed	1	0.14
-Trevor	1	0.14
-bitbucket	1	0.14
-testing	1	0.14
-Johnston	1	0.14
-effort	1	0.14
-pilot	1	0.14
-upgrades	1	0.14
-main	1	0.14
-look	1	0.14
-developing	1	0.14
-reliable	1	0.14
-pace	1	0.14
-while	1	0.14
-technoogy	1	0.14
-install	1	0.14
-Our	1	0.14
-transcripts	1	0.14
-country	1	0.14
-descriptions	1	0.14
-due	1	0.14
-documentation	1	0.14
-allowed	1	0.14
-sample	1	0.14
-enable	1	0.14
-create	1	0.14
-demonstration	1	0.14
-Map	1	0.14
-speakers	1	0.14
-inside	1	0.14
-end	1	0.14
-sessions	1	0.14
-things	1	0.14
-permission	1	0.14
-feature	1	0.14
-who	1	0.14
-started	1	0.14
-which	1	0.14
-digital	1	0.14
-many	1	0.14
-outside	1	0.14
-used	1	0.14
-'s	1	0.14
-separate	1	0.14
-collaboration	1	0.14
-after	1	0.14
-driver	1	0.14
-needs	1	0.14
-moment	1	0.14
-important	1	0.14
-designed	1	0.14
-tidying	1	0.14
-services	1	0.14
-elicit	1	0.14
-AusTalk	1	0.14
-expand	1	0.14
-stereo	1	0.14
-natural	1	0.14
-'	1	0.14
-third	1	0.14
-later	1	0.14
-game	1	0.14
-An	1	0.14
-As	1	0.14
-so	1	0.14
-Big	1	0.14
-allow	1	0.14
-sets	1	0.14
--- a/test-data/sample_text_lower.txt	Wed Nov 01 01:19:55 2017 -0400
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,696 +0,0 @@
-dada
-project
-update
-the
-dada
-project
-is
-developing
-software
-for
-managing
-language
-resources
-and
-exposing
-them
-on
-the
-web
-.
-language
-resources
-are
-digital
-collections
-of
-language
-as
-audio
-,
-video
-and
-text
-used
-to
-study
-language
-and
-build
-technoogy
-systems
-.
-the
-project
-has
-been
-going
-for
-a
-while
-with
-some
-initial
-funding
-from
-the
-arc
-to
-build
-the
-basic
-infrastructure
-and
-later
-from
-macquarie
-university
-for
-some
-work
-on
-the
-auslan
-corpus
-of
-australian
-sign
-language
-collected
-by
-trevor
-johnston
-.
-recently
-we
-have
-two
-projects
-which
-dada
-will
-be
-part
-of
-,
-and
-so
-the
-pace
-of
-development
-has
-picked
-up
-a
-little
-.
-the
-australian
-national
-corpus
-(
-ausnc
-)
-is
-an
-effort
-to
-build
-a
-centralised
-collection
-of
-resources
-of
-language
-in
-australia
-.
-the
-core
-idea
-is
-to
-take
-whatever
-existing
-collections
-we
-can
-get
-permission
-to
-publish
-and
-make
-them
-available
-under
-a
-common
-technical
-infrastructure
-.
-using
-some
-funding
-from
-hcsnet
-we
-build
-a
-small
-demonstration
-site
-that
-allowed
-free
-text
-search
-on
-two
-collections
-:
-the
-australian
-corpus
-of
-english
-and
-the
-corpus
-of
-oz
-early
-english
-.
-we
-now
-have
-some
-funding
-to
-continue
-this
-work
-and
-expand
-both
-the
-size
-of
-the
-collection
-and
-the
-capability
-of
-the
-infrastructure
-that
-will
-support
-it
-.
-what
-we
-'ve
-already
-done
-is
-to
-separate
-the
-text
-in
-these
-corpora
-from
-their
-meta-data
-(
-descriptions
-of
-each
-text
-)
-and
-the
-annotation
-(
-denoting
-things
-within
-the
-texts
-)
-.
-while
-the
-pilot
-allows
-searching
-on
-the
-text
-the
-next
-steps
-will
-allow
-search
-using
-the
-meta-data
-(
-look
-for
-this
-in
-texts
-written
-after
-1900
-)
-and
-the
-annotation
-(
-find
-this
-in
-the
-titles
-of
-articles
-)
-.
-this
-project
-is
-funded
-by
-the
-australian
-national
-data
-service
-(
-ands
-)
-and
-is
-a
-collaboration
-with
-michael
-haugh
-at
-griffith
-.
-the
-big
-australian
-speech
-corpus
-,
-more
-recently
-renamed
-austalk
-,
-is
-an
-arc
-funded
-project
-to
-collect
-speech
-and
-video
-from
-1000
-australian
-speakers
-for
-a
-new
-freely
-available
-corpus
-.
-the
-project
-involves
-many
-partners
-around
-the
-country
-each
-of
-who
-will
-have
-a
-'black
-box
-'
-recording
-station
-to
-collect
-audio
-and
-stereo
-video
-of
-subjects
-reading
-words
-and
-sentences
-,
-being
-interviewed
-and
-doing
-the
-map
-task
--
-a
-game
-designed
-to
-elicit
-natural
-speech
-between
-two
-people
-.
-our
-part
-of
-the
-project
-is
-to
-provide
-the
-server
-infrastructure
-that
-will
-store
-the
-audio
-,
-video
-and
-annotation
-data
-that
-will
-make
-up
-the
-corpus
-.
-dada
-will
-be
-part
-of
-this
-solution
-but
-the
-main
-driver
-is
-to
-be
-able
-to
-provide
-a
-secure
-and
-reliable
-store
-for
-the
-primary
-data
-as
-it
-comes
-in
-from
-the
-collection
-sites
-.
-an
-important
-feature
-of
-the
-collection
-is
-the
-meta-data
-that
-will
-describe
-the
-subjects
-in
-the
-recording
-.
-some
-annotation
-of
-the
-data
-will
-be
-done
-automatically
-,
-for
-example
-some
-forced
-alignment
-of
-the
-read
-words
-and
-sentences
-.
-later
-,
-we
-will
-move
-on
-to
-support
-manual
-annotation
-of
-some
-of
-the
-data
--
-for
-example
-transcripts
-of
-the
-interviews
-and
-map
-task
-sessions
-.
-all
-of
-this
-will
-be
-published
-via
-the
-dada
-server
-infrastructure
-to
-create
-a
-large
-,
-freely
-available
-research
-collection
-for
-australian
-english
-.
-since
-the
-development
-of
-dada
-now
-involves
-people
-outside
-macquarie
-,
-i
-have
-started
-using
-a
-public
-bitbucket
-repository
-for
-the
-code
-.
-as
-of
-this
-writing
-the
-code
-still
-needs
-some
-tidying
-and
-documentation
-to
-enable
-third
-parties
-to
-be
-able
-to
-install
-and
-work
-on
-it
-,
-but
-we
-hope
-to
-have
-that
-done
-within
-a
-month
-.
-the
-public
-dada
-demo
-site
-is
-down
-at
-the
-moment
-due
-to
-network
-upgrades
-at
-macquarie
-(
-it
-'s
-only
-visible
-inside
-mq
-)
--
-i
-hope
-to
-have
-that
-fixed
-soon
-with
-some
-new
-sample
-data
-sets
-loaded
-up
-for
-testing
-.
-2011
-looks
-like
-it
-will
-be
-a
-significant
-year
-for
-dada
-.
-we
-hope
-to
-end
-this
-year
-with
-a
-number
-of
-significant
-text
-,
-audio
-and
-video
-corpora
-hosted
-on
-dada
-infrastructure
-and
-providing
-useful
-services
-to
-the
-linguistics
-and
-language
-technology
-communities
-.
--- a/test-data/sample_text_lower_nopunct.txt	Wed Nov 01 01:19:55 2017 -0400
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,641 +0,0 @@
-dada
-project
-update
-the
-dada
-project
-is
-developing
-software
-for
-managing
-language
-resources
-and
-exposing
-them
-on
-the
-web
-language
-resources
-are
-digital
-collections
-of
-language
-as
-audio
-video
-and
-text
-used
-to
-study
-language
-and
-build
-technoogy
-systems
-the
-project
-has
-been
-going
-for
-a
-while
-with
-some
-initial
-funding
-from
-the
-arc
-to
-build
-the
-basic
-infrastructure
-and
-later
-from
-macquarie
-university
-for
-some
-work
-on
-the
-auslan
-corpus
-of
-australian
-sign
-language
-collected
-by
-trevor
-johnston
-recently
-we
-have
-two
-projects
-which
-dada
-will
-be
-part
-of
-and
-so
-the
-pace
-of
-development
-has
-picked
-up
-a
-little
-the
-australian
-national
-corpus
-ausnc
-is
-an
-effort
-to
-build
-a
-centralised
-collection
-of
-resources
-of
-language
-in
-australia
-the
-core
-idea
-is
-to
-take
-whatever
-existing
-collections
-we
-can
-get
-permission
-to
-publish
-and
-make
-them
-available
-under
-a
-common
-technical
-infrastructure
-using
-some
-funding
-from
-hcsnet
-we
-build
-a
-small
-demonstration
-site
-that
-allowed
-free
-text
-search
-on
-two
-collections
-the
-australian
-corpus
-of
-english
-and
-the
-corpus
-of
-oz
-early
-english
-we
-now
-have
-some
-funding
-to
-continue
-this
-work
-and
-expand
-both
-the
-size
-of
-the
-collection
-and
-the
-capability
-of
-the
-infrastructure
-that
-will
-support
-it
-what
-we
-'ve
-already
-done
-is
-to
-separate
-the
-text
-in
-these
-corpora
-from
-their
-meta-data
-descriptions
-of
-each
-text
-and
-the
-annotation
-denoting
-things
-within
-the
-texts
-while
-the
-pilot
-allows
-searching
-on
-the
-text
-the
-next
-steps
-will
-allow
-search
-using
-the
-meta-data
-look
-for
-this
-in
-texts
-written
-after
-1900
-and
-the
-annotation
-find
-this
-in
-the
-titles
-of
-articles
-this
-project
-is
-funded
-by
-the
-australian
-national
-data
-service
-ands
-and
-is
-a
-collaboration
-with
-michael
-haugh
-at
-griffith
-the
-big
-australian
-speech
-corpus
-more
-recently
-renamed
-austalk
-is
-an
-arc
-funded
-project
-to
-collect
-speech
-and
-video
-from
-1000
-australian
-speakers
-for
-a
-new
-freely
-available
-corpus
-the
-project
-involves
-many
-partners
-around
-the
-country
-each
-of
-who
-will
-have
-a
-'black
-box
-recording
-station
-to
-collect
-audio
-and
-stereo
-video
-of
-subjects
-reading
-words
-and
-sentences
-being
-interviewed
-and
-doing
-the
-map
-task
-a
-game
-designed
-to
-elicit
-natural
-speech
-between
-two
-people
-our
-part
-of
-the
-project
-is
-to
-provide
-the
-server
-infrastructure
-that
-will
-store
-the
-audio
-video
-and
-annotation
-data
-that
-will
-make
-up
-the
-corpus
-dada
-will
-be
-part
-of
-this
-solution
-but
-the
-main
-driver
-is
-to
-be
-able
-to
-provide
-a
-secure
-and
-reliable
-store
-for
-the
-primary
-data
-as
-it
-comes
-in
-from
-the
-collection
-sites
-an
-important
-feature
-of
-the
-collection
-is
-the
-meta-data
-that
-will
-describe
-the
-subjects
-in
-the
-recording
-some
-annotation
-of
-the
-data
-will
-be
-done
-automatically
-for
-example
-some
-forced
-alignment
-of
-the
-read
-words
-and
-sentences
-later
-we
-will
-move
-on
-to
-support
-manual
-annotation
-of
-some
-of
-the
-data
-for
-example
-transcripts
-of
-the
-interviews
-and
-map
-task
-sessions
-all
-of
-this
-will
-be
-published
-via
-the
-dada
-server
-infrastructure
-to
-create
-a
-large
-freely
-available
-research
-collection
-for
-australian
-english
-since
-the
-development
-of
-dada
-now
-involves
-people
-outside
-macquarie
-i
-have
-started
-using
-a
-public
-bitbucket
-repository
-for
-the
-code
-as
-of
-this
-writing
-the
-code
-still
-needs
-some
-tidying
-and
-documentation
-to
-enable
-third
-parties
-to
-be
-able
-to
-install
-and
-work
-on
-it
-but
-we
-hope
-to
-have
-that
-done
-within
-a
-month
-the
-public
-dada
-demo
-site
-is
-down
-at
-the
-moment
-due
-to
-network
-upgrades
-at
-macquarie
-it
-'s
-only
-visible
-inside
-mq
-i
-hope
-to
-have
-that
-fixed
-soon
-with
-some
-new
-sample
-data
-sets
-loaded
-up
-for
-testing
-2011
-looks
-like
-it
-will
-be
-a
-significant
-year
-for
-dada
-we
-hope
-to
-end
-this
-year
-with
-a
-number
-of
-significant
-text
-audio
-and
-video
-corpora
-hosted
-on
-dada
-infrastructure
-and
-providing
-useful
-services
-to
-the
-linguistics
-and
-language
-technology
-communities
--- a/test-data/sample_text_tok.txt	Wed Nov 01 01:19:55 2017 -0400
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,696 +0,0 @@
-DADA
-Project
-Update
-The
-DADA
-project
-is
-developing
-software
-for
-managing
-language
-resources
-and
-exposing
-them
-on
-the
-web
-.
-Language
-resources
-are
-digital
-collections
-of
-language
-as
-audio
-,
-video
-and
-text
-used
-to
-study
-language
-and
-build
-technoogy
-systems
-.
-The
-project
-has
-been
-going
-for
-a
-while
-with
-some
-initial
-funding
-from
-the
-ARC
-to
-build
-the
-basic
-infrastructure
-and
-later
-from
-Macquarie
-University
-for
-some
-work
-on
-the
-Auslan
-corpus
-of
-Australian
-Sign
-Language
-collected
-by
-Trevor
-Johnston
-.
-Recently
-we
-have
-two
-projects
-which
-DADA
-will
-be
-part
-of
-,
-and
-so
-the
-pace
-of
-development
-has
-picked
-up
-a
-little
-.
-The
-Australian
-National
-Corpus
-(
-AusNC
-)
-is
-an
-effort
-to
-build
-a
-centralised
-collection
-of
-resources
-of
-language
-in
-Australia
-.
-The
-core
-idea
-is
-to
-take
-whatever
-existing
-collections
-we
-can
-get
-permission
-to
-publish
-and
-make
-them
-available
-under
-a
-common
-technical
-infrastructure
-.
-Using
-some
-funding
-from
-HCSNet
-we
-build
-a
-small
-demonstration
-site
-that
-allowed
-free
-text
-search
-on
-two
-collections
-:
-the
-Australian
-Corpus
-of
-English
-and
-the
-Corpus
-of
-Oz
-Early
-English
-.
-We
-now
-have
-some
-funding
-to
-continue
-this
-work
-and
-expand
-both
-the
-size
-of
-the
-collection
-and
-the
-capability
-of
-the
-infrastructure
-that
-will
-support
-it
-.
-What
-we
-'ve
-already
-done
-is
-to
-separate
-the
-text
-in
-these
-corpora
-from
-their
-meta-data
-(
-descriptions
-of
-each
-text
-)
-and
-the
-annotation
-(
-denoting
-things
-within
-the
-texts
-)
-.
-While
-the
-pilot
-allows
-searching
-on
-the
-text
-the
-next
-steps
-will
-allow
-search
-using
-the
-meta-data
-(
-look
-for
-this
-in
-texts
-written
-after
-1900
-)
-and
-the
-annotation
-(
-find
-this
-in
-the
-titles
-of
-articles
-)
-.
-This
-project
-is
-funded
-by
-the
-Australian
-National
-Data
-Service
-(
-ANDS
-)
-and
-is
-a
-collaboration
-with
-Michael
-Haugh
-at
-Griffith
-.
-The
-Big
-Australian
-Speech
-Corpus
-,
-more
-recently
-renamed
-AusTalk
-,
-is
-an
-ARC
-funded
-project
-to
-collect
-speech
-and
-video
-from
-1000
-Australian
-speakers
-for
-a
-new
-freely
-available
-corpus
-.
-The
-project
-involves
-many
-partners
-around
-the
-country
-each
-of
-who
-will
-have
-a
-'black
-box
-'
-recording
-station
-to
-collect
-audio
-and
-stereo
-video
-of
-subjects
-reading
-words
-and
-sentences
-,
-being
-interviewed
-and
-doing
-the
-Map
-task
--
-a
-game
-designed
-to
-elicit
-natural
-speech
-between
-two
-people
-.
-Our
-part
-of
-the
-project
-is
-to
-provide
-the
-server
-infrastructure
-that
-will
-store
-the
-audio
-,
-video
-and
-annotation
-data
-that
-will
-make
-up
-the
-corpus
-.
-DADA
-will
-be
-part
-of
-this
-solution
-but
-the
-main
-driver
-is
-to
-be
-able
-to
-provide
-a
-secure
-and
-reliable
-store
-for
-the
-primary
-data
-as
-it
-comes
-in
-from
-the
-collection
-sites
-.
-An
-important
-feature
-of
-the
-collection
-is
-the
-meta-data
-that
-will
-describe
-the
-subjects
-in
-the
-recording
-.
-Some
-annotation
-of
-the
-data
-will
-be
-done
-automatically
-,
-for
-example
-some
-forced
-alignment
-of
-the
-read
-words
-and
-sentences
-.
-Later
-,
-we
-will
-move
-on
-to
-support
-manual
-annotation
-of
-some
-of
-the
-data
--
-for
-example
-transcripts
-of
-the
-interviews
-and
-map
-task
-sessions
-.
-All
-of
-this
-will
-be
-published
-via
-the
-DADA
-server
-infrastructure
-to
-create
-a
-large
-,
-freely
-available
-research
-collection
-for
-Australian
-English
-.
-Since
-the
-development
-of
-DADA
-now
-involves
-people
-outside
-Macquarie
-,
-I
-have
-started
-using
-a
-public
-bitbucket
-repository
-for
-the
-code
-.
-As
-of
-this
-writing
-the
-code
-still
-needs
-some
-tidying
-and
-documentation
-to
-enable
-third
-parties
-to
-be
-able
-to
-install
-and
-work
-on
-it
-,
-but
-we
-hope
-to
-have
-that
-done
-within
-a
-month
-.
-The
-public
-DADA
-demo
-site
-is
-down
-at
-the
-moment
-due
-to
-network
-upgrades
-at
-Macquarie
-(
-it
-'s
-only
-visible
-inside
-MQ
-)
--
-I
-hope
-to
-have
-that
-fixed
-soon
-with
-some
-new
-sample
-data
-sets
-loaded
-up
-for
-testing
-.
-2011
-looks
-like
-it
-will
-be
-a
-significant
-year
-for
-DADA
-.
-We
-hope
-to
-end
-this
-year
-with
-a
-number
-of
-significant
-text
-,
-audio
-and
-video
-corpora
-hosted
-on
-DADA
-infrastructure
-and
-providing
-useful
-services
-to
-the
-linguistics
-and
-language
-technology
-communities
-.