Mercurial > repos > stevecassidy > wrassp

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/cut_timeseries.py	Thu Dec 08 01:45:31 2016 -0500
@@ -0,0 +1,100 @@
+import csv
+import argparse
+
+
+
+def parser():
+    parser = argparse.ArgumentParser(description="Cut data for a segment from a timeseries")
+    parser.add_argument('--segment_list', required=True, action="store", type=str, help="File containing list of item URLs")
+    parser.add_argument('--timeseries', required=True, action="store", type=str, help="time series data (comma separated file names)")
+    parser.add_argument('--identifier', required=True, action="store", type=str, help="Time series dataset identifiers (comma separated)")
+    parser.add_argument('--cutat', required=True, action="store", type=float, help="cut point 0-1")
+    parser.add_argument('--output_path', required=True, action="store", type=str, help="Path to output file")
+    return parser.parse_args()
+
+
+def read_segment_list(filename):
+    """Read an segment list from a file
+    which should be a tabular formatted file
+    with columns start, end, label, duration, identifier
+    Return a dictionary with the 'identifier' field as keys
+    and a dictionary of other values as the values.
+    """
+
+    segments = []
+    with open(filename) as fd:
+        csvreader = csv.DictReader(fd, dialect='excel-tab')
+        if 'identifier' not in csvreader.fieldnames:
+            return None
+
+        for row in csvreader:
+            segments.append(row)
+
+    return segments
+
+def get_tsfile(ident, tsfiles):
+    """Get the tsfile that matches the identifier """
+
+    for tsid, dsname in tsfiles:
+        if ident in tsid:
+            return dsname
+
+    return ''
+
+
+def cut(tsfiles, segfile, cutpoint):
+    """Cut data from tsfile corresponding to the
+    cutpoint (0-1) for the segment with the id
+    in segs.
+    Return... """
+
+    segments = read_segment_list(segfile)
+
+    headers = ['identifier', 'label']
+    result = []
+
+    for seg in segments:
+
+        start = float(seg['start'])
+        end = float(seg['end'])
+        label = seg['label']
+        ident = seg['identifier']
+        tsfile = get_tsfile(ident, tsfiles)
+
+        if tsfile == '':
+            continue
+
+        collect = []
+        with open(tsfile, 'r') as fd:
+            reader = csv.reader(fd, dialect=csv.excel_tab)
+            for row in reader:
+                if row[0] == 'sampletime':
+                    tsheader = row
+                elif float(row[0]) > start and float(row[0]) < end:
+                    collect.append(row)
+
+        # grab the row at the cut point(s)
+        n = int(cutpoint * len(collect))
+        row = [ident, label]
+        row.extend(collect[n])
+        result.append(row)
+
+    headers.extend(tsheader)
+    return (headers, result)
+
+if __name__=='__main__':
+
+    args = parser()
+
+    # get the list of timeseries files
+    tsfiles = args.timeseries.split(',')
+    tsidents = args.identifier.split(',')
+
+
+    headers,rows = cut(zip(tsidents, tsfiles), args.segment_list, args.cutat)
+
+    with open(args.output_path, 'w') as out:
+        writer = csv.writer(out)
+        writer.writerow(headers)
+        for row in rows:
+            writer.writerow(row)
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/cut_timeseries.xml	Thu Dec 08 01:45:31 2016 -0500
@@ -0,0 +1,33 @@
+<tool id="cut_timeseries" name="Cut data from a timeseries" version="0.1">
+    <description></description>
+
+    <command interpreter="python">
+        cut_timeseries.py --timeseries "${",".join(map(str, $timeseries))}" --identifier "${",".join(map(str, [t.element_identifier for t in $timeseries]))}" --cutat ${cutat} --segment_list '${segment_list}' --output_path '$output'
+    </command>
+
+    <inputs>
+        <param name="timeseries" type="data" multiple="true" format="tabular" label="Time Series"/>
+        <param name="cutat" type="float" label="Cut point (0-1)" value="0.5"/>
+        <param name="segment_list" type="data" format="tabular" label="Segment List"/>
+    </inputs>
+    <outputs>
+        <data format="tabular" name="output"/>
+    </outputs>
+    <tests>
+        <test>
+            <param name="cutat" value="0.5"/>
+            <param name="timeseries" value="1_1119_2_22_001-formants.dat"/>
+            <param name="segment_list" value="segmentlist.dat"/>
+            <output name="output">
+                <assert_contents>
+                    <has_text text="sampletime"/>
+                    <has_text text="@"/>
+                    <has_text text="1_1119_2_22_001"/>
+                </assert_contents>
+            </output>
+        </test>
+    </tests>
+    <help><![CDATA[
+
+    ]]></help>
+</tool>