comparison cut_timeseries.py @ 2:6f4db0e89117 draft

planemo upload commit 2acf1e5f5efe0ee3ef51a611a3f7c94ce73d3b89-dirty
author stevecassidy
date Thu, 08 Dec 2016 01:45:31 -0500
parents
children f188eb0b526d
comparison
equal deleted inserted replaced
1:dbcf9bc275e3 2:6f4db0e89117
1 import csv
2 import argparse
3
4
5
6 def parser():
7 parser = argparse.ArgumentParser(description="Cut data for a segment from a timeseries")
8 parser.add_argument('--segment_list', required=True, action="store", type=str, help="File containing list of item URLs")
9 parser.add_argument('--timeseries', required=True, action="store", type=str, help="time series data (comma separated file names)")
10 parser.add_argument('--identifier', required=True, action="store", type=str, help="Time series dataset identifiers (comma separated)")
11 parser.add_argument('--cutat', required=True, action="store", type=float, help="cut point 0-1")
12 parser.add_argument('--output_path', required=True, action="store", type=str, help="Path to output file")
13 return parser.parse_args()
14
15
16 def read_segment_list(filename):
17 """Read an segment list from a file
18 which should be a tabular formatted file
19 with columns start, end, label, duration, identifier
20 Return a dictionary with the 'identifier' field as keys
21 and a dictionary of other values as the values.
22 """
23
24 segments = []
25 with open(filename) as fd:
26 csvreader = csv.DictReader(fd, dialect='excel-tab')
27 if 'identifier' not in csvreader.fieldnames:
28 return None
29
30 for row in csvreader:
31 segments.append(row)
32
33 return segments
34
35 def get_tsfile(ident, tsfiles):
36 """Get the tsfile that matches the identifier """
37
38 for tsid, dsname in tsfiles:
39 if ident in tsid:
40 return dsname
41
42 return ''
43
44
45 def cut(tsfiles, segfile, cutpoint):
46 """Cut data from tsfile corresponding to the
47 cutpoint (0-1) for the segment with the id
48 in segs.
49 Return... """
50
51 segments = read_segment_list(segfile)
52
53 headers = ['identifier', 'label']
54 result = []
55
56 for seg in segments:
57
58 start = float(seg['start'])
59 end = float(seg['end'])
60 label = seg['label']
61 ident = seg['identifier']
62 tsfile = get_tsfile(ident, tsfiles)
63
64 if tsfile == '':
65 continue
66
67 collect = []
68 with open(tsfile, 'r') as fd:
69 reader = csv.reader(fd, dialect=csv.excel_tab)
70 for row in reader:
71 if row[0] == 'sampletime':
72 tsheader = row
73 elif float(row[0]) > start and float(row[0]) < end:
74 collect.append(row)
75
76 # grab the row at the cut point(s)
77 n = int(cutpoint * len(collect))
78 row = [ident, label]
79 row.extend(collect[n])
80 result.append(row)
81
82 headers.extend(tsheader)
83 return (headers, result)
84
85 if __name__=='__main__':
86
87 args = parser()
88
89 # get the list of timeseries files
90 tsfiles = args.timeseries.split(',')
91 tsidents = args.identifier.split(',')
92
93
94 headers,rows = cut(zip(tsidents, tsfiles), args.segment_list, args.cutat)
95
96 with open(args.output_path, 'w') as out:
97 writer = csv.writer(out)
98 writer.writerow(headers)
99 for row in rows:
100 writer.writerow(row)