pipeline/combine_results.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138

#!/usr/bin/python
"""Combines results from multiple days of a single metric.

Feed it the STATUS.txt files on stdin.  It then finds the corresponding
results.csv, and takes the top N items.

Example:

Date,      "google.com,", yahoo.com
2015-03-01,          0.0,       0.9
2015-03-02,          0.1,       0.8

Dygraphs can load this CSV file directly.

TODO: Use different dygraph API?

Also we need error bars.

  new Dygraph(document.getElementById("graphdiv2"),
              [
                [1,10,100],
                [2,20,80],
                [3,50,60],
                [4,70,80]
              ],
              {
                labels: [ "Date", "failure", "timeout", "google.com" ]
              });
"""

import collections
import csv
import json
import os
import sys

import util


def CombineDistResults(stdin, c_out, num_top):
  dates = []
  var_cols = collections.defaultdict(dict)  # {name: {date: value}}

  seen_dates = set()

  for line in stdin:
    status_path = line.strip()

    # Assume it looks like .../2015-03-01/STATUS.txt
    task_dir = os.path.dirname(status_path)
    date = os.path.basename(task_dir)

    # Get rid of duplicate dates.  These could be caused by retries.
    if date in seen_dates:
      continue

    seen_dates.add(date)

    with open(status_path) as f:
      status = f.readline().split()[0]  # OK, FAIL, TIMEOUT, SKIPPED

    dates.append(date)

    if status != 'OK':
      continue  # won't have results.csv

    results_path = os.path.join(task_dir, 'results.csv')
    with open(results_path) as f:
      c = csv.reader(f)
      unused_header = c.next()  # header row

      # they are sorted by decreasing "estimate", which is what we want
      for i in xrange(0, num_top):
        try:
          row = c.next()
        except StopIteration:
          # It's OK if it doesn't have enough
          util.log('Stopping early. Fewer than %d results to render.', num_top)
          break

        string, _, _, proportion, _, prop_low, prop_high = row

        # dygraphs has a weird format with semicolons:
        # value;lower;upper,value;lower;upper.

        # http://dygraphs.com/data.html#csv

        # Arbitrarily use 4 digits after decimal point (for dygraphs, not
        # directly displayed)
        dygraph_triple = '%.4f;%.4f;%.4f' % (
            float(prop_low), float(proportion), float(prop_high))

        var_cols[string][date] = dygraph_triple

  # Now print CSV on stdout.
  cols = sorted(var_cols.keys())  # sort columns alphabetically
  c_out.writerow(['date'] + cols)

  dates.sort()

  for date in dates:
    row = [date]
    for col in cols:
      cell = var_cols[col].get(date)  # None mean sthere is no row
      row.append(cell)
    c_out.writerow(row)

  #util.log("Number of dynamic cols: %d", len(var_cols))


def CombineAssocResults(stdin, c_out, num_top):
  header = ('dummy',)
  c_out.writerow(header)


def main(argv):
  action = argv[1]

  if action == 'dist':
    num_top = int(argv[2])  # number of values to keep
    c_out = csv.writer(sys.stdout)
    CombineDistResults(sys.stdin, c_out, num_top)

  elif action == 'assoc':
    num_top = int(argv[2])  # number of values to keep
    c_out = csv.writer(sys.stdout)
    CombineAssocResults(sys.stdin, c_out, num_top)

  else:
    raise RuntimeError('Invalid action %r' % action)


if __name__ == '__main__':
  try:
    main(sys.argv)
  except RuntimeError, e:
    print >>sys.stderr, 'FATAL: %s' % e
    sys.exit(1)