bin/decode_dist.R


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144

#!/usr/bin/env Rscript
#
# Command line tool to decode a RAPPOR data set.  It is a simple wrapper for
# Decode() in decode.R.

library(optparse)

#
# Command line parsing.  Do this first before loading libraries to catch errors
# quickly.  Loading libraries in R is slow.
#

# For command line error checking.
UsageError <- function(...) {
  cat(sprintf(...))
  cat('\n')
  quit(status = 1)
}

option_list <- list(
  # Inputs
  make_option("--map", default="", help="Map file (required)"),
  make_option("--counts", default="", help="Counts file (required)"),
  make_option("--params", default="", help="Params file (required)"),
  make_option("--output-dir", dest="output_dir", default=".",
              help="Output directory (default .)"),

  make_option("--correction", default="FDR", help="Correction method"),
  make_option("--alpha", default=.05, help="Alpha level"),

  make_option("--adjust-counts-hack", dest="adjust_counts_hack",
              default=FALSE, action="store_true",
              help="Allow the counts file to have more rows than cohorts. 
                    Most users should not use this.")
)

ParseOptions <- function() {
  # NOTE: This API is bad; if you add positional_arguments, the return value
  # changes!
  parser <- OptionParser(option_list = option_list)
  opts <- parse_args(parser)

  if (opts$map == "") {
    UsageError("--map is required.")
  }
  if (opts$counts == "") {
    UsageError("--counts is required.")
  }
  if (opts$params == "") {
    UsageError("--params is required.")
  }
  return(opts)
}

if (!interactive()) {
  opts <- ParseOptions()
}

#
# Load libraries and source our own code.
#

library(RJSONIO)

# So we don't have to change pwd
source.rappor <- function(rel_path)  {
  abs_path <- paste0(Sys.getenv("RAPPOR_REPO", ""), rel_path)
  source(abs_path)
}

source.rappor("analysis/R/read_input.R")
source.rappor("analysis/R/decode.R")
source.rappor("analysis/R/util.R")

source.rappor("analysis/R/alternative.R")

options(stringsAsFactors = FALSE)


main <- function(opts) {
  Log("decode-dist")
  Log("argv:")
  print(commandArgs(TRUE))

  Log("Loading inputs")

  # Run a single model of all inputs are specified.
  params <- ReadParameterFile(opts$params)
  counts <- ReadCountsFile(opts$counts, params, adjust_counts = opts$adjust_counts_hack)
  counts <- AdjustCounts(counts, params)


  # The left-most column has totals.
  num_reports <- sum(counts[, 1])

  map <- LoadMapFile(opts$map, params)

  Log("Decoding %d reports", num_reports)
  res <- Decode(counts, map$map, params, correction = opts$correction,
                alpha = opts$alpha)
  Log("Done decoding")

  if (nrow(res$fit) == 0) {
    Log("FATAL: Analysis returned no strings.")
    quit(status = 1)
  }

  # Write analysis results as CSV.
  results_csv_path <- file.path(opts$output_dir, 'results.csv')
  write.csv(res$fit, file = results_csv_path, row.names = FALSE)

  # Write residual histograph as a png.
  results_png_path <- file.path(opts$output_dir, 'residual.png')
  png(results_png_path)
  breaks <- pretty(res$residual, n = 200)
  histogram <- hist(res$residual, breaks, plot = FALSE)
  histogram$counts <- histogram$counts / sum(histogram$counts)  # convert the histogram to frequencies
  plot(histogram, main = "Histogram of the residual",
       xlab = sprintf("Residual (observed - explained, %d x %d values)", params$m, params$k))
  dev.off()

  res$metrics$total_elapsed_time <- proc.time()[['elapsed']]

  # Write summary as JSON (scalar values).
  metrics_json_path <- file.path(opts$output_dir, 'metrics.json')
  m <- toJSON(res$metrics)
  writeLines(m, con = metrics_json_path)
  Log("Wrote %s, %s, and %s", results_csv_path, results_png_path, metrics_json_path)

  # TODO:
  # - These are in an 2 column 'parameters' and 'values' format.  Should these
  # just be a plain list?
  # - Should any of these privacy params be in metrics.json?

  Log("Privacy summary:")
  print(res$privacy)
  cat("\n")

  Log('DONE')
}

if (!interactive()) {
  main(opts)
}