aboutsummaryrefslogtreecommitdiff
path: root/analysis/R/simulation.R
diff options
context:
space:
mode:
Diffstat (limited to 'analysis/R/simulation.R')
-rwxr-xr-xanalysis/R/simulation.R268
1 files changed, 268 insertions, 0 deletions
diff --git a/analysis/R/simulation.R b/analysis/R/simulation.R
new file mode 100755
index 0000000..251c595
--- /dev/null
+++ b/analysis/R/simulation.R
@@ -0,0 +1,268 @@
+# Copyright 2014 Google Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#
+# RAPPOR simulation library. Contains code for encoding simulated data and
+# creating the map used to encode and decode reports.
+
+library(glmnet)
+library(parallel) # mclapply
+
+SetOfStrings <- function(num_strings = 100) {
+ # Generates a set of strings for simulation purposes.
+ strs <- paste0("V_", as.character(1:num_strings))
+ strs
+}
+
+GetSampleProbs <- function(params) {
+ # Generate different underlying distributions for simulations purposes.
+ # Args:
+ # - params: a list describing the shape of the true distribution:
+ # c(num_strings, prop_nonzero_strings, decay_type,
+ # rate_exponetial).
+ nstrs <- params[[1]]
+ nonzero <- params[[2]]
+ decay <- params[[3]]
+ expo <- params[[4]]
+ background <- params[[5]]
+
+ probs <- rep(0, nstrs)
+ ind <- floor(nstrs * nonzero)
+ if (decay == "Linear") {
+ probs[1:ind] <- (ind:1) / sum(1:ind)
+ } else if (decay == "Constant") {
+ probs[1:ind] <- 1 / ind
+ } else if (decay == "Exponential") {
+ temp <- seq(0, nonzero, length.out = ind)
+ temp <- exp(-temp * expo)
+ temp <- temp + background
+ temp <- temp / sum(temp)
+ probs[1:ind] <- temp
+ } else {
+ stop('params[[4]] must be in c("Linear", "Exponenential", "Constant")')
+ }
+ probs
+}
+
+EncodeAll <- function(x, cohorts, map, params, num_cores = 1) {
+ # Encodes the ground truth into RAPPOR reports.
+ #
+ # Args:
+ # x: Observed strings for each report, Nx1 vector
+ # cohort: Cohort assignment for each report, Nx1 vector
+ # map: list of matrices encoding locations of hashes for each
+ # string, for each cohort
+ # params: System parameters
+ #
+ # Returns:
+ # RAPPOR reports for each piece of data.
+
+ p <- params$p
+ q <- params$q
+ f <- params$f
+ k <- params$k
+
+ qstar <- (1 - f / 2) * q + (f / 2) * p
+ pstar <- (1 - f / 2) * p + (f / 2) * q
+
+ candidates <- colnames(map[[1]])
+ if (!all(x %in% candidates)) {
+ stop("Some strings are not in the map. set(X) - set(candidates): ",
+ paste(setdiff(unique(x), candidates), collapse=" "), "\n")
+ }
+ bfs <- mapply(function(x, y) y[, x], x, map[cohorts], SIMPLIFY = FALSE,
+ USE.NAMES = FALSE)
+ reports <- mclapply(bfs, function(x) {
+ noise <- sample(0:1, k, replace = TRUE, prob = c(1 - pstar, pstar))
+ ind <- which(x)
+ noise[ind] <- sample(0:1, length(ind), replace = TRUE,
+ prob = c(1 - qstar, qstar))
+ noise
+ }, mc.cores = num_cores)
+
+ reports
+}
+
+CreateMap <- function(strs, params, generate_pos = TRUE, basic = FALSE) {
+ # Creates a list of 0/1 matrices corresponding to mapping between the strs and
+ # Bloom filters for each instance of the RAPPOR.
+ # Ex. for 3 strings, 2 instances, 1 hash function and Bloom filter of size 4,
+ # the result could look this:
+ # [[1]]
+ # 1 0 0 0
+ # 0 1 0 0
+ # 0 0 0 1
+ # [[2]]
+ # 0 1 0 0
+ # 0 0 0 1
+ # 0 0 1 0
+ #
+ # Args:
+ # strs: a vector of strings
+ # params: a list of parameters in the following format:
+ # (k, h, m, p, q, f).
+ # generate_pos: Tells whether to generate an object storing the
+ # positions of the nonzeros in the matrix
+ # basic: Tells whether to use basic RAPPOR (only works if h=1).
+
+ M <- length(strs)
+ map_by_cohort <- list()
+ k <- params$k
+ h <- params$h
+ m <- params$m
+
+ for (i in 1:m) {
+ if (basic && (h == 1) && (k == M)) {
+ ones <- 1:M
+ } else {
+ ones <- sample(1:k, M * h, replace = TRUE)
+ }
+ cols <- rep(1:M, each = h)
+ map_by_cohort[[i]] <- sparseMatrix(ones, cols, dims = c(k, M))
+ colnames(map_by_cohort[[i]]) <- strs
+ }
+
+ all_cohorts_map <- do.call("rBind", map_by_cohort)
+ if (generate_pos) {
+ map_pos <- t(apply(all_cohorts_map, 2, function(x) {
+ ind <- which(x == 1)
+ n <- length(ind)
+ if (n < h * m) {
+ ind <- c(ind, rep(NA, h * m - n))
+ }
+ ind
+ }))
+ } else {
+ map_pos <- NULL
+ }
+
+ list(map_by_cohort = map_by_cohort, all_cohorts_map = all_cohorts_map,
+ map_pos = map_pos)
+}
+
+GetSample <- function(N, strs, probs) {
+ # Sample for the strs population with distribution probs.
+ sample(strs, N, replace = TRUE, prob = probs)
+}
+
+GetTrueBits <- function(samp, map, params) {
+ # Convert sample generated by GetSample() to Bloom filters where mapping
+ # is defined in map.
+ # Output:
+ # - reports: a matrix of size [num_instances x size] where each row
+ # represents the number of times each bit in the Bloom filter
+ # was set for a particular instance.
+ # Note: reports[, 1] contains the same size for each instance.
+
+ N <- length(samp)
+ k <- params$k
+ m <- params$m
+ strs <- colnames(map[[1]])
+ reports <- matrix(0, m, k + 1)
+ inst <- sample(1:m, N, replace = TRUE)
+ for (i in 1:m) {
+ tab <- table(samp[inst == i])
+ tab2 <- rep(0, length(strs))
+ tab2[match(names(tab), strs)] <- tab
+ counts <- apply(map[[i]], 1, function(x) x * tab2)
+ # cat(length(tab2), dim(map[[i]]), dim(counts), "\n")
+ reports[i, ] <- c(sum(tab2), apply(counts, 2, sum))
+ }
+ reports
+}
+
+GetNoisyBits <- function(truth, params) {
+ # Applies RAPPOR to the Bloom filters.
+ # Args:
+ # - truth: a matrix generated by GetTrueBits().
+
+ k <- params$k
+ p <- params$p
+ q <- params$q
+ f <- params$f
+
+ rappors <- apply(truth, 1, function(x) {
+ # The following samples considering 4 cases:
+ # 1. Signal and we lie on the bit.
+ # 2. Signal and we tell the truth.
+ # 3. Noise and we lie.
+ # 4. Noise and we tell the truth.
+
+ # Lies when signal sampled from the binomial distribution.
+ lied_signal <- rbinom(k, x[-1], f)
+
+ # Remaining must be the non-lying bits when signal. Sampled with q.
+ truth_signal <- x[-1] - lied_signal
+
+ # Lies when there is no signal which happens x[1] - x[-1] times.
+ lied_nosignal <- rbinom(k, x[1] - x[-1], f)
+
+ # Trtuh when there's no signal. These are sampled with p.
+ truth_nosignal <- x[1] - x[-1] - lied_nosignal
+
+ # Total lies and sampling lies with 50/50 for either p or q.
+ lied <- lied_signal + lied_nosignal
+ lied_p <- rbinom(k, lied, .5)
+ lied_q <- lied - lied_p
+
+ # Generating the report where sampling of either p or q occurs.
+ rbinom(k, lied_q + truth_signal, q) + rbinom(k, lied_p + truth_nosignal, p)
+ })
+
+ cbind(truth[, 1], t(rappors))
+}
+
+GenerateSamples <- function(N = 10^5, params, pop_params, alpha = .05,
+ prop_missing = 0,
+ correction = "Bonferroni") {
+ # Simulate N reports with pop_params describing the population and
+ # params describing the RAPPOR configuration.
+ num_strings = pop_params[[1]]
+
+ strs <- SetOfStrings(num_strings)
+ probs <- GetSampleProbs(pop_params)
+ samp <- GetSample(N, strs, probs)
+ map <- CreateMap(strs, params)
+ truth <- GetTrueBits(samp, map$map_by_cohort, params)
+ rappors <- GetNoisyBits(truth, params)
+
+ strs_apprx <- strs
+ map_apprx <- map$all_cohorts_map
+ # Remove % of strings to simulate missing variables.
+ if (prop_missing > 0) {
+ ind <- which(probs > 0)
+ removed <- sample(ind, ceiling(prop_missing * length(ind)))
+ map_apprx <- map$all_cohorts_map[, -removed]
+ strs_apprx <- strs[-removed]
+ }
+
+ # Randomize the columns.
+ ind <- sample(1:length(strs_apprx), length(strs_apprx))
+ map_apprx <- map_apprx[, ind]
+ strs_apprx <- strs_apprx[ind]
+
+ fit <- Decode(rappors, map_apprx, params, alpha = alpha,
+ correction = correction)
+
+ # Add truth column.
+ fit$fit$Truth <- table(samp)[fit$fit$string]
+ fit$fit$Truth[is.na(fit$fit$Truth)] <- 0
+
+ fit$map <- map$map_by_cohort
+ fit$truth <- truth
+ fit$strs <- strs
+ fit$probs <- probs
+
+ fit
+}