diff options
Diffstat (limited to 'analysis/R/simulation.R')
-rwxr-xr-x | analysis/R/simulation.R | 268 |
1 files changed, 268 insertions, 0 deletions
diff --git a/analysis/R/simulation.R b/analysis/R/simulation.R new file mode 100755 index 0000000..251c595 --- /dev/null +++ b/analysis/R/simulation.R @@ -0,0 +1,268 @@ +# Copyright 2014 Google Inc. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# +# RAPPOR simulation library. Contains code for encoding simulated data and +# creating the map used to encode and decode reports. + +library(glmnet) +library(parallel) # mclapply + +SetOfStrings <- function(num_strings = 100) { + # Generates a set of strings for simulation purposes. + strs <- paste0("V_", as.character(1:num_strings)) + strs +} + +GetSampleProbs <- function(params) { + # Generate different underlying distributions for simulations purposes. + # Args: + # - params: a list describing the shape of the true distribution: + # c(num_strings, prop_nonzero_strings, decay_type, + # rate_exponetial). + nstrs <- params[[1]] + nonzero <- params[[2]] + decay <- params[[3]] + expo <- params[[4]] + background <- params[[5]] + + probs <- rep(0, nstrs) + ind <- floor(nstrs * nonzero) + if (decay == "Linear") { + probs[1:ind] <- (ind:1) / sum(1:ind) + } else if (decay == "Constant") { + probs[1:ind] <- 1 / ind + } else if (decay == "Exponential") { + temp <- seq(0, nonzero, length.out = ind) + temp <- exp(-temp * expo) + temp <- temp + background + temp <- temp / sum(temp) + probs[1:ind] <- temp + } else { + stop('params[[4]] must be in c("Linear", "Exponenential", "Constant")') + } + probs +} + +EncodeAll <- function(x, cohorts, map, params, num_cores = 1) { + # Encodes the ground truth into RAPPOR reports. + # + # Args: + # x: Observed strings for each report, Nx1 vector + # cohort: Cohort assignment for each report, Nx1 vector + # map: list of matrices encoding locations of hashes for each + # string, for each cohort + # params: System parameters + # + # Returns: + # RAPPOR reports for each piece of data. + + p <- params$p + q <- params$q + f <- params$f + k <- params$k + + qstar <- (1 - f / 2) * q + (f / 2) * p + pstar <- (1 - f / 2) * p + (f / 2) * q + + candidates <- colnames(map[[1]]) + if (!all(x %in% candidates)) { + stop("Some strings are not in the map. set(X) - set(candidates): ", + paste(setdiff(unique(x), candidates), collapse=" "), "\n") + } + bfs <- mapply(function(x, y) y[, x], x, map[cohorts], SIMPLIFY = FALSE, + USE.NAMES = FALSE) + reports <- mclapply(bfs, function(x) { + noise <- sample(0:1, k, replace = TRUE, prob = c(1 - pstar, pstar)) + ind <- which(x) + noise[ind] <- sample(0:1, length(ind), replace = TRUE, + prob = c(1 - qstar, qstar)) + noise + }, mc.cores = num_cores) + + reports +} + +CreateMap <- function(strs, params, generate_pos = TRUE, basic = FALSE) { + # Creates a list of 0/1 matrices corresponding to mapping between the strs and + # Bloom filters for each instance of the RAPPOR. + # Ex. for 3 strings, 2 instances, 1 hash function and Bloom filter of size 4, + # the result could look this: + # [[1]] + # 1 0 0 0 + # 0 1 0 0 + # 0 0 0 1 + # [[2]] + # 0 1 0 0 + # 0 0 0 1 + # 0 0 1 0 + # + # Args: + # strs: a vector of strings + # params: a list of parameters in the following format: + # (k, h, m, p, q, f). + # generate_pos: Tells whether to generate an object storing the + # positions of the nonzeros in the matrix + # basic: Tells whether to use basic RAPPOR (only works if h=1). + + M <- length(strs) + map_by_cohort <- list() + k <- params$k + h <- params$h + m <- params$m + + for (i in 1:m) { + if (basic && (h == 1) && (k == M)) { + ones <- 1:M + } else { + ones <- sample(1:k, M * h, replace = TRUE) + } + cols <- rep(1:M, each = h) + map_by_cohort[[i]] <- sparseMatrix(ones, cols, dims = c(k, M)) + colnames(map_by_cohort[[i]]) <- strs + } + + all_cohorts_map <- do.call("rBind", map_by_cohort) + if (generate_pos) { + map_pos <- t(apply(all_cohorts_map, 2, function(x) { + ind <- which(x == 1) + n <- length(ind) + if (n < h * m) { + ind <- c(ind, rep(NA, h * m - n)) + } + ind + })) + } else { + map_pos <- NULL + } + + list(map_by_cohort = map_by_cohort, all_cohorts_map = all_cohorts_map, + map_pos = map_pos) +} + +GetSample <- function(N, strs, probs) { + # Sample for the strs population with distribution probs. + sample(strs, N, replace = TRUE, prob = probs) +} + +GetTrueBits <- function(samp, map, params) { + # Convert sample generated by GetSample() to Bloom filters where mapping + # is defined in map. + # Output: + # - reports: a matrix of size [num_instances x size] where each row + # represents the number of times each bit in the Bloom filter + # was set for a particular instance. + # Note: reports[, 1] contains the same size for each instance. + + N <- length(samp) + k <- params$k + m <- params$m + strs <- colnames(map[[1]]) + reports <- matrix(0, m, k + 1) + inst <- sample(1:m, N, replace = TRUE) + for (i in 1:m) { + tab <- table(samp[inst == i]) + tab2 <- rep(0, length(strs)) + tab2[match(names(tab), strs)] <- tab + counts <- apply(map[[i]], 1, function(x) x * tab2) + # cat(length(tab2), dim(map[[i]]), dim(counts), "\n") + reports[i, ] <- c(sum(tab2), apply(counts, 2, sum)) + } + reports +} + +GetNoisyBits <- function(truth, params) { + # Applies RAPPOR to the Bloom filters. + # Args: + # - truth: a matrix generated by GetTrueBits(). + + k <- params$k + p <- params$p + q <- params$q + f <- params$f + + rappors <- apply(truth, 1, function(x) { + # The following samples considering 4 cases: + # 1. Signal and we lie on the bit. + # 2. Signal and we tell the truth. + # 3. Noise and we lie. + # 4. Noise and we tell the truth. + + # Lies when signal sampled from the binomial distribution. + lied_signal <- rbinom(k, x[-1], f) + + # Remaining must be the non-lying bits when signal. Sampled with q. + truth_signal <- x[-1] - lied_signal + + # Lies when there is no signal which happens x[1] - x[-1] times. + lied_nosignal <- rbinom(k, x[1] - x[-1], f) + + # Trtuh when there's no signal. These are sampled with p. + truth_nosignal <- x[1] - x[-1] - lied_nosignal + + # Total lies and sampling lies with 50/50 for either p or q. + lied <- lied_signal + lied_nosignal + lied_p <- rbinom(k, lied, .5) + lied_q <- lied - lied_p + + # Generating the report where sampling of either p or q occurs. + rbinom(k, lied_q + truth_signal, q) + rbinom(k, lied_p + truth_nosignal, p) + }) + + cbind(truth[, 1], t(rappors)) +} + +GenerateSamples <- function(N = 10^5, params, pop_params, alpha = .05, + prop_missing = 0, + correction = "Bonferroni") { + # Simulate N reports with pop_params describing the population and + # params describing the RAPPOR configuration. + num_strings = pop_params[[1]] + + strs <- SetOfStrings(num_strings) + probs <- GetSampleProbs(pop_params) + samp <- GetSample(N, strs, probs) + map <- CreateMap(strs, params) + truth <- GetTrueBits(samp, map$map_by_cohort, params) + rappors <- GetNoisyBits(truth, params) + + strs_apprx <- strs + map_apprx <- map$all_cohorts_map + # Remove % of strings to simulate missing variables. + if (prop_missing > 0) { + ind <- which(probs > 0) + removed <- sample(ind, ceiling(prop_missing * length(ind))) + map_apprx <- map$all_cohorts_map[, -removed] + strs_apprx <- strs[-removed] + } + + # Randomize the columns. + ind <- sample(1:length(strs_apprx), length(strs_apprx)) + map_apprx <- map_apprx[, ind] + strs_apprx <- strs_apprx[ind] + + fit <- Decode(rappors, map_apprx, params, alpha = alpha, + correction = correction) + + # Add truth column. + fit$fit$Truth <- table(samp)[fit$fit$string] + fit$fit$Truth[is.na(fit$fit$Truth)] <- 0 + + fit$map <- map$map_by_cohort + fit$truth <- truth + fit$strs <- strs + fit$probs <- probs + + fit +} |