diff options
Diffstat (limited to 'analysis/R/decode.R')
-rwxr-xr-x | analysis/R/decode.R | 513 |
1 files changed, 513 insertions, 0 deletions
diff --git a/analysis/R/decode.R b/analysis/R/decode.R new file mode 100755 index 0000000..7d83f2b --- /dev/null +++ b/analysis/R/decode.R @@ -0,0 +1,513 @@ +# Copyright 2014 Google Inc. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# +# This library implements the RAPPOR marginal decoding algorithms using LASSO. + +library(glmnet) + +# So we don't have to change pwd +source.rappor <- function(rel_path) { + abs_path <- paste0(Sys.getenv("RAPPOR_REPO", ""), rel_path) + source(abs_path) +} + +source.rappor('analysis/R/alternative.R') + +EstimateBloomCounts <- function(params, obs_counts) { + # Estimates the number of times each bit in each cohort was set in original + # Bloom filters. + # + # Input: + # params: a list of RAPPOR parameters: + # k - size of a Bloom filter + # h - number of hash functions + # m - number of cohorts + # p - P(IRR = 1 | PRR = 0) + # q - P(IRR = 1 | PRR = 1) + # f - Proportion of bits in the Bloom filter that are set randomly + # to 0 or 1 regardless of the underlying true bit value + # obs_counts: a matrix of size m by (k + 1). Column one contains sample + # sizes for each cohort. Other counts indicated how many times + # each bit was set in each cohort. + # + # Output: + # ests: a matrix of size m by k with estimated counts for the probability + # of each bit set to 1 in the true Bloom filter. + # stds: standard deviation of the estimates. + + p <- params$p + q <- params$q + f <- params$f + m <- params$m + k <- params$k + + stopifnot(m == nrow(obs_counts), k + 1 == ncol(obs_counts)) + + p11 <- q * (1 - f/2) + p * f / 2 # probability of a true 1 reported as 1 + p01 <- p * (1 - f/2) + q * f / 2 # probability of a true 0 reported as 1 + + p2 <- p11 - p01 # == (1 - f) * (q - p) + + # When m = 1, obs_counts does not have the right dimensions. Fixing this. + dim(obs_counts) <- c(m, k + 1) + + ests <- apply(obs_counts, 1, function(cohort_row) { + N <- cohort_row[1] # sample size for the cohort -- first column is total + v <- cohort_row[-1] # counts for individual bits + (v - p01 * N) / p2 # unbiased estimator for individual bits' + # true counts. It can be negative or + # exceed the total. + }) + + # NOTE: When k == 1, rows of obs_counts have 2 entries. Then cohort_row[-1] + # is a singleton vector, and apply() returns a *vector*. When rows have 3 + # entries, cohort_row[-1] is a vector of length 2 and apply() returns a + # *matrix*. + # + # Fix this by explicitly setting dimensions. NOTE: It's k x m, not m x k. + dim(ests) <- c(k, m) + + total <- sum(obs_counts[,1]) + + variances <- apply(obs_counts, 1, function(cohort_row) { + N <- cohort_row[1] + v <- cohort_row[-1] + p_hats <- (v - p01 * N) / (N * p2) # expectation of a true 1 + p_hats <- pmax(0, pmin(1, p_hats)) # clamp to [0,1] + r <- p_hats * p11 + (1 - p_hats) * p01 # expectation of a reported 1 + N * r * (1 - r) / p2^2 # variance of the binomial + }) + + dim(variances) <- c(k, m) + + # Transform counts from absolute values to fractional, removing bias due to + # variability of reporting between cohorts. + ests <- apply(ests, 1, function(x) x / obs_counts[,1]) + stds <- apply(variances^.5, 1, function(x) x / obs_counts[,1]) + + # Some estimates may be set to infinity, e.g. if f=1. We want to account for + # this possibility, and set the corresponding counts to 0. + ests[abs(ests) == Inf] <- 0 + + list(estimates = ests, stds = stds) +} + +FitLasso <- function(X, Y, intercept = TRUE) { + # Fits a Lasso model to select a subset of columns of X. + # + # Input: + # X: a design matrix of size km by M (the number of candidate strings). + # Y: a vector of size km with estimated counts from EstimateBloomCounts(). + # intercept: whether to fit with intercept or not. + # + # Output: + # a vector of size ncol(X) of coefficients. + + # TODO(mironov): Test cv.glmnet instead of glmnet + mod <- try(glmnet(X, Y, standardize = FALSE, intercept = intercept, + lower.limits = 0, # outputs are non-negative + # Cap the number of non-zero coefficients to 500 or + # 80% of the length of Y, whichever is less. The 500 cap + # is for performance reasons, 80% is to avoid overfitting. + pmax = min(500, length(Y) * .8)), + silent = TRUE) + + # If fitting fails, return an empty data.frame. + if (class(mod)[1] == "try-error") { + coefs <- setNames(rep(0, ncol(X)), colnames(X)) + } else { + coefs <- coef(mod) + coefs <- coefs[-1, ncol(coefs), drop = FALSE] # coefs[1] is the intercept + } + coefs +} + +PerformInference <- function(X, Y, N, mod, params, alpha, correction) { + m <- params$m + p <- params$p + q <- params$q + f <- params$f + h <- params$h + + q2 <- .5 * f * (p + q) + (1 - f) * q + p2 <- .5 * f * (p + q) + (1 - f) * p + resid_var <- p2 * (1 - p2) * (N / m) / (q2 - p2)^2 + + # Total Sum of Squares (SS). + TSS <- sum((Y - mean(Y))^2) + # Error Sum of Squares (ESS). + ESS <- resid_var * nrow(X) + + betas <- matrix(mod$coefs, ncol = 1) + +# mod_var <- summary(mod$fit)$sigma^2 +# betas_sd <- rep(sqrt(max(resid_var, mod_var) / (m * h)), length(betas)) +# +# z_values <- betas / betas_sd +# +# # 1-sided t-test. +# p_values <- pnorm(z_values, lower = FALSE) + + fit <- data.frame(string = colnames(X), Estimate = betas, + SD = mod$stds, # z_stat = z_values, pvalue = p_values, + stringsAsFactors = FALSE) + +# if (correction == "FDR") { +# fit <- fit[order(fit$pvalue, decreasing = FALSE), ] +# ind <- which(fit$pvalue < (1:nrow(fit)) * alpha / nrow(fit)) +# if (length(ind) > 0) { +# fit <- fit[1:max(ind), ] +# } else { +# fit <- fit[numeric(0), ] +# } +# } else { +# fit <- fit[fit$p < alpha, ] +# } + + fit <- fit[order(fit$Estimate, decreasing = TRUE), ] + + if (nrow(fit) > 0) { + str_names <- fit$string + str_names <- str_names[!is.na(str_names)] + if (length(str_names) > 0 && length(str_names) < nrow(X)) { + this_data <- as.data.frame(as.matrix(X[, str_names])) + Y_hat <- predict(lm(Y ~ ., data = this_data)) + RSS <- sum((Y_hat - mean(Y))^2) + } else { + RSS <- NA + } + } else { + RSS <- 0 + } + + USS <- TSS - ESS - RSS + SS <- c(RSS, USS, ESS) / TSS + + list(fit = fit, SS = SS, resid_sigma = sqrt(resid_var)) +} + +ComputePrivacyGuarantees <- function(params, alpha, N) { + # Compute privacy parameters and guarantees. + p <- params$p + q <- params$q + f <- params$f + h <- params$h + + q2 <- .5 * f * (p + q) + (1 - f) * q + p2 <- .5 * f * (p + q) + (1 - f) * p + + exp_e_one <- ((q2 * (1 - p2)) / (p2 * (1 - q2)))^h + if (exp_e_one < 1) { + exp_e_one <- 1 / exp_e_one + } + e_one <- log(exp_e_one) + + exp_e_inf <- ((1 - .5 * f) / (.5 * f))^(2 * h) + e_inf <- log(exp_e_inf) + + std_dev_counts <- sqrt(p2 * (1 - p2) * N) / (q2 - p2) + detection_freq <- qnorm(1 - alpha) * std_dev_counts / N + + privacy_names <- c("Effective p", "Effective q", "exp(e_1)", + "e_1", "exp(e_inf)", "e_inf", "Detection frequency") + privacy_vals <- c(p2, q2, exp_e_one, e_one, exp_e_inf, e_inf, detection_freq) + + privacy <- data.frame(parameters = privacy_names, + values = privacy_vals) + privacy +} + +FitDistribution <- function(estimates_stds, map, quiet = FALSE) { + # Find a distribution over rows of map that approximates estimates_stds best + # + # Input: + # estimates_stds: a list of two m x k matrices, one for estimates, another + # for standard errors + # map : an (m * k) x S boolean matrix + # + # Output: + # a float vector of length S, so that a distribution over map's rows sampled + # according to this vector approximates estimates + + S <- ncol(map) # total number of candidates + + support_coefs <- 1:S + + if (S > length(estimates_stds$estimates) * .8) { + # the system is close to being underdetermined + lasso <- FitLasso(map, as.vector(t(estimates_stds$estimates))) + + # Select non-zero coefficients. + support_coefs <- which(lasso > 0) + + if(!quiet) + cat("LASSO selected ", length(support_coefs), " non-zero coefficients.\n") + } + + coefs <- setNames(rep(0, S), colnames(map)) + + if(length(support_coefs) > 0) { # LASSO may return an empty list + constrained_coefs <- ConstrainedLinModel(map[, support_coefs, drop = FALSE], + estimates_stds) + + coefs[support_coefs] <- constrained_coefs + } + + coefs +} + +Resample <- function(e) { + # Simulate resampling of the Bloom filter estimates by adding Gaussian noise + # with estimated standard deviation. + estimates <- matrix(mapply(function(x, y) x + rnorm(1, 0, y), + e$estimates, e$stds), + nrow = nrow(e$estimates), ncol = ncol(e$estimates)) + stds <- e$stds * 2^.5 + + list(estimates = estimates, stds = stds) +} + +# Private function +# Decode for Boolean RAPPOR inputs +# Returns a list with attribute fit only. (Inference and other aspects +# currently not incorporated because they're unnecessary for association.) +.DecodeBoolean <- function(counts, params, num_reports) { + # Boolean variables are reported without cohorts and to estimate counts, + # first sum up counts across all cohorts and then run EstimateBloomCounts + # with the number of cohorts set to 1. + params$m <- 1 # set number of cohorts to 1 + summed_counts <- colSums(counts) # sum counts across cohorts + es <- EstimateBloomCounts(params, summed_counts) # estimate boolean counts + + ests <- es$estimates[[1]] + std <- es$stds[[1]] + + fit <- data.frame( + string = c("TRUE", "FALSE"), + estimate = c(ests * num_reports, + num_reports - ests * num_reports), + std_error = c(std * num_reports, std * num_reports), + proportion = c(ests, 1 - ests), + prop_std_error = c(std, std)) + + low_95 <- fit$proportion - 1.96 * fit$prop_std_error + high_95 <- fit$proportion + 1.96 * fit$prop_std_error + + fit$prop_low_95 <- pmax(low_95, 0.0) + fit$prop_high_95 <- pmin(high_95, 1.0) + rownames(fit) <- fit$string + + return(list(fit = fit)) +} + +CheckDecodeInputs <- function(counts, map, params) { + # Returns an error message, or NULL if there is no error. + + if (nrow(map) != (params$m * params$k)) { + return(sprintf( + "Map matrix has invalid dimensions: m * k = %d, nrow(map) = %d", + params$m * params$k, nrow(map))) + } + + if ((ncol(counts) - 1) != params$k) { + return(sprintf(paste0( + "Dimensions of counts file do not match: m = %d, k = %d, ", + "nrow(counts) = %d, ncol(counts) = %d"), params$m, params$k, + nrow(counts), ncol(counts))) + + } + + # numerically correct comparison + if (isTRUE(all.equal((1 - params$f) * (params$p - params$q), 0))) { + return("Information is lost. Cannot decode.") + } + + return(NULL) # no error +} + +Decode <- function(counts, map, params, alpha = 0.05, + correction = c("Bonferroni"), quiet = FALSE, ...) { + + error_msg <- CheckDecodeInputs(counts, map, params) + if (!is.null(error_msg)) { + stop(error_msg) + } + + k <- params$k + p <- params$p + q <- params$q + f <- params$f + h <- params$h + m <- params$m + + S <- ncol(map) # total number of candidates + + N <- sum(counts[, 1]) + if (k == 1) { + return(.DecodeBoolean(counts, params, N)) + } + + filter_cohorts <- which(counts[, 1] != 0) # exclude cohorts with zero reports + + # stretch cohorts to bits + filter_bits <- as.vector(matrix(1:nrow(map), ncol = m)[,filter_cohorts, drop = FALSE]) + + map_filtered <- map[filter_bits, , drop = FALSE] + + es <- EstimateBloomCounts(params, counts) + + estimates_stds_filtered <- + list(estimates = es$estimates[filter_cohorts, , drop = FALSE], + stds = es$stds[filter_cohorts, , drop = FALSE]) + + coefs_all <- vector() + + # Run the fitting procedure several times (5 seems to be sufficient and not + # too many) to estimate standard deviation of the output. + for(r in 1:5) { + if(r > 1) + e <- Resample(estimates_stds_filtered) + else + e <- estimates_stds_filtered + + coefs_all <- rbind(coefs_all, + FitDistribution(e, map_filtered, quiet)) + } + + coefs_ssd <- N * apply(coefs_all, 2, sd) # compute sample standard deviations + coefs_ave <- N * apply(coefs_all, 2, mean) + + # Only select coefficients more than two standard deviations from 0. May + # inflate empirical SD of the estimates. + reported <- which(coefs_ave > 1E-6 + 2 * coefs_ssd) + + mod <- list(coefs = coefs_ave[reported], stds = coefs_ssd[reported]) + + coefs_ave_zeroed <- coefs_ave + coefs_ave_zeroed[-reported] <- 0 + + residual <- as.vector(t(estimates_stds_filtered$estimates)) - + map_filtered %*% coefs_ave_zeroed / N + + if (correction == "Bonferroni") { + alpha <- alpha / S + } + + inf <- PerformInference(map_filtered[, reported, drop = FALSE], + as.vector(t(estimates_stds_filtered$estimates)), + N, mod, params, alpha, + correction) + fit <- inf$fit + # If this is a basic RAPPOR instance, just use the counts for the estimate + # (Check if the map is diagonal to tell if this is basic RAPPOR.) + if (sum(map) == sum(diag(map))) { + fit$Estimate <- colSums(counts)[-1] + } + + # Estimates from the model are per instance so must be multipled by h. + # Standard errors are also adjusted. + fit$estimate <- floor(fit$Estimate) + fit$proportion <- fit$estimate / N + + fit$std_error <- floor(fit$SD) + fit$prop_std_error <- fit$std_error / N + + # 1.96 standard deviations gives 95% confidence interval. + low_95 <- fit$proportion - 1.96 * fit$prop_std_error + high_95 <- fit$proportion + 1.96 * fit$prop_std_error + # Clamp estimated proportion. pmin/max: vectorized min and max + fit$prop_low_95 <- pmax(low_95, 0.0) + fit$prop_high_95 <- pmin(high_95, 1.0) + + fit <- fit[, c("string", "estimate", "std_error", "proportion", + "prop_std_error", "prop_low_95", "prop_high_95")] + + allocated_mass <- sum(fit$proportion) + num_detected <- nrow(fit) + + ss <- round(inf$SS, digits = 3) + explained_var <- ss[[1]] + missing_var <- ss[[2]] + noise_var <- ss[[3]] + + noise_std_dev <- round(inf$resid_sigma, digits = 3) + + # Compute summary of the fit. + parameters <- + c("Candidate strings", "Detected strings", + "Sample size (N)", "Discovered Prop (out of N)", + "Explained Variance", "Missing Variance", "Noise Variance", + "Theoretical Noise Std. Dev.") + values <- c(S, num_detected, N, allocated_mass, + explained_var, missing_var, noise_var, noise_std_dev) + + res_summary <- data.frame(parameters = parameters, values = values) + + privacy <- ComputePrivacyGuarantees(params, alpha, N) + params <- data.frame(parameters = + c("k", "h", "m", "p", "q", "f", "N", "alpha"), + values = c(k, h, m, p, q, f, N, alpha)) + + # This is a list of decode stats in a better format than 'summary'. + metrics <- list(sample_size = N, + allocated_mass = allocated_mass, + num_detected = num_detected, + explained_var = explained_var, + missing_var = missing_var) + + list(fit = fit, summary = res_summary, privacy = privacy, params = params, + lasso = NULL, residual = as.vector(residual), + counts = counts[, -1], resid = NULL, metrics = metrics, + ests = es$estimates # ests needed by Shiny rappor-sim app + ) +} + +ComputeCounts <- function(reports, cohorts, params) { + # Counts the number of times each bit in the Bloom filters was set for + # each cohort. + # + # Args: + # reports: A list of N elements, each containing the + # report for a given report + # cohorts: A list of N elements, each containing the + # cohort number for a given report + # params: A list of parameters for the problem + # + # Returns: + # An mx(k+1) array containing the number of times each bit was set + # in each cohort. + + # Check that the cohorts are evenly assigned. We assume that if there + # are m cohorts, each cohort should have approximately N/m reports. + # The constraint we impose here simply says that cohort bins should + # each have within N/m reports of one another. Since the most popular + # cohort is expected to have about O(logN/loglogN) reports (which we ) + # approximate as O(logN) bins for practical values of N, a discrepancy of + # O(N) bins seems significant enough to alter expected behavior. This + # threshold can be changed to be more sensitive if desired. + N <- length(reports) + cohort_freqs <- table(factor(cohorts, levels = 1:params$m)) + imbalance_threshold <- N / params$m + if ((max(cohort_freqs) - min(cohort_freqs)) > imbalance_threshold) { + cat("\nNote: You are using unbalanced cohort assignments, which can", + "significantly degrade estimation quality!\n\n") + } + + # Count the times each bit was set, and add cohort counts to first column + counts <- lapply(1:params$m, function(i) + Reduce("+", reports[which(cohorts == i)])) + counts[which(cohort_freqs == 0)] <- data.frame(rep(0, params$k)) + cbind(cohort_freqs, do.call("rbind", counts)) +} |