1 files changed, 377 insertions, 0 deletions
diff --git a/analysis/R/decode_ngrams.R b/analysis/R/decode_ngrams.R
new file mode 100755
index 0000000..e2585cb
--- /dev/null
+++ b/analysis/R/decode_ngrams.R
@@ -0,0 +1,377 @@
+# Copyright 2014 Google Inc. All rights reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#
+# This file has functions that aid in the estimation of a distribution when the
+#     dictionary is unknown. There are functions for estimating pairwise joint
+#     ngram distributions, pruning out false positives, and combining the two
+#     steps.
+
+FindPairwiseCandidates <- function(report_data, N, ngram_params, params) {
+  # Finds the pairwise most likely ngrams.
+  #
+  # Args:
+  #   report_data: Object containing data relevant to reports:
+  #       $inds: The indices of reports collected using various pairs
+  #       $cohorts: The cohort of each report
+  #       $map: The map used for all the ngrams
+  #       $reports: The reports used for each ngram and full string
+  #   N: Number of reports collected
+  #   ngram_params: Parameters related to ngram size
+  #   params: Parameter list.
+  #
+  # Returns:
+  #   List: list of matrices, list of pairwise distributions.
+
+  inds <- report_data$inds
+  cohorts <- report_data$cohorts
+  num_ngrams_collected <- ngram_params$num_ngrams_collected
+  map <- report_data$map
+  reports <- report_data$reports
+
+  # Cycle over all the unique pairs of ngrams being collected
+  found_candidates <- list()
+
+  # Generate the map list to be used for all ngrams
+  maps <- lapply(1:num_ngrams_collected, function(x) map)
+  num_candidate_ngrams <- length(inds)
+
+  .ComputeDist <- function(i, inds, cohorts, reports, maps, params,
+                           num_ngrams_collected) {
+    library(glmnet)
+    ind <- inds[[i]]
+    cohort_subset <- lapply(1:num_ngrams_collected, function(x)
+                            cohorts[ind])
+    report_subset <- reports[[i]]
+    new_dist <- ComputeDistributionEM(report_subset,
+                                      cohort_subset,
+                                      maps, ignore_other = FALSE,
+                                      params = params, estimate_var = FALSE)
+    new_dist
+  }
+
+  # Compute the pairwise distributions (could be parallelized)
+  dists <- lapply(seq(num_candidate_ngrams), function(i)
+                  .ComputeDist(i, inds, cohorts, reports, maps,
+                               params, num_ngrams_collected))
+
+  dists_null <- sapply(dists, function(x) is.null(x))
+  if (any(dists_null)) {
+    return (list(found_candidates = list(), dists = dists))
+  }
+  cat("Found the pairwise ngram distributions.\n")
+
+  # Find the threshold for choosing "significant" ngram pairs
+  f <- params$f; q <- params$q; p <- params$p
+  q2 <- .5 * f * (p + q) + (1 - f) * q
+  p2 <- .5 * f * (p + q) + (1 - f) * p
+  std_dev_counts <- sqrt(p2 * (1 - p2) * N) / (q2 - p2)
+  (threshold <- std_dev_counts / N)
+  threshold <- 0.04
+
+  # Filter joints to remove infrequently co-occurring ngrams.
+  candidate_strs <- lapply(1:num_candidate_ngrams, function(i) {
+    fit <- dists[[i]]$fit
+    edges <- which(fit > threshold, arr.ind = TRUE, FALSE)
+
+    # Recover the list of strings that seem significant
+    found_candidates <- sapply(1:ncol(edges), function(x) {
+      chunks <- sapply(edges[, x],
+                       function(j) dimnames(fit)[[x]][j])
+      chunks
+    })
+    # sapply returns either "character" vector (for n=1) or a matrix.  Convert
+    # it to a matrix.  This can be seen as follows:
+    #
+    # > class(sapply(1:5, function(x) "a"))
+    # [1] "character"
+    # > class(sapply(1:5, function(x) c("a", "b")))
+    # [1] "matrix"
+    found_candidates <- rbind(found_candidates)
+
+    # Remove the "others"
+    others <- which(found_candidates == "Other")
+    if (length(others) > 0) {
+      other <- which(found_candidates == "Other", arr.ind = TRUE)[, 1]
+      # drop = FALSE necessary to keep it a matrix
+      found_candidates <- found_candidates[-other, , drop = FALSE]
+    }
+
+    found_candidates
+  })
+  if (any(lapply(found_candidates, function(x) length(x)) == 0)) {
+    return (NULL)
+  }
+
+  list(candidate_strs = candidate_strs, dists = dists)
+}
+
+FindFeasibleStrings <- function(found_candidates, pairings, num_ngrams,
+                                ngram_size) {
+  # Uses the list of strings found by the pairwise comparisons to build
+  #     a list of full feasible strings. This relies on the iterative,
+  #     graph-based approach.
+  #
+  # Args:
+  #   found_candidates: list of candidates found by each pairwise decoding
+  #   pairings: Matrix of size 2x(num_ngrams choose 2) listing all the
+  #       ngram position pairings.
+  #   num_ngrams: The total number of ngrams per word.
+  #   ngram_size: Number of characters per ngram
+  #
+  # Returns:
+  #   List of full string candidates.
+
+  # Which ngram pairs are adjacent, i.e. of the form (i,i+1)
+  adjacent <- sapply(seq(num_ngrams - 1), function(x) {
+    c(1 + (x - 1) * ngram_size, x * ngram_size + 1)
+  })
+
+  adjacent_pairs <- apply(adjacent, 2, function(x) {
+    which(apply(pairings, 1, function(y) identical(y, x)))
+  })
+
+  # The first set of candidates are ngrams found in positions 1 and 2
+  active_cands <- found_candidates[[adjacent_pairs[1]]]
+  if (class(active_cands) == "list") {
+    return (list())
+  } else {
+    active_cands <- as.data.frame(active_cands)
+  }
+
+  # Now check successive ngrams to find consistent combinations
+  #     i.e. after ngrams 1-2, check 2-3, 3-4, 4-5, etc.
+  for (i in 2:length(adjacent_pairs)) {
+    if (nrow(active_cands) == 0) {
+      return (list())
+    }
+    new_cands <- found_candidates[[adjacent_pairs[i]]]
+    new_cands <- as.data.frame(new_cands)
+    # Builds the set of possible candidates based only on ascending
+    #     candidate pairs
+    active_cands <- BuildCandidates(active_cands, new_cands)
+  }
+
+  if (nrow(active_cands) == 0) {
+    return (list())
+  }
+  # Now refine these candidates using non-adjacent bigrams
+  remaining <- (1:(num_ngrams * (num_ngrams - 1) / 2))[-c(1, adjacent_pairs)]
+  # For each non-adjacent pair, make sure that all the candidates are
+  #     consistent (in this phase, candidates can ONLY be eliminated)
+
+  for (i in remaining) {
+    new_cands <- found_candidates[[i]]
+    new_cands <- as.data.frame(new_cands)
+    # Prune out all candidates that do not agree with new_cands
+    active_cands <- PruneCandidates(active_cands, pairings[i, ],
+                                    ngram_size,
+                                    new_cands = new_cands)
+  }
+  # Consolidate the string ngrams into a full string representation
+  if (length(active_cands) > 0) {
+    active_cands <- sort(apply(active_cands, 1,
+                               function(x) paste0(x, collapse = "")))
+  }
+  unname(active_cands)
+}
+
+BuildCandidates <- function(active_cands, new_cands) {
+  # Takes in a data frame where each row is a valid sequence of ngrams
+  #     checks which of the new_cands ngram pairs are consistent with
+  #     the original active_cands ngram sequence.
+  #
+  # Args:
+  #   active_cands: data frame of ngram sequence candidates (1 candidate
+  #       sequence per row)
+  #   new_cands: An rx2 data frame with a new list of candidate ngram
+  #       pairs that might fit in with the previous list of candidates
+  #
+  # Returns:
+  #   Updated active_cands, with another column if valid extensions are
+  #       found.
+
+  # Get the trailing ngrams from the current candidates
+  to_check <- as.vector(tail(t(active_cands), n = 1))
+  # Check which of the elements in to_check are leading ngrams among the
+  #     new candidates
+  present <- sapply(to_check, function(x) any(x == new_cands))
+  # Remove the strings that are not represented among the new candidates
+  to_check <- to_check[present]
+  # Now insert the new candidates where they belong
+  active_cands <- active_cands[present, , drop = FALSE]
+  active_cands <- cbind(active_cands, col = NA)
+  num_cands <- nrow(active_cands)
+  hit_list <- c()
+  for (j in 1:num_cands) {
+    inds <- which(new_cands[, 1] == to_check[j])
+    if (length(inds) == 0) {
+      hit_list <- c(hit_list, j)
+      next
+    }
+    # If there are multiple candidates fitting with an ngram, include
+    #     each /full/ string as a candidate
+    extra <- length(inds) - 1
+    if (extra > 0) {
+      rep_inds <- c(j, (new_num_cands + 1):(new_num_cands + extra))
+      to_paste <- active_cands[j, ]
+      # Add the new candidates to the bottom
+      for (p in 1:extra) {
+        active_cands <- rbind(active_cands, to_paste)
+      }
+    } else {
+      rep_inds <- c(j)
+    }
+    active_cands[rep_inds, ncol(active_cands)] <-
+        as.vector(new_cands[inds, 2])
+    new_num_cands <- nrow(active_cands)
+  }
+  # If there were some false candidates in the original set, remove them
+  if (length(hit_list) > 0) {
+    active_cands <- active_cands[-hit_list, , drop = FALSE]
+  }
+  active_cands
+}
+
+PruneCandidates <- function(active_cands, pairing, ngram_size, new_cands) {
+  # Takes in a data frame where each row is a valid sequence of ngrams
+  #     checks which of the new_cands ngram pairs are consistent with
+  #     the original active_cands ngram sequence. This can ONLY remove
+  #     candidates presented in active_cands.
+  #
+  # Args:
+  #   active_cands: data frame of ngram sequence candidates (1 candidate
+  #       sequence per row)
+  #   pairing: A length-2 list storing which two ngrams are measured
+  #   ngram_size: Number of characters per ngram
+  #   new_cands: An rx2 data frame with a new list of candidate ngram
+  #       pairs that might fit in with the previous list of candidates
+  #
+  # Returns:
+  #   Updated active_cands, with a reduced number of rows.
+
+  # Convert the pairing to an ngram index
+  cols <- sapply(pairing, function(x) (x - 1) / ngram_size + 1)
+
+  cands_to_check <- active_cands[, cols, drop = FALSE]
+  # Find the candidates that are inconsistent with the new data
+  hit_list <- sapply(1:nrow(cands_to_check), function(j) {
+    to_kill <- FALSE
+    if (nrow(new_cands) == 0) {
+      return (TRUE)
+    }
+    if (!any(apply(new_cands, 1, function(x)
+                   all(cands_to_check[j, , drop = FALSE] == x)))) {
+      to_kill <- TRUE
+    }
+    to_kill
+  })
+
+  # Determine which rows are false positives
+  hit_indices <- which(hit_list)
+  # Remove the false positives
+  if (length(hit_indices) > 0) {
+    active_cands <- active_cands[-hit_indices, ]
+  }
+  active_cands
+}
+
+EstimateDictionary <- function(report_data, N, ngram_params, params) {
+  # Takes in a list of report data and returns a list of string
+  #     estimates of the dictionary.
+  #
+  # Args:
+  #     report_data: Object containing data relevant to reports:
+  #         $inds: The indices of reports collected using various pairs
+  #         $cohorts: The cohort of each report
+  #         $map: THe map used for all the ngrams
+  #         $reports: The reports used for each ngram and full string
+  #   N: the number of individuals sending reports
+  #   ngram_params: Parameters related to ngram length, etc
+  #   params: Parameter vector with RAPPOR noise levels, cohorts, etc
+  #
+  # Returns:
+  #   List: list of found candidates, list of pairwise candidates
+
+  pairwise_candidates <- FindPairwiseCandidates(report_data, N,
+                                                ngram_params,
+                                                params)$candidate_strs
+  cat("Found the pairwise candidates. \n")
+  if (is.null(pairwise_candidates)) {
+    return (list())
+  }
+  found_candidates <- FindFeasibleStrings(pairwise_candidates,
+                                          report_data$pairings,
+                                          ngram_params$num_ngrams,
+                                          ngram_params$ngram_size)
+  cat("Found all the candidates. \n")
+  list(found_candidates = found_candidates,
+       pairwise_candidates = pairwise_candidates)
+}
+
+WriteKPartiteGraph <- function(conn, pairwise_candidates, pairings, num_ngrams,
+                               ngram_size) {
+  # Args:
+  #  conn: R connection to write to.  Should be opened with mode w+.
+  #  pairwise_candidates: list of matrices.  Each matrix represents a subgraph;
+  #    it contains the edges between partitions i and j, so there are (k choose
+  #    2) matrices.  Each matrix has dimension 2 x E, where E is the number of
+  #    edges.
+  #  pairings: 2 x (k choose 2) matrix of character positions.  Each row
+  #    corresponds to a subgraph; it has 1-based character index of partitions
+  #    i and j.
+  #  num_ngrams: length of pairwise_candidates, or the number of partitions in
+  #    the k-partite graph
+
+  # File Format:
+  #
+  # num_partitions 3
+  # ngram_size 2
+  # 0.ab 1.cd
+  # 0.ab 2.ef
+  #
+  # The first line specifies the number of partitions (k).
+  # The remaining lines are edges, where each node is <partition>.<bigram>.
+  #
+  # Partitions are numbered from 0.  The partition of the left node will be
+  # less than the partition of the right node.
+
+  # First two lines are metadata
+  cat(sprintf('num_partitions %d\n', num_ngrams), file = conn)
+  cat(sprintf('ngram_size %d\n', ngram_size), file = conn)
+
+  for (i in 1:length(pairwise_candidates)) {
+    # The two pairwise_candidates for this subgraph.
+    # Turn 1-based character positions into 0-based partition numbers,
+    # e.g. (3, 5) -> (1, 2)
+
+    pos1 <- pairings[[i, 1]]
+    pos2 <- pairings[[i, 2]]
+    part1 <- (pos1 - 1) / ngram_size
+    part2 <- (pos2 - 1) / ngram_size
+    cat(sprintf("Writing partition (%d, %d)\n", part1, part2))
+
+    p <- pairwise_candidates[[i]]
+    # each row is an edge
+    for (j in 1:nrow(p)) {
+      n1 <- p[[j, 1]]
+      n2 <- p[[j, 2]]
+      line <- sprintf('edge %d.%s %d.%s\n', part1, n1, part2, n2)
+      # NOTE: It would be faster to preallocate 'lines', but we would have to
+      # make a two passes through pairwise_candidates.
+      cat(line, file = conn)
+    }
+  }
+}
+