aboutsummaryrefslogtreecommitdiff
path: root/analysis/R/unknowns_test.R
diff options
context:
space:
mode:
Diffstat (limited to 'analysis/R/unknowns_test.R')
-rwxr-xr-xanalysis/R/unknowns_test.R139
1 files changed, 139 insertions, 0 deletions
diff --git a/analysis/R/unknowns_test.R b/analysis/R/unknowns_test.R
new file mode 100755
index 0000000..5efd738
--- /dev/null
+++ b/analysis/R/unknowns_test.R
@@ -0,0 +1,139 @@
+# Copyright 2014 Google Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Author: fanti@google.com (Giulia Fanti)
+#
+# Tests the unknown unknowns dictionary estimation functions.
+# There are two main components involved in estimating this unknown
+# distribution:
+# a) Find the pairwise ngrams that co-occur often.
+# b) Determine which full strings are consisted with all pairwise
+# relations.
+#
+# TestEstimateDictionary() tests the full pipeline, including parts (a)
+# and (b).
+# TestFindFeasibleStrings() tests only part (b).
+# Both tests generate their own data.
+
+library(parallel)
+source("analysis/R/encode.R")
+source("analysis/R/decode.R")
+source("analysis/R/simulation.R")
+source("analysis/R/association.R")
+source("analysis/R/decode_ngrams.R")
+source("analysis/R/ngrams_simulation.R")
+alphabet <- letters
+options(warn = -1)
+
+GeneratePopulation <- function(N, num_strs, str_len = 10,
+ distribution = NULL) {
+ # Generates a /deterministic/ string for each individual in the
+ # population from distribution.
+ #
+ # Args:
+ # N: Number of individuals in the population
+ # num_strs: Number of strings from which to draw strings
+ # str_len: Length of each string
+ # distribution: Just here for compatibility with original
+ # GeneratePopulation function in ngrams_simulation.R
+ #
+ # Returns:
+ # Vector of strings for each individual in the population
+
+ strs <- sapply(1:num_strs, function(i) {
+ paste0(alphabet[(str_len * (i - 1) + 1):(str_len * i)], collapse = "")
+ })
+
+ # Uniform distribution
+ prob <- rep(1 / num_strs, num_strs)
+ sample(strs, N, replace = TRUE, prob = prob)
+}
+
+TestEstimateDictionary <- function() {
+ # Tests that the algorithm without noise recovers a uniform
+ # string population correctly.
+
+ # Compute the strings from measuring only 2 ngrams
+ N <- 100
+ str_len <- 6
+ ngram_size <- 2
+ num_ngrams <- str_len / ngram_size
+ num_strs <- 1
+
+ params <- list(k = 128, h = 4, m = 2, p = 0, q = 1, f = 0)
+
+ ngram_params <- list(ngram_size = ngram_size, num_ngrams = num_ngrams,
+ num_ngrams_collected = 2)
+
+ sim <- SimulateNGrams(N, ngram_params, str_len, num_strs = num_strs,
+ alphabet, params, distribution = 3)
+
+ res <- EstimateDictionary(sim, N, ngram_params, params)
+
+ # Check that the correct strings are found
+ if (num_strs == 1) {
+ checkTrue(res$found_candidates == sort(unique(sim$strs)))
+ } else {
+ checkTrue(all.equal(res$found_candidates, sort(unique(sim$strs))))
+ }
+}
+
+TestFindFeasibleStrings <- function() {
+ # Tests that FindPairwiseCandidates weeds out false positives.
+ # We test this by adding false positives to the pairwise estimates.
+ N <- 100
+ str_len <- 6
+ ngram_size <- 2
+ num_ngrams <- str_len / ngram_size
+ num_strs <- 2
+
+ params <- list(k = 128, h = 4, m = 2, p = 0, q = 1, f = 0)
+
+ ngram_params <- list(ngram_size = ngram_size, num_ngrams = num_ngrams,
+ num_ngrams_collected = 2)
+
+ sim <- SimulateNGrams(N, ngram_params, str_len, num_strs = num_strs,
+ alphabet, params)
+
+ pairwise_candidates <- FindPairwiseCandidates(sim, N, ngram_params,
+ params)$candidate_strs
+ cat("Found the pairwise candidates. \n")
+
+ pairwise_candidates[[1]] <- rbind(pairwise_candidates[[1]], c("ab", "le"))
+
+ if (is.null(pairwise_candidates)) {
+ return (FALSE)
+ }
+
+ conn <- file('graph.txt', 'w+')
+ WriteKPartiteGraph(conn,
+ pairwise_candidates,
+ sim$pairings,
+ ngram_params$num_ngrams,
+ ngram_params$ngram_size)
+
+ close(conn)
+ cat("Wrote graph.txt\n")
+
+ found_candidates <- FindFeasibleStrings(pairwise_candidates,
+ sim$pairings,
+ ngram_params$num_ngrams,
+ ngram_params$ngram_size)
+ # Check that the correct strings are found
+ if (num_strs == 1) {
+ checkTrue(found_candidates == sort(unique(sim$strs)))
+ } else {
+ checkTrue(all.equal(found_candidates, sort(unique(sim$strs))))
+ }
+}