diff options
author | Xin Li <delphij@google.com> | 2024-03-06 09:29:53 -0800 |
---|---|---|
committer | Xin Li <delphij@google.com> | 2024-03-06 09:29:53 -0800 |
commit | 42996c97b96f0da75543f0fee670f9e8cc595744 (patch) | |
tree | 9892cbbd0fb247ce252a38f258e33741ca025a45 | |
parent | 95f4ed2f5a7365814321253aea37ce9cd9572dc6 (diff) | |
parent | b6d2d80d87a6b4096bb32833cc6ac67295713f55 (diff) | |
download | icing-42996c97b96f0da75543f0fee670f9e8cc595744.tar.gz |
Merge Android 14 QPR2 to AOSP main
Bug: 319669529
Merged-In: I986936f038c20a9b2579d658eea5176ede545011
Change-Id: Ia3512630814deff7c2dde734fb3c415fdc3c5673
200 files changed, 24584 insertions, 3901 deletions
@@ -54,6 +54,8 @@ cc_defaults { "-funsigned-char", "-fvisibility=hidden", + + "-Bsymbolic", ], apex_available: ["com.android.appsearch"], } diff --git a/build.gradle b/build.gradle index a3aa34d..d0d1a39 100644 --- a/build.gradle +++ b/build.gradle @@ -14,66 +14,42 @@ * limitations under the License. */ -import androidx.build.SupportConfig +import androidx.build.SdkHelperKt plugins { - id('AndroidXPlugin') - id('com.android.library') - id('com.google.protobuf') + id("AndroidXPlugin") + id("java-library") + id("com.google.protobuf") } -android { - buildToolsVersion SupportConfig.buildToolsVersion(project) - compileSdkVersion SupportConfig.COMPILE_SDK_VERSION - defaultConfig { - minSdkVersion SupportConfig.DEFAULT_MIN_SDK_VERSION - targetSdkVersion SupportConfig.TARGET_SDK_VERSION - testInstrumentationRunner SupportConfig.INSTRUMENTATION_RUNNER +sourceSets { + main { + java.srcDir 'java/src/' + proto.srcDir 'proto/' } - compileOptions { - sourceCompatibility = JavaVersion.VERSION_1_8 - targetCompatibility = JavaVersion.VERSION_1_8 - } - sourceSets { - main { - java.srcDir 'java/src/' - proto.srcDir 'proto/' - } - // TODO(b/161205849): Re-enable this test once icing nativeLib is no longer being built - // inside appsearch:appsearch. - //androidTest.java.srcDir 'java/tests/instrumentation/' - } - namespace "com.google.android.icing" -} - -// This project has no device tests, skip building it -androidComponents { - beforeVariants(selector().withName("debug"), { variantBuilder -> - variantBuilder.enableAndroidTest = false - }) } dependencies { - api('androidx.annotation:annotation:1.1.0') - - implementation('com.google.protobuf:protobuf-javalite:3.10.0') + compileOnly("androidx.annotation:annotation:1.1.0") + compileOnly(SdkHelperKt.getSdkDependency(project)) + implementation(libs.protobufLite) +} - androidTestImplementation(libs.testCore) - androidTestImplementation(libs.testRules) - androidTestImplementation(libs.truth) - androidTestImplementation(libs.kotlinBom) +afterEvaluate { + lint { + lintOptions { + // protobuf generates unannotated methods + disable("UnknownNullness") + } + } } protobuf { protoc { artifact = libs.protobufCompiler.get() } - generateProtoTasks { all().each { task -> - project.tasks.named("extractReleaseAnnotations").configure { - it.dependsOn(task) - } task.builtins { java { option 'lite' @@ -83,30 +59,6 @@ protobuf { } } -// Create export artifact for all variants (debug/release) for JarJaring -android.libraryVariants.all { variant -> - def variantName = variant.name - def suffix = variantName.capitalize() - def exportJarTask = tasks.register("exportJar${suffix}", Jar) { - archiveBaseName.set("icing-${variantName}") - - // The proto-lite dependency includes .proto files, which are not used by icing. When apps - // depend on appsearch as well as proto-lite directly, these files conflict since jarjar - // only renames the java classes. Remove them here since they are unused. - // Expand the jar and remove any .proto files. - from(zipTree(configurations.detachedConfiguration( - dependencies.create(libs.protobufLite.get())).getSingleFile())) { - exclude("**/*.proto") - } - - from files(variant.javaCompileProvider.get().destinationDir) - dependsOn variant.javaCompileProvider.get() - } - - def exportConfiguration = configurations.register("export${suffix}") - artifacts.add(exportConfiguration.name, exportJarTask.flatMap { it.archiveFile }) -} - androidx { mavenVersion = LibraryVersions.APPSEARCH } diff --git a/icing/file/file-backed-vector_benchmark.cc b/icing/file/file-backed-vector_benchmark.cc index b2e660b..0447e93 100644 --- a/icing/file/file-backed-vector_benchmark.cc +++ b/icing/file/file-backed-vector_benchmark.cc @@ -68,7 +68,7 @@ void BM_Set(benchmark::State& state) { MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC)); // Extend to num_elts - fbv->Set(num_elts - 1, 0); + ICING_ASSERT_OK(fbv->Set(num_elts - 1, 0)); std::uniform_int_distribution<> distrib(0, num_elts - 1); for (auto _ : state) { diff --git a/icing/file/persistent-hash-map.cc b/icing/file/persistent-hash-map.cc index 729b09a..6936c45 100644 --- a/icing/file/persistent-hash-map.cc +++ b/icing/file/persistent-hash-map.cc @@ -27,6 +27,7 @@ #include "icing/absl_ports/str_cat.h" #include "icing/file/file-backed-vector.h" #include "icing/file/memory-mapped-file.h" +#include "icing/file/persistent-storage.h" #include "icing/util/crc32.h" #include "icing/util/status-macros.h" @@ -167,6 +168,8 @@ PersistentHashMap::~PersistentHashMap() { libtextclassifier3::Status PersistentHashMap::Put(std::string_view key, const void* value) { + SetDirty(); + ICING_RETURN_IF_ERROR(ValidateKey(key)); ICING_ASSIGN_OR_RETURN( int32_t bucket_idx, @@ -207,6 +210,7 @@ libtextclassifier3::Status PersistentHashMap::GetOrPut(std::string_view key, FindEntryIndexByKey(bucket_idx, key)); if (idx_pair.target_entry_index == Entry::kInvalidIndex) { // If not found, then insert new key value pair. + SetDirty(); return Insert(bucket_idx, key, next_value); } @@ -232,6 +236,8 @@ libtextclassifier3::Status PersistentHashMap::Get(std::string_view key, } libtextclassifier3::Status PersistentHashMap::Delete(std::string_view key) { + SetDirty(); + ICING_RETURN_IF_ERROR(ValidateKey(key)); ICING_ASSIGN_OR_RETURN( int32_t bucket_idx, @@ -514,6 +520,7 @@ PersistentHashMap::InitializeExistingFiles(const Filesystem& filesystem, << " to " << persistent_hash_map->options_.max_load_factor_percent; + persistent_hash_map->SetInfoDirty(); persistent_hash_map->info().max_load_factor_percent = persistent_hash_map->options_.max_load_factor_percent; ICING_RETURN_IF_ERROR( @@ -525,26 +532,50 @@ PersistentHashMap::InitializeExistingFiles(const Filesystem& filesystem, return persistent_hash_map; } -libtextclassifier3::Status PersistentHashMap::PersistStoragesToDisk() { +libtextclassifier3::Status PersistentHashMap::PersistStoragesToDisk( + bool force) { + if (!force && !is_storage_dirty()) { + return libtextclassifier3::Status::OK; + } + ICING_RETURN_IF_ERROR(bucket_storage_->PersistToDisk()); ICING_RETURN_IF_ERROR(entry_storage_->PersistToDisk()); ICING_RETURN_IF_ERROR(kv_storage_->PersistToDisk()); + is_storage_dirty_ = false; return libtextclassifier3::Status::OK; } -libtextclassifier3::Status PersistentHashMap::PersistMetadataToDisk() { +libtextclassifier3::Status PersistentHashMap::PersistMetadataToDisk( + bool force) { + // We can skip persisting metadata to disk only if both info and storage are + // clean. + if (!force && !is_info_dirty() && !is_storage_dirty()) { + return libtextclassifier3::Status::OK; + } + // Changes should have been applied to the underlying file when using // MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC, but call msync() as an // extra safety step to ensure they are written out. - return metadata_mmapped_file_->PersistToDisk(); + ICING_RETURN_IF_ERROR(metadata_mmapped_file_->PersistToDisk()); + is_info_dirty_ = false; + return libtextclassifier3::Status::OK; } -libtextclassifier3::StatusOr<Crc32> PersistentHashMap::ComputeInfoChecksum() { +libtextclassifier3::StatusOr<Crc32> PersistentHashMap::ComputeInfoChecksum( + bool force) { + if (!force && !is_info_dirty()) { + return Crc32(crcs().component_crcs.info_crc); + } + return info().ComputeChecksum(); } -libtextclassifier3::StatusOr<Crc32> -PersistentHashMap::ComputeStoragesChecksum() { +libtextclassifier3::StatusOr<Crc32> PersistentHashMap::ComputeStoragesChecksum( + bool force) { + if (!force && !is_storage_dirty()) { + return Crc32(crcs().component_crcs.storages_crc); + } + // Compute crcs ICING_ASSIGN_OR_RETURN(Crc32 bucket_storage_crc, bucket_storage_->ComputeChecksum()); @@ -602,6 +633,8 @@ libtextclassifier3::Status PersistentHashMap::CopyEntryValue( libtextclassifier3::Status PersistentHashMap::Insert(int32_t bucket_idx, std::string_view key, const void* value) { + SetDirty(); + // If entry_storage_->num_elements() + 1 exceeds options_.max_num_entries, // then return error. // We compute max_file_size of 3 storages by options_.max_num_entries. Since @@ -655,6 +688,8 @@ libtextclassifier3::Status PersistentHashMap::RehashIfNecessary( return libtextclassifier3::Status::OK; } + SetDirty(); + // Resize and reset buckets. ICING_RETURN_IF_ERROR( bucket_storage_->Set(0, new_num_bucket, Bucket(Entry::kInvalidIndex))); @@ -681,7 +716,7 @@ libtextclassifier3::Status PersistentHashMap::RehashIfNecessary( // # of vector elements may be greater than the actual # of entries. // Therefore, we have to truncate entry_storage_ to the correct size. if (entry_idx < entry_storage_->num_elements()) { - entry_storage_->TruncateTo(entry_idx); + ICING_RETURN_IF_ERROR(entry_storage_->TruncateTo(entry_idx)); } info().num_deleted_entries = 0; diff --git a/icing/file/persistent-hash-map.h b/icing/file/persistent-hash-map.h index 845b22a..5f7999d 100644 --- a/icing/file/persistent-hash-map.h +++ b/icing/file/persistent-hash-map.h @@ -394,7 +394,9 @@ class PersistentHashMap : public PersistentStorage { std::move(metadata_mmapped_file))), bucket_storage_(std::move(bucket_storage)), entry_storage_(std::move(entry_storage)), - kv_storage_(std::move(kv_storage)) {} + kv_storage_(std::move(kv_storage)), + is_info_dirty_(false), + is_storage_dirty_(false) {} static libtextclassifier3::StatusOr<std::unique_ptr<PersistentHashMap>> InitializeNewFiles(const Filesystem& filesystem, std::string&& working_path, @@ -409,20 +411,20 @@ class PersistentHashMap : public PersistentStorage { // Returns: // - OK on success // - INTERNAL_ERROR on I/O error - libtextclassifier3::Status PersistStoragesToDisk() override; + libtextclassifier3::Status PersistStoragesToDisk(bool force) override; // Flushes contents of metadata file. // // Returns: // - OK on success // - INTERNAL_ERROR on I/O error - libtextclassifier3::Status PersistMetadataToDisk() override; + libtextclassifier3::Status PersistMetadataToDisk(bool force) override; // Computes and returns Info checksum. // // Returns: // - Crc of the Info on success - libtextclassifier3::StatusOr<Crc32> ComputeInfoChecksum() override; + libtextclassifier3::StatusOr<Crc32> ComputeInfoChecksum(bool force) override; // Computes and returns all storages checksum. Checksums of bucket_storage_, // entry_storage_ and kv_storage_ will be combined together by XOR. @@ -430,7 +432,8 @@ class PersistentHashMap : public PersistentStorage { // Returns: // - Crc of all storages on success // - INTERNAL_ERROR if any data inconsistency - libtextclassifier3::StatusOr<Crc32> ComputeStoragesChecksum() override; + libtextclassifier3::StatusOr<Crc32> ComputeStoragesChecksum( + bool force) override; // Find the index of the target entry (that contains the key) from a bucket // (specified by bucket index). Also return the previous entry index, since @@ -496,6 +499,17 @@ class PersistentHashMap : public PersistentStorage { kInfoMetadataFileOffset); } + void SetInfoDirty() { is_info_dirty_ = true; } + // When storage is dirty, we have to set info dirty as well. So just expose + // SetDirty to set both. + void SetDirty() { + is_info_dirty_ = true; + is_storage_dirty_ = true; + } + + bool is_info_dirty() const { return is_info_dirty_; } + bool is_storage_dirty() const { return is_storage_dirty_; } + Options options_; std::unique_ptr<MemoryMappedFile> metadata_mmapped_file_; @@ -504,6 +518,9 @@ class PersistentHashMap : public PersistentStorage { std::unique_ptr<FileBackedVector<Bucket>> bucket_storage_; std::unique_ptr<FileBackedVector<Entry>> entry_storage_; std::unique_ptr<FileBackedVector<char>> kv_storage_; + + bool is_info_dirty_; + bool is_storage_dirty_; }; } // namespace lib diff --git a/icing/file/persistent-storage.h b/icing/file/persistent-storage.h index 727cae9..9cb5e4d 100644 --- a/icing/file/persistent-storage.h +++ b/icing/file/persistent-storage.h @@ -148,8 +148,9 @@ class PersistentStorage { return libtextclassifier3::Status::OK; } - ICING_RETURN_IF_ERROR(UpdateChecksumsInternal()); - ICING_RETURN_IF_ERROR(PersistMetadataToDisk()); + ICING_RETURN_IF_ERROR(UpdateChecksumsInternal(/*force=*/true)); + ICING_RETURN_IF_ERROR(PersistStoragesToDisk(/*force=*/true)); + ICING_RETURN_IF_ERROR(PersistMetadataToDisk(/*force=*/true)); is_initialized_ = true; return libtextclassifier3::Status::OK; @@ -184,38 +185,52 @@ class PersistentStorage { // 2) Updates all checksums by new data. // 3) Flushes metadata. // + // Force flag will be passed down to PersistMetadataToDisk, + // PersistStoragesToDisk, ComputeInfoChecksum, ComputeStoragesChecksum. + // - If force == true, then performs actual persisting operations/recomputes + // the checksum. + // - Otherwise, the derived class can decide itself whether skipping + // persisting operations/doing lazy checksum recomputing if the storage is + // not dirty. + // // Returns: // - OK on success // - FAILED_PRECONDITION_ERROR if PersistentStorage is uninitialized // - Any errors from PersistStoragesToDisk, UpdateChecksums, // PersistMetadataToDisk, depending on actual implementation - libtextclassifier3::Status PersistToDisk() { + libtextclassifier3::Status PersistToDisk(bool force = false) { if (!is_initialized_) { return absl_ports::FailedPreconditionError(absl_ports::StrCat( "PersistentStorage ", working_path_, " not initialized")); } - ICING_RETURN_IF_ERROR(PersistStoragesToDisk()); - ICING_RETURN_IF_ERROR(UpdateChecksums()); - ICING_RETURN_IF_ERROR(PersistMetadataToDisk()); + ICING_RETURN_IF_ERROR(UpdateChecksumsInternal(force)); + ICING_RETURN_IF_ERROR(PersistStoragesToDisk(force)); + ICING_RETURN_IF_ERROR(PersistMetadataToDisk(force)); return libtextclassifier3::Status::OK; } // Updates checksums of all components and returns the overall crc (all_crc) // of the persistent storage. // + // Force flag will be passed down ComputeInfoChecksum, + // ComputeStoragesChecksum. + // - If force == true, then recomputes the checksum. + // - Otherwise, the derived class can decide itself whether doing lazy + // checksum recomputing if the storage is not dirty. + // // Returns: // - Overall crc of the persistent storage on success // - FAILED_PRECONDITION_ERROR if PersistentStorage is uninitialized // - Any errors from ComputeInfoChecksum, ComputeStoragesChecksum, depending // on actual implementation - libtextclassifier3::StatusOr<Crc32> UpdateChecksums() { + libtextclassifier3::StatusOr<Crc32> UpdateChecksums(bool force = false) { if (!is_initialized_) { return absl_ports::FailedPreconditionError(absl_ports::StrCat( "PersistentStorage ", working_path_, " not initialized")); } - return UpdateChecksumsInternal(); + return UpdateChecksumsInternal(force); } protected: @@ -234,33 +249,41 @@ class PersistentStorage { // Returns: // - OK on success // - Any other errors, depending on actual implementation - virtual libtextclassifier3::Status PersistMetadataToDisk() = 0; + virtual libtextclassifier3::Status PersistMetadataToDisk(bool force) = 0; // Flushes contents of all storages to underlying files. // // Returns: // - OK on success // - Any other errors, depending on actual implementation - virtual libtextclassifier3::Status PersistStoragesToDisk() = 0; + virtual libtextclassifier3::Status PersistStoragesToDisk(bool force) = 0; // Computes and returns Info checksum. + // - If force = true, then recompute the entire checksum. + // - Otherwise, the derived class can decide itself whether doing lazy + // checksum computing if the storage is not dirty. // // This function will be mainly called by UpdateChecksums. // // Returns: // - Crc of the Info on success // - Any other errors, depending on actual implementation - virtual libtextclassifier3::StatusOr<Crc32> ComputeInfoChecksum() = 0; + virtual libtextclassifier3::StatusOr<Crc32> ComputeInfoChecksum( + bool force) = 0; // Computes and returns all storages checksum. If there are multiple storages, // usually we XOR their checksums together to a single checksum. + // - If force = true, then recompute the entire checksum. + // - Otherwise, the derived class can decide itself whether doing lazy + // checksum computing if the storage is not dirty. // // This function will be mainly called by UpdateChecksums. // // Returns: // - Crc of all storages on success // - Any other errors from depending on actual implementation - virtual libtextclassifier3::StatusOr<Crc32> ComputeStoragesChecksum() = 0; + virtual libtextclassifier3::StatusOr<Crc32> ComputeStoragesChecksum( + bool force) = 0; // Returns the Crcs instance reference. The derived class can either own a // concrete Crcs instance, or reinterpret_cast the memory-mapped region to @@ -292,11 +315,18 @@ class PersistentStorage { // - Overall crc of the persistent storage on success // - Any errors from ComputeInfoChecksum, ComputeStoragesChecksum, depending // on actual implementation - libtextclassifier3::StatusOr<Crc32> UpdateChecksumsInternal() { + libtextclassifier3::StatusOr<Crc32> UpdateChecksumsInternal(bool force) { Crcs& crcs_ref = crcs(); // Compute and update storages + info checksums. - ICING_ASSIGN_OR_RETURN(Crc32 info_crc, ComputeInfoChecksum()); - ICING_ASSIGN_OR_RETURN(Crc32 storages_crc, ComputeStoragesChecksum()); + ICING_ASSIGN_OR_RETURN(Crc32 info_crc, ComputeInfoChecksum(force)); + ICING_ASSIGN_OR_RETURN(Crc32 storages_crc, ComputeStoragesChecksum(force)); + if (crcs_ref.component_crcs.info_crc == info_crc.Get() && + crcs_ref.component_crcs.storages_crc == storages_crc.Get()) { + // If info and storages crc haven't changed, then we don't have to update + // checksums. + return Crc32(crcs_ref.all_crc); + } + crcs_ref.component_crcs.info_crc = info_crc.Get(); crcs_ref.component_crcs.storages_crc = storages_crc.Get(); @@ -318,12 +348,13 @@ class PersistentStorage { return absl_ports::FailedPreconditionError("Invalid all crc"); } - ICING_ASSIGN_OR_RETURN(Crc32 info_crc, ComputeInfoChecksum()); + ICING_ASSIGN_OR_RETURN(Crc32 info_crc, ComputeInfoChecksum(/*force=*/true)); if (crcs_ref.component_crcs.info_crc != info_crc.Get()) { return absl_ports::FailedPreconditionError("Invalid info crc"); } - ICING_ASSIGN_OR_RETURN(Crc32 storages_crc, ComputeStoragesChecksum()); + ICING_ASSIGN_OR_RETURN(Crc32 storages_crc, + ComputeStoragesChecksum(/*force=*/true)); if (crcs_ref.component_crcs.storages_crc != storages_crc.Get()) { return absl_ports::FailedPreconditionError("Invalid storages crc"); } diff --git a/icing/file/portable-file-backed-proto-log_test.cc b/icing/file/portable-file-backed-proto-log_test.cc index bf5e604..cc70151 100644 --- a/icing/file/portable-file-backed-proto-log_test.cc +++ b/icing/file/portable-file-backed-proto-log_test.cc @@ -1124,7 +1124,8 @@ TEST_F(PortableFileBackedProtoLogTest, EraseProtoShouldSetZero) { // document1_offset + sizeof(int) is the start byte of the proto where // sizeof(int) is the size of the proto metadata. - mmapped_file.Remap(document1_offset + sizeof(int), file_size - 1); + ICING_ASSERT_OK( + mmapped_file.Remap(document1_offset + sizeof(int), file_size - 1)); for (size_t i = 0; i < mmapped_file.region_size(); ++i) { ASSERT_THAT(mmapped_file.region()[i], Eq(0)); } diff --git a/icing/file/posting_list/flash-index-storage.cc b/icing/file/posting_list/flash-index-storage.cc index cd7ac12..2198d2c 100644 --- a/icing/file/posting_list/flash-index-storage.cc +++ b/icing/file/posting_list/flash-index-storage.cc @@ -75,7 +75,11 @@ FlashIndexStorage::ReadHeaderMagic(const Filesystem* filesystem, FlashIndexStorage::~FlashIndexStorage() { if (header_block_ != nullptr) { - FlushInMemoryFreeList(); + libtextclassifier3::Status status = FlushInMemoryFreeList(); + if (!status.ok()) { + ICING_LOG(ERROR) << "Cannot flush in memory free list: " + << status.error_message(); + } PersistToDisk(); } } @@ -487,6 +491,9 @@ libtextclassifier3::Status FlashIndexStorage::FreePostingList( PostingListHolder&& holder) { ICING_ASSIGN_OR_RETURN(IndexBlock block, GetIndexBlock(holder.id.block_index())); + if (block.posting_list_bytes() == max_posting_list_bytes()) { + ICING_RETURN_IF_ERROR(block.SetNextBlockIndex(kInvalidBlockIndex)); + } uint32_t posting_list_bytes = block.posting_list_bytes(); int best_block_info_index = FindBestIndexBlockInfo(posting_list_bytes); diff --git a/icing/file/posting_list/flash-index-storage_test.cc b/icing/file/posting_list/flash-index-storage_test.cc index 3e2d239..ef60037 100644 --- a/icing/file/posting_list/flash-index-storage_test.cc +++ b/icing/file/posting_list/flash-index-storage_test.cc @@ -249,7 +249,8 @@ TEST_F(FlashIndexStorageTest, FreeListInMemory) { IsOkAndHolds(ElementsAreArray(hits2.rbegin(), hits2.rend()))); // 3. Now, free the first posting list. This should add it to the free list - flash_index_storage.FreePostingList(std::move(posting_list_holder1)); + ICING_ASSERT_OK( + flash_index_storage.FreePostingList(std::move(posting_list_holder1))); // 4. Request another posting list. This should NOT grow the index because // the first posting list is free. @@ -349,7 +350,8 @@ TEST_F(FlashIndexStorageTest, FreeListNotInMemory) { IsOkAndHolds(ElementsAreArray(hits2.rbegin(), hits2.rend()))); // 3. Now, free the first posting list. This should add it to the free list - flash_index_storage.FreePostingList(std::move(posting_list_holder1)); + ICING_ASSERT_OK( + flash_index_storage.FreePostingList(std::move(posting_list_holder1))); // 4. Request another posting list. This should NOT grow the index because // the first posting list is free. @@ -452,7 +454,8 @@ TEST_F(FlashIndexStorageTest, FreeListInMemoryPersistence) { // 3. Now, free the first posting list. This should add it to the free // list - flash_index_storage.FreePostingList(std::move(posting_list_holder1)); + ICING_ASSERT_OK( + flash_index_storage.FreePostingList(std::move(posting_list_holder1))); } EXPECT_THAT(flash_index_storage.GetDiskUsage(), diff --git a/icing/file/posting_list/index-block_test.cc b/icing/file/posting_list/index-block_test.cc index fcc134a..ebc9ba4 100644 --- a/icing/file/posting_list/index-block_test.cc +++ b/icing/file/posting_list/index-block_test.cc @@ -292,7 +292,7 @@ TEST_F(IndexBlockTest, IndexBlockReallocatingPostingLists) { // Now free the first posting list. Then, reallocate it and fill it with a // different set of hits. - block.FreePostingList(alloc_info_1.posting_list_index); + ICING_ASSERT_OK(block.FreePostingList(alloc_info_1.posting_list_index)); EXPECT_THAT(block.HasFreePostingLists(), IsOkAndHolds(IsTrue())); std::vector<Hit> hits_in_posting_list3{ diff --git a/icing/file/posting_list/posting-list-accessor.cc b/icing/file/posting_list/posting-list-accessor.cc index 67d7a21..a7cdb17 100644 --- a/icing/file/posting_list/posting-list-accessor.cc +++ b/icing/file/posting_list/posting-list-accessor.cc @@ -16,7 +16,10 @@ #include <cstdint> #include <memory> +#include <utility> +#include "icing/text_classifier/lib3/utils/base/status.h" +#include "icing/text_classifier/lib3/utils/base/statusor.h" #include "icing/absl_ports/canonical_errors.h" #include "icing/file/posting_list/flash-index-storage.h" #include "icing/file/posting_list/posting-list-identifier.h" @@ -40,13 +43,15 @@ libtextclassifier3::Status PostingListAccessor::FlushPreexistingPostingList() { // and free this posting list. // // Move will always succeed since in_memory_posting_list_ is max_pl_bytes. - GetSerializer()->MoveFrom(/*dst=*/&in_memory_posting_list_, - /*src=*/&preexisting_posting_list_->posting_list); + ICING_RETURN_IF_ERROR(GetSerializer()->MoveFrom( + /*dst=*/&in_memory_posting_list_, + /*src=*/&preexisting_posting_list_->posting_list)); // Now that all the contents of this posting list have been copied, there's // no more use for it. Make it available to be used for another posting // list. - storage_->FreePostingList(std::move(*preexisting_posting_list_)); + ICING_RETURN_IF_ERROR( + storage_->FreePostingList(std::move(*preexisting_posting_list_))); } preexisting_posting_list_.reset(); return libtextclassifier3::Status::OK; diff --git a/icing/file/posting_list/posting-list-identifier.h b/icing/file/posting_list/posting-list-identifier.h index 78821e8..8a0229b 100644 --- a/icing/file/posting_list/posting-list-identifier.h +++ b/icing/file/posting_list/posting-list-identifier.h @@ -59,6 +59,8 @@ class PostingListIdentifier { public: static PostingListIdentifier kInvalid; + explicit PostingListIdentifier() { *this = kInvalid; } + // 1. block_index - the index of this block within the FlashIndexStorage file // 2. posting_list_index - the index of this posting list within the block // 3. posting_list_index_bits - the number of bits needed to encode the diff --git a/icing/file/version-util.cc b/icing/file/version-util.cc index f477072..dd233e0 100644 --- a/icing/file/version-util.cc +++ b/icing/file/version-util.cc @@ -102,6 +102,48 @@ StateChange GetVersionStateChange(const VersionInfo& existing_version_info, } } +bool ShouldRebuildDerivedFiles(const VersionInfo& existing_version_info, + int32_t curr_version) { + StateChange state_change = + GetVersionStateChange(existing_version_info, curr_version); + switch (state_change) { + case StateChange::kCompatible: + return false; + case StateChange::kUndetermined: + [[fallthrough]]; + case StateChange::kRollBack: + [[fallthrough]]; + case StateChange::kRollForward: + [[fallthrough]]; + case StateChange::kVersionZeroRollForward: + [[fallthrough]]; + case StateChange::kVersionZeroUpgrade: + return true; + case StateChange::kUpgrade: + break; + } + + bool should_rebuild = false; + int32_t existing_version = existing_version_info.version; + while (existing_version < curr_version) { + switch (existing_version) { + case 1: { + // version 1 -> version 2 upgrade, no need to rebuild + break; + } + case 2: { + // version 2 -> version 3 upgrade, no need to rebuild + break; + } + default: + // This should not happen. Rebuild anyway if unsure. + should_rebuild |= true; + } + ++existing_version; + } + return should_rebuild; +} + } // namespace version_util } // namespace lib diff --git a/icing/file/version-util.h b/icing/file/version-util.h index 7fa7fbd..b2d51df 100644 --- a/icing/file/version-util.h +++ b/icing/file/version-util.h @@ -27,10 +27,18 @@ namespace lib { namespace version_util { -// - Version 0: Android T. Can be identified only by flash index magic. -// - Version 1: mainline release 2023-06. -inline static constexpr int32_t kVersion = 1; +// - Version 0: Android T base. Can be identified only by flash index magic. +// - Version 1: Android U base and M-2023-08. +// - Version 2: M-2023-09, M-2023-11, M-2024-01. Schema is compatible with v1. +// (There were no M-2023-10, M-2023-12). +// - Version 3: M-2024-02. Schema is compatible with v1 and v2. +// +// LINT.IfChange(kVersion) +inline static constexpr int32_t kVersion = 3; +// LINT.ThenChange(//depot/google3/icing/schema/schema-store.cc:min_overlay_version_compatibility) inline static constexpr int32_t kVersionOne = 1; +inline static constexpr int32_t kVersionTwo = 2; +inline static constexpr int32_t kVersionThree = 3; inline static constexpr int kVersionZeroFlashIndexMagic = 0x6dfba6ae; @@ -89,6 +97,16 @@ libtextclassifier3::Status WriteVersion(const Filesystem& filesystem, StateChange GetVersionStateChange(const VersionInfo& existing_version_info, int32_t curr_version = kVersion); +// Helper method to determine whether Icing should rebuild all derived files. +// Sometimes it is not required to rebuild derived files when +// roll-forward/upgrading. This function "encodes" upgrade paths and checks if +// the roll-forward/upgrading requires derived files to be rebuilt or not. +// +// REQUIRES: curr_version > 0. We implement version checking in version 1, so +// the callers (except unit tests) will always use a version # greater than 0. +bool ShouldRebuildDerivedFiles(const VersionInfo& existing_version_info, + int32_t curr_version = kVersion); + } // namespace version_util } // namespace lib diff --git a/icing/file/version-util_test.cc b/icing/file/version-util_test.cc index 78cdb7d..9dedb1d 100644 --- a/icing/file/version-util_test.cc +++ b/icing/file/version-util_test.cc @@ -32,6 +32,8 @@ namespace version_util { namespace { using ::testing::Eq; +using ::testing::IsFalse; +using ::testing::IsTrue; struct VersionUtilReadVersionTestParam { std::optional<VersionInfo> existing_version_info; @@ -339,6 +341,14 @@ INSTANTIATE_TEST_SUITE_P( /*curr_version_in=*/2, /*expected_state_change_in=*/StateChange::kRollForward), + // - version 1, max_version 2 + // - Current version = 3 + // - Result: roll forward + VersionUtilStateChangeTestParam( + /*existing_version_info_in=*/VersionInfo(1, 2), + /*curr_version_in=*/3, + /*expected_state_change_in=*/StateChange::kRollForward), + // - version 1, max_version 3 // - Current version = 2 // - Result: roll forward @@ -379,6 +389,94 @@ INSTANTIATE_TEST_SUITE_P( /*curr_version_in=*/2, /*expected_state_change_in=*/StateChange::kRollBack))); +TEST(VersionUtilTest, ShouldRebuildDerivedFilesUndeterminedVersion) { + EXPECT_THAT( + ShouldRebuildDerivedFiles(VersionInfo(-1, -1), /*curr_version=*/1), + IsTrue()); + EXPECT_THAT( + ShouldRebuildDerivedFiles(VersionInfo(-1, -1), /*curr_version=*/2), + IsTrue()); +} + +TEST(VersionUtilTest, ShouldRebuildDerivedFilesVersionZeroUpgrade) { + // 0 -> 1 + EXPECT_THAT(ShouldRebuildDerivedFiles(VersionInfo(0, 0), /*curr_version=*/1), + IsTrue()); + + // 0 -> 2 + EXPECT_THAT(ShouldRebuildDerivedFiles(VersionInfo(0, 0), /*curr_version=*/2), + IsTrue()); +} + +TEST(VersionUtilTest, ShouldRebuildDerivedFilesVersionZeroRollForward) { + // (1 -> 0), 0 -> 1 + EXPECT_THAT(ShouldRebuildDerivedFiles(VersionInfo(0, 1), /*curr_version=*/1), + IsTrue()); + + // (1 -> 0), 0 -> 2 + EXPECT_THAT(ShouldRebuildDerivedFiles(VersionInfo(0, 1), /*curr_version=*/2), + IsTrue()); + + // (2 -> 0), 0 -> 1 + EXPECT_THAT(ShouldRebuildDerivedFiles(VersionInfo(0, 2), /*curr_version=*/1), + IsTrue()); +} + +TEST(VersionUtilTest, ShouldRebuildDerivedFilesRollBack) { + // 2 -> 1 + EXPECT_THAT(ShouldRebuildDerivedFiles(VersionInfo(2, 2), /*curr_version=*/1), + IsTrue()); + + // 3 -> 1 + EXPECT_THAT(ShouldRebuildDerivedFiles(VersionInfo(3, 3), /*curr_version=*/1), + IsTrue()); + + // (3 -> 2), 2 -> 1 + EXPECT_THAT(ShouldRebuildDerivedFiles(VersionInfo(2, 3), /*curr_version=*/1), + IsTrue()); +} + +TEST(VersionUtilTest, ShouldRebuildDerivedFilesRollForward) { + // (2 -> 1), 1 -> 2 + EXPECT_THAT(ShouldRebuildDerivedFiles(VersionInfo(1, 2), /*curr_version=*/2), + IsTrue()); + + // (2 -> 1), 1 -> 3 + EXPECT_THAT(ShouldRebuildDerivedFiles(VersionInfo(1, 2), /*curr_version=*/3), + IsTrue()); + + // (3 -> 1), 1 -> 2 + EXPECT_THAT(ShouldRebuildDerivedFiles(VersionInfo(1, 3), /*curr_version=*/2), + IsTrue()); +} + +TEST(VersionUtilTest, ShouldRebuildDerivedFilesCompatible) { + EXPECT_THAT(ShouldRebuildDerivedFiles(VersionInfo(2, 2), /*curr_version=*/2), + IsFalse()); + + EXPECT_THAT(ShouldRebuildDerivedFiles(VersionInfo(2, 3), /*curr_version=*/2), + IsFalse()); +} + +TEST(VersionUtilTest, Upgrade) { + // Unlike other state changes, upgrade depends on the actual "encoded path". + + // kVersionOne -> kVersionTwo + EXPECT_THAT(ShouldRebuildDerivedFiles(VersionInfo(kVersionOne, kVersionOne), + /*curr_version=*/kVersionTwo), + IsFalse()); + + // kVersionTwo -> kVersionThree + EXPECT_THAT(ShouldRebuildDerivedFiles(VersionInfo(kVersionTwo, kVersionTwo), + /*curr_version=*/kVersionThree), + IsFalse()); + + // kVersionOne -> kVersionThree. + EXPECT_THAT(ShouldRebuildDerivedFiles(VersionInfo(kVersionOne, kVersionOne), + /*curr_version=*/kVersionThree), + IsFalse()); +} + } // namespace } // namespace version_util diff --git a/icing/icing-search-engine.cc b/icing/icing-search-engine.cc index 2cdf930..72be4e9 100644 --- a/icing/icing-search-engine.cc +++ b/icing/icing-search-engine.cc @@ -40,10 +40,12 @@ #include "icing/index/integer-section-indexing-handler.h" #include "icing/index/iterator/doc-hit-info-iterator.h" #include "icing/index/numeric/integer-index.h" -#include "icing/index/string-section-indexing-handler.h" +#include "icing/index/term-indexing-handler.h" #include "icing/join/join-processor.h" +#include "icing/join/qualified-id-join-index-impl-v1.h" +#include "icing/join/qualified-id-join-index-impl-v2.h" +#include "icing/join/qualified-id-join-index.h" #include "icing/join/qualified-id-join-indexing-handler.h" -#include "icing/join/qualified-id-type-joinable-index.h" #include "icing/legacy/index/icing-filesystem.h" #include "icing/portable/endian.h" #include "icing/proto/debug.pb.h" @@ -87,6 +89,7 @@ #include "icing/transform/normalizer.h" #include "icing/util/clock.h" #include "icing/util/crc32.h" +#include "icing/util/data-loss.h" #include "icing/util/logging.h" #include "icing/util/status-macros.h" #include "icing/util/tokenized-document.h" @@ -141,6 +144,15 @@ libtextclassifier3::Status ValidateResultSpec( "ResultSpecProto.num_total_bytes_per_page_threshold cannot be " "non-positive."); } + if (result_spec.max_joined_children_per_parent_to_return() < 0) { + return absl_ports::InvalidArgumentError( + "ResultSpecProto.max_joined_children_per_parent_to_return cannot be " + "negative."); + } + if (result_spec.num_to_score() <= 0) { + return absl_ports::InvalidArgumentError( + "ResultSpecProto.num_to_score cannot be non-positive."); + } // Validate ResultGroupings. std::unordered_set<int32_t> unique_entry_ids; ResultSpecProto::ResultGroupingType result_grouping_type = @@ -218,6 +230,29 @@ libtextclassifier3::Status ValidateSuggestionSpec( return libtextclassifier3::Status::OK; } +bool IsV2QualifiedIdJoinIndexEnabled(const IcingSearchEngineOptions& options) { + return options.use_new_qualified_id_join_index() && + options.document_store_namespace_id_fingerprint(); +} + +libtextclassifier3::StatusOr<std::unique_ptr<QualifiedIdJoinIndex>> +CreateQualifiedIdJoinIndex(const Filesystem& filesystem, + std::string qualified_id_join_index_dir, + const IcingSearchEngineOptions& options) { + if (IsV2QualifiedIdJoinIndexEnabled(options)) { + // V2 + return QualifiedIdJoinIndexImplV2::Create( + filesystem, std::move(qualified_id_join_index_dir), + options.pre_mapping_fbv()); + } else { + // V1 + // TODO(b/275121148): deprecate this part after rollout v2. + return QualifiedIdJoinIndexImplV1::Create( + filesystem, std::move(qualified_id_join_index_dir), + options.pre_mapping_fbv(), options.use_persistent_hash_map()); + } +} + // Version file is a single file under base_dir containing version info of the // existing data. std::string MakeVersionFilePath(const std::string& base_dir) { @@ -583,8 +618,10 @@ libtextclassifier3::Status IcingSearchEngine::InitializeMembers( filesystem_.get(), MakeSchemaDirectoryPath(options_.base_dir()), version_state_change, version_util::kVersion)); - // Step 2: discard all derived data - ICING_RETURN_IF_ERROR(DiscardDerivedFiles()); + // Step 2: discard all derived data if needed rebuild. + if (version_util::ShouldRebuildDerivedFiles(version_info)) { + ICING_RETURN_IF_ERROR(DiscardDerivedFiles()); + } // Step 3: update version file version_util::VersionInfo new_version_info( @@ -621,29 +658,40 @@ libtextclassifier3::Status IcingSearchEngine::InitializeMembers( if (!filesystem_->DeleteDirectoryRecursively(doc_store_dir.c_str()) || !filesystem_->DeleteDirectoryRecursively(index_dir.c_str()) || !IntegerIndex::Discard(*filesystem_, integer_index_dir).ok() || - !QualifiedIdTypeJoinableIndex::Discard(*filesystem_, - qualified_id_join_index_dir) + !QualifiedIdJoinIndex::Discard(*filesystem_, + qualified_id_join_index_dir) .ok()) { return absl_ports::InternalError(absl_ports::StrCat( "Could not delete directories: ", index_dir, ", ", integer_index_dir, ", ", qualified_id_join_index_dir, " and ", doc_store_dir)); } - ICING_RETURN_IF_ERROR(InitializeDocumentStore( - /*force_recovery_and_revalidate_documents=*/false, initialize_stats)); - index_init_status = InitializeIndex(initialize_stats); + ICING_ASSIGN_OR_RETURN( + bool document_store_derived_files_regenerated, + InitializeDocumentStore( + /*force_recovery_and_revalidate_documents=*/false, + initialize_stats)); + index_init_status = InitializeIndex( + document_store_derived_files_regenerated, initialize_stats); if (!index_init_status.ok() && !absl_ports::IsDataLoss(index_init_status)) { return index_init_status; } } else if (filesystem_->FileExists(marker_filepath.c_str())) { // If the marker file is still around then something wonky happened when we // last tried to set the schema. + // + // Since we're going to rebuild all indices in this case, the return value + // of InitializeDocumentStore (document_store_derived_files_regenerated) is + // unused. ICING_RETURN_IF_ERROR(InitializeDocumentStore( /*force_recovery_and_revalidate_documents=*/true, initialize_stats)); // We're going to need to build the index from scratch. So just delete its // directory now. // Discard index directory and instantiate a new one. - Index::Options index_options(index_dir, options_.index_merge_size()); + Index::Options index_options( + index_dir, options_.index_merge_size(), + options_.lite_index_sort_at_indexing(), options_.lite_index_sort_size(), + options_.build_property_existence_metadata_hits()); if (!filesystem_->DeleteDirectoryRecursively(index_dir.c_str()) || !filesystem_->CreateDirectoryRecursively(index_dir.c_str())) { return absl_ports::InternalError( @@ -661,18 +709,18 @@ libtextclassifier3::Status IcingSearchEngine::InitializeMembers( ICING_ASSIGN_OR_RETURN( integer_index_, IntegerIndex::Create(*filesystem_, std::move(integer_index_dir), + options_.integer_index_bucket_split_threshold(), options_.pre_mapping_fbv())); // Discard qualified id join index directory and instantiate a new one. std::string qualified_id_join_index_dir = MakeQualifiedIdJoinIndexWorkingPath(options_.base_dir()); - ICING_RETURN_IF_ERROR(QualifiedIdTypeJoinableIndex::Discard( + ICING_RETURN_IF_ERROR(QualifiedIdJoinIndex::Discard( *filesystem_, qualified_id_join_index_dir)); ICING_ASSIGN_OR_RETURN( qualified_id_join_index_, - QualifiedIdTypeJoinableIndex::Create( - *filesystem_, std::move(qualified_id_join_index_dir), - options_.pre_mapping_fbv(), options_.use_persistent_hash_map())); + CreateQualifiedIdJoinIndex( + *filesystem_, std::move(qualified_id_join_index_dir), options_)); std::unique_ptr<Timer> restore_timer = clock_->GetNewTimer(); IndexRestorationResult restore_result = RestoreIndexIfNeeded(); @@ -697,9 +745,12 @@ libtextclassifier3::Status IcingSearchEngine::InitializeMembers( initialize_stats->set_qualified_id_join_index_restoration_cause( InitializeStatsProto::SCHEMA_CHANGES_OUT_OF_SYNC); } else if (version_state_change != version_util::StateChange::kCompatible) { - ICING_RETURN_IF_ERROR(InitializeDocumentStore( - /*force_recovery_and_revalidate_documents=*/true, initialize_stats)); - index_init_status = InitializeIndex(initialize_stats); + ICING_ASSIGN_OR_RETURN(bool document_store_derived_files_regenerated, + InitializeDocumentStore( + /*force_recovery_and_revalidate_documents=*/true, + initialize_stats)); + index_init_status = InitializeIndex( + document_store_derived_files_regenerated, initialize_stats); if (!index_init_status.ok() && !absl_ports::IsDataLoss(index_init_status)) { return index_init_status; } @@ -715,9 +766,13 @@ libtextclassifier3::Status IcingSearchEngine::InitializeMembers( initialize_stats->set_qualified_id_join_index_restoration_cause( InitializeStatsProto::VERSION_CHANGED); } else { - ICING_RETURN_IF_ERROR(InitializeDocumentStore( - /*force_recovery_and_revalidate_documents=*/false, initialize_stats)); - index_init_status = InitializeIndex(initialize_stats); + ICING_ASSIGN_OR_RETURN( + bool document_store_derived_files_regenerated, + InitializeDocumentStore( + /*force_recovery_and_revalidate_documents=*/false, + initialize_stats)); + index_init_status = InitializeIndex( + document_store_derived_files_regenerated, initialize_stats); if (!index_init_status.ok() && !absl_ports::IsDataLoss(index_init_status)) { return index_init_status; } @@ -751,7 +806,7 @@ libtextclassifier3::Status IcingSearchEngine::InitializeSchemaStore( return libtextclassifier3::Status::OK; } -libtextclassifier3::Status IcingSearchEngine::InitializeDocumentStore( +libtextclassifier3::StatusOr<bool> IcingSearchEngine::InitializeDocumentStore( bool force_recovery_and_revalidate_documents, InitializeStatsProto* initialize_stats) { ICING_RETURN_ERROR_IF_NULL(initialize_stats); @@ -765,17 +820,19 @@ libtextclassifier3::Status IcingSearchEngine::InitializeDocumentStore( } ICING_ASSIGN_OR_RETURN( DocumentStore::CreateResult create_result, - DocumentStore::Create(filesystem_.get(), document_dir, clock_.get(), - schema_store_.get(), - force_recovery_and_revalidate_documents, - options_.document_store_namespace_id_fingerprint(), - options_.compression_level(), initialize_stats)); + DocumentStore::Create( + filesystem_.get(), document_dir, clock_.get(), schema_store_.get(), + force_recovery_and_revalidate_documents, + options_.document_store_namespace_id_fingerprint(), + options_.pre_mapping_fbv(), options_.use_persistent_hash_map(), + options_.compression_level(), initialize_stats)); document_store_ = std::move(create_result.document_store); - return libtextclassifier3::Status::OK; + return create_result.derived_files_regenerated; } libtextclassifier3::Status IcingSearchEngine::InitializeIndex( + bool document_store_derived_files_regenerated, InitializeStatsProto* initialize_stats) { ICING_RETURN_ERROR_IF_NULL(initialize_stats); @@ -785,7 +842,10 @@ libtextclassifier3::Status IcingSearchEngine::InitializeIndex( return absl_ports::InternalError( absl_ports::StrCat("Could not create directory: ", index_dir)); } - Index::Options index_options(index_dir, options_.index_merge_size()); + Index::Options index_options( + index_dir, options_.index_merge_size(), + options_.lite_index_sort_at_indexing(), options_.lite_index_sort_size(), + options_.build_property_existence_metadata_hits()); // Term index InitializeStatsProto::RecoveryCause index_recovery_cause; @@ -816,8 +876,10 @@ libtextclassifier3::Status IcingSearchEngine::InitializeIndex( std::string integer_index_dir = MakeIntegerIndexWorkingPath(options_.base_dir()); InitializeStatsProto::RecoveryCause integer_index_recovery_cause; - auto integer_index_or = IntegerIndex::Create(*filesystem_, integer_index_dir, - options_.pre_mapping_fbv()); + auto integer_index_or = + IntegerIndex::Create(*filesystem_, integer_index_dir, + options_.integer_index_bucket_split_threshold(), + options_.pre_mapping_fbv()); if (!integer_index_or.ok()) { ICING_RETURN_IF_ERROR( IntegerIndex::Discard(*filesystem_, integer_index_dir)); @@ -828,6 +890,7 @@ libtextclassifier3::Status IcingSearchEngine::InitializeIndex( ICING_ASSIGN_OR_RETURN( integer_index_, IntegerIndex::Create(*filesystem_, std::move(integer_index_dir), + options_.integer_index_bucket_split_threshold(), options_.pre_mapping_fbv())); } else { // Integer index was created fine. @@ -842,29 +905,44 @@ libtextclassifier3::Status IcingSearchEngine::InitializeIndex( std::string qualified_id_join_index_dir = MakeQualifiedIdJoinIndexWorkingPath(options_.base_dir()); InitializeStatsProto::RecoveryCause qualified_id_join_index_recovery_cause; - auto qualified_id_join_index_or = QualifiedIdTypeJoinableIndex::Create( - *filesystem_, qualified_id_join_index_dir, options_.pre_mapping_fbv(), - options_.use_persistent_hash_map()); - if (!qualified_id_join_index_or.ok()) { - ICING_RETURN_IF_ERROR(QualifiedIdTypeJoinableIndex::Discard( + if (document_store_derived_files_regenerated && + IsV2QualifiedIdJoinIndexEnabled(options_)) { + // V2 qualified id join index depends on document store derived files, so we + // have to rebuild it from scratch if + // document_store_derived_files_regenerated is true. + ICING_RETURN_IF_ERROR(QualifiedIdJoinIndex::Discard( *filesystem_, qualified_id_join_index_dir)); - qualified_id_join_index_recovery_cause = InitializeStatsProto::IO_ERROR; - - // Try recreating it from scratch and rebuild everything. ICING_ASSIGN_OR_RETURN( qualified_id_join_index_, - QualifiedIdTypeJoinableIndex::Create( - *filesystem_, std::move(qualified_id_join_index_dir), - options_.pre_mapping_fbv(), options_.use_persistent_hash_map())); - } else { - // Qualified id join index was created fine. - qualified_id_join_index_ = - std::move(qualified_id_join_index_or).ValueOrDie(); - // If a recover does have to happen, then it must be because the index is - // out of sync with the document store. + CreateQualifiedIdJoinIndex( + *filesystem_, std::move(qualified_id_join_index_dir), options_)); + qualified_id_join_index_recovery_cause = - InitializeStatsProto::INCONSISTENT_WITH_GROUND_TRUTH; + InitializeStatsProto::DEPENDENCIES_CHANGED; + } else { + auto qualified_id_join_index_or = CreateQualifiedIdJoinIndex( + *filesystem_, qualified_id_join_index_dir, options_); + if (!qualified_id_join_index_or.ok()) { + ICING_RETURN_IF_ERROR(QualifiedIdJoinIndex::Discard( + *filesystem_, qualified_id_join_index_dir)); + + qualified_id_join_index_recovery_cause = InitializeStatsProto::IO_ERROR; + + // Try recreating it from scratch and rebuild everything. + ICING_ASSIGN_OR_RETURN( + qualified_id_join_index_, + CreateQualifiedIdJoinIndex( + *filesystem_, std::move(qualified_id_join_index_dir), options_)); + } else { + // Qualified id join index was created fine. + qualified_id_join_index_ = + std::move(qualified_id_join_index_or).ValueOrDie(); + // If a recover does have to happen, then it must be because the index is + // out of sync with the document store. + qualified_id_join_index_recovery_cause = + InitializeStatsProto::INCONSISTENT_WITH_GROUND_TRUTH; + } } std::unique_ptr<Timer> restore_timer = clock_->GetNewTimer(); @@ -1536,33 +1614,41 @@ OptimizeResultProto IcingSearchEngine::Optimize() { // TODO(b/143646633): figure out if we need to optimize index and doc store // at the same time. std::unique_ptr<Timer> optimize_doc_store_timer = clock_->GetNewTimer(); - libtextclassifier3::StatusOr<std::vector<DocumentId>> - document_id_old_to_new_or = OptimizeDocumentStore(optimize_stats); + libtextclassifier3::StatusOr<DocumentStore::OptimizeResult> + optimize_result_or = OptimizeDocumentStore(optimize_stats); optimize_stats->set_document_store_optimize_latency_ms( optimize_doc_store_timer->GetElapsedMilliseconds()); - if (!document_id_old_to_new_or.ok() && - !absl_ports::IsDataLoss(document_id_old_to_new_or.status())) { + if (!optimize_result_or.ok() && + !absl_ports::IsDataLoss(optimize_result_or.status())) { // The status now is either ABORTED_ERROR or INTERNAL_ERROR. // If ABORTED_ERROR, Icing should still be working. // If INTERNAL_ERROR, we're having IO errors or other errors that we can't // recover from. - TransformStatus(document_id_old_to_new_or.status(), result_status); + TransformStatus(optimize_result_or.status(), result_status); return result_proto; } // The status is either OK or DATA_LOSS. The optimized document store is // guaranteed to work, so we update index according to the new document store. std::unique_ptr<Timer> optimize_index_timer = clock_->GetNewTimer(); + auto doc_store_optimize_result_status = optimize_result_or.status(); bool should_rebuild_index = - !document_id_old_to_new_or.ok() || + !optimize_result_or.ok() || + optimize_result_or.ValueOrDie().should_rebuild_index || ShouldRebuildIndex(*optimize_stats, options_.optimize_rebuild_index_threshold()); if (!should_rebuild_index) { + // At this point should_rebuild_index is false, so it means + // optimize_result_or.ok() is true and therefore it is safe to call + // ValueOrDie. + DocumentStore::OptimizeResult optimize_result = + std::move(optimize_result_or).ValueOrDie(); + optimize_stats->set_index_restoration_mode( OptimizeStatsProto::INDEX_TRANSLATION); libtextclassifier3::Status index_optimize_status = - index_->Optimize(document_id_old_to_new_or.ValueOrDie(), + index_->Optimize(optimize_result.document_id_old_to_new, document_store_->last_added_document_id()); if (!index_optimize_status.ok()) { ICING_LOG(WARNING) << "Failed to optimize index. Error: " @@ -1571,7 +1657,7 @@ OptimizeResultProto IcingSearchEngine::Optimize() { } libtextclassifier3::Status integer_index_optimize_status = - integer_index_->Optimize(document_id_old_to_new_or.ValueOrDie(), + integer_index_->Optimize(optimize_result.document_id_old_to_new, document_store_->last_added_document_id()); if (!integer_index_optimize_status.ok()) { ICING_LOG(WARNING) << "Failed to optimize integer index. Error: " @@ -1581,7 +1667,8 @@ OptimizeResultProto IcingSearchEngine::Optimize() { libtextclassifier3::Status qualified_id_join_index_optimize_status = qualified_id_join_index_->Optimize( - document_id_old_to_new_or.ValueOrDie(), + optimize_result.document_id_old_to_new, + optimize_result.namespace_id_old_to_new, document_store_->last_added_document_id()); if (!qualified_id_join_index_optimize_status.ok()) { ICING_LOG(WARNING) @@ -1593,6 +1680,7 @@ OptimizeResultProto IcingSearchEngine::Optimize() { // If we received a DATA_LOSS error from OptimizeDocumentStore, we have a // valid document store, but it might be the old one or the new one. So throw // out the index data and rebuild from scratch. + // Also rebuild index if DocumentStore::OptimizeInto hints to do so. // Likewise, if Index::Optimize failed, then attempt to recover the index by // rebuilding from scratch. // If ShouldRebuildIndex() returns true, we will also rebuild the index for @@ -1651,7 +1739,11 @@ OptimizeResultProto IcingSearchEngine::Optimize() { // Update the status for this run and write it. auto optimize_status = std::make_unique<OptimizeStatusProto>(); optimize_status->set_last_successful_optimize_run_time_ms(current_time); - optimize_status_file.Write(std::move(optimize_status)); + auto write_status = optimize_status_file.Write(std::move(optimize_status)); + if (!write_status.ok()) { + ICING_LOG(ERROR) << "Failed to write optimize status:\n" + << write_status.error_message(); + } // Flushes data to disk after doing optimization status = InternalPersistToDisk(PersistType::FULL); @@ -1664,7 +1756,7 @@ OptimizeResultProto IcingSearchEngine::Optimize() { optimize_stats->set_storage_size_after( Filesystem::SanitizeFileSize(after_size)); - TransformStatus(document_id_old_to_new_or.status(), result_status); + TransformStatus(doc_store_optimize_result_status, result_status); return result_proto; } @@ -1871,7 +1963,17 @@ SearchResultProto IcingSearchEngine::InternalSearch( StatusProto* result_status = result_proto.mutable_status(); QueryStatsProto* query_stats = result_proto.mutable_query_stats(); + query_stats->set_is_first_page(true); + query_stats->set_requested_page_size(result_spec.num_per_page()); + + // TODO(b/305098009): deprecate search-related flat fields in query_stats. + query_stats->set_num_namespaces_filtered( + search_spec.namespace_filters_size()); + query_stats->set_num_schema_types_filtered( + search_spec.schema_type_filters_size()); query_stats->set_query_length(search_spec.query().length()); + query_stats->set_ranking_strategy(scoring_spec.rank_by()); + if (!initialized_) { result_status->set_code(StatusProto::FAILED_PRECONDITION); result_status->set_message("IcingSearchEngine has not been initialized!"); @@ -1890,27 +1992,22 @@ SearchResultProto IcingSearchEngine::InternalSearch( return result_proto; } - query_stats->set_num_namespaces_filtered( - search_spec.namespace_filters_size()); - query_stats->set_num_schema_types_filtered( - search_spec.schema_type_filters_size()); - query_stats->set_ranking_strategy(scoring_spec.rank_by()); - query_stats->set_is_first_page(true); - query_stats->set_requested_page_size(result_spec.num_per_page()); - const JoinSpecProto& join_spec = search_spec.join_spec(); std::unique_ptr<JoinChildrenFetcher> join_children_fetcher; std::unique_ptr<ResultAdjustmentInfo> child_result_adjustment_info; int64_t current_time_ms = clock_->GetSystemTimeMilliseconds(); if (!join_spec.parent_property_expression().empty() && !join_spec.child_property_expression().empty()) { + query_stats->set_is_join_query(true); + QueryStatsProto::SearchStats* child_search_stats = + query_stats->mutable_child_search_stats(); + // Process child query QueryScoringResults nested_query_scoring_results = ProcessQueryAndScore( join_spec.nested_spec().search_spec(), join_spec.nested_spec().scoring_spec(), join_spec.nested_spec().result_spec(), - /*join_children_fetcher=*/nullptr, current_time_ms); - // TOOD(b/256022027): set different kinds of latency for 2nd query. + /*join_children_fetcher=*/nullptr, current_time_ms, child_search_stats); if (!nested_query_scoring_results.status.ok()) { TransformStatus(nested_query_scoring_results.status, result_status); return result_proto; @@ -1941,24 +2038,24 @@ SearchResultProto IcingSearchEngine::InternalSearch( } // Process parent query - QueryScoringResults query_scoring_results = - ProcessQueryAndScore(search_spec, scoring_spec, result_spec, - join_children_fetcher.get(), current_time_ms); - int term_count = 0; - for (const auto& section_and_terms : query_scoring_results.query_terms) { - term_count += section_and_terms.second.size(); - } - query_stats->set_num_terms(term_count); + QueryStatsProto::SearchStats* parent_search_stats = + query_stats->mutable_parent_search_stats(); + QueryScoringResults query_scoring_results = ProcessQueryAndScore( + search_spec, scoring_spec, result_spec, join_children_fetcher.get(), + current_time_ms, parent_search_stats); + // TODO(b/305098009): deprecate search-related flat fields in query_stats. + query_stats->set_num_terms(parent_search_stats->num_terms()); query_stats->set_parse_query_latency_ms( - query_scoring_results.parse_query_latency_ms); - query_stats->set_scoring_latency_ms(query_scoring_results.scoring_latency_ms); + parent_search_stats->parse_query_latency_ms()); + query_stats->set_scoring_latency_ms( + parent_search_stats->scoring_latency_ms()); + query_stats->set_num_documents_scored( + parent_search_stats->num_documents_scored()); if (!query_scoring_results.status.ok()) { TransformStatus(query_scoring_results.status, result_status); return result_proto; } - query_stats->set_num_documents_scored( - query_scoring_results.scored_document_hits.size()); // Returns early for empty result if (query_scoring_results.scored_document_hits.empty()) { result_status->set_code(StatusProto::OK); @@ -2072,7 +2169,15 @@ SearchResultProto IcingSearchEngine::InternalSearch( IcingSearchEngine::QueryScoringResults IcingSearchEngine::ProcessQueryAndScore( const SearchSpecProto& search_spec, const ScoringSpecProto& scoring_spec, const ResultSpecProto& result_spec, - const JoinChildrenFetcher* join_children_fetcher, int64_t current_time_ms) { + const JoinChildrenFetcher* join_children_fetcher, int64_t current_time_ms, + QueryStatsProto::SearchStats* search_stats) { + search_stats->set_num_namespaces_filtered( + search_spec.namespace_filters_size()); + search_stats->set_num_schema_types_filtered( + search_spec.schema_type_filters_size()); + search_stats->set_query_length(search_spec.query().length()); + search_stats->set_ranking_strategy(scoring_spec.rank_by()); + std::unique_ptr<Timer> component_timer = clock_->GetNewTimer(); // Gets unordered results from query processor @@ -2080,11 +2185,11 @@ IcingSearchEngine::QueryScoringResults IcingSearchEngine::ProcessQueryAndScore( index_.get(), integer_index_.get(), language_segmenter_.get(), normalizer_.get(), document_store_.get(), schema_store_.get()); if (!query_processor_or.ok()) { - return QueryScoringResults( - std::move(query_processor_or).status(), /*query_terms_in=*/{}, - /*scored_document_hits_in=*/{}, - /*parse_query_latency_ms_in=*/component_timer->GetElapsedMilliseconds(), - /*scoring_latency_ms_in=*/0); + search_stats->set_parse_query_latency_ms( + component_timer->GetElapsedMilliseconds()); + return QueryScoringResults(std::move(query_processor_or).status(), + /*query_terms_in=*/{}, + /*scored_document_hits_in=*/{}); } std::unique_ptr<QueryProcessor> query_processor = std::move(query_processor_or).ValueOrDie(); @@ -2097,15 +2202,25 @@ IcingSearchEngine::QueryScoringResults IcingSearchEngine::ProcessQueryAndScore( } else { query_results_or = ranking_strategy_or.status(); } + search_stats->set_parse_query_latency_ms( + component_timer->GetElapsedMilliseconds()); if (!query_results_or.ok()) { - return QueryScoringResults( - std::move(query_results_or).status(), /*query_terms_in=*/{}, - /*scored_document_hits_in=*/{}, - /*parse_query_latency_ms_in=*/component_timer->GetElapsedMilliseconds(), - /*scoring_latency_ms_in=*/0); + return QueryScoringResults(std::move(query_results_or).status(), + /*query_terms_in=*/{}, + /*scored_document_hits_in=*/{}); } QueryResults query_results = std::move(query_results_or).ValueOrDie(); - int64_t parse_query_latency_ms = component_timer->GetElapsedMilliseconds(); + + // Set SearchStats related to QueryResults. + int term_count = 0; + for (const auto& section_and_terms : query_results.query_terms) { + term_count += section_and_terms.second.size(); + } + search_stats->set_num_terms(term_count); + + if (query_results.features_in_use.count(kNumericSearchFeature)) { + search_stats->set_is_numeric_query(true); + } component_timer = clock_->GetNewTimer(); // Scores but does not rank the results. @@ -2116,22 +2231,20 @@ IcingSearchEngine::QueryScoringResults IcingSearchEngine::ProcessQueryAndScore( if (!scoring_processor_or.ok()) { return QueryScoringResults(std::move(scoring_processor_or).status(), std::move(query_results.query_terms), - /*scored_document_hits_in=*/{}, - parse_query_latency_ms, - /*scoring_latency_ms_in=*/0); + /*scored_document_hits_in=*/{}); } std::unique_ptr<ScoringProcessor> scoring_processor = std::move(scoring_processor_or).ValueOrDie(); std::vector<ScoredDocumentHit> scored_document_hits = - scoring_processor->Score(std::move(query_results.root_iterator), - performance_configuration_.num_to_score, - &query_results.query_term_iterators); - int64_t scoring_latency_ms = component_timer->GetElapsedMilliseconds(); + scoring_processor->Score( + std::move(query_results.root_iterator), result_spec.num_to_score(), + &query_results.query_term_iterators, search_stats); + search_stats->set_scoring_latency_ms( + component_timer->GetElapsedMilliseconds()); return QueryScoringResults(libtextclassifier3::Status::OK, std::move(query_results.query_terms), - std::move(scored_document_hits), - parse_query_latency_ms, scoring_latency_ms); + std::move(scored_document_hits)); } SearchResultProto IcingSearchEngine::GetNextPage(uint64_t next_page_token) { @@ -2222,7 +2335,7 @@ void IcingSearchEngine::InvalidateNextPageToken(uint64_t next_page_token) { result_state_manager_->InvalidateResultState(next_page_token); } -libtextclassifier3::StatusOr<std::vector<DocumentId>> +libtextclassifier3::StatusOr<DocumentStore::OptimizeResult> IcingSearchEngine::OptimizeDocumentStore(OptimizeStatsProto* optimize_stats) { // Gets the current directory path and an empty tmp directory path for // document store optimization. @@ -2239,17 +2352,16 @@ IcingSearchEngine::OptimizeDocumentStore(OptimizeStatsProto* optimize_stats) { } // Copies valid document data to tmp directory - libtextclassifier3::StatusOr<std::vector<DocumentId>> - document_id_old_to_new_or = document_store_->OptimizeInto( - temporary_document_dir, language_segmenter_.get(), - options_.document_store_namespace_id_fingerprint(), optimize_stats); + libtextclassifier3::StatusOr<DocumentStore::OptimizeResult> + optimize_result_or = document_store_->OptimizeInto( + temporary_document_dir, language_segmenter_.get(), optimize_stats); // Handles error if any - if (!document_id_old_to_new_or.ok()) { + if (!optimize_result_or.ok()) { filesystem_->DeleteDirectoryRecursively(temporary_document_dir.c_str()); return absl_ports::Annotate( absl_ports::AbortedError("Failed to optimize document store"), - document_id_old_to_new_or.status().error_message()); + optimize_result_or.status().error_message()); } // result_state_manager_ depends on document_store_. So we need to reset it at @@ -2280,6 +2392,7 @@ IcingSearchEngine::OptimizeDocumentStore(OptimizeStatsProto* optimize_stats) { filesystem_.get(), current_document_dir, clock_.get(), schema_store_.get(), /*force_recovery_and_revalidate_documents=*/false, options_.document_store_namespace_id_fingerprint(), + options_.pre_mapping_fbv(), options_.use_persistent_hash_map(), options_.compression_level(), /*initialize_stats=*/nullptr); // TODO(b/144458732): Implement a more robust version of // TC_ASSIGN_OR_RETURN that can support error logging. @@ -2307,6 +2420,7 @@ IcingSearchEngine::OptimizeDocumentStore(OptimizeStatsProto* optimize_stats) { filesystem_.get(), current_document_dir, clock_.get(), schema_store_.get(), /*force_recovery_and_revalidate_documents=*/false, options_.document_store_namespace_id_fingerprint(), + options_.pre_mapping_fbv(), options_.use_persistent_hash_map(), options_.compression_level(), /*initialize_stats=*/nullptr); if (!create_result_or.ok()) { // Unable to create DocumentStore from the new file. Mark as uninitialized @@ -2316,7 +2430,9 @@ IcingSearchEngine::OptimizeDocumentStore(OptimizeStatsProto* optimize_stats) { "Document store has been optimized, but a valid document store " "instance can't be created"); } - document_store_ = std::move(create_result_or.ValueOrDie().document_store); + DocumentStore::CreateResult create_result = + std::move(create_result_or).ValueOrDie(); + document_store_ = std::move(create_result.document_store); result_state_manager_ = std::make_unique<ResultStateManager>( performance_configuration_.max_num_total_hits, *document_store_); @@ -2326,7 +2442,19 @@ IcingSearchEngine::OptimizeDocumentStore(OptimizeStatsProto* optimize_stats) { ICING_LOG(ERROR) << "Document store has been optimized, but it failed to " "delete temporary file directory"; } - return document_id_old_to_new_or; + + // Since we created new (optimized) document store with correct PersistToDisk + // call, we shouldn't have data loss or regenerate derived files. Therefore, + // if we really encounter any of these situations, then return DataLossError + // to let the caller rebuild index. + if (create_result.data_loss != DataLoss::NONE || + create_result.derived_files_regenerated) { + return absl_ports::DataLossError( + "Unexpected data loss or derived files regenerated for new document " + "store"); + } + + return optimize_result_or; } IcingSearchEngine::IndexRestorationResult @@ -2458,11 +2586,12 @@ IcingSearchEngine::CreateDataIndexingHandlers() { std::vector<std::unique_ptr<DataIndexingHandler>> handlers; // Term index handler - ICING_ASSIGN_OR_RETURN(std::unique_ptr<StringSectionIndexingHandler> - string_section_indexing_handler, - StringSectionIndexingHandler::Create( - clock_.get(), normalizer_.get(), index_.get())); - handlers.push_back(std::move(string_section_indexing_handler)); + ICING_ASSIGN_OR_RETURN( + std::unique_ptr<TermIndexingHandler> term_indexing_handler, + TermIndexingHandler::Create( + clock_.get(), normalizer_.get(), index_.get(), + options_.build_property_existence_metadata_hits())); + handlers.push_back(std::move(term_indexing_handler)); // Integer index handler ICING_ASSIGN_OR_RETURN(std::unique_ptr<IntegerSectionIndexingHandler> @@ -2471,13 +2600,13 @@ IcingSearchEngine::CreateDataIndexingHandlers() { clock_.get(), integer_index_.get())); handlers.push_back(std::move(integer_section_indexing_handler)); - // Qualified id joinable property index handler - ICING_ASSIGN_OR_RETURN(std::unique_ptr<QualifiedIdJoinIndexingHandler> - qualified_id_joinable_property_indexing_handler, - QualifiedIdJoinIndexingHandler::Create( - clock_.get(), qualified_id_join_index_.get())); - handlers.push_back( - std::move(qualified_id_joinable_property_indexing_handler)); + // Qualified id join index handler + ICING_ASSIGN_OR_RETURN( + std::unique_ptr<QualifiedIdJoinIndexingHandler> + qualified_id_join_indexing_handler, + QualifiedIdJoinIndexingHandler::Create( + clock_.get(), document_store_.get(), qualified_id_join_index_.get())); + handlers.push_back(std::move(qualified_id_join_indexing_handler)); return handlers; } diff --git a/icing/icing-search-engine.h b/icing/icing-search-engine.h index 15da142..d316350 100644 --- a/icing/icing-search-engine.h +++ b/icing/icing-search-engine.h @@ -19,6 +19,7 @@ #include <memory> #include <string> #include <string_view> +#include <utility> #include <vector> #include "icing/text_classifier/lib3/utils/base/status.h" @@ -31,7 +32,7 @@ #include "icing/index/numeric/numeric-index.h" #include "icing/jni/jni-cache.h" #include "icing/join/join-children-fetcher.h" -#include "icing/join/qualified-id-type-joinable-index.h" +#include "icing/join/qualified-id-join-index.h" #include "icing/legacy/index/icing-filesystem.h" #include "icing/performance-configuration.h" #include "icing/proto/debug.pb.h" @@ -479,7 +480,7 @@ class IcingSearchEngine { ICING_GUARDED_BY(mutex_); // Storage for all join qualified ids from the document store. - std::unique_ptr<QualifiedIdTypeJoinableIndex> qualified_id_join_index_ + std::unique_ptr<QualifiedIdJoinIndex> qualified_id_join_index_ ICING_GUARDED_BY(mutex_); // Pointer to JNI class references @@ -546,10 +547,12 @@ class IcingSearchEngine { // force_recovery_and_revalidate_documents. // // Returns: - // OK on success + // On success, a boolean flag indicating whether derived files of the + // document store have been regenerated or not. If true, any other + // components depending on them should also be rebuilt if true. // FAILED_PRECONDITION if initialize_stats is null // INTERNAL on I/O error - libtextclassifier3::Status InitializeDocumentStore( + libtextclassifier3::StatusOr<bool> InitializeDocumentStore( bool force_recovery_and_revalidate_documents, InitializeStatsProto* initialize_stats) ICING_EXCLUSIVE_LOCKS_REQUIRED(mutex_); @@ -557,6 +560,9 @@ class IcingSearchEngine { // Do any initialization/recovery necessary to create term index, integer // index, and qualified id join index instances. // + // If document_store_derived_files_regenerated is true, then we have to + // rebuild qualified id join index since NamespaceIds were reassigned. + // // Returns: // OK on success // FAILED_PRECONDITION if initialize_stats is null @@ -564,6 +570,7 @@ class IcingSearchEngine { // NOT_FOUND if some Document's schema type is not in the SchemaStore // INTERNAL on I/O error libtextclassifier3::Status InitializeIndex( + bool document_store_derived_files_regenerated, InitializeStatsProto* initialize_stats) ICING_EXCLUSIVE_LOCKS_REQUIRED(mutex_); @@ -603,24 +610,20 @@ class IcingSearchEngine { libtextclassifier3::Status status; SectionRestrictQueryTermsMap query_terms; std::vector<ScoredDocumentHit> scored_document_hits; - int64_t parse_query_latency_ms; - int64_t scoring_latency_ms; explicit QueryScoringResults( libtextclassifier3::Status status_in, SectionRestrictQueryTermsMap&& query_terms_in, - std::vector<ScoredDocumentHit>&& scored_document_hits_in, - int64_t parse_query_latency_ms_in, int64_t scoring_latency_ms_in) + std::vector<ScoredDocumentHit>&& scored_document_hits_in) : status(std::move(status_in)), query_terms(std::move(query_terms_in)), - scored_document_hits(std::move(scored_document_hits_in)), - parse_query_latency_ms(parse_query_latency_ms_in), - scoring_latency_ms(scoring_latency_ms_in) {} + scored_document_hits(std::move(scored_document_hits_in)) {} }; QueryScoringResults ProcessQueryAndScore( const SearchSpecProto& search_spec, const ScoringSpecProto& scoring_spec, const ResultSpecProto& result_spec, - const JoinChildrenFetcher* join_children_fetcher, int64_t current_time_ms) + const JoinChildrenFetcher* join_children_fetcher, int64_t current_time_ms, + QueryStatsProto::SearchStats* search_stats) ICING_SHARED_LOCKS_REQUIRED(mutex_); // Many of the internal components rely on other components' derived data. @@ -664,17 +667,18 @@ class IcingSearchEngine { // would need call Initialize() to reinitialize everything into a valid state. // // Returns: - // On success, a vector that maps from old document id to new document id. A - // value of kInvalidDocumentId indicates that the old document id has been - // deleted. + // On success, OptimizeResult which contains a vector mapping from old + // document id to new document id and another vector mapping from old + // namespace id to new namespace id. A value of kInvalidDocumentId indicates + // that the old document id has been deleted. // ABORTED_ERROR if any error happens before the actual optimization, the // original document store should be still available // DATA_LOSS_ERROR on errors that could potentially cause data loss, // document store is still available // INTERNAL_ERROR on any IO errors or other errors that we can't recover // from - libtextclassifier3::StatusOr<std::vector<DocumentId>> OptimizeDocumentStore( - OptimizeStatsProto* optimize_stats) + libtextclassifier3::StatusOr<DocumentStore::OptimizeResult> + OptimizeDocumentStore(OptimizeStatsProto* optimize_stats) ICING_EXCLUSIVE_LOCKS_REQUIRED(mutex_); // Helper method to restore missing document data in index_, integer_index_, diff --git a/icing/icing-search-engine_benchmark.cc b/icing/icing-search-engine_benchmark.cc index fb44595..18c6bb9 100644 --- a/icing/icing-search-engine_benchmark.cc +++ b/icing/icing-search-engine_benchmark.cc @@ -37,6 +37,7 @@ #include "icing/join/join-processor.h" #include "icing/proto/document.pb.h" #include "icing/proto/initialize.pb.h" +#include "icing/proto/persist.pb.h" #include "icing/proto/reset.pb.h" #include "icing/proto/schema.pb.h" #include "icing/proto/scoring.pb.h" @@ -1116,6 +1117,8 @@ void BM_JoinQueryQualifiedId(benchmark::State& state) { IcingSearchEngineOptions options; options.set_base_dir(test_dir); options.set_index_merge_size(kIcingFullIndexSize); + options.set_document_store_namespace_id_fingerprint(true); + options.set_use_new_qualified_id_join_index(true); std::unique_ptr<IcingSearchEngine> icing = std::make_unique<IcingSearchEngine>(options); @@ -1139,7 +1142,7 @@ void BM_JoinQueryQualifiedId(benchmark::State& state) { } // Create Email documents (child) - static constexpr int kNumEmailDocuments = 10000; + static constexpr int kNumEmailDocuments = 1000; std::uniform_int_distribution<> distrib(0, kNumPersonDocuments - 1); std::default_random_engine e(/*seed=*/12345); for (int i = 0; i < kNumEmailDocuments; ++i) { @@ -1199,20 +1202,63 @@ void BM_JoinQueryQualifiedId(benchmark::State& state) { std::reduce(results.results().begin(), results.results().end(), 0, child_count_reduce_func); - // Get all pages. - while (results.next_page_token() != kInvalidNextPageToken) { - results = icing->GetNextPage(results.next_page_token()); - total_parent_count += results.results_size(); - total_child_count += - std::reduce(results.results().begin(), results.results().end(), 0, - child_count_reduce_func); + ASSERT_THAT(total_parent_count, Eq(kNumPerPage)); + ASSERT_THAT(total_child_count, ::testing::Ge(0)); + } +} +BENCHMARK(BM_JoinQueryQualifiedId); + +void BM_PersistToDisk(benchmark::State& state) { + // Initialize the filesystem + std::string test_dir = GetTestTempDir() + "/icing/benchmark"; + Filesystem filesystem; + DestructibleDirectory ddir(filesystem, test_dir); + + // Create the schema. + std::default_random_engine random; + int num_types = kAvgNumNamespaces * kAvgNumTypes; + ExactStringPropertyGenerator property_generator; + SchemaGenerator<ExactStringPropertyGenerator> schema_generator( + /*num_properties=*/state.range(1), &property_generator); + SchemaProto schema = schema_generator.GenerateSchema(num_types); + EvenDistributionTypeSelector type_selector(schema); + + // Generate documents. + int num_docs = state.range(0); + std::vector<std::string> language = CreateLanguages(kLanguageSize, &random); + const std::vector<DocumentProto> random_docs = + GenerateRandomDocuments(&type_selector, num_docs, language); + + for (auto _ : state) { + state.PauseTiming(); + // Create the index. + IcingSearchEngineOptions options; + options.set_base_dir(test_dir); + options.set_index_merge_size(kIcingFullIndexSize); + options.set_use_persistent_hash_map(true); + std::unique_ptr<IcingSearchEngine> icing = + std::make_unique<IcingSearchEngine>(options); + + ASSERT_THAT(icing->Reset().status(), ProtoIsOk()); + ASSERT_THAT(icing->SetSchema(schema).status(), ProtoIsOk()); + + for (const DocumentProto& doc : random_docs) { + ASSERT_THAT(icing->Put(doc).status(), ProtoIsOk()); } - ASSERT_THAT(total_parent_count, Eq(kNumPersonDocuments)); - ASSERT_THAT(total_child_count, Eq(kNumEmailDocuments)); + state.ResumeTiming(); + + ASSERT_THAT(icing->PersistToDisk(PersistType::FULL).status(), ProtoIsOk()); + + state.PauseTiming(); + icing.reset(); + ASSERT_TRUE(filesystem.DeleteDirectoryRecursively(test_dir.c_str())); + state.ResumeTiming(); } } -BENCHMARK(BM_JoinQueryQualifiedId); +BENCHMARK(BM_PersistToDisk) + // Arguments: num_indexed_documents, num_sections + ->ArgPair(1024, 5); } // namespace diff --git a/icing/icing-search-engine_initialization_test.cc b/icing/icing-search-engine_initialization_test.cc index 74cc78f..122e4af 100644 --- a/icing/icing-search-engine_initialization_test.cc +++ b/icing/icing-search-engine_initialization_test.cc @@ -12,30 +12,41 @@ // See the License for the specific language governing permissions and // limitations under the License. +#include <algorithm> #include <cstdint> #include <limits> #include <memory> #include <string> +#include <string_view> +#include <tuple> #include <utility> +#include <vector> #include "icing/text_classifier/lib3/utils/base/status.h" #include "gmock/gmock.h" #include "gtest/gtest.h" +#include "icing/absl_ports/str_cat.h" #include "icing/document-builder.h" +#include "icing/file/file-backed-vector.h" #include "icing/file/filesystem.h" +#include "icing/file/memory-mapped-file.h" #include "icing/file/mock-filesystem.h" +#include "icing/file/portable-file-backed-proto-log.h" #include "icing/file/version-util.h" #include "icing/icing-search-engine.h" +#include "icing/index/data-indexing-handler.h" #include "icing/index/index-processor.h" #include "icing/index/index.h" #include "icing/index/integer-section-indexing-handler.h" +#include "icing/index/iterator/doc-hit-info-iterator.h" #include "icing/index/numeric/integer-index.h" -#include "icing/index/string-section-indexing-handler.h" +#include "icing/index/numeric/numeric-index.h" +#include "icing/index/term-indexing-handler.h" #include "icing/jni/jni-cache.h" -#include "icing/join/doc-join-info.h" #include "icing/join/join-processor.h" +#include "icing/join/qualified-id-join-index-impl-v2.h" +#include "icing/join/qualified-id-join-index.h" #include "icing/join/qualified-id-join-indexing-handler.h" -#include "icing/join/qualified-id-type-joinable-index.h" #include "icing/legacy/index/icing-filesystem.h" #include "icing/legacy/index/icing-mock-filesystem.h" #include "icing/portable/endian.h" @@ -59,8 +70,12 @@ #include "icing/query/query-features.h" #include "icing/schema-builder.h" #include "icing/schema/schema-store.h" +#include "icing/schema/section.h" +#include "icing/store/document-associated-score-data.h" #include "icing/store/document-id.h" #include "icing/store/document-log-creator.h" +#include "icing/store/document-store.h" +#include "icing/store/namespace-fingerprint-identifier.h" #include "icing/testing/common-matchers.h" #include "icing/testing/fake-clock.h" #include "icing/testing/icu-data-file-helper.h" @@ -71,6 +86,7 @@ #include "icing/tokenization/language-segmenter.h" #include "icing/transform/normalizer-factory.h" #include "icing/transform/normalizer.h" +#include "icing/util/clock.h" #include "icing/util/tokenized-document.h" #include "unicode/uloc.h" @@ -88,6 +104,7 @@ using ::testing::Eq; using ::testing::HasSubstr; using ::testing::IsEmpty; using ::testing::Matcher; +using ::testing::Ne; using ::testing::Return; using ::testing::SizeIs; @@ -210,6 +227,8 @@ std::string GetHeaderFilename() { IcingSearchEngineOptions GetDefaultIcingOptions() { IcingSearchEngineOptions icing_options; icing_options.set_base_dir(GetTestBaseDir()); + icing_options.set_document_store_namespace_id_fingerprint(true); + icing_options.set_use_new_qualified_id_join_index(true); return icing_options; } @@ -1039,12 +1058,14 @@ TEST_F(IcingSearchEngineInitializationTest, .SetCreationTimestampMs(kDefaultCreationTimestampMs) .Build(); + IcingSearchEngineOptions icing_options = GetDefaultIcingOptions(); + { // Initializes folder and schema, index one document - TestIcingSearchEngine icing( - GetDefaultIcingOptions(), std::make_unique<Filesystem>(), - std::make_unique<IcingFilesystem>(), std::make_unique<FakeClock>(), - GetTestJniCache()); + TestIcingSearchEngine icing(icing_options, std::make_unique<Filesystem>(), + std::make_unique<IcingFilesystem>(), + std::make_unique<FakeClock>(), + GetTestJniCache()); EXPECT_THAT(icing.Initialize().status(), ProtoIsOk()); EXPECT_THAT(icing.SetSchema(schema).status(), ProtoIsOk()); EXPECT_THAT(icing.Put(person).status(), ProtoIsOk()); @@ -1060,13 +1081,16 @@ TEST_F(IcingSearchEngineInitializationTest, // Puts message2 into DocumentStore but doesn't index it. ICING_ASSERT_OK_AND_ASSIGN( DocumentStore::CreateResult create_result, - DocumentStore::Create(filesystem(), GetDocumentDir(), &fake_clock, - schema_store.get(), - /*force_recovery_and_revalidate_documents=*/false, - /*namespace_id_fingerprint=*/false, - PortableFileBackedProtoLog< - DocumentWrapper>::kDeflateCompressionLevel, - /*initialize_stats=*/nullptr)); + DocumentStore::Create( + filesystem(), GetDocumentDir(), &fake_clock, schema_store.get(), + /*force_recovery_and_revalidate_documents=*/false, + /*namespace_id_fingerprint=*/ + icing_options.document_store_namespace_id_fingerprint(), + /*pre_mapping_fbv=*/false, + /*use_persistent_hash_map=*/false, + PortableFileBackedProtoLog< + DocumentWrapper>::kDeflateCompressionLevel, + /*initialize_stats=*/nullptr)); std::unique_ptr<DocumentStore> document_store = std::move(create_result.document_store); @@ -1100,8 +1124,7 @@ TEST_F(IcingSearchEngineInitializationTest, HasSubstr("/qualified_id_join_index_dir/"))) .Times(0); - TestIcingSearchEngine icing(GetDefaultIcingOptions(), - std::move(mock_filesystem), + TestIcingSearchEngine icing(icing_options, std::move(mock_filesystem), std::make_unique<IcingFilesystem>(), std::make_unique<FakeClock>(), GetTestJniCache()); InitializeResultProto initialize_result = icing.Initialize(); @@ -1199,6 +1222,222 @@ TEST_F(IcingSearchEngineInitializationTest, expected_join_search_result_proto)); } +TEST_F(IcingSearchEngineInitializationTest, RecoverFromCorruptedDocumentStore) { + // Test the following scenario: some document store derived files are + // corrupted. IcingSearchEngine should be able to recover the document store, + // and since NamespaceIds were reassigned, we should rebuild qualified id join + // index as well. Several additional behaviors are also tested: + // - Index directory handling: + // - Term index directory should be unaffected. + // - Integer index directory should be unaffected. + // - Should discard the entire qualified id join index directory and start + // it from scratch. + // - Truncate indices: + // - "TruncateTo()" for term index shouldn't take effect. + // - "Clear()" shouldn't be called for integer index, i.e. no integer index + // storage sub directories (path_expr = "*/integer_index_dir/*") should be + // discarded. + // - "Clear()" shouldn't be called for qualified id join index, i.e. no + // underlying storage sub directory (path_expr = + // "*/qualified_id_join_index_dir/*") should be discarded. + // - Still, we need to replay and reindex documents (for qualified id join + // index). + + SchemaProto schema = + SchemaBuilder() + .AddType(SchemaTypeConfigBuilder().SetType("Person").AddProperty( + PropertyConfigBuilder() + .SetName("name") + .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_REQUIRED))) + .AddType(SchemaTypeConfigBuilder() + .SetType("Message") + .AddProperty(PropertyConfigBuilder() + .SetName("body") + .SetDataTypeString(TERM_MATCH_PREFIX, + TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_REQUIRED)) + .AddProperty(PropertyConfigBuilder() + .SetName("indexableInteger") + .SetDataTypeInt64(NUMERIC_MATCH_RANGE) + .SetCardinality(CARDINALITY_REQUIRED)) + .AddProperty(PropertyConfigBuilder() + .SetName("senderQualifiedId") + .SetDataTypeJoinableString( + JOINABLE_VALUE_TYPE_QUALIFIED_ID) + .SetCardinality(CARDINALITY_REQUIRED))) + .Build(); + + DocumentProto personDummy = + DocumentBuilder() + .SetKey("namespace2", "personDummy") + .SetSchema("Person") + .AddStringProperty("name", "personDummy") + .SetCreationTimestampMs(kDefaultCreationTimestampMs) + .Build(); + DocumentProto person1 = + DocumentBuilder() + .SetKey("namespace1", "person") + .SetSchema("Person") + .AddStringProperty("name", "person") + .SetCreationTimestampMs(kDefaultCreationTimestampMs) + .Build(); + DocumentProto person2 = + DocumentBuilder() + .SetKey("namespace2", "person") + .SetSchema("Person") + .AddStringProperty("name", "person") + .SetCreationTimestampMs(kDefaultCreationTimestampMs) + .Build(); + DocumentProto message = + DocumentBuilder() + .SetKey("namespace2", "message/1") + .SetSchema("Message") + .AddStringProperty("body", "message body one") + .AddInt64Property("indexableInteger", 123) + .AddStringProperty("senderQualifiedId", "namespace2#person") + .SetCreationTimestampMs(kDefaultCreationTimestampMs) + .Build(); + + IcingSearchEngineOptions icing_options = GetDefaultIcingOptions(); + + { + // Initializes folder and schema, index one document + TestIcingSearchEngine icing(icing_options, std::make_unique<Filesystem>(), + std::make_unique<IcingFilesystem>(), + std::make_unique<FakeClock>(), + GetTestJniCache()); + EXPECT_THAT(icing.Initialize().status(), ProtoIsOk()); + EXPECT_THAT(icing.SetSchema(schema).status(), ProtoIsOk()); + // "namespace2" (in personDummy) will be assigned NamespaceId = 0. + EXPECT_THAT(icing.Put(personDummy).status(), ProtoIsOk()); + // "namespace1" (in person1) will be assigned NamespaceId = 1. + EXPECT_THAT(icing.Put(person1).status(), ProtoIsOk()); + EXPECT_THAT(icing.Put(person2).status(), ProtoIsOk()); + EXPECT_THAT(icing.Put(message).status(), ProtoIsOk()); + + // Now delete personDummy. + EXPECT_THAT( + icing.Delete(personDummy.namespace_(), personDummy.uri()).status(), + ProtoIsOk()); + } // This should shut down IcingSearchEngine and persist anything it needs to + + { + FakeClock fake_clock; + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<SchemaStore> schema_store, + SchemaStore::Create(filesystem(), GetSchemaDir(), &fake_clock)); + + // Manually corrupt one of the derived files of DocumentStore without + // updating checksum in DocumentStore header. + std::string score_cache_filename = GetDocumentDir() + "/score_cache"; + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<FileBackedVector<DocumentAssociatedScoreData>> + score_cache, + FileBackedVector<DocumentAssociatedScoreData>::Create( + *filesystem(), std::move(score_cache_filename), + MemoryMappedFile::READ_WRITE_AUTO_SYNC)); + ICING_ASSERT_OK_AND_ASSIGN(const DocumentAssociatedScoreData* score_data, + score_cache->Get(/*idx=*/0)); + ICING_ASSERT_OK(score_cache->Set( + /*idx=*/0, + DocumentAssociatedScoreData(score_data->corpus_id(), + score_data->document_score() + 1, + score_data->creation_timestamp_ms(), + score_data->length_in_tokens()))); + ICING_ASSERT_OK(score_cache->PersistToDisk()); + } + + // Mock filesystem to observe and check the behavior of all indices. + auto mock_filesystem = std::make_unique<MockFilesystem>(); + EXPECT_CALL(*mock_filesystem, DeleteDirectoryRecursively(_)) + .WillRepeatedly(DoDefault()); + // Ensure term index directory should never be discarded. + EXPECT_CALL(*mock_filesystem, + DeleteDirectoryRecursively(EndsWith("/index_dir"))) + .Times(0); + // Ensure integer index directory should never be discarded, and Clear() + // should never be called (i.e. storage sub directory + // "*/integer_index_dir/*" should never be discarded). + EXPECT_CALL(*mock_filesystem, + DeleteDirectoryRecursively(EndsWith("/integer_index_dir"))) + .Times(0); + EXPECT_CALL(*mock_filesystem, + DeleteDirectoryRecursively(HasSubstr("/integer_index_dir/"))) + .Times(0); + // Ensure qualified id join index directory should be discarded once, and + // Clear() should never be called (i.e. storage sub directory + // "*/qualified_id_join_index_dir/*" should never be discarded). + EXPECT_CALL(*mock_filesystem, DeleteDirectoryRecursively( + EndsWith("/qualified_id_join_index_dir"))) + .Times(1); + EXPECT_CALL(*mock_filesystem, DeleteDirectoryRecursively( + HasSubstr("/qualified_id_join_index_dir/"))) + .Times(0); + + TestIcingSearchEngine icing(icing_options, std::move(mock_filesystem), + std::make_unique<IcingFilesystem>(), + std::make_unique<FakeClock>(), GetTestJniCache()); + InitializeResultProto initialize_result = icing.Initialize(); + EXPECT_THAT(initialize_result.status(), ProtoIsOk()); + // DocumentStore should be recovered. When reassigning NamespaceId, the order + // will be the document traversal order: [person1, person2, message]. + // Therefore, "namespace1" will have id = 0 and "namespace2" will have id = 1. + EXPECT_THAT( + initialize_result.initialize_stats().document_store_recovery_cause(), + Eq(InitializeStatsProto::IO_ERROR)); + // Term, integer index should be unaffected. + EXPECT_THAT(initialize_result.initialize_stats().index_restoration_cause(), + Eq(InitializeStatsProto::NONE)); + EXPECT_THAT( + initialize_result.initialize_stats().integer_index_restoration_cause(), + Eq(InitializeStatsProto::NONE)); + // Qualified id join index should be rebuilt. + EXPECT_THAT(initialize_result.initialize_stats() + .qualified_id_join_index_restoration_cause(), + Eq(InitializeStatsProto::DEPENDENCIES_CHANGED)); + + // Verify join search: join a query for `name:person` with a child query for + // `body:message` based on the child's `senderQualifiedId` field. message2 + // should be joined to person2 correctly. + SearchSpecProto search_spec; + search_spec.set_term_match_type(TermMatchType::EXACT_ONLY); + search_spec.set_query("name:person"); + JoinSpecProto* join_spec = search_spec.mutable_join_spec(); + join_spec->set_parent_property_expression( + std::string(JoinProcessor::kQualifiedIdExpr)); + join_spec->set_child_property_expression("senderQualifiedId"); + join_spec->set_aggregation_scoring_strategy( + JoinSpecProto::AggregationScoringStrategy::COUNT); + JoinSpecProto::NestedSpecProto* nested_spec = + join_spec->mutable_nested_spec(); + SearchSpecProto* nested_search_spec = nested_spec->mutable_search_spec(); + nested_search_spec->set_term_match_type(TermMatchType::EXACT_ONLY); + nested_search_spec->set_query("body:message"); + *nested_spec->mutable_scoring_spec() = GetDefaultScoringSpec(); + *nested_spec->mutable_result_spec() = ResultSpecProto::default_instance(); + + ResultSpecProto result_spec = ResultSpecProto::default_instance(); + result_spec.set_max_joined_children_per_parent_to_return( + std::numeric_limits<int32_t>::max()); + + SearchResultProto expected_join_search_result_proto; + expected_join_search_result_proto.mutable_status()->set_code(StatusProto::OK); + SearchResultProto::ResultProto* result_proto = + expected_join_search_result_proto.mutable_results()->Add(); + *result_proto->mutable_document() = person2; + *result_proto->mutable_joined_results()->Add()->mutable_document() = message; + + *expected_join_search_result_proto.mutable_results() + ->Add() + ->mutable_document() = person1; + + SearchResultProto search_result_proto = icing.Search( + search_spec, ScoringSpecProto::default_instance(), result_spec); + EXPECT_THAT(search_result_proto, EqualsSearchResultIgnoreStatsAndScores( + expected_join_search_result_proto)); +} + TEST_F(IcingSearchEngineInitializationTest, RecoverFromCorruptIndex) { // Test the following scenario: term index is corrupted (e.g. checksum doesn't // match). IcingSearchEngine should be able to recover term index. Several @@ -1493,6 +1732,108 @@ TEST_F(IcingSearchEngineInitializationTest, RecoverFromCorruptIntegerIndex) { } TEST_F(IcingSearchEngineInitializationTest, + RecoverFromIntegerIndexBucketSplitThresholdChange) { + SchemaProto schema = + SchemaBuilder() + .AddType(SchemaTypeConfigBuilder().SetType("Message").AddProperty( + PropertyConfigBuilder() + .SetName("indexableInteger") + .SetDataTypeInt64(NUMERIC_MATCH_RANGE) + .SetCardinality(CARDINALITY_REQUIRED))) + .Build(); + + DocumentProto message = + DocumentBuilder() + .SetKey("namespace", "message/1") + .SetSchema("Message") + .AddInt64Property("indexableInteger", 123) + .SetCreationTimestampMs(kDefaultCreationTimestampMs) + .Build(); + + // 1. Create an index with a message document. + { + TestIcingSearchEngine icing( + GetDefaultIcingOptions(), std::make_unique<Filesystem>(), + std::make_unique<IcingFilesystem>(), std::make_unique<FakeClock>(), + GetTestJniCache()); + + ASSERT_THAT(icing.Initialize().status(), ProtoIsOk()); + ASSERT_THAT(icing.SetSchema(schema).status(), ProtoIsOk()); + + EXPECT_THAT(icing.Put(message).status(), ProtoIsOk()); + } + + // 2. Create the index again with different + // integer_index_bucket_split_threshold. This should trigger index + // restoration. + { + // Mock filesystem to observe and check the behavior of all indices. + auto mock_filesystem = std::make_unique<MockFilesystem>(); + EXPECT_CALL(*mock_filesystem, DeleteDirectoryRecursively(_)) + .WillRepeatedly(DoDefault()); + // Ensure term index directory should never be discarded. + EXPECT_CALL(*mock_filesystem, + DeleteDirectoryRecursively(EndsWith("/index_dir"))) + .Times(0); + // Ensure integer index directory should be discarded once, and Clear() + // should never be called (i.e. storage sub directory + // "*/integer_index_dir/*" should never be discarded) since we start it from + // scratch. + EXPECT_CALL(*mock_filesystem, + DeleteDirectoryRecursively(EndsWith("/integer_index_dir"))) + .Times(1); + EXPECT_CALL(*mock_filesystem, + DeleteDirectoryRecursively(HasSubstr("/integer_index_dir/"))) + .Times(0); + // Ensure qualified id join index directory should never be discarded, and + // Clear() should never be called (i.e. storage sub directory + // "*/qualified_id_join_index_dir/*" should never be discarded). + EXPECT_CALL(*mock_filesystem, DeleteDirectoryRecursively( + EndsWith("/qualified_id_join_index_dir"))) + .Times(0); + EXPECT_CALL( + *mock_filesystem, + DeleteDirectoryRecursively(HasSubstr("/qualified_id_join_index_dir/"))) + .Times(0); + + static constexpr int32_t kNewIntegerIndexBucketSplitThreshold = 1000; + IcingSearchEngineOptions options = GetDefaultIcingOptions(); + ASSERT_THAT(kNewIntegerIndexBucketSplitThreshold, + Ne(options.integer_index_bucket_split_threshold())); + options.set_integer_index_bucket_split_threshold( + kNewIntegerIndexBucketSplitThreshold); + + TestIcingSearchEngine icing(options, std::move(mock_filesystem), + std::make_unique<IcingFilesystem>(), + std::make_unique<FakeClock>(), + GetTestJniCache()); + InitializeResultProto initialize_result = icing.Initialize(); + ASSERT_THAT(initialize_result.status(), ProtoIsOk()); + EXPECT_THAT(initialize_result.initialize_stats().index_restoration_cause(), + Eq(InitializeStatsProto::NONE)); + EXPECT_THAT( + initialize_result.initialize_stats().integer_index_restoration_cause(), + Eq(InitializeStatsProto::IO_ERROR)); + EXPECT_THAT(initialize_result.initialize_stats() + .qualified_id_join_index_restoration_cause(), + Eq(InitializeStatsProto::NONE)); + + // Verify integer index works normally + SearchSpecProto search_spec; + search_spec.set_query("indexableInteger == 123"); + search_spec.set_search_type( + SearchSpecProto::SearchType::EXPERIMENTAL_ICING_ADVANCED_QUERY); + search_spec.add_enabled_features(std::string(kNumericSearchFeature)); + + SearchResultProto results = + icing.Search(search_spec, ScoringSpecProto::default_instance(), + ResultSpecProto::default_instance()); + ASSERT_THAT(results.results(), SizeIs(1)); + EXPECT_THAT(results.results(0).document().uri(), Eq("message/1")); + } +} + +TEST_F(IcingSearchEngineInitializationTest, RecoverFromCorruptQualifiedIdJoinIndex) { // Test the following scenario: qualified id join index is corrupted (e.g. // checksum doesn't match). IcingSearchEngine should be able to recover @@ -1749,7 +2090,9 @@ TEST_F(IcingSearchEngineInitializationTest, RestoreIndexLoseTermIndex) { ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<Index> index, Index::Create(Index::Options(GetIndexDir(), - /*index_merge_size=*/100), + /*index_merge_size=*/100, + /*lite_index_sort_at_indexing=*/true, + /*lite_index_sort_size=*/50), filesystem(), icing_filesystem())); ICING_ASSERT_OK(index->PersistToDisk()); } @@ -2359,7 +2702,9 @@ TEST_F(IcingSearchEngineInitializationTest, std::unique_ptr<Index> index, Index::Create( Index::Options(GetIndexDir(), - /*index_merge_size=*/message.ByteSizeLong()), + /*index_merge_size=*/message.ByteSizeLong(), + /*lite_index_sort_at_indexing=*/true, + /*lite_index_sort_size=*/8), filesystem(), icing_filesystem())); DocumentId original_last_added_doc_id = index->last_added_document_id(); index->set_last_added_document_id(original_last_added_doc_id + 1); @@ -2491,7 +2836,9 @@ TEST_F(IcingSearchEngineInitializationTest, std::unique_ptr<Index> index, Index::Create( Index::Options(GetIndexDir(), - /*index_merge_size=*/message.ByteSizeLong()), + /*index_merge_size=*/message.ByteSizeLong(), + /*lite_index_sort_at_indexing=*/true, + /*lite_index_sort_size=*/8), filesystem(), icing_filesystem())); ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<DocHitInfoIterator> doc_hit_info_iter, @@ -2603,7 +2950,9 @@ TEST_F(IcingSearchEngineInitializationTest, std::unique_ptr<Index> index, Index::Create( Index::Options(GetIndexDir(), - /*index_merge_size=*/message.ByteSizeLong()), + /*index_merge_size=*/message.ByteSizeLong(), + /*lite_index_sort_at_indexing=*/true, + /*lite_index_sort_size=*/8), filesystem(), icing_filesystem())); DocumentId original_last_added_doc_id = index->last_added_document_id(); index->set_last_added_document_id(original_last_added_doc_id + 1); @@ -2740,7 +3089,9 @@ TEST_F(IcingSearchEngineInitializationTest, std::unique_ptr<Index> index, Index::Create( Index::Options(GetIndexDir(), - /*index_merge_size=*/message.ByteSizeLong()), + /*index_merge_size=*/message.ByteSizeLong(), + /*lite_index_sort_at_indexing=*/true, + /*lite_index_sort_size=*/8), filesystem(), icing_filesystem())); ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<DocHitInfoIterator> doc_hit_info_iter, @@ -2800,7 +3151,9 @@ TEST_F(IcingSearchEngineInitializationTest, Index::Create( // index merge size is not important here because we will manually // invoke merge below. - Index::Options(GetIndexDir(), /*index_merge_size=*/100), + Index::Options(GetIndexDir(), /*index_merge_size=*/100, + /*lite_index_sort_at_indexing=*/true, + /*lite_index_sort_size=*/50), filesystem(), icing_filesystem())); // Add hits for document 0 and merge. ASSERT_THAT(index->last_added_document_id(), kInvalidDocumentId); @@ -2876,7 +3229,9 @@ TEST_F(IcingSearchEngineInitializationTest, { ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<Index> index, - Index::Create(Index::Options(GetIndexDir(), /*index_merge_size=*/100), + Index::Create(Index::Options(GetIndexDir(), /*index_merge_size=*/100, + /*lite_index_sort_at_indexing=*/true, + /*lite_index_sort_size=*/50), filesystem(), icing_filesystem())); ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<DocHitInfoIterator> doc_hit_info_iter, @@ -2992,7 +3347,9 @@ TEST_F(IcingSearchEngineInitializationTest, std::unique_ptr<Index> index, Index::Create( Index::Options(GetIndexDir(), - /*index_merge_size=*/message.ByteSizeLong()), + /*index_merge_size=*/message.ByteSizeLong(), + /*lite_index_sort_at_indexing=*/true, + /*lite_index_sort_size=*/8), filesystem(), icing_filesystem())); // Add hits for document 4 and merge. DocumentId original_last_added_doc_id = index->last_added_document_id(); @@ -3135,7 +3492,9 @@ TEST_F(IcingSearchEngineInitializationTest, { ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<Index> index, - Index::Create(Index::Options(GetIndexDir(), /*index_merge_size=*/100), + Index::Create(Index::Options(GetIndexDir(), /*index_merge_size=*/100, + /*lite_index_sort_at_indexing=*/true, + /*lite_index_sort_size=*/50), filesystem(), icing_filesystem())); ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<DocHitInfoIterator> doc_hit_info_iter, @@ -3197,6 +3556,7 @@ TEST_F(IcingSearchEngineInitializationTest, ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<IntegerIndex> integer_index, IntegerIndex::Create(filesystem, GetIntegerIndexDir(), + /*num_data_threshold_for_bucket_split=*/65536, /*pre_mapping_fbv=*/false)); // Add hits for document 0. ASSERT_THAT(integer_index->last_added_document_id(), kInvalidDocumentId); @@ -3376,6 +3736,7 @@ TEST_F(IcingSearchEngineInitializationTest, ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<IntegerIndex> integer_index, IntegerIndex::Create(filesystem, GetIntegerIndexDir(), + /*num_data_threshold_for_bucket_split=*/65536, /*pre_mapping_fbv=*/false)); // Add hits for document 4. DocumentId original_last_added_doc_id = @@ -3571,17 +3932,19 @@ TEST_F(IcingSearchEngineInitializationTest, { Filesystem filesystem; ICING_ASSERT_OK_AND_ASSIGN( - std::unique_ptr<QualifiedIdTypeJoinableIndex> qualified_id_join_index, - QualifiedIdTypeJoinableIndex::Create( - filesystem, GetQualifiedIdJoinIndexDir(), /*pre_mapping_fbv=*/false, - /*use_persistent_hash_map=*/false)); + std::unique_ptr<QualifiedIdJoinIndex> qualified_id_join_index, + QualifiedIdJoinIndexImplV2::Create(filesystem, + GetQualifiedIdJoinIndexDir(), + /*pre_mapping_fbv=*/false)); // Add data for document 0. ASSERT_THAT(qualified_id_join_index->last_added_document_id(), kInvalidDocumentId); qualified_id_join_index->set_last_added_document_id(0); ICING_ASSERT_OK(qualified_id_join_index->Put( - DocJoinInfo(/*document_id=*/0, /*joinable_property_id=*/0), - /*ref_qualified_id_str=*/"namespace#person")); + /*schema_type_id=*/0, /*joinable_property_id=*/0, /*document_id=*/0, + /*ref_namespace_fingerprint_ids=*/ + {NamespaceFingerprintIdentifier(/*namespace_id=*/0, + /*target_str=*/"uri")})); } // 3. Create the index again. This should trigger index restoration. @@ -3641,13 +4004,15 @@ TEST_F(IcingSearchEngineInitializationTest, { Filesystem filesystem; ICING_ASSERT_OK_AND_ASSIGN( - std::unique_ptr<QualifiedIdTypeJoinableIndex> qualified_id_join_index, - QualifiedIdTypeJoinableIndex::Create( - filesystem, GetQualifiedIdJoinIndexDir(), /*pre_mapping_fbv=*/false, - /*use_persistent_hash_map=*/false)); - EXPECT_THAT(qualified_id_join_index->Get( - DocJoinInfo(/*document_id=*/0, /*joinable_property_id=*/0)), - StatusIs(libtextclassifier3::StatusCode::NOT_FOUND)); + std::unique_ptr<QualifiedIdJoinIndex> qualified_id_join_index, + QualifiedIdJoinIndexImplV2::Create(filesystem, + GetQualifiedIdJoinIndexDir(), + /*pre_mapping_fbv=*/false)); + ICING_ASSERT_OK_AND_ASSIGN( + auto iterator, qualified_id_join_index->GetIterator( + /*schema_type_id=*/0, /*joinable_property_id=*/0)); + EXPECT_THAT(iterator->Advance(), + StatusIs(libtextclassifier3::StatusCode::RESOURCE_EXHAUSTED)); } } @@ -3731,7 +4096,6 @@ TEST_F(IcingSearchEngineInitializationTest, EXPECT_THAT(icing.Put(message).status(), ProtoIsOk()); } - DocJoinInfo additional_data_key; // 2. Manually add some data into qualified id join index and increment // last_added_document_id. This will cause mismatched document id with // document store. @@ -3742,21 +4106,21 @@ TEST_F(IcingSearchEngineInitializationTest, { Filesystem filesystem; ICING_ASSERT_OK_AND_ASSIGN( - std::unique_ptr<QualifiedIdTypeJoinableIndex> qualified_id_join_index, - QualifiedIdTypeJoinableIndex::Create( - filesystem, GetQualifiedIdJoinIndexDir(), /*pre_mapping_fbv=*/false, - /*use_persistent_hash_map=*/false)); + std::unique_ptr<QualifiedIdJoinIndex> qualified_id_join_index, + QualifiedIdJoinIndexImplV2::Create(filesystem, + GetQualifiedIdJoinIndexDir(), + /*pre_mapping_fbv=*/false)); // Add data for document 4. DocumentId original_last_added_doc_id = qualified_id_join_index->last_added_document_id(); qualified_id_join_index->set_last_added_document_id( original_last_added_doc_id + 1); - additional_data_key = - DocJoinInfo(/*document_id=*/original_last_added_doc_id + 1, - /*joinable_property_id=*/0); ICING_ASSERT_OK(qualified_id_join_index->Put( - additional_data_key, - /*ref_qualified_id_str=*/"namespace#person")); + /*schema_type_id=*/1, /*joinable_property_id=*/0, + /*document_id=*/original_last_added_doc_id + 1, + /*ref_namespace_fingerprint_ids=*/ + {NamespaceFingerprintIdentifier(/*namespace_id=*/0, + /*target_str=*/"person")})); } // 3. Create the index again. This should trigger index restoration. @@ -3839,10 +4203,9 @@ TEST_F(IcingSearchEngineInitializationTest, // `name:person` with a child query for `body:consectetur` based on the // child's `senderQualifiedId` field. - // Add document 4 without "senderQualifiedId". If joinable index is not - // rebuilt correctly, then it will still have the previously added - // senderQualifiedId for document 4 and include document 4 incorrectly in - // the right side. + // Add document 4 without "senderQualifiedId". If join index is not rebuilt + // correctly, then it will still have the previously added senderQualifiedId + // for document 4 and include document 4 incorrectly in the right side. DocumentProto another_message = DocumentBuilder() .SetKey("namespace", "message/4") @@ -4165,9 +4528,12 @@ TEST_F(IcingSearchEngineInitializationTest, EXPECT_THAT( initialize_result_proto.initialize_stats().document_store_data_status(), Eq(InitializeStatsProto::PARTIAL_LOSS)); - // Since document store rewinds to previous checkpoint, last stored doc id - // will be consistent with last added document ids in term/integer indices, - // so there will be no index restoration. + // Document store rewinds to previous checkpoint and all derived files were + // regenerated. + // - Last stored doc id will be consistent with last added document ids in + // term/integer indices, so there will be no index restoration. + // - Qualified id join index depends on document store derived files and + // since they were regenerated, we should rebuild qualified id join index. EXPECT_THAT( initialize_result_proto.initialize_stats().index_restoration_cause(), Eq(InitializeStatsProto::NONE)); @@ -4176,10 +4542,10 @@ TEST_F(IcingSearchEngineInitializationTest, Eq(InitializeStatsProto::NONE)); EXPECT_THAT(initialize_result_proto.initialize_stats() .qualified_id_join_index_restoration_cause(), - Eq(InitializeStatsProto::NONE)); + Eq(InitializeStatsProto::DEPENDENCIES_CHANGED)); EXPECT_THAT(initialize_result_proto.initialize_stats() .index_restoration_latency_ms(), - Eq(0)); + Eq(10)); EXPECT_THAT(initialize_result_proto.initialize_stats() .schema_store_recovery_cause(), Eq(InitializeStatsProto::NONE)); @@ -4307,7 +4673,9 @@ TEST_F(IcingSearchEngineInitializationTest, ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<Index> index, Index::Create(Index::Options(GetIndexDir(), - /*index_merge_size=*/100), + /*index_merge_size=*/100, + /*lite_index_sort_at_indexing=*/true, + /*lite_index_sort_size=*/50), filesystem(), icing_filesystem())); ICING_ASSERT_OK(index->PersistToDisk()); } @@ -4829,7 +5197,7 @@ TEST_F(IcingSearchEngineInitializationTest, auto mock_filesystem = std::make_unique<MockFilesystem>(); EXPECT_CALL(*mock_filesystem, PRead(A<const char*>(), _, _, _)) .WillRepeatedly(DoDefault()); - // This fails QualifiedIdTypeJoinableIndex::Create() once. + // This fails QualifiedIdJoinIndexImplV2::Create() once. EXPECT_CALL( *mock_filesystem, PRead(Matcher<const char*>(Eq(qualified_id_join_index_metadata_file)), _, @@ -4929,10 +5297,10 @@ TEST_F(IcingSearchEngineInitializationTest, Eq(InitializeStatsProto::NONE)); EXPECT_THAT(initialize_result_proto.initialize_stats() .qualified_id_join_index_restoration_cause(), - Eq(InitializeStatsProto::NONE)); + Eq(InitializeStatsProto::DEPENDENCIES_CHANGED)); EXPECT_THAT( initialize_result_proto.initialize_stats().index_restoration_latency_ms(), - Eq(0)); + Eq(10)); EXPECT_THAT( initialize_result_proto.initialize_stats().schema_store_recovery_cause(), Eq(InitializeStatsProto::NONE)); @@ -5035,6 +5403,169 @@ TEST_F(IcingSearchEngineInitializationTest, } } +// TODO(b/275121148): deprecate this test after rollout join index v2. +class IcingSearchEngineInitializationSwitchJoinIndexTest + : public IcingSearchEngineInitializationTest, + public ::testing::WithParamInterface<bool> {}; +TEST_P(IcingSearchEngineInitializationSwitchJoinIndexTest, SwitchJoinIndex) { + bool use_join_index_v2 = GetParam(); + + SchemaProto schema = + SchemaBuilder() + .AddType(SchemaTypeConfigBuilder().SetType("Person").AddProperty( + PropertyConfigBuilder() + .SetName("name") + .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_REQUIRED))) + .AddType(SchemaTypeConfigBuilder() + .SetType("Message") + .AddProperty(PropertyConfigBuilder() + .SetName("body") + .SetDataTypeString(TERM_MATCH_PREFIX, + TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_REQUIRED)) + .AddProperty(PropertyConfigBuilder() + .SetName("indexableInteger") + .SetDataTypeInt64(NUMERIC_MATCH_RANGE) + .SetCardinality(CARDINALITY_REQUIRED)) + .AddProperty(PropertyConfigBuilder() + .SetName("senderQualifiedId") + .SetDataTypeJoinableString( + JOINABLE_VALUE_TYPE_QUALIFIED_ID) + .SetCardinality(CARDINALITY_OPTIONAL))) + .Build(); + + DocumentProto person = + DocumentBuilder() + .SetKey("namespace", "person") + .SetSchema("Person") + .AddStringProperty("name", "person") + .SetCreationTimestampMs(kDefaultCreationTimestampMs) + .Build(); + DocumentProto message = + DocumentBuilder() + .SetKey("namespace", "message/1") + .SetSchema("Message") + .AddStringProperty("body", kIpsumText) + .AddInt64Property("indexableInteger", 123) + .AddStringProperty("senderQualifiedId", "namespace#person") + .SetCreationTimestampMs(kDefaultCreationTimestampMs) + .Build(); + + // 1. Create an index with message 3 documents. + { + IcingSearchEngineOptions options = GetDefaultIcingOptions(); + options.set_document_store_namespace_id_fingerprint(true); + options.set_use_new_qualified_id_join_index(use_join_index_v2); + + TestIcingSearchEngine icing(options, std::make_unique<Filesystem>(), + std::make_unique<IcingFilesystem>(), + std::make_unique<FakeClock>(), + GetTestJniCache()); + + ASSERT_THAT(icing.Initialize().status(), ProtoIsOk()); + ASSERT_THAT(icing.SetSchema(schema).status(), ProtoIsOk()); + + EXPECT_THAT(icing.Put(person).status(), ProtoIsOk()); + EXPECT_THAT(icing.Put(message).status(), ProtoIsOk()); + message = DocumentBuilder(message).SetUri("message/2").Build(); + EXPECT_THAT(icing.Put(message).status(), ProtoIsOk()); + message = DocumentBuilder(message).SetUri("message/3").Build(); + EXPECT_THAT(icing.Put(message).status(), ProtoIsOk()); + } + + // 2. Create the index again changing join index version. This should trigger + // join index restoration. + { + // Mock filesystem to observe and check the behavior of all indices. + auto mock_filesystem = std::make_unique<MockFilesystem>(); + EXPECT_CALL(*mock_filesystem, DeleteDirectoryRecursively(_)) + .WillRepeatedly(DoDefault()); + // Ensure term index directory should never be discarded. + EXPECT_CALL(*mock_filesystem, + DeleteDirectoryRecursively(EndsWith("/index_dir"))) + .Times(0); + // Ensure integer index directory should never be discarded, and Clear() + // should never be called (i.e. storage sub directory + // "*/integer_index_dir/*" should never be discarded). + EXPECT_CALL(*mock_filesystem, + DeleteDirectoryRecursively(EndsWith("/integer_index_dir"))) + .Times(0); + EXPECT_CALL(*mock_filesystem, + DeleteDirectoryRecursively(HasSubstr("/integer_index_dir/"))) + .Times(0); + // Ensure qualified id join index directory should be discarded once, and + // Clear() should never be called (i.e. storage sub directory + // "*/qualified_id_join_index_dir/*" should never be discarded). + EXPECT_CALL(*mock_filesystem, DeleteDirectoryRecursively( + EndsWith("/qualified_id_join_index_dir"))) + .Times(1); + EXPECT_CALL( + *mock_filesystem, + DeleteDirectoryRecursively(HasSubstr("/qualified_id_join_index_dir/"))) + .Times(0); + + IcingSearchEngineOptions options = GetDefaultIcingOptions(); + options.set_document_store_namespace_id_fingerprint(true); + options.set_use_new_qualified_id_join_index(!use_join_index_v2); + + TestIcingSearchEngine icing(options, std::move(mock_filesystem), + std::make_unique<IcingFilesystem>(), + std::make_unique<FakeClock>(), + GetTestJniCache()); + InitializeResultProto initialize_result = icing.Initialize(); + ASSERT_THAT(initialize_result.status(), ProtoIsOk()); + EXPECT_THAT(initialize_result.initialize_stats().index_restoration_cause(), + Eq(InitializeStatsProto::NONE)); + EXPECT_THAT( + initialize_result.initialize_stats().integer_index_restoration_cause(), + Eq(InitializeStatsProto::NONE)); + EXPECT_THAT(initialize_result.initialize_stats() + .qualified_id_join_index_restoration_cause(), + Eq(InitializeStatsProto::INCONSISTENT_WITH_GROUND_TRUTH)); + + // Verify qualified id join index works normally: join a query for + // `name:person` with a child query for `body:consectetur` based on the + // child's `senderQualifiedId` field. + SearchSpecProto search_spec; + search_spec.set_term_match_type(TermMatchType::EXACT_ONLY); + search_spec.set_query("name:person"); + JoinSpecProto* join_spec = search_spec.mutable_join_spec(); + join_spec->set_parent_property_expression( + std::string(JoinProcessor::kQualifiedIdExpr)); + join_spec->set_child_property_expression("senderQualifiedId"); + join_spec->set_aggregation_scoring_strategy( + JoinSpecProto::AggregationScoringStrategy::COUNT); + JoinSpecProto::NestedSpecProto* nested_spec = + join_spec->mutable_nested_spec(); + SearchSpecProto* nested_search_spec = nested_spec->mutable_search_spec(); + nested_search_spec->set_term_match_type(TermMatchType::EXACT_ONLY); + nested_search_spec->set_query("body:consectetur"); + *nested_spec->mutable_scoring_spec() = GetDefaultScoringSpec(); + *nested_spec->mutable_result_spec() = ResultSpecProto::default_instance(); + + ResultSpecProto result_spec = ResultSpecProto::default_instance(); + result_spec.set_max_joined_children_per_parent_to_return( + std::numeric_limits<int32_t>::max()); + + SearchResultProto results = icing.Search( + search_spec, ScoringSpecProto::default_instance(), result_spec); + ASSERT_THAT(results.results(), SizeIs(1)); + EXPECT_THAT(results.results(0).document().uri(), Eq("person")); + EXPECT_THAT(results.results(0).joined_results(), SizeIs(3)); + EXPECT_THAT(results.results(0).joined_results(0).document().uri(), + Eq("message/3")); + EXPECT_THAT(results.results(0).joined_results(1).document().uri(), + Eq("message/2")); + EXPECT_THAT(results.results(0).joined_results(2).document().uri(), + Eq("message/1")); + } +} + +INSTANTIATE_TEST_SUITE_P(IcingSearchEngineInitializationSwitchJoinIndexTest, + IcingSearchEngineInitializationSwitchJoinIndexTest, + testing::Values(true, false)); + class IcingSearchEngineInitializationVersionChangeTest : public IcingSearchEngineInitializationTest, public ::testing::WithParamInterface<version_util::VersionInfo> {}; @@ -5094,12 +5625,14 @@ TEST_P(IcingSearchEngineInitializationVersionChangeTest, .SetCreationTimestampMs(kDefaultCreationTimestampMs) .Build(); + IcingSearchEngineOptions icing_options = GetDefaultIcingOptions(); + { // Initializes folder and schema, index person1 and person2 - TestIcingSearchEngine icing( - GetDefaultIcingOptions(), std::make_unique<Filesystem>(), - std::make_unique<IcingFilesystem>(), std::make_unique<FakeClock>(), - GetTestJniCache()); + TestIcingSearchEngine icing(icing_options, std::make_unique<Filesystem>(), + std::make_unique<IcingFilesystem>(), + std::make_unique<FakeClock>(), + GetTestJniCache()); EXPECT_THAT(icing.Initialize().status(), ProtoIsOk()); EXPECT_THAT(icing.SetSchema(schema).status(), ProtoIsOk()); EXPECT_THAT(icing.Put(person1).status(), ProtoIsOk()); @@ -5123,19 +5656,24 @@ TEST_P(IcingSearchEngineInitializationVersionChangeTest, // Put message into DocumentStore ICING_ASSERT_OK_AND_ASSIGN( DocumentStore::CreateResult create_result, - DocumentStore::Create(filesystem(), GetDocumentDir(), &fake_clock, - schema_store.get(), - /*force_recovery_and_revalidate_documents=*/false, - /*namespace_id_fingerprint=*/false, - PortableFileBackedProtoLog< - DocumentWrapper>::kDeflateCompressionLevel, - /*initialize_stats=*/nullptr)); + DocumentStore::Create( + filesystem(), GetDocumentDir(), &fake_clock, schema_store.get(), + /*force_recovery_and_revalidate_documents=*/false, + /*namespace_id_fingerprint=*/ + icing_options.document_store_namespace_id_fingerprint(), + /*pre_mapping_fbv=*/false, + /*use_persistent_hash_map=*/false, + PortableFileBackedProtoLog< + DocumentWrapper>::kDeflateCompressionLevel, + /*initialize_stats=*/nullptr)); std::unique_ptr<DocumentStore> document_store = std::move(create_result.document_store); ICING_ASSERT_OK_AND_ASSIGN(DocumentId doc_id, document_store->Put(message)); // Index doc_id with incorrect data - Index::Options options(GetIndexDir(), /*index_merge_size=*/1024 * 1024); + Index::Options options(GetIndexDir(), /*index_merge_size=*/1024 * 1024, + /*lite_index_sort_at_indexing=*/true, + /*lite_index_sort_size=*/1024 * 8); ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<Index> index, Index::Create(options, filesystem(), icing_filesystem())); @@ -5143,33 +5681,33 @@ TEST_P(IcingSearchEngineInitializationVersionChangeTest, ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<IntegerIndex> integer_index, IntegerIndex::Create(*filesystem(), GetIntegerIndexDir(), + /*num_data_threshold_for_bucket_split=*/65536, /*pre_mapping_fbv=*/false)); ICING_ASSERT_OK_AND_ASSIGN( - std::unique_ptr<QualifiedIdTypeJoinableIndex> qualified_id_join_index, - QualifiedIdTypeJoinableIndex::Create( - *filesystem(), GetQualifiedIdJoinIndexDir(), - /*pre_mapping_fbv=*/false, /*use_persistent_hash_map=*/false)); + std::unique_ptr<QualifiedIdJoinIndex> qualified_id_join_index, + QualifiedIdJoinIndexImplV2::Create(*filesystem(), + GetQualifiedIdJoinIndexDir(), + /*pre_mapping_fbv=*/false)); ICING_ASSERT_OK_AND_ASSIGN( - std::unique_ptr<StringSectionIndexingHandler> - string_section_indexing_handler, - StringSectionIndexingHandler::Create(&fake_clock, normalizer_.get(), - index.get())); + std::unique_ptr<TermIndexingHandler> term_indexing_handler, + TermIndexingHandler::Create( + &fake_clock, normalizer_.get(), index.get(), + /*build_property_existence_metadata_hits=*/true)); ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<IntegerSectionIndexingHandler> integer_section_indexing_handler, IntegerSectionIndexingHandler::Create( &fake_clock, integer_index.get())); ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<QualifiedIdJoinIndexingHandler> - qualified_id_joinable_property_indexing_handler, - QualifiedIdJoinIndexingHandler::Create(&fake_clock, - qualified_id_join_index.get())); + qualified_id_join_indexing_handler, + QualifiedIdJoinIndexingHandler::Create( + &fake_clock, document_store.get(), qualified_id_join_index.get())); std::vector<std::unique_ptr<DataIndexingHandler>> handlers; - handlers.push_back(std::move(string_section_indexing_handler)); + handlers.push_back(std::move(term_indexing_handler)); handlers.push_back(std::move(integer_section_indexing_handler)); - handlers.push_back( - std::move(qualified_id_joinable_property_indexing_handler)); + handlers.push_back(std::move(qualified_id_join_indexing_handler)); IndexProcessor index_processor(std::move(handlers), &fake_clock); DocumentProto incorrect_message = @@ -5302,12 +5840,8 @@ INSTANTIATE_TEST_SUITE_P( /*version_in=*/version_util::kVersion + 1, /*max_version_in=*/version_util::kVersion + 1), - // Manually change existing data set's version to kVersion - 1 and - // max_version to kVersion - 1. When initializing, it will detect - // "upgrade". - version_util::VersionInfo( - /*version_in=*/version_util::kVersion - 1, - /*max_version_in=*/version_util::kVersion - 1), + // Currently we don't have any "upgrade" that requires rebuild derived + // files, so skip this case until we have a case for it. // Manually change existing data set's version to kVersion - 1 and // max_version to kVersion. When initializing, it will detect "roll @@ -5334,6 +5868,163 @@ INSTANTIATE_TEST_SUITE_P( /*version_in=*/0, /*max_version_in=*/version_util::kVersion))); +class IcingSearchEngineInitializationChangePropertyExistenceHitsFlagTest + : public IcingSearchEngineInitializationTest, + public ::testing::WithParamInterface<std::tuple<bool, bool>> {}; +TEST_P(IcingSearchEngineInitializationChangePropertyExistenceHitsFlagTest, + ChangePropertyExistenceHitsFlagTest) { + bool before_build_property_existence_metadata_hits = std::get<0>(GetParam()); + bool after_build_property_existence_metadata_hits = std::get<1>(GetParam()); + bool flag_changed = before_build_property_existence_metadata_hits != + after_build_property_existence_metadata_hits; + + SchemaProto schema = + SchemaBuilder() + .AddType(SchemaTypeConfigBuilder() + .SetType("Value") + .AddProperty(PropertyConfigBuilder() + .SetName("body") + .SetDataTypeString(TERM_MATCH_EXACT, + TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_REPEATED)) + .AddProperty(PropertyConfigBuilder() + .SetName("timestamp") + .SetDataType(TYPE_INT64) + .SetCardinality(CARDINALITY_OPTIONAL)) + .AddProperty(PropertyConfigBuilder() + .SetName("score") + .SetDataType(TYPE_DOUBLE) + .SetCardinality(CARDINALITY_OPTIONAL))) + .Build(); + + // Create a document with every property. + DocumentProto document0 = DocumentBuilder() + .SetKey("icing", "uri0") + .SetSchema("Value") + .SetCreationTimestampMs(1) + .AddStringProperty("body", "foo") + .AddInt64Property("timestamp", 123) + .AddDoubleProperty("score", 456.789) + .Build(); + // Create a document with missing body. + DocumentProto document1 = DocumentBuilder() + .SetKey("icing", "uri1") + .SetSchema("Value") + .SetCreationTimestampMs(1) + .AddInt64Property("timestamp", 123) + .AddDoubleProperty("score", 456.789) + .Build(); + // Create a document with missing timestamp. + DocumentProto document2 = DocumentBuilder() + .SetKey("icing", "uri2") + .SetSchema("Value") + .SetCreationTimestampMs(1) + .AddStringProperty("body", "foo") + .AddDoubleProperty("score", 456.789) + .Build(); + + // 1. Create an index with the 3 documents. + { + IcingSearchEngineOptions options = GetDefaultIcingOptions(); + options.set_build_property_existence_metadata_hits( + before_build_property_existence_metadata_hits); + TestIcingSearchEngine icing(options, std::make_unique<Filesystem>(), + std::make_unique<IcingFilesystem>(), + std::make_unique<FakeClock>(), + GetTestJniCache()); + + ASSERT_THAT(icing.Initialize().status(), ProtoIsOk()); + ASSERT_THAT(icing.SetSchema(schema).status(), ProtoIsOk()); + ASSERT_THAT(icing.Put(document0).status(), ProtoIsOk()); + ASSERT_THAT(icing.Put(document1).status(), ProtoIsOk()); + ASSERT_THAT(icing.Put(document2).status(), ProtoIsOk()); + } + + // 2. Create the index again with + // after_build_property_existence_metadata_hits. + // + // Mock filesystem to observe and check the behavior of all indices. + auto mock_filesystem = std::make_unique<MockFilesystem>(); + EXPECT_CALL(*mock_filesystem, DeleteDirectoryRecursively(_)) + .WillRepeatedly(DoDefault()); + // Ensure that the term index is rebuilt if the flag is changed. + EXPECT_CALL(*mock_filesystem, + DeleteDirectoryRecursively(EndsWith("/index_dir"))) + .Times(flag_changed ? 1 : 0); + + IcingSearchEngineOptions options = GetDefaultIcingOptions(); + options.set_build_property_existence_metadata_hits( + after_build_property_existence_metadata_hits); + TestIcingSearchEngine icing(options, std::move(mock_filesystem), + std::make_unique<IcingFilesystem>(), + std::make_unique<FakeClock>(), GetTestJniCache()); + InitializeResultProto initialize_result = icing.Initialize(); + ASSERT_THAT(initialize_result.status(), ProtoIsOk()); + // Ensure that the term index is rebuilt if the flag is changed. + EXPECT_THAT(initialize_result.initialize_stats().index_restoration_cause(), + Eq(flag_changed ? InitializeStatsProto::IO_ERROR + : InitializeStatsProto::NONE)); + EXPECT_THAT( + initialize_result.initialize_stats().integer_index_restoration_cause(), + Eq(InitializeStatsProto::NONE)); + EXPECT_THAT(initialize_result.initialize_stats() + .qualified_id_join_index_restoration_cause(), + Eq(InitializeStatsProto::NONE)); + + // Get all documents that have "body". + SearchSpecProto search_spec; + search_spec.set_term_match_type(TermMatchType::EXACT_ONLY); + search_spec.set_search_type( + SearchSpecProto::SearchType::EXPERIMENTAL_ICING_ADVANCED_QUERY); + search_spec.add_enabled_features(std::string(kHasPropertyFunctionFeature)); + search_spec.add_enabled_features( + std::string(kListFilterQueryLanguageFeature)); + search_spec.set_query("hasProperty(\"body\")"); + SearchResultProto results = icing.Search(search_spec, GetDefaultScoringSpec(), + ResultSpecProto::default_instance()); + EXPECT_THAT(results.status(), ProtoIsOk()); + if (after_build_property_existence_metadata_hits) { + EXPECT_THAT(results.results(), SizeIs(2)); + EXPECT_THAT(results.results(0).document(), EqualsProto(document2)); + EXPECT_THAT(results.results(1).document(), EqualsProto(document0)); + } else { + EXPECT_THAT(results.results(), IsEmpty()); + } + + // Get all documents that have "timestamp". + search_spec.set_query("hasProperty(\"timestamp\")"); + results = icing.Search(search_spec, GetDefaultScoringSpec(), + ResultSpecProto::default_instance()); + EXPECT_THAT(results.status(), ProtoIsOk()); + if (after_build_property_existence_metadata_hits) { + EXPECT_THAT(results.results(), SizeIs(2)); + EXPECT_THAT(results.results(0).document(), EqualsProto(document1)); + EXPECT_THAT(results.results(1).document(), EqualsProto(document0)); + } else { + EXPECT_THAT(results.results(), IsEmpty()); + } + + // Get all documents that have "score". + search_spec.set_query("hasProperty(\"score\")"); + results = icing.Search(search_spec, GetDefaultScoringSpec(), + ResultSpecProto::default_instance()); + EXPECT_THAT(results.status(), ProtoIsOk()); + if (after_build_property_existence_metadata_hits) { + EXPECT_THAT(results.results(), SizeIs(3)); + EXPECT_THAT(results.results(0).document(), EqualsProto(document2)); + EXPECT_THAT(results.results(1).document(), EqualsProto(document1)); + EXPECT_THAT(results.results(2).document(), EqualsProto(document0)); + } else { + EXPECT_THAT(results.results(), IsEmpty()); + } +} + +INSTANTIATE_TEST_SUITE_P( + IcingSearchEngineInitializationChangePropertyExistenceHitsFlagTest, + IcingSearchEngineInitializationChangePropertyExistenceHitsFlagTest, + testing::Values(std::make_tuple(false, false), std::make_tuple(false, true), + std::make_tuple(true, false), std::make_tuple(true, true))); + } // namespace } // namespace lib } // namespace icing diff --git a/icing/icing-search-engine_optimize_test.cc b/icing/icing-search-engine_optimize_test.cc index 3127171..61b594c 100644 --- a/icing/icing-search-engine_optimize_test.cc +++ b/icing/icing-search-engine_optimize_test.cc @@ -1552,6 +1552,8 @@ TEST_F(IcingSearchEngineOptimizeTest, OptimizeThresholdTest) { expected.set_num_original_documents(3); expected.set_num_deleted_documents(1); expected.set_num_expired_documents(1); + expected.set_num_original_namespaces(1); + expected.set_num_deleted_namespaces(0); expected.set_index_restoration_mode(OptimizeStatsProto::INDEX_TRANSLATION); // Run Optimize @@ -1584,6 +1586,8 @@ TEST_F(IcingSearchEngineOptimizeTest, OptimizeThresholdTest) { expected.set_num_original_documents(1); expected.set_num_deleted_documents(0); expected.set_num_expired_documents(0); + expected.set_num_original_namespaces(1); + expected.set_num_deleted_namespaces(0); expected.set_time_since_last_optimize_ms(10000); expected.set_index_restoration_mode(OptimizeStatsProto::INDEX_TRANSLATION); @@ -1606,6 +1610,8 @@ TEST_F(IcingSearchEngineOptimizeTest, OptimizeThresholdTest) { expected.set_num_original_documents(1); expected.set_num_deleted_documents(1); expected.set_num_expired_documents(0); + expected.set_num_original_namespaces(1); + expected.set_num_deleted_namespaces(1); expected.set_time_since_last_optimize_ms(0); // Should rebuild the index since all documents are removed. expected.set_index_restoration_mode(OptimizeStatsProto::FULL_INDEX_REBUILD); @@ -1689,6 +1695,8 @@ TEST_F(IcingSearchEngineOptimizeTest, OptimizeStatsProtoTest) { expected.set_num_original_documents(3); expected.set_num_deleted_documents(1); expected.set_num_expired_documents(1); + expected.set_num_original_namespaces(1); + expected.set_num_deleted_namespaces(0); expected.set_index_restoration_mode(OptimizeStatsProto::FULL_INDEX_REBUILD); // Run Optimize @@ -1723,6 +1731,8 @@ TEST_F(IcingSearchEngineOptimizeTest, OptimizeStatsProtoTest) { expected.set_num_original_documents(1); expected.set_num_deleted_documents(0); expected.set_num_expired_documents(0); + expected.set_num_original_namespaces(1); + expected.set_num_deleted_namespaces(0); expected.set_time_since_last_optimize_ms(10000); expected.set_index_restoration_mode(OptimizeStatsProto::FULL_INDEX_REBUILD); @@ -1745,6 +1755,8 @@ TEST_F(IcingSearchEngineOptimizeTest, OptimizeStatsProtoTest) { expected.set_num_original_documents(1); expected.set_num_deleted_documents(1); expected.set_num_expired_documents(0); + expected.set_num_original_namespaces(1); + expected.set_num_deleted_namespaces(1); expected.set_time_since_last_optimize_ms(0); expected.set_index_restoration_mode(OptimizeStatsProto::FULL_INDEX_REBUILD); diff --git a/icing/icing-search-engine_schema_test.cc b/icing/icing-search-engine_schema_test.cc index 2609cce..49c024e 100644 --- a/icing/icing-search-engine_schema_test.cc +++ b/icing/icing-search-engine_schema_test.cc @@ -18,7 +18,6 @@ #include <string> #include <utility> -#include "icing/text_classifier/lib3/utils/base/status.h" #include "gmock/gmock.h" #include "gtest/gtest.h" #include "icing/document-builder.h" @@ -27,14 +26,12 @@ #include "icing/icing-search-engine.h" #include "icing/jni/jni-cache.h" #include "icing/join/join-processor.h" -#include "icing/portable/endian.h" #include "icing/portable/equals-proto.h" #include "icing/portable/platform.h" #include "icing/proto/debug.pb.h" #include "icing/proto/document.pb.h" #include "icing/proto/document_wrapper.pb.h" #include "icing/proto/initialize.pb.h" -#include "icing/proto/logging.pb.h" #include "icing/proto/optimize.pb.h" #include "icing/proto/persist.pb.h" #include "icing/proto/reset.pb.h" @@ -47,6 +44,7 @@ #include "icing/proto/usage.pb.h" #include "icing/query/query-features.h" #include "icing/schema-builder.h" +#include "icing/schema/section.h" #include "icing/testing/common-matchers.h" #include "icing/testing/fake-clock.h" #include "icing/testing/icu-data-file-helper.h" @@ -117,6 +115,8 @@ std::string GetSchemaDir() { return GetTestBaseDir() + "/schema_dir"; } IcingSearchEngineOptions GetDefaultIcingOptions() { IcingSearchEngineOptions icing_options; icing_options.set_base_dir(GetTestBaseDir()); + icing_options.set_document_store_namespace_id_fingerprint(true); + icing_options.set_use_new_qualified_id_join_index(true); return icing_options; } @@ -888,6 +888,256 @@ TEST_F(IcingSearchEngineSchemaTest, expected_search_result_proto)); } +TEST_F( + IcingSearchEngineSchemaTest, + SetSchemaNewIndexedDocumentPropertyTriggersIndexRestorationAndReturnsOk) { + IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache()); + ASSERT_THAT(icing.Initialize().status(), ProtoIsOk()); + + // Create a schema with a nested document type: + // + // Section id assignment for 'Person': + // - "age": integer type, indexed. Section id = 0 + // - "name": string type, indexed. Section id = 1. + // - "worksFor.name": string type, (nested) indexed. Section id = 2. + // + // Joinable property id assignment for 'Person': + // - "worksFor.listRef": string type, Qualified Id type joinable. Joinable + // property id = 0. + SchemaProto schema_one = + SchemaBuilder() + .AddType(SchemaTypeConfigBuilder().SetType("List").AddProperty( + PropertyConfigBuilder() + .SetName("title") + .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_REQUIRED))) + .AddType(SchemaTypeConfigBuilder() + .SetType("Person") + .AddProperty(PropertyConfigBuilder() + .SetName("name") + .SetDataTypeString(TERM_MATCH_PREFIX, + TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_REQUIRED)) + .AddProperty(PropertyConfigBuilder() + .SetName("age") + .SetDataTypeInt64(NUMERIC_MATCH_RANGE) + .SetCardinality(CARDINALITY_OPTIONAL)) + .AddProperty(PropertyConfigBuilder() + .SetName("worksFor") + .SetDataTypeDocument( + "Organization", + /*index_nested_properties=*/true) + .SetCardinality(CARDINALITY_OPTIONAL))) + .AddType(SchemaTypeConfigBuilder() + .SetType("Organization") + .AddProperty(PropertyConfigBuilder() + .SetName("name") + .SetDataTypeString(TERM_MATCH_PREFIX, + TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_REQUIRED)) + .AddProperty(PropertyConfigBuilder() + .SetName("listRef") + .SetDataTypeJoinableString( + JOINABLE_VALUE_TYPE_QUALIFIED_ID) + .SetCardinality(CARDINALITY_REQUIRED))) + .Build(); + ASSERT_THAT(icing.SetSchema(schema_one).status(), ProtoIsOk()); + + DocumentProto list_document = DocumentBuilder() + .SetKey("namespace", "list/1") + .SetSchema("List") + .SetCreationTimestampMs(1000) + .AddStringProperty("title", "title") + .Build(); + DocumentProto person_document = + DocumentBuilder() + .SetKey("namespace", "person/2") + .SetSchema("Person") + .SetCreationTimestampMs(1000) + .AddStringProperty("name", "John") + .AddInt64Property("age", 20) + .AddDocumentProperty( + "worksFor", DocumentBuilder() + .SetKey("namespace", "org/1") + .SetSchema("Organization") + .AddStringProperty("name", "Google") + .AddStringProperty("listRef", "namespace#list/1") + .Build()) + .Build(); + EXPECT_THAT(icing.Put(list_document).status(), ProtoIsOk()); + EXPECT_THAT(icing.Put(person_document).status(), ProtoIsOk()); + + ResultSpecProto result_spec = ResultSpecProto::default_instance(); + result_spec.set_max_joined_children_per_parent_to_return( + std::numeric_limits<int32_t>::max()); + + SearchResultProto expected_search_result_proto; + expected_search_result_proto.mutable_status()->set_code(StatusProto::OK); + *expected_search_result_proto.mutable_results()->Add()->mutable_document() = + person_document; + + SearchResultProto empty_result; + empty_result.mutable_status()->set_code(StatusProto::OK); + + // Verify term search + SearchSpecProto search_spec1; + search_spec1.set_query("worksFor.name:Google"); + search_spec1.set_term_match_type(TermMatchType::EXACT_ONLY); + + SearchResultProto actual_results = + icing.Search(search_spec1, GetDefaultScoringSpec(), result_spec); + EXPECT_THAT(actual_results, EqualsSearchResultIgnoreStatsAndScores( + expected_search_result_proto)); + + // Verify numeric (integer) search + SearchSpecProto search_spec2; + search_spec2.set_query("age == 20"); + search_spec2.set_search_type( + SearchSpecProto::SearchType::EXPERIMENTAL_ICING_ADVANCED_QUERY); + search_spec2.add_enabled_features(std::string(kNumericSearchFeature)); + + actual_results = + icing.Search(search_spec2, GetDefaultScoringSpec(), result_spec); + EXPECT_THAT(actual_results, EqualsSearchResultIgnoreStatsAndScores( + expected_search_result_proto)); + + // Verify join search: join a query for `title:title` (which will get + // list_document) with a child query for `name:John` (which will get + // person_document) based on the child's `worksFor.listRef` field. + SearchSpecProto search_spec_with_join; + search_spec_with_join.set_query("title:title"); + search_spec_with_join.set_term_match_type(TermMatchType::EXACT_ONLY); + JoinSpecProto* join_spec = search_spec_with_join.mutable_join_spec(); + join_spec->set_parent_property_expression( + std::string(JoinProcessor::kQualifiedIdExpr)); + join_spec->set_child_property_expression("worksFor.listRef"); + join_spec->set_aggregation_scoring_strategy( + JoinSpecProto::AggregationScoringStrategy::COUNT); + JoinSpecProto::NestedSpecProto* nested_spec = + join_spec->mutable_nested_spec(); + SearchSpecProto* nested_search_spec = nested_spec->mutable_search_spec(); + nested_search_spec->set_term_match_type(TermMatchType::EXACT_ONLY); + nested_search_spec->set_query("name:John"); + *nested_spec->mutable_scoring_spec() = GetDefaultScoringSpec(); + *nested_spec->mutable_result_spec() = result_spec; + + SearchResultProto expected_join_search_result_proto; + expected_join_search_result_proto.mutable_status()->set_code(StatusProto::OK); + SearchResultProto::ResultProto* result_proto = + expected_join_search_result_proto.mutable_results()->Add(); + *result_proto->mutable_document() = list_document; + *result_proto->mutable_joined_results()->Add()->mutable_document() = + person_document; + + actual_results = + icing.Search(search_spec_with_join, GetDefaultScoringSpec(), result_spec); + EXPECT_THAT(actual_results, EqualsSearchResultIgnoreStatsAndScores( + expected_join_search_result_proto)); + + // Change the schema to add another nested document property to 'Person' + // + // New section id assignment for 'Person': + // - "age": integer type, indexed. Section id = 0 + // - "almaMater.name", string type, indexed. Section id = 1 + // - "name": string type, indexed. Section id = 2 + // - "worksFor.name": string type, (nested) indexed. Section id = 3 + // + // New joinable property id assignment for 'Person': + // - "almaMater.listRef": string type, Qualified Id type joinable. Joinable + // property id = 0. + // - "worksFor.listRef": string type, Qualified Id type joinable. Joinable + // property id = 1. + SchemaProto schema_two = + SchemaBuilder() + .AddType(SchemaTypeConfigBuilder().SetType("List").AddProperty( + PropertyConfigBuilder() + .SetName("title") + .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_REQUIRED))) + .AddType(SchemaTypeConfigBuilder() + .SetType("Person") + .AddProperty(PropertyConfigBuilder() + .SetName("name") + .SetDataTypeString(TERM_MATCH_PREFIX, + TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_REQUIRED)) + .AddProperty(PropertyConfigBuilder() + .SetName("age") + .SetDataTypeInt64(NUMERIC_MATCH_RANGE) + .SetCardinality(CARDINALITY_OPTIONAL)) + .AddProperty(PropertyConfigBuilder() + .SetName("worksFor") + .SetDataTypeDocument( + "Organization", + /*index_nested_properties=*/true) + .SetCardinality(CARDINALITY_OPTIONAL)) + .AddProperty(PropertyConfigBuilder() + .SetName("almaMater") + .SetDataTypeDocument( + "Organization", + /*index_nested_properties=*/true) + .SetCardinality(CARDINALITY_OPTIONAL))) + .AddType(SchemaTypeConfigBuilder() + .SetType("Organization") + .AddProperty(PropertyConfigBuilder() + .SetName("name") + .SetDataTypeString(TERM_MATCH_PREFIX, + TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_REQUIRED)) + .AddProperty(PropertyConfigBuilder() + .SetName("listRef") + .SetDataTypeJoinableString( + JOINABLE_VALUE_TYPE_QUALIFIED_ID) + .SetCardinality(CARDINALITY_REQUIRED))) + .Build(); + + // This schema change is compatible since the added 'almaMater' property has + // CARDINALITY_OPTIONAL. + // + // Index restoration should be triggered here because new schema requires more + // properties to be indexed. Also new section ids will be reassigned and index + // restoration should use new section ids to rebuild. + SetSchemaResultProto set_schema_result = icing.SetSchema(schema_two); + // Ignore latency numbers. They're covered elsewhere. + set_schema_result.clear_latency_ms(); + SetSchemaResultProto expected_set_schema_result = SetSchemaResultProto(); + expected_set_schema_result.mutable_status()->set_code(StatusProto::OK); + expected_set_schema_result.mutable_index_incompatible_changed_schema_types() + ->Add("Person"); + expected_set_schema_result.mutable_join_incompatible_changed_schema_types() + ->Add("Person"); + EXPECT_THAT(set_schema_result, EqualsProto(expected_set_schema_result)); + + // Verify term search: + // Searching for "worksFor.name:Google" should still match document + actual_results = + icing.Search(search_spec1, GetDefaultScoringSpec(), result_spec); + EXPECT_THAT(actual_results, EqualsSearchResultIgnoreStatsAndScores( + expected_search_result_proto)); + + // In new_schema the 'name' property is now indexed at section id 2. If + // searching for "name:Google" matched the document, this means that index + // rebuild was not triggered and Icing is still searching the old index, where + // 'worksFor.name' was indexed at section id 2. + search_spec1.set_query("name:Google"); + actual_results = + icing.Search(search_spec1, GetDefaultScoringSpec(), result_spec); + EXPECT_THAT(actual_results, + EqualsSearchResultIgnoreStatsAndScores(empty_result)); + + // Verify numeric (integer) search: should still match document + actual_results = + icing.Search(search_spec2, GetDefaultScoringSpec(), result_spec); + EXPECT_THAT(actual_results, EqualsSearchResultIgnoreStatsAndScores( + expected_search_result_proto)); + + // Verify join search: should still able to join by `worksFor.listRef` + actual_results = + icing.Search(search_spec_with_join, GetDefaultScoringSpec(), result_spec); + EXPECT_THAT(actual_results, EqualsSearchResultIgnoreStatsAndScores( + expected_join_search_result_proto)); +} + TEST_F(IcingSearchEngineSchemaTest, SetSchemaChangeNestedPropertiesTriggersIndexRestorationAndReturnsOk) { IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache()); @@ -1081,6 +1331,281 @@ TEST_F(IcingSearchEngineSchemaTest, EqualsSearchResultIgnoreStatsAndScores(empty_result)); } +TEST_F( + IcingSearchEngineSchemaTest, + SetSchemaChangeNestedPropertiesListTriggersIndexRestorationAndReturnsOk) { + IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache()); + ASSERT_THAT(icing.Initialize().status(), ProtoIsOk()); + + SchemaTypeConfigProto person_proto = + SchemaTypeConfigBuilder() + .SetType("Person") + .AddProperty( + PropertyConfigBuilder() + .SetName("name") + .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_OPTIONAL)) + .AddProperty( + PropertyConfigBuilder() + .SetName("lastName") + .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_OPTIONAL)) + .AddProperty( + PropertyConfigBuilder() + .SetName("address") + .SetDataTypeString(TERM_MATCH_UNKNOWN, TOKENIZER_NONE) + .SetCardinality(CARDINALITY_OPTIONAL)) + .AddProperty(PropertyConfigBuilder() + .SetName("age") + .SetDataTypeInt64(NUMERIC_MATCH_RANGE) + .SetCardinality(CARDINALITY_OPTIONAL)) + .AddProperty(PropertyConfigBuilder() + .SetName("birthday") + .SetDataTypeInt64(NUMERIC_MATCH_UNKNOWN) + .SetCardinality(CARDINALITY_OPTIONAL)) + .Build(); + // Create a schema with nested properties: + // - "sender.address": string type, (nested) non-indexable. Section id = 0. + // - "sender.age": int64 type, (nested) indexed. Section id = 1. + // - "sender.birthday": int64 type, (nested) non-indexable. Section id = 2. + // - "sender.lastName": int64 type, (nested) indexed. Section id = 3. + // - "sender.name": string type, (nested) indexed. Section id = 4. + // - "subject": string type, indexed. Section id = 5. + // - "timestamp": int64 type, indexed. Section id = 6. + // - "sender.foo": unknown type, (nested) non-indexable. Section id = 7. + // + // "sender.address" and "sender.birthday" are assigned a section id because + // they are listed in the indexable_nested_properties_list for 'Email.sender'. + // They are assigned a sectionId but are not indexed since their indexing + // configs are non-indexable. + // + // "sender.foo" is also assigned a section id, but is also not undefined by + // the schema definition. Trying to index a document with this nested property + // should fail. + SchemaProto nested_schema = + SchemaBuilder() + .AddType(person_proto) + .AddType( + SchemaTypeConfigBuilder() + .SetType("Email") + .AddProperty( + PropertyConfigBuilder() + .SetName("sender") + .SetDataTypeDocument( + "Person", /*indexable_nested_properties_list=*/ + {"age", "lastName", "address", "name", "birthday", + "foo"}) + .SetCardinality(CARDINALITY_OPTIONAL)) + .AddProperty( + PropertyConfigBuilder() + .SetName("subject") + .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_OPTIONAL)) + .AddProperty(PropertyConfigBuilder() + .SetName("timestamp") + .SetDataTypeInt64(NUMERIC_MATCH_RANGE) + .SetCardinality(CARDINALITY_OPTIONAL))) + .Build(); + + SetSchemaResultProto set_schema_result = icing.SetSchema(nested_schema); + // Ignore latency numbers. They're covered elsewhere. + set_schema_result.clear_latency_ms(); + SetSchemaResultProto expected_set_schema_result; + expected_set_schema_result.mutable_status()->set_code(StatusProto::OK); + expected_set_schema_result.mutable_new_schema_types()->Add("Email"); + expected_set_schema_result.mutable_new_schema_types()->Add("Person"); + EXPECT_THAT(set_schema_result, EqualsProto(expected_set_schema_result)); + + DocumentProto document = + DocumentBuilder() + .SetKey("namespace1", "uri1") + .SetSchema("Email") + .SetCreationTimestampMs(1000) + .AddStringProperty("subject", + "Did you get the memo about TPS reports?") + .AddDocumentProperty( + "sender", + DocumentBuilder() + .SetKey("namespace1", "uri1") + .SetSchema("Person") + .AddStringProperty("name", "Bill") + .AddStringProperty("lastName", "Lundbergh") + .AddStringProperty("address", "1600 Amphitheatre Pkwy") + .AddInt64Property("age", 20) + .AddInt64Property("birthday", 20) + .Build()) + .AddInt64Property("timestamp", 1234) + .Build(); + + // Indexing this doc should fail, since the 'sender.foo' property is not found + DocumentProto invalid_document = + DocumentBuilder() + .SetKey("namespace2", "uri1") + .SetSchema("Email") + .SetCreationTimestampMs(1000) + .AddStringProperty("subject", + "Did you get the memo about TPS reports?") + .AddDocumentProperty( + "sender", + DocumentBuilder() + .SetKey("namespace1", "uri1") + .SetSchema("Person") + .AddStringProperty("name", "Bill") + .AddStringProperty("lastName", "Lundbergh") + .AddStringProperty("address", "1600 Amphitheatre Pkwy") + .AddInt64Property("age", 20) + .AddInt64Property("birthday", 20) + .AddBytesProperty("foo", "bar bytes") + .Build()) + .AddInt64Property("timestamp", 1234) + .Build(); + + EXPECT_THAT(icing.Put(document).status(), ProtoIsOk()); + EXPECT_THAT(icing.Put(invalid_document).status(), + ProtoStatusIs(StatusProto::NOT_FOUND)); + + SearchResultProto expected_search_result_proto; + expected_search_result_proto.mutable_status()->set_code(StatusProto::OK); + *expected_search_result_proto.mutable_results()->Add()->mutable_document() = + document; + + SearchResultProto empty_result; + empty_result.mutable_status()->set_code(StatusProto::OK); + + // Verify term search + // document should match a query for 'Bill' in 'sender.name', but not in + // 'sender.lastName' + SearchSpecProto search_spec1; + search_spec1.set_query("sender.name:Bill"); + search_spec1.set_term_match_type(TermMatchType::EXACT_ONLY); + + SearchResultProto actual_results = + icing.Search(search_spec1, GetDefaultScoringSpec(), + ResultSpecProto::default_instance()); + EXPECT_THAT(actual_results, EqualsSearchResultIgnoreStatsAndScores( + expected_search_result_proto)); + + search_spec1.set_query("sender.lastName:Bill"); + actual_results = icing.Search(search_spec1, GetDefaultScoringSpec(), + ResultSpecProto::default_instance()); + EXPECT_THAT(actual_results, + EqualsSearchResultIgnoreStatsAndScores(empty_result)); + + // document should match a query for 'Lundber' in 'sender.lastName', but not + // in 'sender.name'. + SearchSpecProto search_spec2; + search_spec2.set_query("sender.lastName:Lundber"); + search_spec2.set_term_match_type(TermMatchType::PREFIX); + + actual_results = icing.Search(search_spec2, GetDefaultScoringSpec(), + ResultSpecProto::default_instance()); + EXPECT_THAT(actual_results, EqualsSearchResultIgnoreStatsAndScores( + expected_search_result_proto)); + + search_spec2.set_query("sender.name:Lundber"); + actual_results = icing.Search(search_spec2, GetDefaultScoringSpec(), + ResultSpecProto::default_instance()); + EXPECT_THAT(actual_results, + EqualsSearchResultIgnoreStatsAndScores(empty_result)); + + // document should not match a query for 'Amphitheatre' because the + // 'sender.address' field is not indexed. + search_spec2.set_query("Amphitheatre"); + search_spec2.set_term_match_type(TermMatchType::PREFIX); + + actual_results = icing.Search(search_spec2, GetDefaultScoringSpec(), + ResultSpecProto::default_instance()); + EXPECT_THAT(actual_results, + EqualsSearchResultIgnoreStatsAndScores(empty_result)); + + // Verify numeric (integer) search + // document should match a query for 20 in 'sender.age', but not in + // 'timestamp' or 'sender.birthday' + SearchSpecProto search_spec3; + search_spec3.set_query("sender.age == 20"); + search_spec3.set_search_type( + SearchSpecProto::SearchType::EXPERIMENTAL_ICING_ADVANCED_QUERY); + search_spec3.add_enabled_features(std::string(kNumericSearchFeature)); + + actual_results = icing.Search(search_spec3, GetDefaultScoringSpec(), + ResultSpecProto::default_instance()); + EXPECT_THAT(actual_results, EqualsSearchResultIgnoreStatsAndScores( + expected_search_result_proto)); + + search_spec3.set_query("timestamp == 20"); + actual_results = icing.Search(search_spec3, GetDefaultScoringSpec(), + ResultSpecProto::default_instance()); + EXPECT_THAT(actual_results, + EqualsSearchResultIgnoreStatsAndScores(empty_result)); + + search_spec3.set_query("birthday == 20"); + actual_results = icing.Search(search_spec3, GetDefaultScoringSpec(), + ResultSpecProto::default_instance()); + EXPECT_THAT(actual_results, + EqualsSearchResultIgnoreStatsAndScores(empty_result)); + + // Now update the schema and don't index "sender.name", "sender.birthday" and + // "sender.foo". + // This should reassign section ids, lead to an index rebuild and ensure that + // nothing match a query for "Bill". + // + // Section id assignment: + // - "sender.address": string type, (nested) non-indexable. Section id = 0. + // - "sender.age": int64 type, (nested) indexed. Section id = 1. + // - "sender.birthday": int64 type, (nested) unindexed. No section id. + // - "sender.lastName": int64 type, (nested) indexed. Section id = 2. + // - "sender.name": string type, (nested) unindexed. No section id. + // - "subject": string type, indexed. Section id = 3. + // - "timestamp": int64 type, indexed. Section id = 4. + // - "sender.foo": unknown type, invalid. No section id. + SchemaProto nested_schema_with_less_props = + SchemaBuilder() + .AddType(person_proto) + .AddType(SchemaTypeConfigBuilder() + .SetType("Email") + .AddProperty( + PropertyConfigBuilder() + .SetName("sender") + .SetDataTypeDocument( + "Person", /*indexable_nested_properties=*/ + {"age", "lastName", "address"}) + .SetCardinality(CARDINALITY_OPTIONAL)) + .AddProperty(PropertyConfigBuilder() + .SetName("subject") + .SetDataTypeString(TERM_MATCH_PREFIX, + TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_OPTIONAL)) + .AddProperty(PropertyConfigBuilder() + .SetName("timestamp") + .SetDataTypeInt64(NUMERIC_MATCH_RANGE) + .SetCardinality(CARDINALITY_OPTIONAL))) + .Build(); + + set_schema_result = icing.SetSchema(nested_schema_with_less_props); + // Ignore latency numbers. They're covered elsewhere. + set_schema_result.clear_latency_ms(); + expected_set_schema_result = SetSchemaResultProto(); + expected_set_schema_result.mutable_status()->set_code(StatusProto::OK); + expected_set_schema_result.mutable_index_incompatible_changed_schema_types() + ->Add("Email"); + EXPECT_THAT(set_schema_result, EqualsProto(expected_set_schema_result)); + + // Verify term search + // document shouldn't match a query for 'Bill' in either 'sender.name' or + // 'subject' + search_spec1.set_query("sender.name:Bill"); + actual_results = icing.Search(search_spec1, GetDefaultScoringSpec(), + ResultSpecProto::default_instance()); + EXPECT_THAT(actual_results, + EqualsSearchResultIgnoreStatsAndScores(empty_result)); + + search_spec1.set_query("subject:Bill"); + actual_results = icing.Search(search_spec1, GetDefaultScoringSpec(), + ResultSpecProto::default_instance()); + EXPECT_THAT(actual_results, + EqualsSearchResultIgnoreStatsAndScores(empty_result)); +} + TEST_F(IcingSearchEngineSchemaTest, SetSchemaNewJoinablePropertyTriggersIndexRestorationAndReturnsOk) { IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache()); @@ -1614,8 +2139,8 @@ TEST_F(IcingSearchEngineSchemaTest, // - "senderQualifiedId": qualified id joinable. Joinable property id = 0. // // If the index is not correctly rebuilt, then the joinable data of - // "senderQualifiedId" in the joinable index will still have old joinable - // property id of 1 and therefore won't take effect for join search query. + // "senderQualifiedId" in the join index will still have old joinable property + // id of 1 and therefore won't take effect for join search query. SchemaProto email_without_receiver_schema = SchemaBuilder() .AddType(SchemaTypeConfigBuilder().SetType("Person").AddProperty( @@ -1917,8 +2442,8 @@ TEST_F( // - "zQualifiedId": qualified id joinable. Joinable property id = 1. // // If the index is not correctly rebuilt, then the joinable data of - // "senderQualifiedId" in the joinable index will still have old joinable - // property id of 1 and therefore won't take effect for join search query. + // "senderQualifiedId" in the join index will still have old joinable property + // id of 1 and therefore won't take effect for join search query. SchemaProto email_no_body_schema = SchemaBuilder() .AddType(SchemaTypeConfigBuilder().SetType("Person").AddProperty( @@ -2609,6 +3134,26 @@ TEST_F(IcingSearchEngineSchemaTest, IcingShouldWorkFor64Sections) { EqualsSearchResultIgnoreStatsAndScores(expected_no_documents)); } +TEST_F(IcingSearchEngineSchemaTest, IcingShouldReturnErrorForExtraSections) { + // Create a schema with more sections than allowed. + SchemaTypeConfigBuilder schema_type_config_builder = + SchemaTypeConfigBuilder().SetType("type"); + for (int i = 0; i <= kMaxSectionId + 1; ++i) { + schema_type_config_builder.AddProperty( + PropertyConfigBuilder() + .SetName("prop" + std::to_string(i)) + .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_OPTIONAL)); + } + SchemaProto schema = + SchemaBuilder().AddType(schema_type_config_builder).Build(); + + IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache()); + ASSERT_THAT(icing.Initialize().status(), ProtoIsOk()); + ASSERT_THAT(icing.SetSchema(schema).status().message(), + HasSubstr("Too many properties to be indexed")); +} + } // namespace } // namespace lib } // namespace icing diff --git a/icing/icing-search-engine_search_test.cc b/icing/icing-search-engine_search_test.cc index f1b49fb..21512c6 100644 --- a/icing/icing-search-engine_search_test.cc +++ b/icing/icing-search-engine_search_test.cc @@ -24,6 +24,7 @@ #include "icing/document-builder.h" #include "icing/file/filesystem.h" #include "icing/icing-search-engine.h" +#include "icing/index/lite/term-id-hit-pair.h" #include "icing/jni/jni-cache.h" #include "icing/join/join-processor.h" #include "icing/portable/endian.h" @@ -45,6 +46,7 @@ #include "icing/proto/term.pb.h" #include "icing/proto/usage.pb.h" #include "icing/query/query-features.h" +#include "icing/result/result-state-manager.h" #include "icing/schema-builder.h" #include "icing/testing/common-matchers.h" #include "icing/testing/fake-clock.h" @@ -60,10 +62,12 @@ namespace lib { namespace { using ::icing::lib::portable_equals_proto::EqualsProto; +using ::testing::DoubleEq; using ::testing::ElementsAre; using ::testing::Eq; using ::testing::Gt; using ::testing::IsEmpty; +using ::testing::Lt; using ::testing::Ne; using ::testing::SizeIs; @@ -119,6 +123,8 @@ constexpr int64_t kDefaultCreationTimestampMs = 1575492852000; IcingSearchEngineOptions GetDefaultIcingOptions() { IcingSearchEngineOptions icing_options; icing_options.set_base_dir(GetTestBaseDir()); + icing_options.set_document_store_namespace_id_fingerprint(true); + icing_options.set_use_new_qualified_id_join_index(true); return icing_options; } @@ -393,14 +399,39 @@ TEST_P(IcingSearchEngineSearchTest, SearchReturnsOneResult) { EXPECT_THAT(search_result_proto.status(), ProtoIsOk()); EXPECT_THAT(search_result_proto.query_stats().latency_ms(), Eq(1000)); + EXPECT_THAT(search_result_proto.query_stats().document_retrieval_latency_ms(), + Eq(1000)); + EXPECT_THAT(search_result_proto.query_stats().lock_acquisition_latency_ms(), + Eq(1000)); + // TODO(b/305098009): deprecate search-related flat fields in query_stats. EXPECT_THAT(search_result_proto.query_stats().parse_query_latency_ms(), Eq(1000)); EXPECT_THAT(search_result_proto.query_stats().scoring_latency_ms(), Eq(1000)); EXPECT_THAT(search_result_proto.query_stats().ranking_latency_ms(), Eq(1000)); - EXPECT_THAT(search_result_proto.query_stats().document_retrieval_latency_ms(), + EXPECT_THAT(search_result_proto.query_stats() + .parent_search_stats() + .parse_query_latency_ms(), Eq(1000)); - EXPECT_THAT(search_result_proto.query_stats().lock_acquisition_latency_ms(), + EXPECT_THAT(search_result_proto.query_stats() + .parent_search_stats() + .scoring_latency_ms(), Eq(1000)); + EXPECT_THAT(search_result_proto.query_stats() + .parent_search_stats() + .num_documents_scored(), + Eq(2)); + EXPECT_THAT(search_result_proto.query_stats() + .parent_search_stats() + .num_fetched_hits_lite_index(), + Eq(2)); + EXPECT_THAT(search_result_proto.query_stats() + .parent_search_stats() + .num_fetched_hits_main_index(), + Eq(0)); + EXPECT_THAT(search_result_proto.query_stats() + .parent_search_stats() + .num_fetched_hits_integer_index(), + Eq(0)); // The token is a random number so we don't verify it. expected_search_result_proto.set_next_page_token( @@ -444,14 +475,39 @@ TEST_P(IcingSearchEngineSearchTest, SearchReturnsOneResult_readOnlyFalse) { EXPECT_THAT(search_result_proto.status(), ProtoIsOk()); EXPECT_THAT(search_result_proto.query_stats().latency_ms(), Eq(1000)); + EXPECT_THAT(search_result_proto.query_stats().document_retrieval_latency_ms(), + Eq(1000)); + EXPECT_THAT(search_result_proto.query_stats().lock_acquisition_latency_ms(), + Eq(1000)); + // TODO(b/305098009): deprecate search-related flat fields in query_stats. EXPECT_THAT(search_result_proto.query_stats().parse_query_latency_ms(), Eq(1000)); EXPECT_THAT(search_result_proto.query_stats().scoring_latency_ms(), Eq(1000)); EXPECT_THAT(search_result_proto.query_stats().ranking_latency_ms(), Eq(1000)); - EXPECT_THAT(search_result_proto.query_stats().document_retrieval_latency_ms(), + EXPECT_THAT(search_result_proto.query_stats() + .parent_search_stats() + .parse_query_latency_ms(), Eq(1000)); - EXPECT_THAT(search_result_proto.query_stats().lock_acquisition_latency_ms(), + EXPECT_THAT(search_result_proto.query_stats() + .parent_search_stats() + .scoring_latency_ms(), Eq(1000)); + EXPECT_THAT(search_result_proto.query_stats() + .parent_search_stats() + .num_documents_scored(), + Eq(2)); + EXPECT_THAT(search_result_proto.query_stats() + .parent_search_stats() + .num_fetched_hits_lite_index(), + Eq(2)); + EXPECT_THAT(search_result_proto.query_stats() + .parent_search_stats() + .num_fetched_hits_main_index(), + Eq(0)); + EXPECT_THAT(search_result_proto.query_stats() + .parent_search_stats() + .num_fetched_hits_integer_index(), + Eq(0)); // The token is a random number so we don't verify it. expected_search_result_proto.set_next_page_token( @@ -502,6 +558,71 @@ TEST_P(IcingSearchEngineSearchTest, expected_search_result_proto)); } +TEST_P(IcingSearchEngineSearchTest, SearchWithNumToScore) { + auto fake_clock = std::make_unique<FakeClock>(); + fake_clock->SetTimerElapsedMilliseconds(1000); + TestIcingSearchEngine icing(GetDefaultIcingOptions(), + std::make_unique<Filesystem>(), + std::make_unique<IcingFilesystem>(), + std::move(fake_clock), GetTestJniCache()); + ASSERT_THAT(icing.Initialize().status(), ProtoIsOk()); + ASSERT_THAT(icing.SetSchema(CreateMessageSchema()).status(), ProtoIsOk()); + + DocumentProto document_one = CreateMessageDocument("namespace", "uri1"); + document_one.set_score(10); + ASSERT_THAT(icing.Put(document_one).status(), ProtoIsOk()); + + DocumentProto document_two = CreateMessageDocument("namespace", "uri2"); + document_two.set_score(5); + ASSERT_THAT(icing.Put(document_two).status(), ProtoIsOk()); + + SearchSpecProto search_spec; + search_spec.set_term_match_type(TermMatchType::PREFIX); + search_spec.set_query("message"); + search_spec.set_search_type(GetParam()); + + ResultSpecProto result_spec; + result_spec.set_num_per_page(10); + result_spec.set_num_to_score(10); + + ScoringSpecProto scoring_spec = GetDefaultScoringSpec(); + + SearchResultProto expected_search_result_proto1; + expected_search_result_proto1.mutable_status()->set_code(StatusProto::OK); + *expected_search_result_proto1.mutable_results()->Add()->mutable_document() = + document_one; + *expected_search_result_proto1.mutable_results()->Add()->mutable_document() = + document_two; + + SearchResultProto search_result_proto = + icing.Search(search_spec, GetDefaultScoringSpec(), result_spec); + EXPECT_THAT(search_result_proto.status(), ProtoIsOk()); + EXPECT_THAT(search_result_proto, EqualsSearchResultIgnoreStatsAndScores( + expected_search_result_proto1)); + + result_spec.set_num_to_score(1); + // By setting num_to_score = 1, only document_two will be scored, ranked, and + // returned. + // - num_to_score cutoff is only affected by the reading order from posting + // list. IOW, since we read posting lists in doc id descending order, + // ScoringProcessor scores documents with higher doc ids first and cuts off + // if exceeding num_to_score. + // - Therefore, even though document_one has higher score, ScoringProcessor + // still skips document_one, because posting list reads document_two first + // and ScoringProcessor stops after document_two given that total # of + // scored document has already reached num_to_score. + SearchResultProto expected_search_result_google::protobuf; + expected_search_result_google::protobuf.mutable_status()->set_code(StatusProto::OK); + *expected_search_result_google::protobuf.mutable_results()->Add()->mutable_document() = + document_two; + + search_result_proto = + icing.Search(search_spec, GetDefaultScoringSpec(), result_spec); + EXPECT_THAT(search_result_proto.status(), ProtoIsOk()); + EXPECT_THAT(search_result_proto, EqualsSearchResultIgnoreStatsAndScores( + expected_search_result_google::protobuf)); +} + TEST_P(IcingSearchEngineSearchTest, SearchNegativeResultLimitReturnsInvalidArgument) { IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache()); @@ -551,7 +672,6 @@ TEST_P(IcingSearchEngineSearchTest, expected_search_result_proto)); } - TEST_P(IcingSearchEngineSearchTest, SearchNonPositivePageTotalBytesLimitReturnsInvalidArgument) { IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache()); @@ -577,6 +697,62 @@ TEST_P(IcingSearchEngineSearchTest, ProtoStatusIs(StatusProto::INVALID_ARGUMENT)); } +TEST_P(IcingSearchEngineSearchTest, + SearchNegativeMaxJoinedChildrenPerParentReturnsInvalidArgument) { + IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache()); + ASSERT_THAT(icing.Initialize().status(), ProtoIsOk()); + + SearchSpecProto search_spec; + search_spec.set_term_match_type(TermMatchType::PREFIX); + search_spec.set_query(""); + search_spec.set_search_type(GetParam()); + + ResultSpecProto result_spec; + result_spec.set_max_joined_children_per_parent_to_return(-1); + + SearchResultProto expected_search_result_proto; + expected_search_result_proto.mutable_status()->set_code( + StatusProto::INVALID_ARGUMENT); + expected_search_result_proto.mutable_status()->set_message( + "ResultSpecProto.max_joined_children_per_parent_to_return cannot be " + "negative."); + SearchResultProto actual_results = + icing.Search(search_spec, GetDefaultScoringSpec(), result_spec); + EXPECT_THAT(actual_results, EqualsSearchResultIgnoreStatsAndScores( + expected_search_result_proto)); +} + +TEST_P(IcingSearchEngineSearchTest, + SearchNonPositiveNumToScoreReturnsInvalidArgument) { + IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache()); + ASSERT_THAT(icing.Initialize().status(), ProtoIsOk()); + + SearchSpecProto search_spec; + search_spec.set_term_match_type(TermMatchType::PREFIX); + search_spec.set_query(""); + search_spec.set_search_type(GetParam()); + + ResultSpecProto result_spec; + result_spec.set_num_to_score(-1); + + SearchResultProto expected_search_result_proto; + expected_search_result_proto.mutable_status()->set_code( + StatusProto::INVALID_ARGUMENT); + expected_search_result_proto.mutable_status()->set_message( + "ResultSpecProto.num_to_score cannot be non-positive."); + + SearchResultProto actual_results1 = + icing.Search(search_spec, GetDefaultScoringSpec(), result_spec); + EXPECT_THAT(actual_results1, EqualsSearchResultIgnoreStatsAndScores( + expected_search_result_proto)); + + result_spec.set_num_to_score(0); + SearchResultProto actual_results2 = + icing.Search(search_spec, GetDefaultScoringSpec(), result_spec); + EXPECT_THAT(actual_results2, EqualsSearchResultIgnoreStatsAndScores( + expected_search_result_proto)); +} + TEST_P(IcingSearchEngineSearchTest, SearchWithPersistenceReturnsValidResults) { IcingSearchEngineOptions icing_options = GetDefaultIcingOptions(); @@ -658,14 +834,39 @@ TEST_P(IcingSearchEngineSearchTest, SearchShouldReturnEmpty) { EXPECT_THAT(search_result_proto.status(), ProtoIsOk()); EXPECT_THAT(search_result_proto.query_stats().latency_ms(), Eq(1000)); + EXPECT_THAT(search_result_proto.query_stats().document_retrieval_latency_ms(), + Eq(0)); + EXPECT_THAT(search_result_proto.query_stats().lock_acquisition_latency_ms(), + Eq(1000)); + // TODO(b/305098009): deprecate search-related flat fields in query_stats. EXPECT_THAT(search_result_proto.query_stats().parse_query_latency_ms(), Eq(1000)); EXPECT_THAT(search_result_proto.query_stats().scoring_latency_ms(), Eq(1000)); EXPECT_THAT(search_result_proto.query_stats().ranking_latency_ms(), Eq(0)); - EXPECT_THAT(search_result_proto.query_stats().document_retrieval_latency_ms(), - Eq(0)); - EXPECT_THAT(search_result_proto.query_stats().lock_acquisition_latency_ms(), + EXPECT_THAT(search_result_proto.query_stats() + .parent_search_stats() + .parse_query_latency_ms(), + Eq(1000)); + EXPECT_THAT(search_result_proto.query_stats() + .parent_search_stats() + .scoring_latency_ms(), Eq(1000)); + EXPECT_THAT(search_result_proto.query_stats() + .parent_search_stats() + .num_documents_scored(), + Eq(0)); + EXPECT_THAT(search_result_proto.query_stats() + .parent_search_stats() + .num_fetched_hits_lite_index(), + Eq(0)); + EXPECT_THAT(search_result_proto.query_stats() + .parent_search_stats() + .num_fetched_hits_main_index(), + Eq(0)); + EXPECT_THAT(search_result_proto.query_stats() + .parent_search_stats() + .num_fetched_hits_integer_index(), + Eq(0)); EXPECT_THAT(search_result_proto, EqualsSearchResultIgnoreStatsAndScores( expected_search_result_proto)); @@ -3444,11 +3645,795 @@ TEST_P(IcingSearchEngineSearchTest, SearchWithProjectionMultipleFieldPaths) { EqualsProto(projected_document_one)); } +TEST_P(IcingSearchEngineSearchTest, SearchWithPropertyFilters) { + IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache()); + ASSERT_THAT(icing.Initialize().status(), ProtoIsOk()); + ASSERT_THAT(icing.SetSchema(CreatePersonAndEmailSchema()).status(), + ProtoIsOk()); + + // 1. Add two email documents + DocumentProto document_one = + DocumentBuilder() + .SetKey("namespace", "uri1") + .SetCreationTimestampMs(1000) + .SetSchema("Email") + .AddDocumentProperty( + "sender", + DocumentBuilder() + .SetKey("namespace", "uri1") + .SetSchema("Person") + .AddStringProperty("name", "Meg Ryan") + .AddStringProperty("emailAddress", "hellogirl@aol.com") + .Build()) + .AddStringProperty("subject", "Hello World!") + .AddStringProperty( + "body", "Oh what a beautiful morning! Oh what a beautiful day!") + .Build(); + ASSERT_THAT(icing.Put(document_one).status(), ProtoIsOk()); + + DocumentProto document_two = + DocumentBuilder() + .SetKey("namespace", "uri2") + .SetCreationTimestampMs(1000) + .SetSchema("Email") + .AddDocumentProperty( + "sender", DocumentBuilder() + .SetKey("namespace", "uri2") + .SetSchema("Person") + .AddStringProperty("name", "Tom Hanks") + .AddStringProperty("emailAddress", "ny152@aol.com") + .Build()) + .AddStringProperty("subject", "Goodnight Moon!") + .AddStringProperty("body", + "Count all the sheep and tell them 'Hello'.") + .Build(); + ASSERT_THAT(icing.Put(document_two).status(), ProtoIsOk()); + + // 2. Issue a query with property filters of sender.name and subject for the + // Email schema type. + auto search_spec = std::make_unique<SearchSpecProto>(); + search_spec->set_term_match_type(TermMatchType::PREFIX); + search_spec->set_query("hello"); + search_spec->set_search_type(GetParam()); + TypePropertyMask* email_property_filters = + search_spec->add_type_property_filters(); + email_property_filters->set_schema_type("Email"); + email_property_filters->add_paths("sender.name"); + email_property_filters->add_paths("subject"); + + auto result_spec = std::make_unique<ResultSpecProto>(); + + auto scoring_spec = std::make_unique<ScoringSpecProto>(); + *scoring_spec = GetDefaultScoringSpec(); + SearchResultProto results = + icing.Search(*search_spec, *scoring_spec, *result_spec); + EXPECT_THAT(results.status(), ProtoIsOk()); + EXPECT_THAT(results.results(), SizeIs(1)); + + // 3. Verify that only the first document is returned. Although 'hello' is + // present in document_two, it shouldn't be in the result since 'hello' is not + // in the specified property filter. + EXPECT_THAT(results.results(0).document(), EqualsProto(document_one)); +} + +TEST_P(IcingSearchEngineSearchTest, EmptySearchWithPropertyFilter) { + IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache()); + ASSERT_THAT(icing.Initialize().status(), ProtoIsOk()); + ASSERT_THAT(icing.SetSchema(CreatePersonAndEmailSchema()).status(), + ProtoIsOk()); + + // 1. Add two email documents + DocumentProto document_one = + DocumentBuilder() + .SetKey("namespace", "uri1") + .SetCreationTimestampMs(1000) + .SetSchema("Email") + .AddDocumentProperty( + "sender", + DocumentBuilder() + .SetKey("namespace", "uri1") + .SetSchema("Person") + .AddStringProperty("name", "Meg Ryan") + .AddStringProperty("emailAddress", "hellogirl@aol.com") + .Build()) + .AddStringProperty("subject", "Hello World!") + .AddStringProperty( + "body", "Oh what a beautiful morning! Oh what a beautiful day!") + .Build(); + ASSERT_THAT(icing.Put(document_one).status(), ProtoIsOk()); + + DocumentProto document_two = + DocumentBuilder() + .SetKey("namespace", "uri2") + .SetCreationTimestampMs(1000) + .SetSchema("Email") + .AddDocumentProperty( + "sender", DocumentBuilder() + .SetKey("namespace", "uri2") + .SetSchema("Person") + .AddStringProperty("name", "Tom Hanks") + .AddStringProperty("emailAddress", "ny152@aol.com") + .Build()) + .AddStringProperty("subject", "Goodnight Moon!") + .AddStringProperty("body", + "Count all the sheep and tell them 'Hello'.") + .Build(); + ASSERT_THAT(icing.Put(document_two).status(), ProtoIsOk()); + + // 2. Issue a query with a property filter + auto search_spec = std::make_unique<SearchSpecProto>(); + search_spec->set_term_match_type(TermMatchType::PREFIX); + search_spec->set_query(""); + search_spec->set_search_type(GetParam()); + TypePropertyMask* email_property_filters = + search_spec->add_type_property_filters(); + email_property_filters->set_schema_type("Email"); + email_property_filters->add_paths("subject"); + + auto result_spec = std::make_unique<ResultSpecProto>(); + + // 3. Verify that both documents are returned. + auto scoring_spec = std::make_unique<ScoringSpecProto>(); + *scoring_spec = GetDefaultScoringSpec(); + SearchResultProto results = + icing.Search(*search_spec, *scoring_spec, *result_spec); + EXPECT_THAT(results.status(), ProtoIsOk()); + EXPECT_THAT(results.results(), SizeIs(2)); +} + +TEST_P(IcingSearchEngineSearchTest, EmptySearchWithEmptyPropertyFilter) { + IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache()); + ASSERT_THAT(icing.Initialize().status(), ProtoIsOk()); + ASSERT_THAT(icing.SetSchema(CreatePersonAndEmailSchema()).status(), + ProtoIsOk()); + + // 1. Add two email documents + DocumentProto document_one = + DocumentBuilder() + .SetKey("namespace", "uri1") + .SetCreationTimestampMs(1000) + .SetSchema("Email") + .AddDocumentProperty( + "sender", + DocumentBuilder() + .SetKey("namespace", "uri1") + .SetSchema("Person") + .AddStringProperty("name", "Meg Ryan") + .AddStringProperty("emailAddress", "hellogirl@aol.com") + .Build()) + .AddStringProperty("subject", "Hello World!") + .AddStringProperty( + "body", "Oh what a beautiful morning! Oh what a beautiful day!") + .Build(); + ASSERT_THAT(icing.Put(document_one).status(), ProtoIsOk()); + + DocumentProto document_two = + DocumentBuilder() + .SetKey("namespace", "uri2") + .SetCreationTimestampMs(1000) + .SetSchema("Email") + .AddDocumentProperty( + "sender", DocumentBuilder() + .SetKey("namespace", "uri2") + .SetSchema("Person") + .AddStringProperty("name", "Tom Hanks") + .AddStringProperty("emailAddress", "ny152@aol.com") + .Build()) + .AddStringProperty("subject", "Goodnight Moon!") + .AddStringProperty("body", + "Count all the sheep and tell them 'Hello'.") + .Build(); + ASSERT_THAT(icing.Put(document_two).status(), ProtoIsOk()); + + // 2. Issue a query with a property filter + auto search_spec = std::make_unique<SearchSpecProto>(); + search_spec->set_term_match_type(TermMatchType::PREFIX); + search_spec->set_query(""); + search_spec->set_search_type(GetParam()); + TypePropertyMask* email_property_filters = + search_spec->add_type_property_filters(); + // Add empty list for Email's property filters + email_property_filters->set_schema_type("Email"); + + auto result_spec = std::make_unique<ResultSpecProto>(); + + // 3. Verify that both documents are returned. + auto scoring_spec = std::make_unique<ScoringSpecProto>(); + *scoring_spec = GetDefaultScoringSpec(); + SearchResultProto results = + icing.Search(*search_spec, *scoring_spec, *result_spec); + EXPECT_THAT(results.status(), ProtoIsOk()); + EXPECT_THAT(results.results(), SizeIs(2)); +} + +TEST_P(IcingSearchEngineSearchTest, SearchWithPropertyFiltersOnMultipleSchema) { + IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache()); + ASSERT_THAT(icing.Initialize().status(), ProtoIsOk()); + // Add Person and Organization schema with a property 'name' in both. + SchemaProto schema = + SchemaBuilder() + .AddType(SchemaTypeConfigBuilder() + .SetType("Person") + .AddProperty(PropertyConfigBuilder() + .SetName("name") + .SetDataTypeString(TERM_MATCH_PREFIX, + TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_OPTIONAL)) + .AddProperty(PropertyConfigBuilder() + .SetName("emailAddress") + .SetDataTypeString(TERM_MATCH_PREFIX, + TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_OPTIONAL))) + .AddType(SchemaTypeConfigBuilder() + .SetType("Organization") + .AddProperty(PropertyConfigBuilder() + .SetName("name") + .SetDataTypeString(TERM_MATCH_PREFIX, + TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_OPTIONAL)) + .AddProperty(PropertyConfigBuilder() + .SetName("address") + .SetDataTypeString(TERM_MATCH_PREFIX, + TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_OPTIONAL))) + .Build(); + ASSERT_THAT(icing.SetSchema(schema).status(), ProtoIsOk()); + + // 1. Add person document + DocumentProto person_document = + DocumentBuilder() + .SetKey("namespace", "uri1") + .SetCreationTimestampMs(1000) + .SetSchema("Person") + .AddStringProperty("name", "Meg Ryan") + .AddStringProperty("emailAddress", "hellogirl@aol.com") + .Build(); + ASSERT_THAT(icing.Put(person_document).status(), ProtoIsOk()); + + // 1. Add organization document + DocumentProto organization_document = + DocumentBuilder() + .SetKey("namespace", "uri2") + .SetCreationTimestampMs(1000) + .SetSchema("Organization") + .AddStringProperty("name", "Meg Corp") + .AddStringProperty("address", "Universal street") + .Build(); + ASSERT_THAT(icing.Put(organization_document).status(), ProtoIsOk()); + + // 2. Issue a query with property filters. Person schema has name in it's + // property filter but Organization schema doesn't. + auto search_spec = std::make_unique<SearchSpecProto>(); + search_spec->set_term_match_type(TermMatchType::PREFIX); + search_spec->set_query("Meg"); + search_spec->set_search_type(GetParam()); + TypePropertyMask* person_property_filters = + search_spec->add_type_property_filters(); + person_property_filters->set_schema_type("Person"); + person_property_filters->add_paths("name"); + TypePropertyMask* organization_property_filters = + search_spec->add_type_property_filters(); + organization_property_filters->set_schema_type("Organization"); + organization_property_filters->add_paths("address"); + + auto result_spec = std::make_unique<ResultSpecProto>(); + + auto scoring_spec = std::make_unique<ScoringSpecProto>(); + *scoring_spec = GetDefaultScoringSpec(); + SearchResultProto results = + icing.Search(*search_spec, *scoring_spec, *result_spec); + EXPECT_THAT(results.status(), ProtoIsOk()); + EXPECT_THAT(results.results(), SizeIs(1)); + + // 3. Verify that only the person document is returned. Although 'Meg' is + // present in organization document, it shouldn't be in the result since + // the name field is not specified in the Organization property filter. + EXPECT_THAT(results.results(0).document(), EqualsProto(person_document)); +} + +TEST_P(IcingSearchEngineSearchTest, SearchWithWildcardPropertyFilters) { + IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache()); + ASSERT_THAT(icing.Initialize().status(), ProtoIsOk()); + ASSERT_THAT(icing.SetSchema(CreatePersonAndEmailSchema()).status(), + ProtoIsOk()); + + // 1. Add two email documents + DocumentProto document_one = + DocumentBuilder() + .SetKey("namespace", "uri1") + .SetCreationTimestampMs(1000) + .SetSchema("Email") + .AddDocumentProperty( + "sender", + DocumentBuilder() + .SetKey("namespace", "uri1") + .SetSchema("Person") + .AddStringProperty("name", "Meg Ryan") + .AddStringProperty("emailAddress", "hellogirl@aol.com") + .Build()) + .AddStringProperty("subject", "Hello World!") + .AddStringProperty( + "body", "Oh what a beautiful morning! Oh what a beautiful day!") + .Build(); + ASSERT_THAT(icing.Put(document_one).status(), ProtoIsOk()); + + DocumentProto document_two = + DocumentBuilder() + .SetKey("namespace", "uri2") + .SetCreationTimestampMs(1000) + .SetSchema("Email") + .AddDocumentProperty( + "sender", DocumentBuilder() + .SetKey("namespace", "uri2") + .SetSchema("Person") + .AddStringProperty("name", "Tom Hanks") + .AddStringProperty("emailAddress", "ny152@aol.com") + .Build()) + .AddStringProperty("subject", "Goodnight Moon!") + .AddStringProperty("body", + "Count all the sheep and tell them 'Hello'.") + .Build(); + ASSERT_THAT(icing.Put(document_two).status(), ProtoIsOk()); + + // 2. Issue a query with property filters of sender.name and subject for the + // wildcard(*) schema type. + auto search_spec = std::make_unique<SearchSpecProto>(); + search_spec->set_term_match_type(TermMatchType::PREFIX); + search_spec->set_query("hello"); + search_spec->set_search_type(GetParam()); + TypePropertyMask* wildcard_property_filters = + search_spec->add_type_property_filters(); + wildcard_property_filters->set_schema_type("*"); + wildcard_property_filters->add_paths("sender.name"); + wildcard_property_filters->add_paths("subject"); + + auto result_spec = std::make_unique<ResultSpecProto>(); + + auto scoring_spec = std::make_unique<ScoringSpecProto>(); + *scoring_spec = GetDefaultScoringSpec(); + SearchResultProto results = + icing.Search(*search_spec, *scoring_spec, *result_spec); + EXPECT_THAT(results.status(), ProtoIsOk()); + EXPECT_THAT(results.results(), SizeIs(1)); + + // 3. Verify that only the first document is returned since the second + // document doesn't contain the word 'hello' in either of fields specified in + // the property filter. This confirms that the property filters for the + // wildcard entry have been applied to the Email schema as well. + EXPECT_THAT(results.results(0).document(), EqualsProto(document_one)); +} + +TEST_P(IcingSearchEngineSearchTest, SearchWithMixedPropertyFilters) { + IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache()); + ASSERT_THAT(icing.Initialize().status(), ProtoIsOk()); + ASSERT_THAT(icing.SetSchema(CreatePersonAndEmailSchema()).status(), + ProtoIsOk()); + + // 1. Add two email documents + DocumentProto document_one = + DocumentBuilder() + .SetKey("namespace", "uri1") + .SetCreationTimestampMs(1000) + .SetSchema("Email") + .AddDocumentProperty( + "sender", + DocumentBuilder() + .SetKey("namespace", "uri1") + .SetSchema("Person") + .AddStringProperty("name", "Meg Ryan") + .AddStringProperty("emailAddress", "hellogirl@aol.com") + .Build()) + .AddStringProperty("subject", "Hello World!") + .AddStringProperty( + "body", "Oh what a beautiful morning! Oh what a beautiful day!") + .Build(); + ASSERT_THAT(icing.Put(document_one).status(), ProtoIsOk()); + + DocumentProto document_two = + DocumentBuilder() + .SetKey("namespace", "uri2") + .SetCreationTimestampMs(1000) + .SetSchema("Email") + .AddDocumentProperty( + "sender", DocumentBuilder() + .SetKey("namespace", "uri2") + .SetSchema("Person") + .AddStringProperty("name", "Tom Hanks") + .AddStringProperty("emailAddress", "ny152@aol.com") + .Build()) + .AddStringProperty("subject", "Goodnight Moon!") + .AddStringProperty("body", + "Count all the sheep and tell them 'Hello'.") + .Build(); + ASSERT_THAT(icing.Put(document_two).status(), ProtoIsOk()); + + // 2. Issue a query with property filters of sender.name and subject for the + // wildcard(*) schema type plus property filters of sender.name and body for + // the Email schema type. + auto search_spec = std::make_unique<SearchSpecProto>(); + search_spec->set_term_match_type(TermMatchType::PREFIX); + search_spec->set_query("hello"); + search_spec->set_search_type(GetParam()); + TypePropertyMask* wildcard_property_filters = + search_spec->add_type_property_filters(); + wildcard_property_filters->set_schema_type("*"); + wildcard_property_filters->add_paths("sender.name"); + wildcard_property_filters->add_paths("subject"); + TypePropertyMask* email_property_filters = + search_spec->add_type_property_filters(); + email_property_filters->set_schema_type("Email"); + email_property_filters->add_paths("sender.name"); + email_property_filters->add_paths("body"); + + auto result_spec = std::make_unique<ResultSpecProto>(); + + auto scoring_spec = std::make_unique<ScoringSpecProto>(); + *scoring_spec = GetDefaultScoringSpec(); + SearchResultProto results = + icing.Search(*search_spec, *scoring_spec, *result_spec); + EXPECT_THAT(results.status(), ProtoIsOk()); + EXPECT_THAT(results.results(), SizeIs(1)); + + // 3. Verify that only the second document is returned since the first + // document doesn't contain the word 'hello' in either of fields sender.name + // or body. This confirms that the property filters specified for Email schema + // have been applied and the ones specified for wildcard entry have been + // ignored. + EXPECT_THAT(results.results(0).document(), EqualsProto(document_two)); +} + +TEST_P(IcingSearchEngineSearchTest, SearchWithNonApplicablePropertyFilters) { + IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache()); + ASSERT_THAT(icing.Initialize().status(), ProtoIsOk()); + ASSERT_THAT(icing.SetSchema(CreatePersonAndEmailSchema()).status(), + ProtoIsOk()); + + // 1. Add two email documents + DocumentProto document_one = + DocumentBuilder() + .SetKey("namespace", "uri1") + .SetCreationTimestampMs(1000) + .SetSchema("Email") + .AddDocumentProperty( + "sender", + DocumentBuilder() + .SetKey("namespace", "uri1") + .SetSchema("Person") + .AddStringProperty("name", "Meg Ryan") + .AddStringProperty("emailAddress", "hellogirl@aol.com") + .Build()) + .AddStringProperty("subject", "Hello World!") + .AddStringProperty( + "body", "Oh what a beautiful morning! Oh what a beautiful day!") + .Build(); + ASSERT_THAT(icing.Put(document_one).status(), ProtoIsOk()); + + DocumentProto document_two = + DocumentBuilder() + .SetKey("namespace", "uri2") + .SetCreationTimestampMs(1000) + .SetSchema("Email") + .AddDocumentProperty( + "sender", DocumentBuilder() + .SetKey("namespace", "uri2") + .SetSchema("Person") + .AddStringProperty("name", "Tom Hanks") + .AddStringProperty("emailAddress", "ny152@aol.com") + .Build()) + .AddStringProperty("subject", "Goodnight Moon!") + .AddStringProperty("body", + "Count all the sheep and tell them 'Hello'.") + .Build(); + ASSERT_THAT(icing.Put(document_two).status(), ProtoIsOk()); + + // 2. Issue a query with property filters of sender.name and subject for an + // unknown schema type. + auto search_spec = std::make_unique<SearchSpecProto>(); + search_spec->set_term_match_type(TermMatchType::PREFIX); + search_spec->set_query("hello"); + search_spec->set_search_type(GetParam()); + TypePropertyMask* email_property_filters = + search_spec->add_type_property_filters(); + email_property_filters->set_schema_type("unknown"); + email_property_filters->add_paths("sender.name"); + email_property_filters->add_paths("subject"); + + auto result_spec = std::make_unique<ResultSpecProto>(); + + auto scoring_spec = std::make_unique<ScoringSpecProto>(); + *scoring_spec = GetDefaultScoringSpec(); + SearchResultProto results = + icing.Search(*search_spec, *scoring_spec, *result_spec); + EXPECT_THAT(results.status(), ProtoIsOk()); + EXPECT_THAT(results.results(), SizeIs(2)); + + // 3. Verify that both the documents are returned since each of them have the + // word 'hello' in at least 1 property. The second document being returned + // confirms that the body field was searched and the specified property + // filters were not applied to the Email schema type. + EXPECT_THAT(results.results(0).document(), EqualsProto(document_two)); + EXPECT_THAT(results.results(1).document(), EqualsProto(document_one)); +} + +TEST_P(IcingSearchEngineSearchTest, SearchWithEmptyPropertyFilter) { + IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache()); + ASSERT_THAT(icing.Initialize().status(), ProtoIsOk()); + ASSERT_THAT(icing.SetSchema(CreateMessageSchema()).status(), ProtoIsOk()); + + // 1. Add two email documents + DocumentProto document_one = DocumentBuilder() + .SetKey("namespace", "uri1") + .SetCreationTimestampMs(1000) + .SetSchema("Message") + .AddStringProperty("body", "Hello World!") + .Build(); + ASSERT_THAT(icing.Put(document_one).status(), ProtoIsOk()); + + // 2. Issue a query with empty property filter for Message schema. + auto search_spec = std::make_unique<SearchSpecProto>(); + search_spec->set_term_match_type(TermMatchType::PREFIX); + search_spec->set_query("hello"); + search_spec->set_search_type(GetParam()); + TypePropertyMask* message_property_filters = + search_spec->add_type_property_filters(); + message_property_filters->set_schema_type("Message"); + + auto result_spec = std::make_unique<ResultSpecProto>(); + + auto scoring_spec = std::make_unique<ScoringSpecProto>(); + *scoring_spec = GetDefaultScoringSpec(); + SearchResultProto results = + icing.Search(*search_spec, *scoring_spec, *result_spec); + EXPECT_THAT(results.status(), ProtoIsOk()); + + // 3. Verify that no documents are returned. Although 'hello' is present in + // the indexed document, it shouldn't be returned since the Message property + // filter doesn't allow any properties to be searched. + ASSERT_THAT(results.results(), IsEmpty()); +} + +TEST_P(IcingSearchEngineSearchTest, + SearchWithPropertyFilterHavingInvalidProperty) { + IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache()); + ASSERT_THAT(icing.Initialize().status(), ProtoIsOk()); + ASSERT_THAT(icing.SetSchema(CreateMessageSchema()).status(), ProtoIsOk()); + + // 1. Add two email documents + DocumentProto document_one = DocumentBuilder() + .SetKey("namespace", "uri1") + .SetCreationTimestampMs(1000) + .SetSchema("Message") + .AddStringProperty("body", "Hello World!") + .Build(); + ASSERT_THAT(icing.Put(document_one).status(), ProtoIsOk()); + + // 2. Issue a query with property filter having invalid/unknown property for + // Message schema. + auto search_spec = std::make_unique<SearchSpecProto>(); + search_spec->set_term_match_type(TermMatchType::PREFIX); + search_spec->set_query("hello"); + search_spec->set_search_type(GetParam()); + TypePropertyMask* message_property_filters = + search_spec->add_type_property_filters(); + message_property_filters->set_schema_type("Message"); + message_property_filters->add_paths("unknown"); + + auto result_spec = std::make_unique<ResultSpecProto>(); + + auto scoring_spec = std::make_unique<ScoringSpecProto>(); + *scoring_spec = GetDefaultScoringSpec(); + SearchResultProto results = + icing.Search(*search_spec, *scoring_spec, *result_spec); + EXPECT_THAT(results.status(), ProtoIsOk()); + + // 3. Verify that no documents are returned. Although 'hello' is present in + // the indexed document, it shouldn't be returned since the Message property + // filter doesn't allow any valid properties to be searched. Any + // invalid/unknown properties specified in the property filters will be + // ignored while searching. + ASSERT_THAT(results.results(), IsEmpty()); +} + +TEST_P(IcingSearchEngineSearchTest, SearchWithPropertyFiltersWithNesting) { + IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache()); + ASSERT_THAT(icing.Initialize().status(), ProtoIsOk()); + ASSERT_THAT(icing.SetSchema(CreatePersonAndEmailSchema()).status(), + ProtoIsOk()); + + // 1. Add two email documents + DocumentProto document_one = + DocumentBuilder() + .SetKey("namespace", "uri1") + .SetCreationTimestampMs(1000) + .SetSchema("Email") + .AddDocumentProperty( + "sender", + DocumentBuilder() + .SetKey("namespace", "uri1") + .SetSchema("Person") + .AddStringProperty("name", "Meg Ryan") + .AddStringProperty("emailAddress", "hellogirl@aol.com") + .Build()) + .AddStringProperty("subject", "Hello World!") + .AddStringProperty( + "body", "Oh what a beautiful morning! Oh what a beautiful day!") + .Build(); + ASSERT_THAT(icing.Put(document_one).status(), ProtoIsOk()); + + DocumentProto document_two = + DocumentBuilder() + .SetKey("namespace", "uri2") + .SetCreationTimestampMs(1000) + .SetSchema("Email") + .AddDocumentProperty( + "sender", DocumentBuilder() + .SetKey("namespace", "uri2") + .SetSchema("Person") + .AddStringProperty("name", "Tom Hanks") + .AddStringProperty("emailAddress", "ny152@aol.com") + .Build()) + .AddStringProperty("subject", "Goodnight Moon!") + .AddStringProperty("body", + "Count all the sheep and tell them 'Hello'.") + .Build(); + ASSERT_THAT(icing.Put(document_two).status(), ProtoIsOk()); + + // 2. Issue a query with property filter of sender.emailAddress for the Email + // schema type. + auto search_spec = std::make_unique<SearchSpecProto>(); + search_spec->set_term_match_type(TermMatchType::PREFIX); + search_spec->set_query("hello"); + search_spec->set_search_type(GetParam()); + TypePropertyMask* email_property_filters = + search_spec->add_type_property_filters(); + email_property_filters->set_schema_type("Email"); + email_property_filters->add_paths("sender.emailAddress"); + + auto result_spec = std::make_unique<ResultSpecProto>(); + + auto scoring_spec = std::make_unique<ScoringSpecProto>(); + *scoring_spec = GetDefaultScoringSpec(); + SearchResultProto results = + icing.Search(*search_spec, *scoring_spec, *result_spec); + EXPECT_THAT(results.status(), ProtoIsOk()); + EXPECT_THAT(results.results(), SizeIs(1)); + + // 3. Verify that only the first document is returned since the second + // document doesn't contain the word 'hello' in sender.emailAddress. The first + // document being returned confirms that the nested property + // sender.emailAddress was actually searched. + EXPECT_THAT(results.results(0).document(), EqualsProto(document_one)); +} + +TEST_P(IcingSearchEngineSearchTest, + SearchWithPropertyFilter_RelevanceScoreUnaffectedByExcludedSectionHits) { + IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache()); + ASSERT_THAT(icing.Initialize().status(), ProtoIsOk()); + ASSERT_THAT(icing.SetSchema(CreatePersonAndEmailSchema()).status(), + ProtoIsOk()); + + // 1. Add two email documents + DocumentProto document_one = + DocumentBuilder() + .SetKey("namespace", "uri1") + .SetCreationTimestampMs(1000) + .SetSchema("Email") + .AddDocumentProperty( + "sender", DocumentBuilder() + .SetKey("namespace", "uri1") + .SetSchema("Person") + .AddStringProperty("name", "Hello Ryan") + .AddStringProperty("emailAddress", "hello@aol.com") + .Build()) + .AddStringProperty("subject", "Hello Hello!") + .AddStringProperty("body", "hello1 hello2 hello3 hello4 hello5") + .Build(); + ASSERT_THAT(icing.Put(document_one).status(), ProtoIsOk()); + + DocumentProto document_two = + DocumentBuilder() + .SetKey("namespace", "uri2") + .SetCreationTimestampMs(1000) + .SetSchema("Email") + .AddDocumentProperty( + "sender", DocumentBuilder() + .SetKey("namespace", "uri2") + .SetSchema("Person") + .AddStringProperty("name", "Tom Hanks") + .AddStringProperty("emailAddress", "world@aol.com") + .Build()) + .AddStringProperty("subject", "Hello Hello!") + .AddStringProperty("body", "one1 two2 three3 four4 five5") + .Build(); + ASSERT_THAT(icing.Put(document_two).status(), ProtoIsOk()); + + // 2. Issue a query with a property filter + auto search_spec = std::make_unique<SearchSpecProto>(); + search_spec->set_term_match_type(TermMatchType::PREFIX); + search_spec->set_query("Hello"); + search_spec->set_search_type(GetParam()); + TypePropertyMask* email_property_filters = + search_spec->add_type_property_filters(); + email_property_filters->set_schema_type("Email"); + email_property_filters->add_paths("subject"); + + auto result_spec = std::make_unique<ResultSpecProto>(); + + // 3. Verify that both documents are returned and have equal relevance score + // Note, the total number of tokens must be equal in the documents + auto scoring_spec = std::make_unique<ScoringSpecProto>(); + scoring_spec->set_rank_by(ScoringSpecProto::RankingStrategy::RELEVANCE_SCORE); + SearchResultProto results = + icing.Search(*search_spec, *scoring_spec, *result_spec); + EXPECT_THAT(results.status(), ProtoIsOk()); + ASSERT_THAT(results.results(), SizeIs(2)); + EXPECT_THAT(results.results(0).score(), DoubleEq(results.results(1).score())); +} + +TEST_P(IcingSearchEngineSearchTest, + SearchWithPropertyFilter_ExcludingSectionsWithHitsLowersRelevanceScore) { + IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache()); + ASSERT_THAT(icing.Initialize().status(), ProtoIsOk()); + ASSERT_THAT(icing.SetSchema(CreatePersonAndEmailSchema()).status(), + ProtoIsOk()); + + // 1. Add an email document + DocumentProto document_one = + DocumentBuilder() + .SetKey("namespace", "uri1") + .SetCreationTimestampMs(1000) + .SetSchema("Email") + .AddDocumentProperty( + "sender", DocumentBuilder() + .SetKey("namespace", "uri1") + .SetSchema("Person") + .AddStringProperty("name", "Hello Ryan") + .AddStringProperty("emailAddress", "hello@aol.com") + .Build()) + .AddStringProperty("subject", "Hello Hello!") + .AddStringProperty("body", "hello hello hello hello hello") + .Build(); + ASSERT_THAT(icing.Put(document_one).status(), ProtoIsOk()); + + // 2. Issue a query without property filter + auto search_spec = std::make_unique<SearchSpecProto>(); + search_spec->set_term_match_type(TermMatchType::PREFIX); + search_spec->set_query("Hello"); + search_spec->set_search_type(GetParam()); + + auto result_spec = std::make_unique<ResultSpecProto>(); + + // 3. Get the relevance score without property filter + auto scoring_spec = std::make_unique<ScoringSpecProto>(); + scoring_spec->set_rank_by(ScoringSpecProto::RankingStrategy::RELEVANCE_SCORE); + SearchResultProto results = + icing.Search(*search_spec, *scoring_spec, *result_spec); + EXPECT_THAT(results.status(), ProtoIsOk()); + ASSERT_THAT(results.results(), SizeIs(1)); + double original_relevance_score = results.results(0).score(); + + // 4. Relevance score with property filter should be lower + TypePropertyMask* email_property_filters = + search_spec->add_type_property_filters(); + email_property_filters->set_schema_type("Email"); + email_property_filters->add_paths("subject"); + results = icing.Search(*search_spec, *scoring_spec, *result_spec); + EXPECT_THAT(results.status(), ProtoIsOk()); + ASSERT_THAT(results.results(), SizeIs(1)); + EXPECT_THAT(results.results(0).score(), Lt(original_relevance_score)); +} + TEST_P(IcingSearchEngineSearchTest, QueryStatsProtoTest) { auto fake_clock = std::make_unique<FakeClock>(); fake_clock->SetTimerElapsedMilliseconds(5); - TestIcingSearchEngine icing(GetDefaultIcingOptions(), - std::make_unique<Filesystem>(), + + // Set index merge size to 6 hits. This will cause document1, document2, + // document3's hits being merged into the main index, and document4, + // document5's hits will remain in the lite index. + IcingSearchEngineOptions options = GetDefaultIcingOptions(); + options.set_index_merge_size(sizeof(TermIdHitPair::Value) * 6); + + TestIcingSearchEngine icing(options, std::make_unique<Filesystem>(), std::make_unique<IcingFilesystem>(), std::move(fake_clock), GetTestJniCache()); ASSERT_THAT(icing.Initialize().status(), ProtoIsOk()); @@ -3491,6 +4476,7 @@ TEST_P(IcingSearchEngineSearchTest, QueryStatsProtoTest) { ASSERT_THAT(search_result.next_page_token(), Ne(kInvalidNextPageToken)); // Check the stats + // TODO(b/305098009): deprecate search-related flat fields in query_stats. QueryStatsProto exp_stats; exp_stats.set_query_length(7); exp_stats.set_num_terms(1); @@ -3510,6 +4496,22 @@ TEST_P(IcingSearchEngineSearchTest, QueryStatsProtoTest) { exp_stats.set_document_retrieval_latency_ms(5); exp_stats.set_lock_acquisition_latency_ms(5); exp_stats.set_num_joined_results_returned_current_page(0); + + QueryStatsProto::SearchStats* exp_parent_search_stats = + exp_stats.mutable_parent_search_stats(); + exp_parent_search_stats->set_query_length(7); + exp_parent_search_stats->set_num_terms(1); + exp_parent_search_stats->set_num_namespaces_filtered(1); + exp_parent_search_stats->set_num_schema_types_filtered(1); + exp_parent_search_stats->set_ranking_strategy( + ScoringSpecProto::RankingStrategy::CREATION_TIMESTAMP); + exp_parent_search_stats->set_num_documents_scored(5); + exp_parent_search_stats->set_parse_query_latency_ms(5); + exp_parent_search_stats->set_scoring_latency_ms(5); + exp_parent_search_stats->set_num_fetched_hits_lite_index(2); + exp_parent_search_stats->set_num_fetched_hits_main_index(3); + exp_parent_search_stats->set_num_fetched_hits_integer_index(0); + EXPECT_THAT(search_result.query_stats(), EqualsProto(exp_stats)); // Second page, 2 result with 1 snippet @@ -3550,8 +4552,14 @@ TEST_P(IcingSearchEngineSearchTest, QueryStatsProtoTest) { TEST_P(IcingSearchEngineSearchTest, JoinQueryStatsProtoTest) { auto fake_clock = std::make_unique<FakeClock>(); fake_clock->SetTimerElapsedMilliseconds(5); - TestIcingSearchEngine icing(GetDefaultIcingOptions(), - std::make_unique<Filesystem>(), + + // Set index merge size to 13 hits. This will cause person1, person2, email1, + // email2, email3's hits being merged into the main index, and person3, + // email4's hits will remain in the lite index. + IcingSearchEngineOptions options = GetDefaultIcingOptions(); + options.set_index_merge_size(sizeof(TermIdHitPair::Value) * 13); + + TestIcingSearchEngine icing(options, std::make_unique<Filesystem>(), std::make_unique<IcingFilesystem>(), std::move(fake_clock), GetTestJniCache()); @@ -3571,8 +4579,7 @@ TEST_P(IcingSearchEngineSearchTest, JoinQueryStatsProtoTest) { .SetCardinality(CARDINALITY_OPTIONAL)) .AddProperty(PropertyConfigBuilder() .SetName("emailAddress") - .SetDataTypeString(TERM_MATCH_PREFIX, - TOKENIZER_PLAIN) + .SetDataType(TYPE_STRING) .SetCardinality(CARDINALITY_OPTIONAL))) .AddType(SchemaTypeConfigBuilder() .SetType("Email") @@ -3646,15 +4653,25 @@ TEST_P(IcingSearchEngineSearchTest, JoinQueryStatsProtoTest) { .SetCreationTimestampMs(kDefaultCreationTimestampMs) .SetScore(1) .Build(); + DocumentProto email4 = + DocumentBuilder() + .SetKey("namespace", "email4") + .SetSchema("Email") + .AddStringProperty("subject", "test subject 4") + .AddStringProperty("personQualifiedId", "pkg$db/namespace#person1") + .SetCreationTimestampMs(kDefaultCreationTimestampMs) + .SetScore(0) + .Build(); ASSERT_THAT(icing.Initialize().status(), ProtoIsOk()); ASSERT_THAT(icing.SetSchema(schema).status(), ProtoIsOk()); ASSERT_THAT(icing.Put(person1).status(), ProtoIsOk()); ASSERT_THAT(icing.Put(person2).status(), ProtoIsOk()); - ASSERT_THAT(icing.Put(person3).status(), ProtoIsOk()); ASSERT_THAT(icing.Put(email1).status(), ProtoIsOk()); ASSERT_THAT(icing.Put(email2).status(), ProtoIsOk()); ASSERT_THAT(icing.Put(email3).status(), ProtoIsOk()); + ASSERT_THAT(icing.Put(person3).status(), ProtoIsOk()); + ASSERT_THAT(icing.Put(email4).status(), ProtoIsOk()); // Parent SearchSpec SearchSpecProto search_spec; @@ -3691,13 +4708,14 @@ TEST_P(IcingSearchEngineSearchTest, JoinQueryStatsProtoTest) { std::numeric_limits<int32_t>::max()); // Since we: - // - Use MAX for aggregation scoring strategy. + // - Use COUNT for aggregation scoring strategy. // - (Default) use DOCUMENT_SCORE to score child documents. // - (Default) use DESC as the ranking order. // - // person1 + email1 should have the highest aggregated score (3) and be - // returned first. person2 + email2 (aggregated score = 2) should be the - // second, and person3 + email3 (aggregated score = 1) should be the last. + // person1 with [email1, email2, email4] should have the highest aggregated + // score (3) and be returned first. person2 with [email3] (aggregated score = + // 1) should be the second, and person3 with no child (aggregated score = 0) + // should be the last. SearchResultProto expected_result1; expected_result1.mutable_status()->set_code(StatusProto::OK); SearchResultProto::ResultProto* result_proto1 = @@ -3705,6 +4723,7 @@ TEST_P(IcingSearchEngineSearchTest, JoinQueryStatsProtoTest) { *result_proto1->mutable_document() = person1; *result_proto1->mutable_joined_results()->Add()->mutable_document() = email1; *result_proto1->mutable_joined_results()->Add()->mutable_document() = email2; + *result_proto1->mutable_joined_results()->Add()->mutable_document() = email4; SearchResultProto expected_result2; expected_result2.mutable_status()->set_code(StatusProto::OK); @@ -3728,6 +4747,7 @@ TEST_P(IcingSearchEngineSearchTest, JoinQueryStatsProtoTest) { EqualsSearchResultIgnoreStatsAndScores(expected_result1)); // Check the stats + // TODO(b/305098009): deprecate search-related flat fields in query_stats. QueryStatsProto exp_stats; exp_stats.set_query_length(15); exp_stats.set_num_terms(1); @@ -3746,8 +4766,40 @@ TEST_P(IcingSearchEngineSearchTest, JoinQueryStatsProtoTest) { exp_stats.set_ranking_latency_ms(5); exp_stats.set_document_retrieval_latency_ms(5); exp_stats.set_lock_acquisition_latency_ms(5); - exp_stats.set_num_joined_results_returned_current_page(2); + exp_stats.set_num_joined_results_returned_current_page(3); exp_stats.set_join_latency_ms(5); + exp_stats.set_is_join_query(true); + + QueryStatsProto::SearchStats* exp_parent_search_stats = + exp_stats.mutable_parent_search_stats(); + exp_parent_search_stats->set_query_length(15); + exp_parent_search_stats->set_num_terms(1); + exp_parent_search_stats->set_num_namespaces_filtered(0); + exp_parent_search_stats->set_num_schema_types_filtered(0); + exp_parent_search_stats->set_ranking_strategy( + ScoringSpecProto::RankingStrategy::JOIN_AGGREGATE_SCORE); + exp_parent_search_stats->set_num_documents_scored(3); + exp_parent_search_stats->set_parse_query_latency_ms(5); + exp_parent_search_stats->set_scoring_latency_ms(5); + exp_parent_search_stats->set_num_fetched_hits_lite_index(1); + exp_parent_search_stats->set_num_fetched_hits_main_index(2); + exp_parent_search_stats->set_num_fetched_hits_integer_index(0); + + QueryStatsProto::SearchStats* exp_child_search_stats = + exp_stats.mutable_child_search_stats(); + exp_child_search_stats->set_query_length(12); + exp_child_search_stats->set_num_terms(1); + exp_child_search_stats->set_num_namespaces_filtered(0); + exp_child_search_stats->set_num_schema_types_filtered(0); + exp_child_search_stats->set_ranking_strategy( + ScoringSpecProto::RankingStrategy::DOCUMENT_SCORE); + exp_child_search_stats->set_num_documents_scored(4); + exp_child_search_stats->set_parse_query_latency_ms(5); + exp_child_search_stats->set_scoring_latency_ms(5); + exp_child_search_stats->set_num_fetched_hits_lite_index(1); + exp_child_search_stats->set_num_fetched_hits_main_index(3); + exp_child_search_stats->set_num_fetched_hits_integer_index(0); + EXPECT_THAT(search_result.query_stats(), EqualsProto(exp_stats)); // Second page, 1 child doc. @@ -4317,6 +5369,166 @@ TEST_P(IcingSearchEngineSearchTest, JoinByQualifiedId) { EqualsSearchResultIgnoreStatsAndScores(expected_result3)); } +TEST_P(IcingSearchEngineSearchTest, JoinByQualifiedIdMultipleNamespaces) { + SchemaProto schema = + SchemaBuilder() + .AddType(SchemaTypeConfigBuilder() + .SetType("Person") + .AddProperty(PropertyConfigBuilder() + .SetName("firstName") + .SetDataTypeString(TERM_MATCH_PREFIX, + TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_OPTIONAL)) + .AddProperty(PropertyConfigBuilder() + .SetName("lastName") + .SetDataTypeString(TERM_MATCH_PREFIX, + TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_OPTIONAL)) + .AddProperty(PropertyConfigBuilder() + .SetName("emailAddress") + .SetDataTypeString(TERM_MATCH_PREFIX, + TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_OPTIONAL))) + .AddType(SchemaTypeConfigBuilder() + .SetType("Email") + .AddProperty(PropertyConfigBuilder() + .SetName("subject") + .SetDataTypeString(TERM_MATCH_PREFIX, + TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_OPTIONAL)) + .AddProperty(PropertyConfigBuilder() + .SetName("personQualifiedId") + .SetDataTypeJoinableString( + JOINABLE_VALUE_TYPE_QUALIFIED_ID) + .SetCardinality(CARDINALITY_OPTIONAL))) + .Build(); + + DocumentProto person1 = + DocumentBuilder() + .SetKey("pkg$db/namespace1", "person") + .SetSchema("Person") + .AddStringProperty("firstName", "first1") + .AddStringProperty("lastName", "last1") + .AddStringProperty("emailAddress", "email1@gmail.com") + .SetCreationTimestampMs(kDefaultCreationTimestampMs) + .SetScore(1) + .Build(); + DocumentProto person2 = + DocumentBuilder() + .SetKey("pkg$db/namespace2", "person") + .SetSchema("Person") + .AddStringProperty("firstName", "first2") + .AddStringProperty("lastName", "last2") + .AddStringProperty("emailAddress", "email2@gmail.com") + .SetCreationTimestampMs(kDefaultCreationTimestampMs) + .SetScore(2) + .Build(); + + DocumentProto email1 = + DocumentBuilder() + .SetKey("namespace1", "email1") + .SetSchema("Email") + .AddStringProperty("subject", "test subject 1") + .AddStringProperty("personQualifiedId", "pkg$db/namespace1#person") + .SetCreationTimestampMs(kDefaultCreationTimestampMs) + .SetScore(3) + .Build(); + DocumentProto email2 = + DocumentBuilder() + .SetKey("namespace2", "email2") + .SetSchema("Email") + .AddStringProperty("subject", "test subject 2") + .AddStringProperty("personQualifiedId", "pkg$db/namespace1#person") + .SetCreationTimestampMs(kDefaultCreationTimestampMs) + .SetScore(2) + .Build(); + DocumentProto email3 = + DocumentBuilder() + .SetKey("namespace2", "email3") + .SetSchema("Email") + .AddStringProperty("subject", "test subject 3") + .AddStringProperty("personQualifiedId", "pkg$db/namespace2#person") + .SetCreationTimestampMs(kDefaultCreationTimestampMs) + .SetScore(1) + .Build(); + + IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache()); + ASSERT_THAT(icing.Initialize().status(), ProtoIsOk()); + ASSERT_THAT(icing.SetSchema(schema).status(), ProtoIsOk()); + ASSERT_THAT(icing.Put(person1).status(), ProtoIsOk()); + ASSERT_THAT(icing.Put(person2).status(), ProtoIsOk()); + ASSERT_THAT(icing.Put(email1).status(), ProtoIsOk()); + ASSERT_THAT(icing.Put(email2).status(), ProtoIsOk()); + ASSERT_THAT(icing.Put(email3).status(), ProtoIsOk()); + + // Parent SearchSpec + SearchSpecProto search_spec; + search_spec.set_term_match_type(TermMatchType::PREFIX); + search_spec.set_query("firstName:first"); + search_spec.set_search_type(GetParam()); + + // JoinSpec + JoinSpecProto* join_spec = search_spec.mutable_join_spec(); + join_spec->set_parent_property_expression( + std::string(JoinProcessor::kQualifiedIdExpr)); + join_spec->set_child_property_expression("personQualifiedId"); + join_spec->set_aggregation_scoring_strategy( + JoinSpecProto::AggregationScoringStrategy::COUNT); + JoinSpecProto::NestedSpecProto* nested_spec = + join_spec->mutable_nested_spec(); + SearchSpecProto* nested_search_spec = nested_spec->mutable_search_spec(); + nested_search_spec->set_term_match_type(TermMatchType::PREFIX); + nested_search_spec->set_query("subject:test"); + nested_search_spec->set_search_type(GetParam()); + *nested_spec->mutable_scoring_spec() = GetDefaultScoringSpec(); + *nested_spec->mutable_result_spec() = ResultSpecProto::default_instance(); + + // Parent ScoringSpec + ScoringSpecProto scoring_spec = GetDefaultScoringSpec(); + + // Parent ResultSpec + ResultSpecProto result_spec; + result_spec.set_num_per_page(1); + result_spec.set_max_joined_children_per_parent_to_return( + std::numeric_limits<int32_t>::max()); + + // Since we: + // - Use COUNT for aggregation scoring strategy. + // - (Default) use DESC as the ranking order. + // + // pkg$db/namespace1#person + email1, email2 should have the highest + // aggregated score (2) and be returned first. pkg$db/namespace2#person + + // email3 (aggregated score = 1) should be the second. + SearchResultProto expected_result1; + expected_result1.mutable_status()->set_code(StatusProto::OK); + SearchResultProto::ResultProto* result_proto1 = + expected_result1.mutable_results()->Add(); + *result_proto1->mutable_document() = person1; + *result_proto1->mutable_joined_results()->Add()->mutable_document() = email1; + *result_proto1->mutable_joined_results()->Add()->mutable_document() = email2; + + SearchResultProto expected_result2; + expected_result2.mutable_status()->set_code(StatusProto::OK); + SearchResultProto::ResultProto* result_google::protobuf = + expected_result2.mutable_results()->Add(); + *result_google::protobuf->mutable_document() = person2; + *result_google::protobuf->mutable_joined_results()->Add()->mutable_document() = email3; + + SearchResultProto result1 = + icing.Search(search_spec, scoring_spec, result_spec); + uint64_t next_page_token = result1.next_page_token(); + EXPECT_THAT(next_page_token, Ne(kInvalidNextPageToken)); + expected_result1.set_next_page_token(next_page_token); + EXPECT_THAT(result1, + EqualsSearchResultIgnoreStatsAndScores(expected_result1)); + + SearchResultProto result2 = icing.GetNextPage(next_page_token); + next_page_token = result2.next_page_token(); + EXPECT_THAT(next_page_token, Eq(kInvalidNextPageToken)); + EXPECT_THAT(result2, + EqualsSearchResultIgnoreStatsAndScores(expected_result2)); +} + TEST_P(IcingSearchEngineSearchTest, JoinShouldLimitNumChildDocumentsByMaxJoinedChildPerParent) { SchemaProto schema = @@ -5328,6 +6540,126 @@ TEST_F(IcingSearchEngineSearchTest, NumericFilterOldQueryFails) { EXPECT_THAT(results.status(), ProtoStatusIs(StatusProto::INVALID_ARGUMENT)); } +TEST_F(IcingSearchEngineSearchTest, NumericFilterQueryStatsProtoTest) { + auto fake_clock = std::make_unique<FakeClock>(); + fake_clock->SetTimerElapsedMilliseconds(5); + + TestIcingSearchEngine icing(GetDefaultIcingOptions(), + std::make_unique<Filesystem>(), + std::make_unique<IcingFilesystem>(), + std::move(fake_clock), GetTestJniCache()); + ASSERT_THAT(icing.Initialize().status(), ProtoIsOk()); + + // Create the schema and document store + SchemaProto schema = + SchemaBuilder() + .AddType(SchemaTypeConfigBuilder() + .SetType("transaction") + .AddProperty(PropertyConfigBuilder() + .SetName("price") + .SetDataTypeInt64(NUMERIC_MATCH_RANGE) + .SetCardinality(CARDINALITY_OPTIONAL)) + .AddProperty(PropertyConfigBuilder() + .SetName("cost") + .SetDataTypeInt64(NUMERIC_MATCH_RANGE) + .SetCardinality(CARDINALITY_OPTIONAL))) + .Build(); + ASSERT_THAT(icing.SetSchema(schema).status(), ProtoIsOk()); + + DocumentProto document_one = DocumentBuilder() + .SetKey("namespace", "1") + .SetSchema("transaction") + .SetCreationTimestampMs(1) + .AddInt64Property("price", 10) + .Build(); + ASSERT_THAT(icing.Put(document_one).status(), ProtoIsOk()); + + DocumentProto document_two = DocumentBuilder() + .SetKey("namespace", "2") + .SetSchema("transaction") + .SetCreationTimestampMs(2) + .AddInt64Property("price", 25) + .Build(); + ASSERT_THAT(icing.Put(document_two).status(), ProtoIsOk()); + + DocumentProto document_three = DocumentBuilder() + .SetKey("namespace", "3") + .SetSchema("transaction") + .SetCreationTimestampMs(3) + .AddInt64Property("cost", 2) + .Build(); + ASSERT_THAT(icing.Put(document_three).status(), ProtoIsOk()); + + DocumentProto document_four = DocumentBuilder() + .SetKey("namespace", "3") + .SetSchema("transaction") + .SetCreationTimestampMs(4) + .AddInt64Property("price", 15) + .Build(); + ASSERT_THAT(icing.Put(document_four).status(), ProtoIsOk()); + + SearchSpecProto search_spec; + search_spec.add_namespace_filters("namespace"); + search_spec.add_schema_type_filters(document_one.schema()); + search_spec.set_query("price < 20"); + search_spec.add_enabled_features(std::string(kNumericSearchFeature)); + + ResultSpecProto result_spec; + result_spec.set_num_per_page(5); + + ScoringSpecProto scoring_spec; + scoring_spec.set_rank_by( + ScoringSpecProto::RankingStrategy::CREATION_TIMESTAMP); + + SearchResultProto results = + icing.Search(search_spec, scoring_spec, result_spec); + ASSERT_THAT(results.results(), SizeIs(2)); + EXPECT_THAT(results.results(0).document(), EqualsProto(document_four)); + EXPECT_THAT(results.results(1).document(), EqualsProto(document_one)); + + // Check the stats + // TODO(b/305098009): deprecate search-related flat fields in query_stats. + QueryStatsProto exp_stats; + exp_stats.set_query_length(10); + exp_stats.set_num_terms(0); + exp_stats.set_num_namespaces_filtered(1); + exp_stats.set_num_schema_types_filtered(1); + exp_stats.set_ranking_strategy( + ScoringSpecProto::RankingStrategy::CREATION_TIMESTAMP); + exp_stats.set_is_first_page(true); + exp_stats.set_requested_page_size(5); + exp_stats.set_num_results_returned_current_page(2); + exp_stats.set_num_documents_scored(2); + exp_stats.set_num_results_with_snippets(0); + exp_stats.set_latency_ms(5); + exp_stats.set_parse_query_latency_ms(5); + exp_stats.set_scoring_latency_ms(5); + exp_stats.set_ranking_latency_ms(5); + exp_stats.set_document_retrieval_latency_ms(5); + exp_stats.set_lock_acquisition_latency_ms(5); + exp_stats.set_num_joined_results_returned_current_page(0); + + QueryStatsProto::SearchStats* exp_parent_search_stats = + exp_stats.mutable_parent_search_stats(); + exp_parent_search_stats->set_query_length(10); + exp_parent_search_stats->set_num_terms(0); + exp_parent_search_stats->set_num_namespaces_filtered(1); + exp_parent_search_stats->set_num_schema_types_filtered(1); + exp_parent_search_stats->set_ranking_strategy( + ScoringSpecProto::RankingStrategy::CREATION_TIMESTAMP); + exp_parent_search_stats->set_is_numeric_query(true); + exp_parent_search_stats->set_num_documents_scored(2); + exp_parent_search_stats->set_parse_query_latency_ms(5); + exp_parent_search_stats->set_scoring_latency_ms(5); + exp_parent_search_stats->set_num_fetched_hits_lite_index(0); + exp_parent_search_stats->set_num_fetched_hits_main_index(0); + // Since we will inspect 1 bucket from "price" in integer index and it + // contains 3 hits, we will fetch 3 hits (but filter out one of them). + exp_parent_search_stats->set_num_fetched_hits_integer_index(3); + + EXPECT_THAT(results.query_stats(), EqualsProto(exp_stats)); +} + TEST_P(IcingSearchEngineSearchTest, BarisNormalizationTest) { IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache()); ASSERT_THAT(icing.Initialize().status(), ProtoIsOk()); @@ -5526,6 +6858,310 @@ TEST_P(IcingSearchEngineSearchTest, } } +TEST_P(IcingSearchEngineSearchTest, HasPropertyQuery) { + if (GetParam() != + SearchSpecProto::SearchType::EXPERIMENTAL_ICING_ADVANCED_QUERY) { + GTEST_SKIP() + << "The hasProperty() function is only supported in advanced query."; + } + SchemaProto schema = + SchemaBuilder() + .AddType(SchemaTypeConfigBuilder() + .SetType("Value") + .AddProperty(PropertyConfigBuilder() + .SetName("body") + .SetDataTypeString(TERM_MATCH_EXACT, + TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_REPEATED)) + .AddProperty(PropertyConfigBuilder() + .SetName("timestamp") + .SetDataType(TYPE_INT64) + .SetCardinality(CARDINALITY_OPTIONAL)) + .AddProperty(PropertyConfigBuilder() + .SetName("score") + .SetDataType(TYPE_DOUBLE) + .SetCardinality(CARDINALITY_OPTIONAL))) + .Build(); + + // Create a document with every property. + DocumentProto document0 = DocumentBuilder() + .SetKey("icing", "uri0") + .SetSchema("Value") + .SetCreationTimestampMs(1) + .AddStringProperty("body", "foo") + .AddInt64Property("timestamp", 123) + .AddDoubleProperty("score", 456.789) + .Build(); + // Create a document with missing body. + DocumentProto document1 = DocumentBuilder() + .SetKey("icing", "uri1") + .SetSchema("Value") + .SetCreationTimestampMs(1) + .AddInt64Property("timestamp", 123) + .AddDoubleProperty("score", 456.789) + .Build(); + // Create a document with missing timestamp. + DocumentProto document2 = DocumentBuilder() + .SetKey("icing", "uri2") + .SetSchema("Value") + .SetCreationTimestampMs(1) + .AddStringProperty("body", "foo") + .AddDoubleProperty("score", 456.789) + .Build(); + + IcingSearchEngineOptions options = GetDefaultIcingOptions(); + options.set_build_property_existence_metadata_hits(true); + IcingSearchEngine icing(options, GetTestJniCache()); + ASSERT_THAT(icing.Initialize().status(), ProtoIsOk()); + ASSERT_THAT(icing.SetSchema(schema).status(), ProtoIsOk()); + ASSERT_THAT(icing.Put(document0).status(), ProtoIsOk()); + ASSERT_THAT(icing.Put(document1).status(), ProtoIsOk()); + ASSERT_THAT(icing.Put(document2).status(), ProtoIsOk()); + + // Get all documents that have "body". + SearchSpecProto search_spec; + search_spec.set_term_match_type(TermMatchType::EXACT_ONLY); + search_spec.set_search_type(GetParam()); + search_spec.add_enabled_features(std::string(kHasPropertyFunctionFeature)); + search_spec.add_enabled_features( + std::string(kListFilterQueryLanguageFeature)); + search_spec.set_query("hasProperty(\"body\")"); + SearchResultProto results = icing.Search(search_spec, GetDefaultScoringSpec(), + ResultSpecProto::default_instance()); + EXPECT_THAT(results.status(), ProtoIsOk()); + EXPECT_THAT(results.results(), SizeIs(2)); + EXPECT_THAT(results.results(0).document(), EqualsProto(document2)); + EXPECT_THAT(results.results(1).document(), EqualsProto(document0)); + + // Get all documents that have "timestamp". + search_spec.set_query("hasProperty(\"timestamp\")"); + results = icing.Search(search_spec, GetDefaultScoringSpec(), + ResultSpecProto::default_instance()); + EXPECT_THAT(results.status(), ProtoIsOk()); + EXPECT_THAT(results.results(), SizeIs(2)); + EXPECT_THAT(results.results(0).document(), EqualsProto(document1)); + EXPECT_THAT(results.results(1).document(), EqualsProto(document0)); + + // Get all documents that have "score". + search_spec.set_query("hasProperty(\"score\")"); + results = icing.Search(search_spec, GetDefaultScoringSpec(), + ResultSpecProto::default_instance()); + EXPECT_THAT(results.status(), ProtoIsOk()); + EXPECT_THAT(results.results(), SizeIs(3)); + EXPECT_THAT(results.results(0).document(), EqualsProto(document2)); + EXPECT_THAT(results.results(1).document(), EqualsProto(document1)); + EXPECT_THAT(results.results(2).document(), EqualsProto(document0)); +} + +TEST_P(IcingSearchEngineSearchTest, + HasPropertyQueryDoesNotWorkWithoutMetadataHits) { + if (GetParam() != + SearchSpecProto::SearchType::EXPERIMENTAL_ICING_ADVANCED_QUERY) { + GTEST_SKIP() + << "The hasProperty() function is only supported in advanced query."; + } + SchemaProto schema = + SchemaBuilder() + .AddType(SchemaTypeConfigBuilder() + .SetType("Value") + .AddProperty(PropertyConfigBuilder() + .SetName("body") + .SetDataTypeString(TERM_MATCH_EXACT, + TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_REPEATED)) + .AddProperty(PropertyConfigBuilder() + .SetName("timestamp") + .SetDataType(TYPE_INT64) + .SetCardinality(CARDINALITY_OPTIONAL)) + .AddProperty(PropertyConfigBuilder() + .SetName("score") + .SetDataType(TYPE_DOUBLE) + .SetCardinality(CARDINALITY_OPTIONAL))) + .Build(); + + // Create a document with every property. + DocumentProto document0 = DocumentBuilder() + .SetKey("icing", "uri0") + .SetSchema("Value") + .SetCreationTimestampMs(1) + .AddStringProperty("body", "foo") + .AddInt64Property("timestamp", 123) + .AddDoubleProperty("score", 456.789) + .Build(); + // Create a document with missing body. + DocumentProto document1 = DocumentBuilder() + .SetKey("icing", "uri1") + .SetSchema("Value") + .SetCreationTimestampMs(1) + .AddInt64Property("timestamp", 123) + .AddDoubleProperty("score", 456.789) + .Build(); + // Create a document with missing timestamp. + DocumentProto document2 = DocumentBuilder() + .SetKey("icing", "uri2") + .SetSchema("Value") + .SetCreationTimestampMs(1) + .AddStringProperty("body", "foo") + .AddDoubleProperty("score", 456.789) + .Build(); + + IcingSearchEngineOptions options = GetDefaultIcingOptions(); + options.set_build_property_existence_metadata_hits(false); + IcingSearchEngine icing(options, GetTestJniCache()); + ASSERT_THAT(icing.Initialize().status(), ProtoIsOk()); + ASSERT_THAT(icing.SetSchema(schema).status(), ProtoIsOk()); + ASSERT_THAT(icing.Put(document0).status(), ProtoIsOk()); + ASSERT_THAT(icing.Put(document1).status(), ProtoIsOk()); + ASSERT_THAT(icing.Put(document2).status(), ProtoIsOk()); + + // Check that none of the following hasProperty queries can return any + // results. + // + // Get all documents that have "body". + SearchSpecProto search_spec; + search_spec.set_term_match_type(TermMatchType::EXACT_ONLY); + search_spec.set_search_type(GetParam()); + search_spec.add_enabled_features(std::string(kHasPropertyFunctionFeature)); + search_spec.add_enabled_features( + std::string(kListFilterQueryLanguageFeature)); + search_spec.set_query("hasProperty(\"body\")"); + SearchResultProto results = icing.Search(search_spec, GetDefaultScoringSpec(), + ResultSpecProto::default_instance()); + EXPECT_THAT(results.status(), ProtoIsOk()); + EXPECT_THAT(results.results(), IsEmpty()); + + // Get all documents that have "timestamp". + search_spec.set_query("hasProperty(\"timestamp\")"); + results = icing.Search(search_spec, GetDefaultScoringSpec(), + ResultSpecProto::default_instance()); + EXPECT_THAT(results.status(), ProtoIsOk()); + EXPECT_THAT(results.results(), IsEmpty()); + + // Get all documents that have "score". + search_spec.set_query("hasProperty(\"score\")"); + results = icing.Search(search_spec, GetDefaultScoringSpec(), + ResultSpecProto::default_instance()); + EXPECT_THAT(results.status(), ProtoIsOk()); + EXPECT_THAT(results.results(), IsEmpty()); +} + +TEST_P(IcingSearchEngineSearchTest, HasPropertyQueryNestedDocument) { + if (GetParam() != + SearchSpecProto::SearchType::EXPERIMENTAL_ICING_ADVANCED_QUERY) { + GTEST_SKIP() + << "The hasProperty() function is only supported in advanced query."; + } + SchemaProto schema = + SchemaBuilder() + .AddType(SchemaTypeConfigBuilder() + .SetType("Value") + .AddProperty(PropertyConfigBuilder() + .SetName("body") + .SetDataTypeString(TERM_MATCH_EXACT, + TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_REPEATED)) + .AddProperty(PropertyConfigBuilder() + .SetName("timestamp") + .SetDataType(TYPE_INT64) + .SetCardinality(CARDINALITY_OPTIONAL)) + .AddProperty(PropertyConfigBuilder() + .SetName("score") + .SetDataType(TYPE_DOUBLE) + .SetCardinality(CARDINALITY_OPTIONAL))) + .AddType(SchemaTypeConfigBuilder() + .SetType("TreeNode") + .AddProperty(PropertyConfigBuilder() + .SetName("name") + .SetDataTypeString(TERM_MATCH_EXACT, + TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_OPTIONAL)) + .AddProperty( + PropertyConfigBuilder() + .SetName("value") + .SetDataTypeDocument( + "Value", /*index_nested_properties=*/true) + .SetCardinality(CARDINALITY_OPTIONAL))) + .Build(); + + // Create a complex nested root_document with the following property paths. + // - name + // - value + // - value.body + // - value.score + DocumentProto document = + DocumentBuilder() + .SetKey("icing", "uri") + .SetSchema("TreeNode") + .SetCreationTimestampMs(1) + .AddStringProperty("name", "root") + .AddDocumentProperty("value", DocumentBuilder() + .SetKey("icing", "uri") + .SetSchema("Value") + .AddStringProperty("body", "foo") + .AddDoubleProperty("score", 456.789) + .Build()) + .Build(); + + IcingSearchEngineOptions options = GetDefaultIcingOptions(); + options.set_build_property_existence_metadata_hits(true); + IcingSearchEngine icing(options, GetTestJniCache()); + ASSERT_THAT(icing.Initialize().status(), ProtoIsOk()); + ASSERT_THAT(icing.SetSchema(schema).status(), ProtoIsOk()); + ASSERT_THAT(icing.Put(document).status(), ProtoIsOk()); + + // Check that the document can be found by `hasProperty("name")`. + SearchSpecProto search_spec; + search_spec.set_term_match_type(TermMatchType::EXACT_ONLY); + search_spec.set_search_type(GetParam()); + search_spec.add_enabled_features(std::string(kHasPropertyFunctionFeature)); + search_spec.add_enabled_features( + std::string(kListFilterQueryLanguageFeature)); + search_spec.set_query("hasProperty(\"name\")"); + SearchResultProto results = icing.Search(search_spec, GetDefaultScoringSpec(), + ResultSpecProto::default_instance()); + EXPECT_THAT(results.status(), ProtoIsOk()); + EXPECT_THAT(results.results(), SizeIs(1)); + EXPECT_THAT(results.results(0).document(), EqualsProto(document)); + + // Check that the document can be found by `hasProperty("value")`. + search_spec.set_query("hasProperty(\"value\")"); + results = icing.Search(search_spec, GetDefaultScoringSpec(), + ResultSpecProto::default_instance()); + EXPECT_THAT(results.status(), ProtoIsOk()); + EXPECT_THAT(results.results(), SizeIs(1)); + EXPECT_THAT(results.results(0).document(), EqualsProto(document)); + + // Check that the document can be found by `hasProperty("value.body")`. + search_spec.set_query("hasProperty(\"value.body\")"); + results = icing.Search(search_spec, GetDefaultScoringSpec(), + ResultSpecProto::default_instance()); + EXPECT_THAT(results.status(), ProtoIsOk()); + EXPECT_THAT(results.results(), SizeIs(1)); + EXPECT_THAT(results.results(0).document(), EqualsProto(document)); + + // Check that the document can be found by `hasProperty("value.score")`. + search_spec.set_query("hasProperty(\"value.score\")"); + results = icing.Search(search_spec, GetDefaultScoringSpec(), + ResultSpecProto::default_instance()); + EXPECT_THAT(results.status(), ProtoIsOk()); + EXPECT_THAT(results.results(), SizeIs(1)); + EXPECT_THAT(results.results(0).document(), EqualsProto(document)); + + // Check that the document can NOT be found by `hasProperty("body")`. + search_spec.set_query("hasProperty(\"body\")"); + results = icing.Search(search_spec, GetDefaultScoringSpec(), + ResultSpecProto::default_instance()); + EXPECT_THAT(results.status(), ProtoIsOk()); + EXPECT_THAT(results.results(), IsEmpty()); + + // Check that the document can NOT be found by `hasProperty("score")`. + search_spec.set_query("hasProperty(\"score\")"); + results = icing.Search(search_spec, GetDefaultScoringSpec(), + ResultSpecProto::default_instance()); + EXPECT_THAT(results.status(), ProtoIsOk()); + EXPECT_THAT(results.results(), IsEmpty()); +} + INSTANTIATE_TEST_SUITE_P( IcingSearchEngineSearchTest, IcingSearchEngineSearchTest, testing::Values( diff --git a/icing/index/index-processor_benchmark.cc b/icing/index/index-processor_benchmark.cc index b6d3c29..8f5e319 100644 --- a/icing/index/index-processor_benchmark.cc +++ b/icing/index/index-processor_benchmark.cc @@ -12,13 +12,17 @@ // See the License for the specific language governing permissions and // limitations under the License. +#include <cstdint> +#include <limits> #include <memory> +#include <string> #include <utility> #include <vector> #include "icing/text_classifier/lib3/utils/base/statusor.h" #include "testing/base/public/benchmark.h" #include "gmock/gmock.h" +#include "third_party/absl/flags/flag.h" #include "icing/document-builder.h" #include "icing/file/filesystem.h" #include "icing/index/data-indexing-handler.h" @@ -27,11 +31,11 @@ #include "icing/index/integer-section-indexing-handler.h" #include "icing/index/numeric/integer-index.h" #include "icing/index/numeric/numeric-index.h" -#include "icing/index/string-section-indexing-handler.h" +#include "icing/index/term-indexing-handler.h" #include "icing/legacy/core/icing-string-util.h" +#include "icing/legacy/index/icing-filesystem.h" #include "icing/schema/schema-store.h" -#include "icing/schema/schema-util.h" -#include "icing/schema/section-manager.h" +#include "icing/store/document-id.h" #include "icing/testing/common-matchers.h" #include "icing/testing/icu-data-file-helper.h" #include "icing/testing/test-data.h" @@ -40,7 +44,9 @@ #include "icing/tokenization/language-segmenter.h" #include "icing/transform/normalizer-factory.h" #include "icing/transform/normalizer.h" +#include "icing/util/clock.h" #include "icing/util/logging.h" +#include "icing/util/status-macros.h" #include "icing/util/tokenized-document.h" #include "unicode/uloc.h" @@ -150,7 +156,9 @@ DocumentProto CreateDocumentWithHiragana(int content_length) { std::unique_ptr<Index> CreateIndex(const IcingFilesystem& icing_filesystem, const Filesystem& filesystem, const std::string& index_dir) { - Index::Options options(index_dir, /*index_merge_size=*/1024 * 1024 * 10); + Index::Options options(index_dir, /*index_merge_size=*/1024 * 1024 * 10, + /*lite_index_sort_at_indexing=*/true, + /*lite_index_sort_size=*/1024 * 8); return Index::Create(options, &filesystem, &icing_filesystem).ValueOrDie(); } @@ -187,16 +195,17 @@ libtextclassifier3::StatusOr<std::vector<std::unique_ptr<DataIndexingHandler>>> CreateDataIndexingHandlers(const Clock* clock, const Normalizer* normalizer, Index* index, NumericIndex<int64_t>* integer_index) { ICING_ASSIGN_OR_RETURN( - std::unique_ptr<StringSectionIndexingHandler> - string_section_indexing_handler, - StringSectionIndexingHandler::Create(clock, normalizer, index)); + std::unique_ptr<TermIndexingHandler> term_indexing_handler, + TermIndexingHandler::Create( + clock, normalizer, index, + /*build_property_existence_metadata_hits=*/true)); ICING_ASSIGN_OR_RETURN( std::unique_ptr<IntegerSectionIndexingHandler> integer_section_indexing_handler, IntegerSectionIndexingHandler::Create(clock, integer_index)); std::vector<std::unique_ptr<DataIndexingHandler>> handlers; - handlers.push_back(std::move(string_section_indexing_handler)); + handlers.push_back(std::move(term_indexing_handler)); handlers.push_back(std::move(integer_section_indexing_handler)); return handlers; } @@ -227,6 +236,7 @@ void BM_IndexDocumentWithOneProperty(benchmark::State& state) { ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<NumericIndex<int64_t>> integer_index, IntegerIndex::Create(filesystem, integer_index_dir, + IntegerIndex::kDefaultNumDataThresholdForBucketSplit, /*pre_mapping_fbv=*/true)); language_segmenter_factory::SegmenterOptions options(ULOC_US); std::unique_ptr<LanguageSegmenter> language_segmenter = @@ -302,6 +312,7 @@ void BM_IndexDocumentWithTenProperties(benchmark::State& state) { ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<NumericIndex<int64_t>> integer_index, IntegerIndex::Create(filesystem, integer_index_dir, + IntegerIndex::kDefaultNumDataThresholdForBucketSplit, /*pre_mapping_fbv=*/true)); language_segmenter_factory::SegmenterOptions options(ULOC_US); std::unique_ptr<LanguageSegmenter> language_segmenter = @@ -378,6 +389,7 @@ void BM_IndexDocumentWithDiacriticLetters(benchmark::State& state) { ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<NumericIndex<int64_t>> integer_index, IntegerIndex::Create(filesystem, integer_index_dir, + IntegerIndex::kDefaultNumDataThresholdForBucketSplit, /*pre_mapping_fbv=*/true)); language_segmenter_factory::SegmenterOptions options(ULOC_US); std::unique_ptr<LanguageSegmenter> language_segmenter = @@ -454,6 +466,7 @@ void BM_IndexDocumentWithHiragana(benchmark::State& state) { ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<NumericIndex<int64_t>> integer_index, IntegerIndex::Create(filesystem, integer_index_dir, + IntegerIndex::kDefaultNumDataThresholdForBucketSplit, /*pre_mapping_fbv=*/true)); language_segmenter_factory::SegmenterOptions options(ULOC_US); std::unique_ptr<LanguageSegmenter> language_segmenter = diff --git a/icing/index/index-processor_test.cc b/icing/index/index-processor_test.cc index 0a0108d..3d1be68 100644 --- a/icing/index/index-processor_test.cc +++ b/icing/index/index-processor_test.cc @@ -30,18 +30,21 @@ #include "icing/absl_ports/str_join.h" #include "icing/document-builder.h" #include "icing/file/filesystem.h" +#include "icing/file/portable-file-backed-proto-log.h" #include "icing/index/data-indexing-handler.h" #include "icing/index/hit/doc-hit-info.h" +#include "icing/index/hit/hit.h" #include "icing/index/index.h" #include "icing/index/integer-section-indexing-handler.h" #include "icing/index/iterator/doc-hit-info-iterator-test-util.h" #include "icing/index/iterator/doc-hit-info-iterator.h" #include "icing/index/numeric/integer-index.h" #include "icing/index/numeric/numeric-index.h" -#include "icing/index/string-section-indexing-handler.h" +#include "icing/index/term-indexing-handler.h" #include "icing/index/term-property-id.h" +#include "icing/join/qualified-id-join-index-impl-v1.h" +#include "icing/join/qualified-id-join-index.h" #include "icing/join/qualified-id-join-indexing-handler.h" -#include "icing/join/qualified-id-type-joinable-index.h" #include "icing/legacy/index/icing-filesystem.h" #include "icing/legacy/index/icing-mock-filesystem.h" #include "icing/portable/platform.h" @@ -50,7 +53,6 @@ #include "icing/proto/term.pb.h" #include "icing/schema-builder.h" #include "icing/schema/schema-store.h" -#include "icing/schema/schema-util.h" #include "icing/schema/section.h" #include "icing/store/document-id.h" #include "icing/store/document-store.h" @@ -64,6 +66,7 @@ #include "icing/tokenization/language-segmenter.h" #include "icing/transform/normalizer-factory.h" #include "icing/transform/normalizer.h" +#include "icing/util/crc32.h" #include "icing/util/tokenized-document.h" #include "unicode/uloc.h" @@ -167,19 +170,24 @@ class IndexProcessorTest : public Test { schema_store_dir_ = base_dir_ + "/schema_store"; doc_store_dir_ = base_dir_ + "/doc_store"; - Index::Options options(index_dir_, /*index_merge_size=*/1024 * 1024); + Index::Options options(index_dir_, /*index_merge_size=*/1024 * 1024, + /*lite_index_sort_at_indexing=*/true, + /*lite_index_sort_size=*/1024 * 8); ICING_ASSERT_OK_AND_ASSIGN( index_, Index::Create(options, &filesystem_, &icing_filesystem_)); ICING_ASSERT_OK_AND_ASSIGN( - integer_index_, IntegerIndex::Create(filesystem_, integer_index_dir_, - /*pre_mapping_fbv=*/false)); - - ICING_ASSERT_OK_AND_ASSIGN( - qualified_id_join_index_, - QualifiedIdTypeJoinableIndex::Create( - filesystem_, qualified_id_join_index_dir_, - /*pre_mapping_fbv=*/false, /*use_persistent_hash_map=*/false)); + integer_index_, + IntegerIndex::Create( + filesystem_, integer_index_dir_, + IntegerIndex::kDefaultNumDataThresholdForBucketSplit, + /*pre_mapping_fbv=*/false)); + + ICING_ASSERT_OK_AND_ASSIGN(qualified_id_join_index_, + QualifiedIdJoinIndexImplV1::Create( + filesystem_, qualified_id_join_index_dir_, + /*pre_mapping_fbv=*/false, + /*use_persistent_hash_map=*/false)); language_segmenter_factory::SegmenterOptions segmenter_options(ULOC_US); ICING_ASSERT_OK_AND_ASSIGN( @@ -277,34 +285,34 @@ class IndexProcessorTest : public Test { ASSERT_TRUE(filesystem_.CreateDirectoryRecursively(doc_store_dir_.c_str())); ICING_ASSERT_OK_AND_ASSIGN( DocumentStore::CreateResult create_result, - DocumentStore::Create(&filesystem_, doc_store_dir_, &fake_clock_, - schema_store_.get(), - /*force_recovery_and_revalidate_documents=*/false, - /*namespace_id_fingerprint=*/false, - PortableFileBackedProtoLog< - DocumentWrapper>::kDeflateCompressionLevel, - /*initialize_stats=*/nullptr)); + DocumentStore::Create( + &filesystem_, doc_store_dir_, &fake_clock_, schema_store_.get(), + /*force_recovery_and_revalidate_documents=*/false, + /*namespace_id_fingerprint=*/false, /*pre_mapping_fbv=*/false, + /*use_persistent_hash_map=*/false, + PortableFileBackedProtoLog< + DocumentWrapper>::kDeflateCompressionLevel, + /*initialize_stats=*/nullptr)); doc_store_ = std::move(create_result.document_store); ICING_ASSERT_OK_AND_ASSIGN( - std::unique_ptr<StringSectionIndexingHandler> - string_section_indexing_handler, - StringSectionIndexingHandler::Create(&fake_clock_, normalizer_.get(), - index_.get())); + std::unique_ptr<TermIndexingHandler> term_indexing_handler, + TermIndexingHandler::Create( + &fake_clock_, normalizer_.get(), index_.get(), + /*build_property_existence_metadata_hits=*/true)); ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<IntegerSectionIndexingHandler> integer_section_indexing_handler, IntegerSectionIndexingHandler::Create( &fake_clock_, integer_index_.get())); ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<QualifiedIdJoinIndexingHandler> - qualified_id_joinable_property_indexing_handler, - QualifiedIdJoinIndexingHandler::Create(&fake_clock_, + qualified_id_join_indexing_handler, + QualifiedIdJoinIndexingHandler::Create(&fake_clock_, doc_store_.get(), qualified_id_join_index_.get())); std::vector<std::unique_ptr<DataIndexingHandler>> handlers; - handlers.push_back(std::move(string_section_indexing_handler)); + handlers.push_back(std::move(term_indexing_handler)); handlers.push_back(std::move(integer_section_indexing_handler)); - handlers.push_back( - std::move(qualified_id_joinable_property_indexing_handler)); + handlers.push_back(std::move(qualified_id_join_indexing_handler)); index_processor_ = std::make_unique<IndexProcessor>(std::move(handlers), &fake_clock_); @@ -339,7 +347,7 @@ class IndexProcessorTest : public Test { std::unique_ptr<Index> index_; std::unique_ptr<NumericIndex<int64_t>> integer_index_; - std::unique_ptr<QualifiedIdTypeJoinableIndex> qualified_id_join_index_; + std::unique_ptr<QualifiedIdJoinIndex> qualified_id_join_index_; std::unique_ptr<LanguageSegmenter> lang_segmenter_; std::unique_ptr<Normalizer> normalizer_; std::unique_ptr<SchemaStore> schema_store_; @@ -629,12 +637,13 @@ TEST_F(IndexProcessorTest, TooLongTokens) { normalizer_factory::Create( /*max_term_byte_size=*/4)); - ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<StringSectionIndexingHandler> - string_section_indexing_handler, - StringSectionIndexingHandler::Create( - &fake_clock_, normalizer.get(), index_.get())); + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<TermIndexingHandler> term_indexing_handler, + TermIndexingHandler::Create( + &fake_clock_, normalizer.get(), index_.get(), + /*build_property_existence_metadata_hits=*/true)); std::vector<std::unique_ptr<DataIndexingHandler>> handlers; - handlers.push_back(std::move(string_section_indexing_handler)); + handlers.push_back(std::move(term_indexing_handler)); index_processor_ = std::make_unique<IndexProcessor>(std::move(handlers), &fake_clock_); @@ -819,24 +828,23 @@ TEST_F(IndexProcessorTest, OutOfOrderDocumentIds) { TEST_F(IndexProcessorTest, OutOfOrderDocumentIdsInRecoveryMode) { ICING_ASSERT_OK_AND_ASSIGN( - std::unique_ptr<StringSectionIndexingHandler> - string_section_indexing_handler, - StringSectionIndexingHandler::Create(&fake_clock_, normalizer_.get(), - index_.get())); + std::unique_ptr<TermIndexingHandler> term_indexing_handler, + TermIndexingHandler::Create( + &fake_clock_, normalizer_.get(), index_.get(), + /*build_property_existence_metadata_hits=*/true)); ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<IntegerSectionIndexingHandler> integer_section_indexing_handler, IntegerSectionIndexingHandler::Create( &fake_clock_, integer_index_.get())); ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<QualifiedIdJoinIndexingHandler> - qualified_id_joinable_property_indexing_handler, - QualifiedIdJoinIndexingHandler::Create(&fake_clock_, + qualified_id_join_indexing_handler, + QualifiedIdJoinIndexingHandler::Create(&fake_clock_, doc_store_.get(), qualified_id_join_index_.get())); std::vector<std::unique_ptr<DataIndexingHandler>> handlers; - handlers.push_back(std::move(string_section_indexing_handler)); + handlers.push_back(std::move(term_indexing_handler)); handlers.push_back(std::move(integer_section_indexing_handler)); - handlers.push_back( - std::move(qualified_id_joinable_property_indexing_handler)); + handlers.push_back(std::move(qualified_id_join_indexing_handler)); IndexProcessor index_processor(std::move(handlers), &fake_clock_, /*recovery_mode=*/true); @@ -969,17 +977,19 @@ TEST_F(IndexProcessorTest, IndexingDocAutomaticMerge) { TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(), document)); Index::Options options(index_dir_, - /*index_merge_size=*/document.ByteSizeLong() * 100); + /*index_merge_size=*/document.ByteSizeLong() * 100, + /*lite_index_sort_at_indexing=*/true, + /*lite_index_sort_size=*/64); ICING_ASSERT_OK_AND_ASSIGN( index_, Index::Create(options, &filesystem_, &icing_filesystem_)); ICING_ASSERT_OK_AND_ASSIGN( - std::unique_ptr<StringSectionIndexingHandler> - string_section_indexing_handler, - StringSectionIndexingHandler::Create(&fake_clock_, normalizer_.get(), - index_.get())); + std::unique_ptr<TermIndexingHandler> term_indexing_handler, + TermIndexingHandler::Create( + &fake_clock_, normalizer_.get(), index_.get(), + /*build_property_existence_metadata_hits=*/true)); std::vector<std::unique_ptr<DataIndexingHandler>> handlers; - handlers.push_back(std::move(string_section_indexing_handler)); + handlers.push_back(std::move(term_indexing_handler)); index_processor_ = std::make_unique<IndexProcessor>(std::move(handlers), &fake_clock_); @@ -1032,18 +1042,20 @@ TEST_F(IndexProcessorTest, IndexingDocMergeFailureResets) { // 2. Recreate the index with the mock filesystem and a merge size that will // only allow one document to be added before requiring a merge. Index::Options options(index_dir_, - /*index_merge_size=*/document.ByteSizeLong()); + /*index_merge_size=*/document.ByteSizeLong(), + /*lite_index_sort_at_indexing=*/true, + /*lite_index_sort_size=*/16); ICING_ASSERT_OK_AND_ASSIGN( index_, Index::Create(options, &filesystem_, mock_icing_filesystem_.get())); ICING_ASSERT_OK_AND_ASSIGN( - std::unique_ptr<StringSectionIndexingHandler> - string_section_indexing_handler, - StringSectionIndexingHandler::Create(&fake_clock_, normalizer_.get(), - index_.get())); + std::unique_ptr<TermIndexingHandler> term_indexing_handler, + TermIndexingHandler::Create( + &fake_clock_, normalizer_.get(), index_.get(), + /*build_property_existence_metadata_hits=*/true)); std::vector<std::unique_ptr<DataIndexingHandler>> handlers; - handlers.push_back(std::move(string_section_indexing_handler)); + handlers.push_back(std::move(term_indexing_handler)); index_processor_ = std::make_unique<IndexProcessor>(std::move(handlers), &fake_clock_); diff --git a/icing/index/index.cc b/icing/index/index.cc index 19edbb6..98058be 100644 --- a/icing/index/index.cc +++ b/icing/index/index.cc @@ -14,31 +14,38 @@ #include "icing/index/index.h" +#include <algorithm> +#include <cstddef> #include <cstdint> #include <memory> #include <string> #include <utility> +#include <vector> #include "icing/text_classifier/lib3/utils/base/status.h" #include "icing/text_classifier/lib3/utils/base/statusor.h" #include "icing/absl_ports/canonical_errors.h" #include "icing/absl_ports/str_cat.h" +#include "icing/file/filesystem.h" #include "icing/index/hit/hit.h" #include "icing/index/iterator/doc-hit-info-iterator-or.h" #include "icing/index/iterator/doc-hit-info-iterator.h" #include "icing/index/lite/doc-hit-info-iterator-term-lite.h" #include "icing/index/lite/lite-index.h" #include "icing/index/main/doc-hit-info-iterator-term-main.h" +#include "icing/index/main/main-index.h" #include "icing/index/term-id-codec.h" -#include "icing/index/term-property-id.h" +#include "icing/index/term-metadata.h" #include "icing/legacy/core/icing-string-util.h" #include "icing/legacy/index/icing-dynamic-trie.h" #include "icing/legacy/index/icing-filesystem.h" +#include "icing/proto/scoring.pb.h" #include "icing/proto/storage.pb.h" #include "icing/proto/term.pb.h" #include "icing/schema/section.h" #include "icing/scoring/ranker.h" #include "icing/store/document-id.h" +#include "icing/store/suggestion-result-checker.h" #include "icing/util/logging.h" #include "icing/util/status-macros.h" @@ -58,8 +65,10 @@ libtextclassifier3::StatusOr<LiteIndex::Options> CreateLiteIndexOptions( "Requested hit buffer size %d is too large.", options.index_merge_size)); } - return LiteIndex::Options(options.base_dir + "/idx/lite.", - options.index_merge_size); + return LiteIndex::Options( + options.base_dir + "/idx/lite.", options.index_merge_size, + options.lite_index_sort_at_indexing, options.lite_index_sort_size, + options.include_property_existence_metadata_hits); } std::string MakeMainIndexFilepath(const std::string& base_dir) { @@ -151,9 +160,17 @@ libtextclassifier3::StatusOr<std::unique_ptr<Index>> Index::Create( IcingDynamicTrie::max_value_index(GetMainLexiconOptions()), IcingDynamicTrie::max_value_index( lite_index_options.lexicon_options))); + ICING_ASSIGN_OR_RETURN( std::unique_ptr<LiteIndex> lite_index, LiteIndex::Create(lite_index_options, icing_filesystem)); + // Sort the lite index if we've enabled sorting the HitBuffer at indexing + // time, and there's an unsorted tail exceeding the threshold. + if (options.lite_index_sort_at_indexing && + lite_index->HasUnsortedHitsExceedingSortThreshold()) { + lite_index->SortHits(); + } + ICING_ASSIGN_OR_RETURN( std::unique_ptr<MainIndex> main_index, MainIndex::Create(MakeMainIndexFilepath(options.base_dir), filesystem, diff --git a/icing/index/index.h b/icing/index/index.h index c170278..a5d75c4 100644 --- a/icing/index/index.h +++ b/icing/index/index.h @@ -18,8 +18,9 @@ #include <cstdint> #include <memory> #include <string> -#include <unordered_set> +#include <unordered_map> #include <utility> +#include <vector> #include "icing/text_classifier/lib3/utils/base/status.h" #include "icing/text_classifier/lib3/utils/base/statusor.h" @@ -27,6 +28,7 @@ #include "icing/index/hit/hit.h" #include "icing/index/iterator/doc-hit-info-iterator.h" #include "icing/index/lite/lite-index.h" +#include "icing/index/lite/term-id-hit-pair.h" #include "icing/index/main/main-index-merger.h" #include "icing/index/main/main-index.h" #include "icing/index/term-id-codec.h" @@ -40,7 +42,7 @@ #include "icing/store/document-id.h" #include "icing/store/namespace-id.h" #include "icing/store/suggestion-result-checker.h" -#include "icing/util/crc32.h" +#include "icing/util/status-macros.h" namespace icing { namespace lib { @@ -68,11 +70,22 @@ namespace lib { class Index { public: struct Options { - explicit Options(const std::string& base_dir, uint32_t index_merge_size) - : base_dir(base_dir), index_merge_size(index_merge_size) {} + explicit Options(const std::string& base_dir, uint32_t index_merge_size, + bool lite_index_sort_at_indexing, + uint32_t lite_index_sort_size, + bool include_property_existence_metadata_hits = false) + : base_dir(base_dir), + index_merge_size(index_merge_size), + lite_index_sort_at_indexing(lite_index_sort_at_indexing), + lite_index_sort_size(lite_index_sort_size), + include_property_existence_metadata_hits( + include_property_existence_metadata_hits) {} std::string base_dir; int32_t index_merge_size; + bool lite_index_sort_at_indexing; + int32_t lite_index_sort_size; + bool include_property_existence_metadata_hits; }; // Creates an instance of Index in the directory pointed by file_dir. @@ -279,6 +292,19 @@ class Index { return lite_index_->Reset(); } + // Whether the LiteIndex HitBuffer requires sorting. This is only true if + // Icing has enabled sorting during indexing time, and the HitBuffer's + // unsorted tail has exceeded the lite_index_sort_size. + bool LiteIndexNeedSort() const { + return options_.lite_index_sort_at_indexing && + lite_index_->HasUnsortedHitsExceedingSortThreshold(); + } + + // Sorts the LiteIndex HitBuffer. + void SortLiteIndex() { + lite_index_->SortHits(); + } + // Reduces internal file sizes by reclaiming space of deleted documents. // new_last_added_document_id will be used to update the last added document // id in the lite index. diff --git a/icing/index/index_test.cc b/icing/index/index_test.cc index d563bcb..04a6bb7 100644 --- a/icing/index/index_test.cc +++ b/icing/index/index_test.cc @@ -58,6 +58,7 @@ using ::testing::Eq; using ::testing::Ge; using ::testing::Gt; using ::testing::IsEmpty; +using ::testing::IsFalse; using ::testing::IsTrue; using ::testing::Ne; using ::testing::NiceMock; @@ -75,7 +76,9 @@ class IndexTest : public Test { protected: void SetUp() override { index_dir_ = GetTestTempDir() + "/index_test/"; - Index::Options options(index_dir_, /*index_merge_size=*/1024 * 1024); + Index::Options options(index_dir_, /*index_merge_size=*/1024 * 1024, + /*lite_index_sort_at_indexing=*/true, + /*lite_index_sort_size=*/1024 * 8); ICING_ASSERT_OK_AND_ASSIGN( index_, Index::Create(options, &filesystem_, &icing_filesystem_)); } @@ -146,7 +149,9 @@ MATCHER_P2(EqualsTermMetadata, content, hit_count, "") { } TEST_F(IndexTest, CreationWithNullPointerShouldFail) { - Index::Options options(index_dir_, /*index_merge_size=*/1024 * 1024); + Index::Options options(index_dir_, /*index_merge_size=*/1024 * 1024, + /*lite_index_sort_at_indexing=*/true, + /*lite_index_sort_size=*/1024 * 8); EXPECT_THAT( Index::Create(options, &filesystem_, /*icing_filesystem=*/nullptr), StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION)); @@ -192,6 +197,36 @@ TEST_F(IndexTest, EmptyIndexAfterMerge) { StatusIs(libtextclassifier3::StatusCode::RESOURCE_EXHAUSTED)); } +TEST_F(IndexTest, CreationWithLiteIndexSortAtIndexingEnabledShouldSort) { + // Make the index with lite_index_sort_at_indexing=false and a very small sort + // threshold. + Index::Options options(index_dir_, /*index_merge_size=*/1024, + /*lite_index_sort_at_indexing=*/false, + /*lite_index_sort_size=*/16); + ICING_ASSERT_OK_AND_ASSIGN( + index_, Index::Create(options, &filesystem_, &icing_filesystem_)); + + Index::Editor edit = index_->Edit( + kDocumentId0, kSectionId2, TermMatchType::EXACT_ONLY, /*namespace_id=*/0); + ASSERT_THAT(edit.BufferTerm("foo"), IsOk()); + ASSERT_THAT(edit.BufferTerm("bar"), IsOk()); + ASSERT_THAT(edit.BufferTerm("baz"), IsOk()); + ASSERT_THAT(edit.IndexAllBufferedTerms(), IsOk()); + + // Persist and recreate the index with lite_index_sort_at_indexing=true + ASSERT_THAT(index_->PersistToDisk(), IsOk()); + options = Index::Options(index_dir_, /*index_merge_size=*/1024, + /*lite_index_sort_at_indexing=*/true, + /*lite_index_sort_size=*/16); + ICING_ASSERT_OK_AND_ASSIGN( + index_, Index::Create(options, &filesystem_, &icing_filesystem_)); + + // Check that the index is sorted after recreating with + // lite_index_sort_at_indexing, with the unsorted HitBuffer exceeding the sort + // threshold. + EXPECT_THAT(index_->LiteIndexNeedSort(), IsFalse()); +} + TEST_F(IndexTest, AdvancePastEnd) { Index::Editor edit = index_->Edit( kDocumentId0, kSectionId2, TermMatchType::EXACT_ONLY, /*namespace_id=*/0); @@ -248,6 +283,228 @@ TEST_F(IndexTest, AdvancePastEndAfterMerge) { EqualsDocHitInfo(kInvalidDocumentId, std::vector<SectionId>())); } +TEST_F(IndexTest, IteratorGetCallStats_mainIndexOnly) { + Index::Editor edit = index_->Edit( + kDocumentId0, kSectionId2, TermMatchType::EXACT_ONLY, /*namespace_id=*/0); + EXPECT_THAT(edit.BufferTerm("foo"), IsOk()); + EXPECT_THAT(edit.BufferTerm("bar"), IsOk()); + EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk()); + + edit = index_->Edit(kDocumentId1, kSectionId2, TermMatchType::EXACT_ONLY, + /*namespace_id=*/0); + EXPECT_THAT(edit.BufferTerm("foo"), IsOk()); + EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk()); + + // Merge the index. + ICING_ASSERT_OK(index_->Merge()); + + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<DocHitInfoIterator> itr, + index_->GetIterator("foo", /*term_start_index=*/0, + /*unnormalized_term_length=*/0, kSectionIdMaskAll, + TermMatchType::EXACT_ONLY)); + + // Before Advance(). + EXPECT_THAT( + itr->GetCallStats(), + EqualsDocHitInfoIteratorCallStats( + /*num_leaf_advance_calls_lite_index=*/0, + /*num_leaf_advance_calls_main_index=*/0, + /*num_leaf_advance_calls_integer_index=*/0, + /*num_leaf_advance_calls_no_index=*/0, /*num_blocks_inspected=*/0)); + + // 1st Advance(). + ICING_ASSERT_OK(itr->Advance()); + EXPECT_THAT( + itr->GetCallStats(), + EqualsDocHitInfoIteratorCallStats( + /*num_leaf_advance_calls_lite_index=*/0, + /*num_leaf_advance_calls_main_index=*/1, + /*num_leaf_advance_calls_integer_index=*/0, + /*num_leaf_advance_calls_no_index=*/0, /*num_blocks_inspected=*/1)); + + // 2nd Advance(). + ICING_ASSERT_OK(itr->Advance()); + EXPECT_THAT( + itr->GetCallStats(), + EqualsDocHitInfoIteratorCallStats( + /*num_leaf_advance_calls_lite_index=*/0, + /*num_leaf_advance_calls_main_index=*/2, + /*num_leaf_advance_calls_integer_index=*/0, + /*num_leaf_advance_calls_no_index=*/0, /*num_blocks_inspected=*/1)); + + // 3rd Advance(). + ASSERT_THAT(itr->Advance(), + StatusIs(libtextclassifier3::StatusCode::RESOURCE_EXHAUSTED)); + EXPECT_THAT( + itr->GetCallStats(), + EqualsDocHitInfoIteratorCallStats( + /*num_leaf_advance_calls_lite_index=*/0, + /*num_leaf_advance_calls_main_index=*/2, + /*num_leaf_advance_calls_integer_index=*/0, + /*num_leaf_advance_calls_no_index=*/0, /*num_blocks_inspected=*/1)); +} + +TEST_F(IndexTest, IteratorGetCallStats_liteIndexOnly) { + Index::Editor edit = index_->Edit( + kDocumentId0, kSectionId2, TermMatchType::EXACT_ONLY, /*namespace_id=*/0); + EXPECT_THAT(edit.BufferTerm("foo"), IsOk()); + EXPECT_THAT(edit.BufferTerm("bar"), IsOk()); + EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk()); + + edit = index_->Edit(kDocumentId1, kSectionId2, TermMatchType::EXACT_ONLY, + /*namespace_id=*/0); + EXPECT_THAT(edit.BufferTerm("foo"), IsOk()); + EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk()); + + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<DocHitInfoIterator> itr, + index_->GetIterator("foo", /*term_start_index=*/0, + /*unnormalized_term_length=*/0, kSectionIdMaskAll, + TermMatchType::EXACT_ONLY)); + + // Before Advance(). + EXPECT_THAT( + itr->GetCallStats(), + EqualsDocHitInfoIteratorCallStats( + /*num_leaf_advance_calls_lite_index=*/0, + /*num_leaf_advance_calls_main_index=*/0, + /*num_leaf_advance_calls_integer_index=*/0, + /*num_leaf_advance_calls_no_index=*/0, /*num_blocks_inspected=*/0)); + + // 1st Advance(). + ICING_ASSERT_OK(itr->Advance()); + EXPECT_THAT( + itr->GetCallStats(), + EqualsDocHitInfoIteratorCallStats( + /*num_leaf_advance_calls_lite_index=*/1, + /*num_leaf_advance_calls_main_index=*/0, + /*num_leaf_advance_calls_integer_index=*/0, + /*num_leaf_advance_calls_no_index=*/0, /*num_blocks_inspected=*/0)); + + // 2nd Advance(). + ICING_ASSERT_OK(itr->Advance()); + EXPECT_THAT( + itr->GetCallStats(), + EqualsDocHitInfoIteratorCallStats( + /*num_leaf_advance_calls_lite_index=*/2, + /*num_leaf_advance_calls_main_index=*/0, + /*num_leaf_advance_calls_integer_index=*/0, + /*num_leaf_advance_calls_no_index=*/0, /*num_blocks_inspected=*/0)); + + // 3rd Advance(). + ASSERT_THAT(itr->Advance(), + StatusIs(libtextclassifier3::StatusCode::RESOURCE_EXHAUSTED)); + EXPECT_THAT( + itr->GetCallStats(), + EqualsDocHitInfoIteratorCallStats( + /*num_leaf_advance_calls_lite_index=*/2, + /*num_leaf_advance_calls_main_index=*/0, + /*num_leaf_advance_calls_integer_index=*/0, + /*num_leaf_advance_calls_no_index=*/0, /*num_blocks_inspected=*/0)); +} + +TEST_F(IndexTest, IteratorGetCallStats) { + Index::Editor edit = index_->Edit( + kDocumentId0, kSectionId2, TermMatchType::EXACT_ONLY, /*namespace_id=*/0); + EXPECT_THAT(edit.BufferTerm("foo"), IsOk()); + EXPECT_THAT(edit.BufferTerm("bar"), IsOk()); + EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk()); + + edit = index_->Edit(kDocumentId1, kSectionId2, TermMatchType::EXACT_ONLY, + /*namespace_id=*/0); + EXPECT_THAT(edit.BufferTerm("foo"), IsOk()); + EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk()); + + // Merge the index. 2 hits for "foo" will be merged into the main index. + ICING_ASSERT_OK(index_->Merge()); + + // Insert 2 more hits for "foo". It will be in the lite index. + edit = index_->Edit(kDocumentId2, kSectionId2, TermMatchType::EXACT_ONLY, + /*namespace_id=*/0); + EXPECT_THAT(edit.BufferTerm("foo"), IsOk()); + EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk()); + + edit = index_->Edit(kDocumentId3, kSectionId2, TermMatchType::EXACT_ONLY, + /*namespace_id=*/0); + EXPECT_THAT(edit.BufferTerm("foo"), IsOk()); + EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk()); + + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<DocHitInfoIterator> itr, + index_->GetIterator("foo", /*term_start_index=*/0, + /*unnormalized_term_length=*/0, kSectionIdMaskAll, + TermMatchType::EXACT_ONLY)); + + // Before Advance(). + EXPECT_THAT( + itr->GetCallStats(), + EqualsDocHitInfoIteratorCallStats( + /*num_leaf_advance_calls_lite_index=*/0, + /*num_leaf_advance_calls_main_index=*/0, + /*num_leaf_advance_calls_integer_index=*/0, + /*num_leaf_advance_calls_no_index=*/0, /*num_blocks_inspected=*/0)); + + // 1st Advance(). DocHitInfoIteratorOr will advance both left and right + // iterator (i.e. lite and main index iterator) once, compare document ids, + // and return the hit with larger document id. In this case, hit from lite + // index will be chosen and returned. + ICING_ASSERT_OK(itr->Advance()); + EXPECT_THAT( + itr->GetCallStats(), + EqualsDocHitInfoIteratorCallStats( + /*num_leaf_advance_calls_lite_index=*/1, + /*num_leaf_advance_calls_main_index=*/1, + /*num_leaf_advance_calls_integer_index=*/0, + /*num_leaf_advance_calls_no_index=*/0, /*num_blocks_inspected=*/1)); + + // 2nd Advance(). Since lite index iterator has larger document id in the + // previous round, we advance lite index iterator in this round. We still + // choose and return hit from lite index. + ICING_ASSERT_OK(itr->Advance()); + EXPECT_THAT( + itr->GetCallStats(), + EqualsDocHitInfoIteratorCallStats( + /*num_leaf_advance_calls_lite_index=*/2, + /*num_leaf_advance_calls_main_index=*/1, + /*num_leaf_advance_calls_integer_index=*/0, + /*num_leaf_advance_calls_no_index=*/0, /*num_blocks_inspected=*/1)); + + // 3rd Advance(). Since lite index iterator has larger document id in the + // previous round, we advance lite index iterator in this round. However, + // there is no hit from lite index anymore, so we choose and return hit from + // main index. + ICING_ASSERT_OK(itr->Advance()); + EXPECT_THAT( + itr->GetCallStats(), + EqualsDocHitInfoIteratorCallStats( + /*num_leaf_advance_calls_lite_index=*/2, + /*num_leaf_advance_calls_main_index=*/1, + /*num_leaf_advance_calls_integer_index=*/0, + /*num_leaf_advance_calls_no_index=*/0, /*num_blocks_inspected=*/1)); + + // 4th Advance(). Advance main index. + ICING_ASSERT_OK(itr->Advance()); + EXPECT_THAT( + itr->GetCallStats(), + EqualsDocHitInfoIteratorCallStats( + /*num_leaf_advance_calls_lite_index=*/2, + /*num_leaf_advance_calls_main_index=*/2, + /*num_leaf_advance_calls_integer_index=*/0, + /*num_leaf_advance_calls_no_index=*/0, /*num_blocks_inspected=*/1)); + + // 5th Advance(). Reach the end. + ASSERT_THAT(itr->Advance(), + StatusIs(libtextclassifier3::StatusCode::RESOURCE_EXHAUSTED)); + EXPECT_THAT( + itr->GetCallStats(), + EqualsDocHitInfoIteratorCallStats( + /*num_leaf_advance_calls_lite_index=*/2, + /*num_leaf_advance_calls_main_index=*/2, + /*num_leaf_advance_calls_integer_index=*/0, + /*num_leaf_advance_calls_no_index=*/0, /*num_blocks_inspected=*/1)); +} + TEST_F(IndexTest, SingleHitSingleTermIndex) { Index::Editor edit = index_->Edit( kDocumentId0, kSectionId2, TermMatchType::EXACT_ONLY, /*namespace_id=*/0); @@ -967,7 +1224,9 @@ TEST_F(IndexTest, NonAsciiTermsAfterMerge) { TEST_F(IndexTest, FullIndex) { // Make a smaller index so that it's easier to fill up. - Index::Options options(index_dir_, /*index_merge_size=*/1024); + Index::Options options(index_dir_, /*index_merge_size=*/1024, + /*lite_index_sort_at_indexing=*/true, + /*lite_index_sort_size=*/64); ICING_ASSERT_OK_AND_ASSIGN( index_, Index::Create(options, &filesystem_, &icing_filesystem_)); @@ -1035,7 +1294,9 @@ TEST_F(IndexTest, FullIndex) { TEST_F(IndexTest, FullIndexMerge) { // Make a smaller index so that it's easier to fill up. - Index::Options options(index_dir_, /*index_merge_size=*/1024); + Index::Options options(index_dir_, /*index_merge_size=*/1024, + /*lite_index_sort_at_indexing=*/true, + /*lite_index_sort_size=*/64); ICING_ASSERT_OK_AND_ASSIGN( index_, Index::Create(options, &filesystem_, &icing_filesystem_)); @@ -1368,7 +1629,9 @@ TEST_F(IndexTest, IndexCreateIOFailure) { NiceMock<IcingMockFilesystem> mock_icing_filesystem; ON_CALL(mock_icing_filesystem, CreateDirectoryRecursively) .WillByDefault(Return(false)); - Index::Options options(index_dir_, /*index_merge_size=*/1024 * 1024); + Index::Options options(index_dir_, /*index_merge_size=*/1024 * 1024, + /*lite_index_sort_at_indexing=*/true, + /*lite_index_sort_size=*/1024 * 8); EXPECT_THAT(Index::Create(options, &filesystem_, &mock_icing_filesystem), StatusIs(libtextclassifier3::StatusCode::INTERNAL)); } @@ -1399,7 +1662,9 @@ TEST_F(IndexTest, IndexCreateCorruptionFailure) { IsTrue()); // Recreate the index. - Index::Options options(index_dir_, /*index_merge_size=*/1024 * 1024); + Index::Options options(index_dir_, /*index_merge_size=*/1024 * 1024, + /*lite_index_sort_at_indexing=*/true, + /*lite_index_sort_size=*/1024 * 8); EXPECT_THAT(Index::Create(options, &filesystem_, &icing_filesystem_), StatusIs(libtextclassifier3::StatusCode::DATA_LOSS)); } @@ -1417,7 +1682,9 @@ TEST_F(IndexTest, IndexPersistence) { index_.reset(); // Recreate the index. - Index::Options options(index_dir_, /*index_merge_size=*/1024 * 1024); + Index::Options options(index_dir_, /*index_merge_size=*/1024 * 1024, + /*lite_index_sort_at_indexing=*/true, + /*lite_index_sort_size=*/1024 * 8); ICING_ASSERT_OK_AND_ASSIGN( index_, Index::Create(options, &filesystem_, &icing_filesystem_)); @@ -1446,7 +1713,9 @@ TEST_F(IndexTest, IndexPersistenceAfterMerge) { index_.reset(); // Recreate the index. - Index::Options options(index_dir_, /*index_merge_size=*/1024 * 1024); + Index::Options options(index_dir_, /*index_merge_size=*/1024 * 1024, + /*lite_index_sort_at_indexing=*/true, + /*lite_index_sort_size=*/1024 * 8); ICING_ASSERT_OK_AND_ASSIGN( index_, Index::Create(options, &filesystem_, &icing_filesystem_)); @@ -1463,7 +1732,8 @@ TEST_F(IndexTest, IndexPersistenceAfterMerge) { TEST_F(IndexTest, InvalidHitBufferSize) { Index::Options options( - index_dir_, /*index_merge_size=*/std::numeric_limits<uint32_t>::max()); + index_dir_, /*index_merge_size=*/std::numeric_limits<uint32_t>::max(), + /*lite_index_sort_at_indexing=*/true, /*lite_index_sort_size=*/1024 * 8); EXPECT_THAT(Index::Create(options, &filesystem_, &icing_filesystem_), StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); } diff --git a/icing/index/integer-section-indexing-handler_test.cc b/icing/index/integer-section-indexing-handler_test.cc index 96e21ca..91cc06f 100644 --- a/icing/index/integer-section-indexing-handler_test.cc +++ b/icing/index/integer-section-indexing-handler_test.cc @@ -106,6 +106,7 @@ class IntegerSectionIndexingHandlerTest : public ::testing::Test { ICING_ASSERT_OK_AND_ASSIGN( integer_index_, IntegerIndex::Create(filesystem_, integer_index_working_path_, + /*num_data_threshold_for_bucket_split=*/65536, /*pre_mapping_fbv=*/false)); language_segmenter_factory::SegmenterOptions segmenter_options(ULOC_US); @@ -169,6 +170,8 @@ class IntegerSectionIndexingHandlerTest : public ::testing::Test { schema_store_.get(), /*force_recovery_and_revalidate_documents=*/false, /*namespace_id_fingerprint=*/false, + /*pre_mapping_fbv=*/false, + /*use_persistent_hash_map=*/false, PortableFileBackedProtoLog< DocumentWrapper>::kDeflateCompressionLevel, /*initialize_stats=*/nullptr)); diff --git a/icing/index/iterator/doc-hit-info-iterator-all-document-id.cc b/icing/index/iterator/doc-hit-info-iterator-all-document-id.cc index 67c7d25..1917fd0 100644 --- a/icing/index/iterator/doc-hit-info-iterator-all-document-id.cc +++ b/icing/index/iterator/doc-hit-info-iterator-all-document-id.cc @@ -32,7 +32,6 @@ libtextclassifier3::Status DocHitInfoIteratorAllDocumentId::Advance() { if (!IsDocumentIdValid(current_document_id_)) { // Reached the end, set these to invalid values and return doc_hit_info_ = DocHitInfo(kInvalidDocumentId); - hit_intersect_section_ids_mask_ = kSectionIdMaskNone; return absl_ports::ResourceExhaustedError( "No more DocHitInfos in iterator"); } diff --git a/icing/index/iterator/doc-hit-info-iterator-all-document-id.h b/icing/index/iterator/doc-hit-info-iterator-all-document-id.h index bb16eaf..60c5e0c 100644 --- a/icing/index/iterator/doc-hit-info-iterator-all-document-id.h +++ b/icing/index/iterator/doc-hit-info-iterator-all-document-id.h @@ -37,10 +37,16 @@ class DocHitInfoIteratorAllDocumentId : public DocHitInfoIterator { libtextclassifier3::StatusOr<TrimmedNode> TrimRightMostNode() && override; - int32_t GetNumBlocksInspected() const override { return 0; } + void MapChildren(const ChildrenMapper& mapper) override {} - int32_t GetNumLeafAdvanceCalls() const override { - return document_id_limit_ - current_document_id_; + CallStats GetCallStats() const override { + return CallStats( + /*num_leaf_advance_calls_lite_index_in=*/0, + /*num_leaf_advance_calls_main_index_in=*/0, + /*num_leaf_advance_calls_integer_index_in=*/0, + /*num_leaf_advance_calls_no_index_in=*/document_id_limit_ - + current_document_id_, + /*num_blocks_inspected_in=*/0); } std::string ToString() const override { diff --git a/icing/index/iterator/doc-hit-info-iterator-all-document-id_test.cc b/icing/index/iterator/doc-hit-info-iterator-all-document-id_test.cc index ea2dda6..379cb4d 100644 --- a/icing/index/iterator/doc-hit-info-iterator-all-document-id_test.cc +++ b/icing/index/iterator/doc-hit-info-iterator-all-document-id_test.cc @@ -41,9 +41,8 @@ TEST(DocHitInfoIteratorAllDocumentIdTest, Initialize) { // We'll always start with an invalid document_id, need to Advance before we // get anything out of this. - EXPECT_THAT(all_it.doc_hit_info().document_id(), Eq(kInvalidDocumentId)); - EXPECT_THAT(all_it.hit_intersect_section_ids_mask(), - Eq(kSectionIdMaskNone)); + EXPECT_THAT(all_it.doc_hit_info(), + EqualsDocHitInfo(kInvalidDocumentId, std::vector<SectionId>{})); } { @@ -54,26 +53,25 @@ TEST(DocHitInfoIteratorAllDocumentIdTest, Initialize) { } } -TEST(DocHitInfoIteratorAllDocumentIdTest, GetNumBlocksInspected) { +TEST(DocHitInfoIteratorAllDocumentIdTest, GetCallStats) { DocHitInfoIteratorAllDocumentId all_it(100); - EXPECT_THAT(all_it.GetNumBlocksInspected(), Eq(0)); - - // Number of iterations is chosen arbitrarily. Just meant to demonstrate that - // no matter how many Advance calls are made, GetNumBlocksInspected should - // always return 0. - for (int i = 0; i < 5; ++i) { - EXPECT_THAT(all_it.Advance(), IsOk()); - EXPECT_THAT(all_it.GetNumBlocksInspected(), Eq(0)); - } -} - -TEST(DocHitInfoIteratorAllDocumentIdTest, GetNumLeafAdvanceCalls) { - DocHitInfoIteratorAllDocumentId all_it(100); - EXPECT_THAT(all_it.GetNumLeafAdvanceCalls(), Eq(0)); + EXPECT_THAT( + all_it.GetCallStats(), + EqualsDocHitInfoIteratorCallStats( + /*num_leaf_advance_calls_lite_index=*/0, + /*num_leaf_advance_calls_main_index=*/0, + /*num_leaf_advance_calls_integer_index=*/0, + /*num_leaf_advance_calls_no_index=*/0, /*num_blocks_inspected=*/0)); for (int i = 1; i <= 5; ++i) { EXPECT_THAT(all_it.Advance(), IsOk()); - EXPECT_THAT(all_it.GetNumLeafAdvanceCalls(), Eq(i)); + EXPECT_THAT( + all_it.GetCallStats(), + EqualsDocHitInfoIteratorCallStats( + /*num_leaf_advance_calls_lite_index=*/0, + /*num_leaf_advance_calls_main_index=*/0, + /*num_leaf_advance_calls_integer_index=*/0, + /*num_leaf_advance_calls_no_index=*/i, /*num_blocks_inspected=*/0)); } } @@ -87,12 +85,8 @@ TEST(DocHitInfoIteratorAllDocumentIdTest, Advance) { // Test one advance DocHitInfoIteratorAllDocumentId all_it(5); EXPECT_THAT(all_it.Advance(), IsOk()); - EXPECT_THAT(all_it.doc_hit_info().document_id(), Eq(5)); - - // Advancing shouldn't affect the intersect section ids mask, since there's - // no intersecting going on - EXPECT_THAT(all_it.hit_intersect_section_ids_mask(), - Eq(kSectionIdMaskNone)); + EXPECT_THAT(all_it.doc_hit_info(), + EqualsDocHitInfo(5, std::vector<SectionId>{})); } { diff --git a/icing/index/iterator/doc-hit-info-iterator-and.cc b/icing/index/iterator/doc-hit-info-iterator-and.cc index 185a35e..249bd0e 100644 --- a/icing/index/iterator/doc-hit-info-iterator-and.cc +++ b/icing/index/iterator/doc-hit-info-iterator-and.cc @@ -83,7 +83,6 @@ libtextclassifier3::Status DocHitInfoIteratorAnd::Advance() { // Didn't find anything for the first iterator, reset to invalid values and // return. doc_hit_info_ = DocHitInfo(kInvalidDocumentId); - hit_intersect_section_ids_mask_ = kSectionIdMaskNone; return absl_ports::ResourceExhaustedError( "No more DocHitInfos in iterator"); } @@ -106,8 +105,6 @@ libtextclassifier3::Status DocHitInfoIteratorAnd::Advance() { // Guaranteed that short_doc_id and long_doc_id match now doc_hit_info_ = short_->doc_hit_info(); doc_hit_info_.MergeSectionsFrom(long_->doc_hit_info().hit_section_ids_mask()); - hit_intersect_section_ids_mask_ = short_->hit_intersect_section_ids_mask() & - long_->hit_intersect_section_ids_mask(); return libtextclassifier3::Status::OK; } @@ -124,14 +121,6 @@ DocHitInfoIteratorAnd::TrimRightMostNode() && { return trimmed_long; } -int32_t DocHitInfoIteratorAnd::GetNumBlocksInspected() const { - return short_->GetNumBlocksInspected() + long_->GetNumBlocksInspected(); -} - -int32_t DocHitInfoIteratorAnd::GetNumLeafAdvanceCalls() const { - return short_->GetNumLeafAdvanceCalls() + long_->GetNumLeafAdvanceCalls(); -} - std::string DocHitInfoIteratorAnd::ToString() const { return absl_ports::StrCat("(", short_->ToString(), " AND ", long_->ToString(), ")"); @@ -152,7 +141,6 @@ libtextclassifier3::Status DocHitInfoIteratorAndNary::Advance() { // Didn't find anything for the first iterator, reset to invalid values and // return doc_hit_info_ = DocHitInfo(kInvalidDocumentId); - hit_intersect_section_ids_mask_ = kSectionIdMaskNone; return absl_ports::ResourceExhaustedError( "No more DocHitInfos in iterator"); } @@ -196,14 +184,10 @@ libtextclassifier3::Status DocHitInfoIteratorAndNary::Advance() { // Found a DocumentId which exists in all the iterators doc_hit_info_ = iterators_.at(0)->doc_hit_info(); - hit_intersect_section_ids_mask_ = - iterators_.at(0)->hit_intersect_section_ids_mask(); for (size_t i = 1; i < iterators_.size(); i++) { doc_hit_info_.MergeSectionsFrom( iterators_.at(i)->doc_hit_info().hit_section_ids_mask()); - hit_intersect_section_ids_mask_ &= - iterators_.at(i)->hit_intersect_section_ids_mask(); } return libtextclassifier3::Status::OK; } @@ -229,20 +213,12 @@ DocHitInfoIteratorAndNary::TrimRightMostNode() && { return trimmed_right; } -int32_t DocHitInfoIteratorAndNary::GetNumBlocksInspected() const { - int32_t blockCount = 0; - for (const std::unique_ptr<DocHitInfoIterator>& iter : iterators_) { - blockCount += iter->GetNumBlocksInspected(); - } - return blockCount; -} - -int32_t DocHitInfoIteratorAndNary::GetNumLeafAdvanceCalls() const { - int32_t leafCount = 0; - for (const std::unique_ptr<DocHitInfoIterator>& iter : iterators_) { - leafCount += iter->GetNumLeafAdvanceCalls(); +DocHitInfoIterator::CallStats DocHitInfoIteratorAndNary::GetCallStats() const { + CallStats call_stats; + for (const auto& iter : iterators_) { + call_stats += iter->GetCallStats(); } - return leafCount; + return call_stats; } std::string DocHitInfoIteratorAndNary::ToString() const { diff --git a/icing/index/iterator/doc-hit-info-iterator-and.h b/icing/index/iterator/doc-hit-info-iterator-and.h index 0f40f94..8c52ac9 100644 --- a/icing/index/iterator/doc-hit-info-iterator-and.h +++ b/icing/index/iterator/doc-hit-info-iterator-and.h @@ -18,6 +18,7 @@ #include <cstdint> #include <memory> #include <string> +#include <utility> #include <vector> #include "icing/text_classifier/lib3/utils/base/status.h" @@ -42,14 +43,19 @@ class DocHitInfoIteratorAnd : public DocHitInfoIterator { libtextclassifier3::StatusOr<TrimmedNode> TrimRightMostNode() && override; - int32_t GetNumBlocksInspected() const override; - - int32_t GetNumLeafAdvanceCalls() const override; + CallStats GetCallStats() const override { + return short_->GetCallStats() + long_->GetCallStats(); + } std::string ToString() const override; + void MapChildren(const ChildrenMapper& mapper) override { + short_ = mapper(std::move(short_)); + long_ = mapper(std::move(long_)); + } + void PopulateMatchedTermsStats( - std::vector<TermMatchInfo> *matched_terms_stats, + std::vector<TermMatchInfo>* matched_terms_stats, SectionIdMask filtering_section_mask = kSectionIdMaskAll) const override { if (doc_hit_info_.document_id() == kInvalidDocumentId) { // Current hit isn't valid, return. @@ -78,14 +84,18 @@ class DocHitInfoIteratorAndNary : public DocHitInfoIterator { libtextclassifier3::StatusOr<TrimmedNode> TrimRightMostNode() && override; - int32_t GetNumBlocksInspected() const override; - - int32_t GetNumLeafAdvanceCalls() const override; + CallStats GetCallStats() const override; std::string ToString() const override; + void MapChildren(const ChildrenMapper& mapper) override { + for (int i = 0; i < iterators_.size(); ++i) { + iterators_[i] = mapper(std::move(iterators_[i])); + } + } + void PopulateMatchedTermsStats( - std::vector<TermMatchInfo> *matched_terms_stats, + std::vector<TermMatchInfo>* matched_terms_stats, SectionIdMask filtering_section_mask = kSectionIdMaskAll) const override { if (doc_hit_info_.document_id() == kInvalidDocumentId) { // Current hit isn't valid, return. diff --git a/icing/index/iterator/doc-hit-info-iterator-and_test.cc b/icing/index/iterator/doc-hit-info-iterator-and_test.cc index 51828cb..f204ada 100644 --- a/icing/index/iterator/doc-hit-info-iterator-and_test.cc +++ b/icing/index/iterator/doc-hit-info-iterator-and_test.cc @@ -74,39 +74,33 @@ TEST(DocHitInfoIteratorAndTest, Initialize) { std::make_unique<DocHitInfoIteratorDummy>()); // We start out with invalid values - EXPECT_THAT(and_iter.doc_hit_info(), Eq(DocHitInfo(kInvalidDocumentId))); - EXPECT_THAT(and_iter.hit_intersect_section_ids_mask(), - Eq(kSectionIdMaskNone)); + EXPECT_THAT(and_iter.doc_hit_info(), + EqualsDocHitInfo(kInvalidDocumentId, std::vector<SectionId>{})); } -TEST(DocHitInfoIteratorAndTest, GetNumBlocksInspected) { - int first_iter_blocks = 4; // arbitrary value +TEST(DocHitInfoIteratorAndTest, GetCallStats) { + DocHitInfoIterator::CallStats first_iter_call_stats( + /*num_leaf_advance_calls_lite_index_in=*/2, + /*num_leaf_advance_calls_main_index_in=*/5, + /*num_leaf_advance_calls_integer_index_in=*/3, + /*num_leaf_advance_calls_no_index_in=*/1, + /*num_blocks_inspected_in=*/4); // arbitrary value auto first_iter = std::make_unique<DocHitInfoIteratorDummy>(); - first_iter->SetNumBlocksInspected(first_iter_blocks); - - int second_iter_blocks = 7; // arbitrary value + first_iter->SetCallStats(first_iter_call_stats); + + DocHitInfoIterator::CallStats second_iter_call_stats( + /*num_leaf_advance_calls_lite_index_in=*/6, + /*num_leaf_advance_calls_main_index_in=*/2, + /*num_leaf_advance_calls_integer_index_in=*/10, + /*num_leaf_advance_calls_no_index_in=*/3, + /*num_blocks_inspected_in=*/7); // arbitrary value auto second_iter = std::make_unique<DocHitInfoIteratorDummy>(); - second_iter->SetNumBlocksInspected(second_iter_blocks); + second_iter->SetCallStats(second_iter_call_stats); DocHitInfoIteratorAnd and_iter(std::move(first_iter), std::move(second_iter)); - EXPECT_THAT(and_iter.GetNumBlocksInspected(), - Eq(first_iter_blocks + second_iter_blocks)); -} - -TEST(DocHitInfoIteratorAndTest, GetNumLeafAdvanceCalls) { - int first_iter_leaves = 4; // arbitrary value - auto first_iter = std::make_unique<DocHitInfoIteratorDummy>(); - first_iter->SetNumLeafAdvanceCalls(first_iter_leaves); - - int second_iter_leaves = 7; // arbitrary value - auto second_iter = std::make_unique<DocHitInfoIteratorDummy>(); - second_iter->SetNumLeafAdvanceCalls(second_iter_leaves); - - DocHitInfoIteratorAnd and_iter(std::move(first_iter), std::move(second_iter)); - - EXPECT_THAT(and_iter.GetNumLeafAdvanceCalls(), - Eq(first_iter_leaves + second_iter_leaves)); + EXPECT_THAT(and_iter.GetCallStats(), + Eq(first_iter_call_stats + second_iter_call_stats)); } TEST(DocHitInfoIteratorAndTest, AdvanceNoOverlap) { @@ -293,24 +287,22 @@ TEST(DocHitInfoIteratorAndTest, SectionIdMask) { // Created to test correct section_id_mask behavior. SectionIdMask section_id_mask1 = 0b01010101; // hits in sections 0, 2, 4, 6 SectionIdMask section_id_mask2 = 0b00000110; // hits in sections 1, 2 - SectionIdMask mask_anded_result = 0b00000100; SectionIdMask mask_ored_result = 0b01010111; std::vector<DocHitInfo> first_vector = {DocHitInfo(4, section_id_mask1)}; std::vector<DocHitInfo> second_vector = {DocHitInfo(4, section_id_mask2)}; auto first_iter = std::make_unique<DocHitInfoIteratorDummy>(first_vector); - first_iter->set_hit_intersect_section_ids_mask(section_id_mask1); + first_iter->set_hit_section_ids_mask(section_id_mask1); auto second_iter = std::make_unique<DocHitInfoIteratorDummy>(second_vector); - second_iter->set_hit_intersect_section_ids_mask(section_id_mask2); + second_iter->set_hit_section_ids_mask(section_id_mask2); DocHitInfoIteratorAnd and_iter(std::move(first_iter), std::move(second_iter)); ICING_EXPECT_OK(and_iter.Advance()); EXPECT_THAT(and_iter.doc_hit_info().hit_section_ids_mask(), Eq(mask_ored_result)); - EXPECT_THAT(and_iter.hit_intersect_section_ids_mask(), Eq(mask_anded_result)); } TEST(DocHitInfoIteratorAndTest, PopulateMatchedTermsStats) { @@ -340,11 +332,11 @@ TEST(DocHitInfoIteratorAndTest, PopulateMatchedTermsStats) { auto first_iter = std::make_unique<DocHitInfoIteratorDummy>(first_vector, "hi"); - first_iter->set_hit_intersect_section_ids_mask(section_id_mask1); + first_iter->set_hit_section_ids_mask(section_id_mask1); auto second_iter = std::make_unique<DocHitInfoIteratorDummy>(second_vector, "hello"); - second_iter->set_hit_intersect_section_ids_mask(section_id_mask2); + second_iter->set_hit_section_ids_mask(section_id_mask2); DocHitInfoIteratorAnd and_iter(std::move(first_iter), std::move(second_iter)); @@ -380,11 +372,11 @@ TEST(DocHitInfoIteratorAndTest, PopulateMatchedTermsStats) { auto first_iter = std::make_unique<DocHitInfoIteratorDummy>(first_vector, "hi"); - first_iter->set_hit_intersect_section_ids_mask(section_id_mask1); + first_iter->set_hit_section_ids_mask(section_id_mask1); auto second_iter = std::make_unique<DocHitInfoIteratorDummy>(second_vector, "hi"); - second_iter->set_hit_intersect_section_ids_mask(section_id_mask1); + second_iter->set_hit_section_ids_mask(section_id_mask1); DocHitInfoIteratorAnd and_iter(std::move(first_iter), std::move(second_iter)); @@ -435,9 +427,8 @@ TEST(DocHitInfoIteratorAndNaryTest, Initialize) { DocHitInfoIteratorAndNary and_iter(std::move(iterators)); // We start out with invalid values - EXPECT_THAT(and_iter.doc_hit_info(), Eq(DocHitInfo(kInvalidDocumentId))); - EXPECT_THAT(and_iter.hit_intersect_section_ids_mask(), - Eq(kSectionIdMaskNone)); + EXPECT_THAT(and_iter.doc_hit_info(), + EqualsDocHitInfo(kInvalidDocumentId, std::vector<SectionId>{})); } TEST(DocHitInfoIteratorAndNaryTest, InitializeEmpty) { @@ -450,51 +441,42 @@ TEST(DocHitInfoIteratorAndNaryTest, InitializeEmpty) { StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); } -TEST(DocHitInfoIteratorAndNaryTest, GetNumBlocksInspected) { - int first_iter_blocks = 4; // arbitrary value +TEST(DocHitInfoIteratorAndNaryTest, GetCallStats) { + DocHitInfoIterator::CallStats first_iter_call_stats( + /*num_leaf_advance_calls_lite_index_in=*/2, + /*num_leaf_advance_calls_main_index_in=*/5, + /*num_leaf_advance_calls_integer_index_in=*/3, + /*num_leaf_advance_calls_no_index_in=*/1, + /*num_blocks_inspected_in=*/4); // arbitrary value auto first_iter = std::make_unique<DocHitInfoIteratorDummy>(); - first_iter->SetNumBlocksInspected(first_iter_blocks); - - int second_iter_blocks = 7; // arbitrary value + first_iter->SetCallStats(first_iter_call_stats); + + DocHitInfoIterator::CallStats second_iter_call_stats( + /*num_leaf_advance_calls_lite_index_in=*/6, + /*num_leaf_advance_calls_main_index_in=*/2, + /*num_leaf_advance_calls_integer_index_in=*/10, + /*num_leaf_advance_calls_no_index_in=*/3, + /*num_blocks_inspected_in=*/7); // arbitrary value auto second_iter = std::make_unique<DocHitInfoIteratorDummy>(); - second_iter->SetNumBlocksInspected(second_iter_blocks); - - int third_iter_blocks = 13; // arbitrary value + second_iter->SetCallStats(second_iter_call_stats); + + DocHitInfoIterator::CallStats third_iter_call_stats( + /*num_leaf_advance_calls_lite_index_in=*/1000, + /*num_leaf_advance_calls_main_index_in=*/2000, + /*num_leaf_advance_calls_integer_index_in=*/3000, + /*num_leaf_advance_calls_no_index_in=*/0, + /*num_blocks_inspected_in=*/200); // arbitrary value auto third_iter = std::make_unique<DocHitInfoIteratorDummy>(); - third_iter->SetNumBlocksInspected(third_iter_blocks); - - int fourth_iter_blocks = 1; // arbitrary value - auto fourth_iter = std::make_unique<DocHitInfoIteratorDummy>(); - fourth_iter->SetNumBlocksInspected(fourth_iter_blocks); - - std::vector<std::unique_ptr<DocHitInfoIterator>> iterators; - iterators.push_back(std::move(first_iter)); - iterators.push_back(std::move(second_iter)); - iterators.push_back(std::move(third_iter)); - iterators.push_back(std::move(fourth_iter)); - DocHitInfoIteratorAndNary and_iter(std::move(iterators)); - - EXPECT_THAT(and_iter.GetNumBlocksInspected(), - Eq(first_iter_blocks + second_iter_blocks + third_iter_blocks + - fourth_iter_blocks)); -} - -TEST(DocHitInfoIteratorAndNaryTest, GetNumLeafAdvanceCalls) { - int first_iter_leaves = 4; // arbitrary value - auto first_iter = std::make_unique<DocHitInfoIteratorDummy>(); - first_iter->SetNumLeafAdvanceCalls(first_iter_leaves); - - int second_iter_leaves = 7; // arbitrary value - auto second_iter = std::make_unique<DocHitInfoIteratorDummy>(); - second_iter->SetNumLeafAdvanceCalls(second_iter_leaves); - - int third_iter_leaves = 13; // arbitrary value - auto third_iter = std::make_unique<DocHitInfoIteratorDummy>(); - third_iter->SetNumLeafAdvanceCalls(third_iter_leaves); - - int fourth_iter_leaves = 13; // arbitrary value + third_iter->SetCallStats(third_iter_call_stats); + + DocHitInfoIterator::CallStats fourth_iter_call_stats( + /*num_leaf_advance_calls_lite_index_in=*/200, + /*num_leaf_advance_calls_main_index_in=*/400, + /*num_leaf_advance_calls_integer_index_in=*/100, + /*num_leaf_advance_calls_no_index_in=*/20, + /*num_blocks_inspected_in=*/50); // arbitrary value auto fourth_iter = std::make_unique<DocHitInfoIteratorDummy>(); - fourth_iter->SetNumLeafAdvanceCalls(fourth_iter_leaves); + fourth_iter->SetCallStats(fourth_iter_call_stats); std::vector<std::unique_ptr<DocHitInfoIterator>> iterators; iterators.push_back(std::move(first_iter)); @@ -503,9 +485,9 @@ TEST(DocHitInfoIteratorAndNaryTest, GetNumLeafAdvanceCalls) { iterators.push_back(std::move(fourth_iter)); DocHitInfoIteratorAndNary and_iter(std::move(iterators)); - EXPECT_THAT(and_iter.GetNumLeafAdvanceCalls(), - Eq(first_iter_leaves + second_iter_leaves + third_iter_leaves + - fourth_iter_leaves)); + EXPECT_THAT(and_iter.GetCallStats(), + Eq(first_iter_call_stats + second_iter_call_stats + + third_iter_call_stats + fourth_iter_call_stats)); } TEST(DocHitInfoIteratorAndNaryTest, Advance) { @@ -541,7 +523,6 @@ TEST(DocHitInfoIteratorAndNaryTest, SectionIdMask) { SectionIdMask section_id_mask2 = 0b00000110; // hits in sections 1, 2 SectionIdMask section_id_mask3 = 0b00001100; // hits in sections 2, 3 SectionIdMask section_id_mask4 = 0b00100100; // hits in sections 2, 5 - SectionIdMask mask_anded_result = 0b00000100; SectionIdMask mask_ored_result = 0b01101111; std::vector<DocHitInfo> first_vector = {DocHitInfo(4, section_id_mask1)}; @@ -550,16 +531,16 @@ TEST(DocHitInfoIteratorAndNaryTest, SectionIdMask) { std::vector<DocHitInfo> fourth_vector = {DocHitInfo(4, section_id_mask4)}; auto first_iter = std::make_unique<DocHitInfoIteratorDummy>(first_vector); - first_iter->set_hit_intersect_section_ids_mask(section_id_mask1); + first_iter->set_hit_section_ids_mask(section_id_mask1); auto second_iter = std::make_unique<DocHitInfoIteratorDummy>(second_vector); - second_iter->set_hit_intersect_section_ids_mask(section_id_mask2); + second_iter->set_hit_section_ids_mask(section_id_mask2); auto third_iter = std::make_unique<DocHitInfoIteratorDummy>(third_vector); - third_iter->set_hit_intersect_section_ids_mask(section_id_mask3); + third_iter->set_hit_section_ids_mask(section_id_mask3); auto fourth_iter = std::make_unique<DocHitInfoIteratorDummy>(fourth_vector); - fourth_iter->set_hit_intersect_section_ids_mask(section_id_mask4); + fourth_iter->set_hit_section_ids_mask(section_id_mask4); std::vector<std::unique_ptr<DocHitInfoIterator>> iterators; iterators.push_back(std::move(first_iter)); @@ -572,7 +553,6 @@ TEST(DocHitInfoIteratorAndNaryTest, SectionIdMask) { ICING_EXPECT_OK(and_iter.Advance()); EXPECT_THAT(and_iter.doc_hit_info().hit_section_ids_mask(), Eq(mask_ored_result)); - EXPECT_THAT(and_iter.hit_intersect_section_ids_mask(), Eq(mask_anded_result)); } TEST(DocHitInfoIteratorAndNaryTest, PopulateMatchedTermsStats) { diff --git a/icing/index/iterator/doc-hit-info-iterator-filter.cc b/icing/index/iterator/doc-hit-info-iterator-filter.cc index a82e556..82d1ac7 100644 --- a/icing/index/iterator/doc-hit-info-iterator-filter.cc +++ b/icing/index/iterator/doc-hit-info-iterator-filter.cc @@ -100,14 +100,11 @@ libtextclassifier3::Status DocHitInfoIteratorFilter::Advance() { // Satisfied all our specified filters doc_hit_info_ = delegate_->doc_hit_info(); - hit_intersect_section_ids_mask_ = - delegate_->hit_intersect_section_ids_mask(); return libtextclassifier3::Status::OK; } // Didn't find anything on the delegate iterator. doc_hit_info_ = DocHitInfo(kInvalidDocumentId); - hit_intersect_section_ids_mask_ = kSectionIdMaskNone; return absl_ports::ResourceExhaustedError("No more DocHitInfos in iterator"); } @@ -123,14 +120,6 @@ DocHitInfoIteratorFilter::TrimRightMostNode() && { return trimmed_delegate; } -int32_t DocHitInfoIteratorFilter::GetNumBlocksInspected() const { - return delegate_->GetNumBlocksInspected(); -} - -int32_t DocHitInfoIteratorFilter::GetNumLeafAdvanceCalls() const { - return delegate_->GetNumLeafAdvanceCalls(); -} - std::string DocHitInfoIteratorFilter::ToString() const { return delegate_->ToString(); } diff --git a/icing/index/iterator/doc-hit-info-iterator-filter.h b/icing/index/iterator/doc-hit-info-iterator-filter.h index be5e1e8..608665e 100644 --- a/icing/index/iterator/doc-hit-info-iterator-filter.h +++ b/icing/index/iterator/doc-hit-info-iterator-filter.h @@ -20,6 +20,7 @@ #include <string> #include <string_view> #include <unordered_set> +#include <utility> #include <vector> #include "icing/text_classifier/lib3/utils/base/status.h" @@ -62,9 +63,11 @@ class DocHitInfoIteratorFilter : public DocHitInfoIterator { libtextclassifier3::StatusOr<TrimmedNode> TrimRightMostNode() && override; - int32_t GetNumBlocksInspected() const override; + void MapChildren(const ChildrenMapper& mapper) override { + delegate_ = mapper(std::move(delegate_)); + } - int32_t GetNumLeafAdvanceCalls() const override; + CallStats GetCallStats() const override { return delegate_->GetCallStats(); } std::string ToString() const override; diff --git a/icing/index/iterator/doc-hit-info-iterator-filter_test.cc b/icing/index/iterator/doc-hit-info-iterator-filter_test.cc index d8839dc..0ed4d02 100644 --- a/icing/index/iterator/doc-hit-info-iterator-filter_test.cc +++ b/icing/index/iterator/doc-hit-info-iterator-filter_test.cc @@ -55,7 +55,8 @@ libtextclassifier3::StatusOr<DocumentStore::CreateResult> CreateDocumentStore( return DocumentStore::Create( filesystem, base_dir, clock, schema_store, /*force_recovery_and_revalidate_documents=*/false, - /*namespace_id_fingerprint=*/false, + /*namespace_id_fingerprint=*/false, /*pre_mapping_fbv=*/false, + /*use_persistent_hash_map=*/false, PortableFileBackedProtoLog<DocumentWrapper>::kDeflateCompressionLevel, /*initialize_stats=*/nullptr); } @@ -1000,28 +1001,22 @@ TEST_F(DocHitInfoIteratorFilterTest, SectionIdMasksArePopulatedCorrectly) { EqualsDocHitInfo(document_id3, section_ids3))); } -TEST_F(DocHitInfoIteratorFilterTest, GetNumBlocksInspected) { +TEST_F(DocHitInfoIteratorFilterTest, GetCallStats) { + DocHitInfoIterator::CallStats original_call_stats( + /*num_leaf_advance_calls_lite_index_in=*/2, + /*num_leaf_advance_calls_main_index_in=*/5, + /*num_leaf_advance_calls_integer_index_in=*/3, + /*num_leaf_advance_calls_no_index_in=*/1, + /*num_blocks_inspected_in=*/4); // arbitrary value auto original_iterator = std::make_unique<DocHitInfoIteratorDummy>(); - original_iterator->SetNumBlocksInspected(5); + original_iterator->SetCallStats(original_call_stats); DocHitInfoIteratorFilter::Options options; DocHitInfoIteratorFilter filtered_iterator( std::move(original_iterator), document_store_.get(), schema_store_.get(), options, fake_clock_.GetSystemTimeMilliseconds()); - EXPECT_THAT(filtered_iterator.GetNumBlocksInspected(), Eq(5)); -} - -TEST_F(DocHitInfoIteratorFilterTest, GetNumLeafAdvanceCalls) { - auto original_iterator = std::make_unique<DocHitInfoIteratorDummy>(); - original_iterator->SetNumLeafAdvanceCalls(6); - - DocHitInfoIteratorFilter::Options options; - DocHitInfoIteratorFilter filtered_iterator( - std::move(original_iterator), document_store_.get(), schema_store_.get(), - options, fake_clock_.GetSystemTimeMilliseconds()); - - EXPECT_THAT(filtered_iterator.GetNumLeafAdvanceCalls(), Eq(6)); + EXPECT_THAT(filtered_iterator.GetCallStats(), Eq(original_call_stats)); } TEST_F(DocHitInfoIteratorFilterTest, TrimFilterIterator) { diff --git a/icing/index/iterator/doc-hit-info-iterator-none.h b/icing/index/iterator/doc-hit-info-iterator-none.h index f938d32..c2853f1 100644 --- a/icing/index/iterator/doc-hit-info-iterator-none.h +++ b/icing/index/iterator/doc-hit-info-iterator-none.h @@ -39,9 +39,9 @@ class DocHitInfoIteratorNone : public DocHitInfoIterator { return node; } - int32_t GetNumBlocksInspected() const override { return 0; } + void MapChildren(const ChildrenMapper& mapper) override {} - int32_t GetNumLeafAdvanceCalls() const override { return 0; } + CallStats GetCallStats() const override { return CallStats(); } std::string ToString() const override { return "(NONE)"; } }; diff --git a/icing/index/iterator/doc-hit-info-iterator-not.cc b/icing/index/iterator/doc-hit-info-iterator-not.cc index 38b1ded..10a8292 100644 --- a/icing/index/iterator/doc-hit-info-iterator-not.cc +++ b/icing/index/iterator/doc-hit-info-iterator-not.cc @@ -15,13 +15,15 @@ #include "icing/index/iterator/doc-hit-info-iterator-not.h" #include <cstdint> +#include <memory> +#include <utility> #include "icing/text_classifier/lib3/utils/base/status.h" #include "icing/absl_ports/canonical_errors.h" #include "icing/absl_ports/str_cat.h" #include "icing/index/hit/doc-hit-info.h" #include "icing/index/iterator/doc-hit-info-iterator-all-document-id.h" -#include "icing/schema/section.h" +#include "icing/index/iterator/doc-hit-info-iterator.h" #include "icing/store/document-id.h" namespace icing { @@ -67,14 +69,8 @@ DocHitInfoIteratorNot::TrimRightMostNode() && { "Cannot generate suggestion if the last term is NOT operator."); } -int32_t DocHitInfoIteratorNot::GetNumBlocksInspected() const { - return to_be_excluded_->GetNumBlocksInspected() + - all_document_id_iterator_.GetNumBlocksInspected(); -} - -int32_t DocHitInfoIteratorNot::GetNumLeafAdvanceCalls() const { - return to_be_excluded_->GetNumLeafAdvanceCalls() + - all_document_id_iterator_.GetNumLeafAdvanceCalls(); +void DocHitInfoIteratorNot::MapChildren(const ChildrenMapper& mapper) { + to_be_excluded_ = mapper(std::move(to_be_excluded_)); } std::string DocHitInfoIteratorNot::ToString() const { diff --git a/icing/index/iterator/doc-hit-info-iterator-not.h b/icing/index/iterator/doc-hit-info-iterator-not.h index 8cc3bf3..11575fb 100644 --- a/icing/index/iterator/doc-hit-info-iterator-not.h +++ b/icing/index/iterator/doc-hit-info-iterator-not.h @@ -30,14 +30,12 @@ namespace lib { // Iterator that will return all documents that are *not* specified by the // to_be_excluded_iterator. // -// NOTE: The hit_intersect_section_ids_mask is meaningless for this iterator. +// NOTE: doc_hit_info_.hit_section_ids_mask() is meaningless for this iterator. // When this iterator produces a result, it's because the Document was not // present in the to_be_excluded_iterator. There is no concept of the Document // having been chosen because it's term was in a specific section. Since we // don't know anything about the sections for the Document, the -// hit_intersect_section_ids_mask is always kSectionIdMaskNone. Correspondingly, -// this means that the doc_hit_info.hit_section_ids_mask will also always be -// kSectionIdMaskNone. +// doc_hit_info.hit_section_ids_mask() is always kSectionIdMaskNone. class DocHitInfoIteratorNot : public DocHitInfoIterator { public: // to_be_excluded_iterator: The results of this iterator will be excluded @@ -55,9 +53,12 @@ class DocHitInfoIteratorNot : public DocHitInfoIterator { // to NOT operator. libtextclassifier3::StatusOr<TrimmedNode> TrimRightMostNode() && override; - int32_t GetNumBlocksInspected() const override; + void MapChildren(const ChildrenMapper& mapper) override; - int32_t GetNumLeafAdvanceCalls() const override; + CallStats GetCallStats() const override { + return to_be_excluded_->GetCallStats() + + all_document_id_iterator_.GetCallStats(); + } std::string ToString() const override; diff --git a/icing/index/iterator/doc-hit-info-iterator-not_test.cc b/icing/index/iterator/doc-hit-info-iterator-not_test.cc index 5a8ce2c..a8c835f 100644 --- a/icing/index/iterator/doc-hit-info-iterator-not_test.cc +++ b/icing/index/iterator/doc-hit-info-iterator-not_test.cc @@ -102,40 +102,39 @@ TEST(DocHitInfoIteratorNotTest, AllDocumentIdOverlapOk) { EXPECT_THAT(GetDocumentIds(¬_iterator), IsEmpty()); } -TEST(DocHitInfoIteratorNotTest, GetNumBlocksInspected) { - int to_be_excluded_iterator_blocks = 4; // arbitrary value +TEST(DocHitInfoIteratorNotTest, GetCallStats) { + DocHitInfoIterator::CallStats to_be_excluded_iterator_call_stats( + /*num_leaf_advance_calls_lite_index_in=*/2, + /*num_leaf_advance_calls_main_index_in=*/5, + /*num_leaf_advance_calls_integer_index_in=*/3, + /*num_leaf_advance_calls_no_index_in=*/1, + /*num_blocks_inspected_in=*/4); // arbitrary value auto to_be_excluded_iterator = std::make_unique<DocHitInfoIteratorDummy>(); - to_be_excluded_iterator->SetNumBlocksInspected( - to_be_excluded_iterator_blocks); - - DocHitInfoIteratorNot not_iterator(std::move(to_be_excluded_iterator), - /*document_id_limit=*/5); - - // The AllDocumentId iterator doesn't count any blocks as being inspected - // since it's just decrementing 1 from the document_id_limit. - EXPECT_THAT(not_iterator.GetNumBlocksInspected(), - Eq(to_be_excluded_iterator_blocks)); -} - -TEST(DocHitInfoIteratorNotTest, GetNumLeafAdvanceCalls) { - int to_be_excluded_iterator_leaves = 4; // arbitrary value - auto to_be_excluded_iterator = std::make_unique<DocHitInfoIteratorDummy>(); - to_be_excluded_iterator->SetNumLeafAdvanceCalls( - to_be_excluded_iterator_leaves); + to_be_excluded_iterator->SetCallStats(to_be_excluded_iterator_call_stats); int all_document_id_limit = 5; // Since we iterate from [limit, 0] inclusive, add 1 for the 0th advance call int all_leaf_advance_calls = all_document_id_limit + 1; DocHitInfoIteratorNot not_iterator(std::move(to_be_excluded_iterator), - all_document_id_limit); + /*document_id_limit=*/5); while (not_iterator.Advance().ok()) { // Advance through the whole not iterator } - // The AllDocumentId iterator counts each DocumentId as a leaf advance call - EXPECT_THAT(not_iterator.GetNumLeafAdvanceCalls(), - Eq(to_be_excluded_iterator_leaves + all_leaf_advance_calls)); + // The AllDocumentId iterator doesn't count lite/main/integer index or blocks + // as being inspected since it's just decrementing 1 from the + // document_id_limit. + EXPECT_THAT( + not_iterator.GetCallStats(), + EqualsDocHitInfoIteratorCallStats( + to_be_excluded_iterator_call_stats.num_leaf_advance_calls_lite_index, + to_be_excluded_iterator_call_stats.num_leaf_advance_calls_main_index, + to_be_excluded_iterator_call_stats + .num_leaf_advance_calls_integer_index, + to_be_excluded_iterator_call_stats.num_leaf_advance_calls_no_index + + all_leaf_advance_calls, + to_be_excluded_iterator_call_stats.num_blocks_inspected)); } TEST(DocHitInfoIteratorNotTest, SectionIdsAlwaysNone) { diff --git a/icing/index/iterator/doc-hit-info-iterator-or.cc b/icing/index/iterator/doc-hit-info-iterator-or.cc index 8f7b84f..6251365 100644 --- a/icing/index/iterator/doc-hit-info-iterator-or.cc +++ b/icing/index/iterator/doc-hit-info-iterator-or.cc @@ -20,6 +20,7 @@ #include "icing/absl_ports/canonical_errors.h" #include "icing/absl_ports/str_cat.h" #include "icing/index/hit/doc-hit-info.h" +#include "icing/index/iterator/doc-hit-info-iterator.h" #include "icing/store/document-id.h" #include "icing/util/status-macros.h" @@ -113,7 +114,6 @@ libtextclassifier3::Status DocHitInfoIteratorOr::Advance() { right_document_id_ == kInvalidDocumentId) { // Reached the end, set these to invalid values and return doc_hit_info_ = DocHitInfo(kInvalidDocumentId); - hit_intersect_section_ids_mask_ = kSectionIdMaskNone; return absl_ports::ResourceExhaustedError( "No more DocHitInfos in iterator"); } @@ -132,26 +132,16 @@ libtextclassifier3::Status DocHitInfoIteratorOr::Advance() { current_ = chosen; doc_hit_info_ = chosen->doc_hit_info(); - hit_intersect_section_ids_mask_ = chosen->hit_intersect_section_ids_mask(); // If equal, combine. if (left_document_id_ == right_document_id_) { doc_hit_info_.MergeSectionsFrom( right_->doc_hit_info().hit_section_ids_mask()); - hit_intersect_section_ids_mask_ &= right_->hit_intersect_section_ids_mask(); } return libtextclassifier3::Status::OK; } -int32_t DocHitInfoIteratorOr::GetNumBlocksInspected() const { - return left_->GetNumBlocksInspected() + right_->GetNumBlocksInspected(); -} - -int32_t DocHitInfoIteratorOr::GetNumLeafAdvanceCalls() const { - return left_->GetNumLeafAdvanceCalls() + right_->GetNumLeafAdvanceCalls(); -} - std::string DocHitInfoIteratorOr::ToString() const { return absl_ports::StrCat("(", left_->ToString(), " OR ", right_->ToString(), ")"); @@ -192,7 +182,6 @@ libtextclassifier3::Status DocHitInfoIteratorOrNary::Advance() { // 0 is the smallest (last) DocumentId, can't advance further. Reset to // invalid values and return directly doc_hit_info_ = DocHitInfo(kInvalidDocumentId); - hit_intersect_section_ids_mask_ = kSectionIdMaskNone; return absl_ports::ResourceExhaustedError( "No more DocHitInfos in iterator"); } @@ -222,45 +211,31 @@ libtextclassifier3::Status DocHitInfoIteratorOrNary::Advance() { // None of the iterators had a next document_id, reset to invalid values and // return doc_hit_info_ = DocHitInfo(kInvalidDocumentId); - hit_intersect_section_ids_mask_ = kSectionIdMaskNone; return absl_ports::ResourceExhaustedError( "No more DocHitInfos in iterator"); } // Found the next hit DocumentId, now calculate the section info. - hit_intersect_section_ids_mask_ = kSectionIdMaskNone; for (const auto& iterator : iterators_) { if (iterator->doc_hit_info().document_id() == next_document_id) { current_iterators_.push_back(iterator.get()); if (doc_hit_info_.document_id() == kInvalidDocumentId) { doc_hit_info_ = iterator->doc_hit_info(); - hit_intersect_section_ids_mask_ = - iterator->hit_intersect_section_ids_mask(); } else { doc_hit_info_.MergeSectionsFrom( iterator->doc_hit_info().hit_section_ids_mask()); - hit_intersect_section_ids_mask_ &= - iterator->hit_intersect_section_ids_mask(); } } } return libtextclassifier3::Status::OK; } -int32_t DocHitInfoIteratorOrNary::GetNumBlocksInspected() const { - int32_t blockCount = 0; - for (const auto& iter : iterators_) { - blockCount += iter->GetNumBlocksInspected(); - } - return blockCount; -} - -int32_t DocHitInfoIteratorOrNary::GetNumLeafAdvanceCalls() const { - int32_t leafCount = 0; +DocHitInfoIterator::CallStats DocHitInfoIteratorOrNary::GetCallStats() const { + CallStats call_stats; for (const auto& iter : iterators_) { - leafCount += iter->GetNumLeafAdvanceCalls(); + call_stats += iter->GetCallStats(); } - return leafCount; + return call_stats; } std::string DocHitInfoIteratorOrNary::ToString() const { diff --git a/icing/index/iterator/doc-hit-info-iterator-or.h b/icing/index/iterator/doc-hit-info-iterator-or.h index 1e9847d..8c0427b 100644 --- a/icing/index/iterator/doc-hit-info-iterator-or.h +++ b/icing/index/iterator/doc-hit-info-iterator-or.h @@ -16,7 +16,9 @@ #define ICING_INDEX_ITERATOR_DOC_HIT_INFO_ITERATOR_OR_H_ #include <cstdint> +#include <memory> #include <string> +#include <utility> #include "icing/index/iterator/doc-hit-info-iterator.h" @@ -38,12 +40,17 @@ class DocHitInfoIteratorOr : public DocHitInfoIterator { libtextclassifier3::Status Advance() override; - int32_t GetNumBlocksInspected() const override; - - int32_t GetNumLeafAdvanceCalls() const override; + CallStats GetCallStats() const override { + return left_->GetCallStats() + right_->GetCallStats(); + } std::string ToString() const override; + void MapChildren(const ChildrenMapper &mapper) override { + left_ = mapper(std::move(left_)); + right_ = mapper(std::move(right_)); + } + void PopulateMatchedTermsStats( std::vector<TermMatchInfo> *matched_terms_stats, SectionIdMask filtering_section_mask = kSectionIdMaskAll) const override { @@ -83,12 +90,16 @@ class DocHitInfoIteratorOrNary : public DocHitInfoIterator { libtextclassifier3::Status Advance() override; - int32_t GetNumBlocksInspected() const override; - - int32_t GetNumLeafAdvanceCalls() const override; + CallStats GetCallStats() const override; std::string ToString() const override; + void MapChildren(const ChildrenMapper &mapper) override { + for (int i = 0; i < iterators_.size(); ++i) { + iterators_[i] = mapper(std::move(iterators_[i])); + } + } + void PopulateMatchedTermsStats( std::vector<TermMatchInfo> *matched_terms_stats, SectionIdMask filtering_section_mask = kSectionIdMaskAll) const override { diff --git a/icing/index/iterator/doc-hit-info-iterator-or_test.cc b/icing/index/iterator/doc-hit-info-iterator-or_test.cc index 1950c01..d198b53 100644 --- a/icing/index/iterator/doc-hit-info-iterator-or_test.cc +++ b/icing/index/iterator/doc-hit-info-iterator-or_test.cc @@ -73,38 +73,33 @@ TEST(DocHitInfoIteratorOrTest, Initialize) { std::make_unique<DocHitInfoIteratorDummy>()); // We start out with invalid values - EXPECT_THAT(or_iter.doc_hit_info(), Eq(DocHitInfo(kInvalidDocumentId))); - EXPECT_THAT(or_iter.hit_intersect_section_ids_mask(), Eq(kSectionIdMaskNone)); + EXPECT_THAT(or_iter.doc_hit_info(), + EqualsDocHitInfo(kInvalidDocumentId, std::vector<SectionId>{})); } -TEST(DocHitInfoIteratorOrTest, GetNumBlocksInspected) { - int first_iter_blocks = 4; // arbitrary value +TEST(DocHitInfoIteratorOrTest, GetCallStats) { + DocHitInfoIterator::CallStats first_iter_call_stats( + /*num_leaf_advance_calls_lite_index_in=*/2, + /*num_leaf_advance_calls_main_index_in=*/5, + /*num_leaf_advance_calls_integer_index_in=*/3, + /*num_leaf_advance_calls_no_index_in=*/1, + /*num_blocks_inspected_in=*/4); // arbitrary value auto first_iter = std::make_unique<DocHitInfoIteratorDummy>(); - first_iter->SetNumBlocksInspected(first_iter_blocks); - - int second_iter_blocks = 7; // arbitrary value + first_iter->SetCallStats(first_iter_call_stats); + + DocHitInfoIterator::CallStats second_iter_call_stats( + /*num_leaf_advance_calls_lite_index_in=*/6, + /*num_leaf_advance_calls_main_index_in=*/2, + /*num_leaf_advance_calls_integer_index_in=*/10, + /*num_leaf_advance_calls_no_index_in=*/3, + /*num_blocks_inspected_in=*/7); // arbitrary value auto second_iter = std::make_unique<DocHitInfoIteratorDummy>(); - second_iter->SetNumBlocksInspected(second_iter_blocks); + second_iter->SetCallStats(second_iter_call_stats); DocHitInfoIteratorOr or_iter(std::move(first_iter), std::move(second_iter)); - EXPECT_THAT(or_iter.GetNumBlocksInspected(), - Eq(first_iter_blocks + second_iter_blocks)); -} - -TEST(DocHitInfoIteratorOrTest, GetNumLeafAdvanceCalls) { - int first_iter_leaves = 4; // arbitrary value - auto first_iter = std::make_unique<DocHitInfoIteratorDummy>(); - first_iter->SetNumLeafAdvanceCalls(first_iter_leaves); - - int second_iter_leaves = 7; // arbitrary value - auto second_iter = std::make_unique<DocHitInfoIteratorDummy>(); - second_iter->SetNumLeafAdvanceCalls(second_iter_leaves); - - DocHitInfoIteratorOr or_iter(std::move(first_iter), std::move(second_iter)); - - EXPECT_THAT(or_iter.GetNumLeafAdvanceCalls(), - Eq(first_iter_leaves + second_iter_leaves)); + EXPECT_THAT(or_iter.GetCallStats(), + Eq(first_iter_call_stats + second_iter_call_stats)); } TEST(DocHitInfoIteratorOrTest, Advance) { @@ -155,24 +150,22 @@ TEST(DocHitInfoIteratorOrTest, SectionIdMask) { // Created to test correct section_id_mask behavior. SectionIdMask section_id_mask1 = 0b01010101; // hits in sections 0, 2, 4, 6 SectionIdMask section_id_mask2 = 0b00000110; // hits in sections 1, 2 - SectionIdMask mask_anded_result = 0b00000100; SectionIdMask mask_ored_result = 0b01010111; std::vector<DocHitInfo> first_vector = {DocHitInfo(4, section_id_mask1)}; std::vector<DocHitInfo> second_vector = {DocHitInfo(4, section_id_mask2)}; auto first_iter = std::make_unique<DocHitInfoIteratorDummy>(first_vector); - first_iter->set_hit_intersect_section_ids_mask(section_id_mask1); + first_iter->set_hit_section_ids_mask(section_id_mask1); auto second_iter = std::make_unique<DocHitInfoIteratorDummy>(second_vector); - second_iter->set_hit_intersect_section_ids_mask(section_id_mask2); + second_iter->set_hit_section_ids_mask(section_id_mask2); DocHitInfoIteratorOr or_iter(std::move(first_iter), std::move(second_iter)); ICING_EXPECT_OK(or_iter.Advance()); EXPECT_THAT(or_iter.doc_hit_info().hit_section_ids_mask(), Eq(mask_ored_result)); - EXPECT_THAT(or_iter.hit_intersect_section_ids_mask(), Eq(mask_anded_result)); } TEST(DocHitInfoIteratorOrTest, PopulateMatchedTermsStats) { @@ -200,11 +193,11 @@ TEST(DocHitInfoIteratorOrTest, PopulateMatchedTermsStats) { auto first_iter = std::make_unique<DocHitInfoIteratorDummy>(first_vector, "hi"); - first_iter->set_hit_intersect_section_ids_mask(section_id_mask1); + first_iter->set_hit_section_ids_mask(section_id_mask1); auto second_iter = std::make_unique<DocHitInfoIteratorDummy>(second_vector, "hello"); - second_iter->set_hit_intersect_section_ids_mask(section_id_mask2); + second_iter->set_hit_section_ids_mask(section_id_mask2); DocHitInfoIteratorOr or_iter(std::move(first_iter), std::move(second_iter)); std::vector<TermMatchInfo> matched_terms_stats; @@ -238,11 +231,11 @@ TEST(DocHitInfoIteratorOrTest, PopulateMatchedTermsStats) { auto first_iter = std::make_unique<DocHitInfoIteratorDummy>(first_vector, "hi"); - first_iter->set_hit_intersect_section_ids_mask(section_id_mask1); + first_iter->set_hit_section_ids_mask(section_id_mask1); auto second_iter = std::make_unique<DocHitInfoIteratorDummy>(second_vector, "hi"); - second_iter->set_hit_intersect_section_ids_mask(section_id_mask1); + second_iter->set_hit_section_ids_mask(section_id_mask1); DocHitInfoIteratorOr or_iter(std::move(first_iter), std::move(second_iter)); std::vector<TermMatchInfo> matched_terms_stats; @@ -281,11 +274,11 @@ TEST(DocHitInfoIteratorOrTest, PopulateMatchedTermsStats) { auto first_iter = std::make_unique<DocHitInfoIteratorDummy>(first_vector, "hi"); - first_iter->set_hit_intersect_section_ids_mask(section_id_mask1); + first_iter->set_hit_section_ids_mask(section_id_mask1); auto second_iter = std::make_unique<DocHitInfoIteratorDummy>(second_vector, "hello"); - second_iter->set_hit_intersect_section_ids_mask(section_id_mask2); + second_iter->set_hit_section_ids_mask(section_id_mask2); DocHitInfoIteratorOr or_iter(std::move(first_iter), std::move(second_iter)); std::vector<TermMatchInfo> matched_terms_stats; @@ -362,8 +355,8 @@ TEST(DocHitInfoIteratorOrNaryTest, Initialize) { DocHitInfoIteratorOrNary or_iter(std::move(iterators)); // We start out with invalid values - EXPECT_THAT(or_iter.doc_hit_info(), Eq(DocHitInfo(kInvalidDocumentId))); - EXPECT_THAT(or_iter.hit_intersect_section_ids_mask(), Eq(kSectionIdMaskNone)); + EXPECT_THAT(or_iter.doc_hit_info(), + EqualsDocHitInfo(kInvalidDocumentId, std::vector<SectionId>{})); } TEST(DocHitInfoIteratorOrNaryTest, InitializeEmpty) { @@ -376,51 +369,42 @@ TEST(DocHitInfoIteratorOrNaryTest, InitializeEmpty) { StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); } -TEST(DocHitInfoIteratorOrNaryTest, GetNumBlocksInspected) { - int first_iter_blocks = 4; // arbitrary value +TEST(DocHitInfoIteratorOrNaryTest, GetCallStats) { + DocHitInfoIterator::CallStats first_iter_call_stats( + /*num_leaf_advance_calls_lite_index_in=*/2, + /*num_leaf_advance_calls_main_index_in=*/5, + /*num_leaf_advance_calls_integer_index_in=*/3, + /*num_leaf_advance_calls_no_index_in=*/1, + /*num_blocks_inspected_in=*/4); // arbitrary value auto first_iter = std::make_unique<DocHitInfoIteratorDummy>(); - first_iter->SetNumBlocksInspected(first_iter_blocks); - - int second_iter_blocks = 7; // arbitrary value + first_iter->SetCallStats(first_iter_call_stats); + + DocHitInfoIterator::CallStats second_iter_call_stats( + /*num_leaf_advance_calls_lite_index_in=*/6, + /*num_leaf_advance_calls_main_index_in=*/2, + /*num_leaf_advance_calls_integer_index_in=*/10, + /*num_leaf_advance_calls_no_index_in=*/3, + /*num_blocks_inspected_in=*/7); // arbitrary value auto second_iter = std::make_unique<DocHitInfoIteratorDummy>(); - second_iter->SetNumBlocksInspected(second_iter_blocks); - - int third_iter_blocks = 13; // arbitrary value + second_iter->SetCallStats(second_iter_call_stats); + + DocHitInfoIterator::CallStats third_iter_call_stats( + /*num_leaf_advance_calls_lite_index_in=*/1000, + /*num_leaf_advance_calls_main_index_in=*/2000, + /*num_leaf_advance_calls_integer_index_in=*/3000, + /*num_leaf_advance_calls_no_index_in=*/0, + /*num_blocks_inspected_in=*/200); // arbitrary value auto third_iter = std::make_unique<DocHitInfoIteratorDummy>(); - third_iter->SetNumBlocksInspected(third_iter_blocks); - - int fourth_iter_blocks = 1; // arbitrary value - auto fourth_iter = std::make_unique<DocHitInfoIteratorDummy>(); - fourth_iter->SetNumBlocksInspected(fourth_iter_blocks); - - std::vector<std::unique_ptr<DocHitInfoIterator>> iterators; - iterators.push_back(std::move(first_iter)); - iterators.push_back(std::move(second_iter)); - iterators.push_back(std::move(third_iter)); - iterators.push_back(std::move(fourth_iter)); - DocHitInfoIteratorOrNary or_iter(std::move(iterators)); - - EXPECT_THAT(or_iter.GetNumBlocksInspected(), - Eq(first_iter_blocks + second_iter_blocks + third_iter_blocks + - fourth_iter_blocks)); -} - -TEST(DocHitInfoIteratorOrNaryTest, GetNumLeafAdvanceCalls) { - int first_iter_leaves = 4; // arbitrary value - auto first_iter = std::make_unique<DocHitInfoIteratorDummy>(); - first_iter->SetNumLeafAdvanceCalls(first_iter_leaves); - - int second_iter_leaves = 7; // arbitrary value - auto second_iter = std::make_unique<DocHitInfoIteratorDummy>(); - second_iter->SetNumLeafAdvanceCalls(second_iter_leaves); - - int third_iter_leaves = 13; // arbitrary value - auto third_iter = std::make_unique<DocHitInfoIteratorDummy>(); - third_iter->SetNumLeafAdvanceCalls(third_iter_leaves); - - int fourth_iter_leaves = 13; // arbitrary value + third_iter->SetCallStats(third_iter_call_stats); + + DocHitInfoIterator::CallStats fourth_iter_call_stats( + /*num_leaf_advance_calls_lite_index_in=*/200, + /*num_leaf_advance_calls_main_index_in=*/400, + /*num_leaf_advance_calls_integer_index_in=*/100, + /*num_leaf_advance_calls_no_index_in=*/20, + /*num_blocks_inspected_in=*/50); // arbitrary value auto fourth_iter = std::make_unique<DocHitInfoIteratorDummy>(); - fourth_iter->SetNumLeafAdvanceCalls(fourth_iter_leaves); + fourth_iter->SetCallStats(fourth_iter_call_stats); std::vector<std::unique_ptr<DocHitInfoIterator>> iterators; iterators.push_back(std::move(first_iter)); @@ -429,9 +413,9 @@ TEST(DocHitInfoIteratorOrNaryTest, GetNumLeafAdvanceCalls) { iterators.push_back(std::move(fourth_iter)); DocHitInfoIteratorOrNary or_iter(std::move(iterators)); - EXPECT_THAT(or_iter.GetNumLeafAdvanceCalls(), - Eq(first_iter_leaves + second_iter_leaves + third_iter_leaves + - fourth_iter_leaves)); + EXPECT_THAT(or_iter.GetCallStats(), + Eq(first_iter_call_stats + second_iter_call_stats + + third_iter_call_stats + fourth_iter_call_stats)); } TEST(DocHitInfoIteratorOrNaryTest, Advance) { @@ -460,7 +444,6 @@ TEST(DocHitInfoIteratorOrNaryTest, SectionIdMask) { SectionIdMask section_id_mask2 = 0b00000110; // hits in sections 1, 2 SectionIdMask section_id_mask3 = 0b00001100; // hits in sections 2, 3 SectionIdMask section_id_mask4 = 0b00100100; // hits in sections 2, 5 - SectionIdMask mask_anded_result = 0b00000100; SectionIdMask mask_ored_result = 0b01101111; std::vector<DocHitInfo> first_vector = {DocHitInfo(4, section_id_mask1)}; @@ -469,16 +452,16 @@ TEST(DocHitInfoIteratorOrNaryTest, SectionIdMask) { std::vector<DocHitInfo> fourth_vector = {DocHitInfo(4, section_id_mask4)}; auto first_iter = std::make_unique<DocHitInfoIteratorDummy>(first_vector); - first_iter->set_hit_intersect_section_ids_mask(section_id_mask1); + first_iter->set_hit_section_ids_mask(section_id_mask1); auto second_iter = std::make_unique<DocHitInfoIteratorDummy>(second_vector); - second_iter->set_hit_intersect_section_ids_mask(section_id_mask2); + second_iter->set_hit_section_ids_mask(section_id_mask2); auto third_iter = std::make_unique<DocHitInfoIteratorDummy>(third_vector); - third_iter->set_hit_intersect_section_ids_mask(section_id_mask3); + third_iter->set_hit_section_ids_mask(section_id_mask3); auto fourth_iter = std::make_unique<DocHitInfoIteratorDummy>(fourth_vector); - fourth_iter->set_hit_intersect_section_ids_mask(section_id_mask4); + fourth_iter->set_hit_section_ids_mask(section_id_mask4); std::vector<std::unique_ptr<DocHitInfoIterator>> iterators; iterators.push_back(std::move(first_iter)); @@ -491,7 +474,6 @@ TEST(DocHitInfoIteratorOrNaryTest, SectionIdMask) { ICING_EXPECT_OK(or_iter.Advance()); EXPECT_THAT(or_iter.doc_hit_info().hit_section_ids_mask(), Eq(mask_ored_result)); - EXPECT_THAT(or_iter.hit_intersect_section_ids_mask(), Eq(mask_anded_result)); } TEST(DocHitInfoIteratorOrNaryTest, PopulateMatchedTermsStats) { diff --git a/icing/index/iterator/doc-hit-info-iterator-property-in-document.cc b/icing/index/iterator/doc-hit-info-iterator-property-in-document.cc new file mode 100644 index 0000000..e6a1c67 --- /dev/null +++ b/icing/index/iterator/doc-hit-info-iterator-property-in-document.cc @@ -0,0 +1,65 @@ +// Copyright (C) 2019 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "icing/index/iterator/doc-hit-info-iterator-property-in-document.h" + +#include <memory> +#include <string> +#include <string_view> +#include <utility> + +#include "icing/text_classifier/lib3/utils/base/status.h" +#include "icing/text_classifier/lib3/utils/base/statusor.h" +#include "icing/absl_ports/canonical_errors.h" +#include "icing/index/hit/doc-hit-info.h" +#include "icing/index/iterator/doc-hit-info-iterator.h" +#include "icing/store/document-id.h" + +namespace icing { +namespace lib { + +DocHitInfoIteratorPropertyInDocument::DocHitInfoIteratorPropertyInDocument( + std::unique_ptr<DocHitInfoIterator> meta_hit_iterator) + : meta_hit_iterator_(std::move(meta_hit_iterator)) {} + +libtextclassifier3::Status DocHitInfoIteratorPropertyInDocument::Advance() { + while (meta_hit_iterator_->Advance().ok()) { + // Currently, the metadata hits added by PropertyExistenceIndexingHandler + // can only have a section id of 0, so the section mask has to be 1 << 0. + if (meta_hit_iterator_->doc_hit_info().hit_section_ids_mask() == (1 << 0)) { + doc_hit_info_ = meta_hit_iterator_->doc_hit_info(); + // Hits returned by "hasProperty" should not be associated with any + // section. + doc_hit_info_.set_hit_section_ids_mask(/*section_id_mask=*/0); + return libtextclassifier3::Status::OK; + } + } + + doc_hit_info_ = DocHitInfo(kInvalidDocumentId); + return absl_ports::ResourceExhaustedError("No more DocHitInfos in iterator"); +} + +libtextclassifier3::StatusOr<DocHitInfoIterator::TrimmedNode> +DocHitInfoIteratorPropertyInDocument::TrimRightMostNode() && { + // Don't generate suggestion if the last operator is this custom function. + return absl_ports::InvalidArgumentError( + "Cannot generate suggestion if the last term is hasProperty()."); +} + +std::string DocHitInfoIteratorPropertyInDocument::ToString() const { + return meta_hit_iterator_->ToString(); +} + +} // namespace lib +} // namespace icing diff --git a/icing/index/iterator/doc-hit-info-iterator-property-in-document.h b/icing/index/iterator/doc-hit-info-iterator-property-in-document.h new file mode 100644 index 0000000..bb2c97a --- /dev/null +++ b/icing/index/iterator/doc-hit-info-iterator-property-in-document.h @@ -0,0 +1,73 @@ +// Copyright (C) 2019 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef ICING_INDEX_ITERATOR_DOC_HIT_INFO_ITERATOR_PROPERTY_IN_DOCUMENT_H_ +#define ICING_INDEX_ITERATOR_DOC_HIT_INFO_ITERATOR_PROPERTY_IN_DOCUMENT_H_ + +#include <cstdint> +#include <memory> +#include <string> +#include <utility> +#include <vector> + +#include "icing/text_classifier/lib3/utils/base/status.h" +#include "icing/text_classifier/lib3/utils/base/statusor.h" +#include "icing/index/iterator/doc-hit-info-iterator.h" +#include "icing/schema/section.h" +#include "icing/store/document-id.h" + +namespace icing { +namespace lib { + +// The iterator returned by the "hasProperty" function in advanced query that +// post-processes metadata hits added by PropertyExistenceIndexingHandler. +// Specifically, it filters out hits that are not recognized as metadata, and +// always set hit_section_ids_mask to 0. +// +// It is marked as a subclass of DocHitInfoLeafIterator because section +// restriction should not be passed down to meta_hit_iterator. +class DocHitInfoIteratorPropertyInDocument : public DocHitInfoLeafIterator { + public: + explicit DocHitInfoIteratorPropertyInDocument( + std::unique_ptr<DocHitInfoIterator> meta_hit_iterator); + + libtextclassifier3::Status Advance() override; + + libtextclassifier3::StatusOr<TrimmedNode> TrimRightMostNode() && override; + + CallStats GetCallStats() const override { + return meta_hit_iterator_->GetCallStats(); + } + + std::string ToString() const override; + + void PopulateMatchedTermsStats( + std::vector<TermMatchInfo>* matched_terms_stats, + SectionIdMask filtering_section_mask = kSectionIdMaskAll) const override { + if (doc_hit_info_.document_id() == kInvalidDocumentId) { + // Current hit isn't valid, return. + return; + } + meta_hit_iterator_->PopulateMatchedTermsStats(matched_terms_stats, + filtering_section_mask); + } + + private: + std::unique_ptr<DocHitInfoIterator> meta_hit_iterator_; +}; + +} // namespace lib +} // namespace icing + +#endif // ICING_INDEX_ITERATOR_DOC_HIT_INFO_ITERATOR_PROPERTY_IN_DOCUMENT_H_ diff --git a/icing/index/iterator/doc-hit-info-iterator-property-in-schema.cc b/icing/index/iterator/doc-hit-info-iterator-property-in-schema.cc index 05778b0..8b98302 100644 --- a/icing/index/iterator/doc-hit-info-iterator-property-in-schema.cc +++ b/icing/index/iterator/doc-hit-info-iterator-property-in-schema.cc @@ -45,7 +45,6 @@ DocHitInfoIteratorPropertyInSchema::DocHitInfoIteratorPropertyInSchema( libtextclassifier3::Status DocHitInfoIteratorPropertyInSchema::Advance() { doc_hit_info_ = DocHitInfo(kInvalidDocumentId); - hit_intersect_section_ids_mask_ = kSectionIdMaskNone; // Maps from SchemaTypeId to a bool indicating whether or not the type has // the requested property. @@ -77,9 +76,6 @@ libtextclassifier3::Status DocHitInfoIteratorPropertyInSchema::Advance() { if (valid_match) { doc_hit_info_ = delegate_->doc_hit_info(); - hit_intersect_section_ids_mask_ = - delegate_->hit_intersect_section_ids_mask(); - doc_hit_info_.set_hit_section_ids_mask(hit_intersect_section_ids_mask_); return libtextclassifier3::Status::OK; } @@ -98,14 +94,6 @@ DocHitInfoIteratorPropertyInSchema::TrimRightMostNode() && { "Cannot generate suggestion if the last term is hasPropertyDefined()."); } -int32_t DocHitInfoIteratorPropertyInSchema::GetNumBlocksInspected() const { - return delegate_->GetNumBlocksInspected(); -} - -int32_t DocHitInfoIteratorPropertyInSchema::GetNumLeafAdvanceCalls() const { - return delegate_->GetNumLeafAdvanceCalls(); -} - std::string DocHitInfoIteratorPropertyInSchema::ToString() const { return absl_ports::StrCat("(", absl_ports::StrJoin(target_properties_, ","), "): ", delegate_->ToString()); diff --git a/icing/index/iterator/doc-hit-info-iterator-property-in-schema.h b/icing/index/iterator/doc-hit-info-iterator-property-in-schema.h index 730c497..c16a1c4 100644 --- a/icing/index/iterator/doc-hit-info-iterator-property-in-schema.h +++ b/icing/index/iterator/doc-hit-info-iterator-property-in-schema.h @@ -19,6 +19,7 @@ #include <memory> #include <string> #include <string_view> +#include <utility> #include "icing/text_classifier/lib3/utils/base/status.h" #include "icing/index/iterator/doc-hit-info-iterator.h" @@ -45,9 +46,11 @@ class DocHitInfoIteratorPropertyInSchema : public DocHitInfoIterator { libtextclassifier3::StatusOr<TrimmedNode> TrimRightMostNode() && override; - int32_t GetNumBlocksInspected() const override; + void MapChildren(const ChildrenMapper& mapper) override { + delegate_ = mapper(std::move(delegate_)); + } - int32_t GetNumLeafAdvanceCalls() const override; + CallStats GetCallStats() const override { return delegate_->GetCallStats(); } std::string ToString() const override; diff --git a/icing/index/iterator/doc-hit-info-iterator-property-in-schema_test.cc b/icing/index/iterator/doc-hit-info-iterator-property-in-schema_test.cc index df5ddf5..3f5a0a7 100644 --- a/icing/index/iterator/doc-hit-info-iterator-property-in-schema_test.cc +++ b/icing/index/iterator/doc-hit-info-iterator-property-in-schema_test.cc @@ -97,13 +97,14 @@ class DocHitInfoIteratorPropertyInSchemaTest : public ::testing::Test { ICING_ASSERT_OK_AND_ASSIGN( DocumentStore::CreateResult create_result, - DocumentStore::Create(&filesystem_, test_dir_, &fake_clock_, - schema_store_.get(), - /*force_recovery_and_revalidate_documents=*/false, - /*namespace_id_fingerprint=*/false, - PortableFileBackedProtoLog< - DocumentWrapper>::kDeflateCompressionLevel, - /*initialize_stats=*/nullptr)); + DocumentStore::Create( + &filesystem_, test_dir_, &fake_clock_, schema_store_.get(), + /*force_recovery_and_revalidate_documents=*/false, + /*namespace_id_fingerprint=*/false, /*pre_mapping_fbv=*/false, + /*use_persistent_hash_map=*/false, + PortableFileBackedProtoLog< + DocumentWrapper>::kDeflateCompressionLevel, + /*initialize_stats=*/nullptr)); document_store_ = std::move(create_result.document_store); } @@ -198,8 +199,7 @@ TEST_F(DocHitInfoIteratorPropertyInSchemaTest, auto original_iterator = std::make_unique<DocHitInfoIteratorDummy>(doc_hit_infos, "hi"); - original_iterator->set_hit_intersect_section_ids_mask( - original_section_id_mask); + original_iterator->set_hit_section_ids_mask(original_section_id_mask); DocHitInfoIteratorPropertyInSchema property_defined_iterator( std::move(original_iterator), document_store_.get(), schema_store_.get(), @@ -217,7 +217,7 @@ TEST_F(DocHitInfoIteratorPropertyInSchemaTest, // The expected mask is the same as the original mask, since the iterator // should treat it as a pass-through. SectionIdMask expected_section_id_mask = original_section_id_mask; - EXPECT_EQ(property_defined_iterator.hit_intersect_section_ids_mask(), + EXPECT_EQ(property_defined_iterator.doc_hit_info().hit_section_ids_mask(), expected_section_id_mask); property_defined_iterator.PopulateMatchedTermsStats(&matched_terms_stats); diff --git a/icing/index/iterator/doc-hit-info-iterator-section-restrict.cc b/icing/index/iterator/doc-hit-info-iterator-section-restrict.cc index 227a185..35dc0b9 100644 --- a/icing/index/iterator/doc-hit-info-iterator-section-restrict.cc +++ b/icing/index/iterator/doc-hit-info-iterator-section-restrict.cc @@ -16,46 +16,142 @@ #include <cstdint> #include <memory> +#include <set> #include <string> #include <string_view> +#include <unordered_map> #include <utility> +#include <vector> #include "icing/text_classifier/lib3/utils/base/status.h" #include "icing/text_classifier/lib3/utils/base/statusor.h" #include "icing/absl_ports/canonical_errors.h" #include "icing/absl_ports/str_cat.h" +#include "icing/absl_ports/str_join.h" #include "icing/index/hit/doc-hit-info.h" #include "icing/index/iterator/doc-hit-info-iterator.h" +#include "icing/index/iterator/section-restrict-data.h" +#include "icing/proto/search.pb.h" #include "icing/schema/schema-store.h" #include "icing/schema/section.h" #include "icing/store/document-filter-data.h" #include "icing/store/document-id.h" #include "icing/store/document-store.h" +#include "icing/util/status-macros.h" namespace icing { namespace lib { +// An iterator that simply takes ownership of SectionRestrictData. +class SectionRestrictDataHolderIterator : public DocHitInfoIterator { + public: + explicit SectionRestrictDataHolderIterator( + std::unique_ptr<DocHitInfoIterator> delegate, + std::unique_ptr<SectionRestrictData> data) + : delegate_(std::move(delegate)), data_(std::move(data)) {} + + libtextclassifier3::Status Advance() override { + auto result = delegate_->Advance(); + doc_hit_info_ = delegate_->doc_hit_info(); + return result; + } + + libtextclassifier3::StatusOr<TrimmedNode> TrimRightMostNode() && override { + ICING_ASSIGN_OR_RETURN(TrimmedNode trimmed_delegate, + std::move(*delegate_).TrimRightMostNode()); + if (trimmed_delegate.iterator_ != nullptr) { + trimmed_delegate.iterator_ = + std::make_unique<SectionRestrictDataHolderIterator>( + std::move(trimmed_delegate.iterator_), std::move(data_)); + } + return trimmed_delegate; + } + + void MapChildren(const ChildrenMapper& mapper) override { + delegate_ = mapper(std::move(delegate_)); + } + + CallStats GetCallStats() const override { return delegate_->GetCallStats(); } + + std::string ToString() const override { return delegate_->ToString(); } + + void PopulateMatchedTermsStats( + std::vector<TermMatchInfo>* matched_terms_stats, + SectionIdMask filtering_section_mask) const override { + return delegate_->PopulateMatchedTermsStats(matched_terms_stats, + filtering_section_mask); + } + + private: + std::unique_ptr<DocHitInfoIterator> delegate_; + std::unique_ptr<SectionRestrictData> data_; +}; + DocHitInfoIteratorSectionRestrict::DocHitInfoIteratorSectionRestrict( - std::unique_ptr<DocHitInfoIterator> delegate, + std::unique_ptr<DocHitInfoIterator> delegate, SectionRestrictData* data) + : delegate_(std::move(delegate)), data_(data) {} + +std::unique_ptr<DocHitInfoIterator> +DocHitInfoIteratorSectionRestrict::ApplyRestrictions( + std::unique_ptr<DocHitInfoIterator> iterator, const DocumentStore* document_store, const SchemaStore* schema_store, - std::set<std::string> target_sections, int64_t current_time_ms) - : delegate_(std::move(delegate)), - document_store_(*document_store), - schema_store_(*schema_store), - target_sections_(std::move(target_sections)), - current_time_ms_(current_time_ms) {} + std::set<std::string> target_sections, int64_t current_time_ms) { + std::unordered_map<std::string, std::set<std::string>> type_property_filters; + type_property_filters[std::string(SchemaStore::kSchemaTypeWildcard)] = + std::move(target_sections); + auto data = std::make_unique<SectionRestrictData>( + document_store, schema_store, current_time_ms, type_property_filters); + std::unique_ptr<DocHitInfoIterator> result = + ApplyRestrictions(std::move(iterator), data.get()); + return std::make_unique<SectionRestrictDataHolderIterator>(std::move(result), + std::move(data)); +} + +std::unique_ptr<DocHitInfoIterator> +DocHitInfoIteratorSectionRestrict::ApplyRestrictions( + std::unique_ptr<DocHitInfoIterator> iterator, + const DocumentStore* document_store, const SchemaStore* schema_store, + const SearchSpecProto& search_spec, int64_t current_time_ms) { + std::unordered_map<std::string, std::set<std::string>> type_property_filters; + // TODO(b/294274922): Add support for polymorphism in type property filters. + for (const TypePropertyMask& type_property_mask : + search_spec.type_property_filters()) { + type_property_filters[type_property_mask.schema_type()] = + std::set<std::string>(type_property_mask.paths().begin(), + type_property_mask.paths().end()); + } + auto data = std::make_unique<SectionRestrictData>( + document_store, schema_store, current_time_ms, type_property_filters); + std::unique_ptr<DocHitInfoIterator> result = + ApplyRestrictions(std::move(iterator), data.get()); + return std::make_unique<SectionRestrictDataHolderIterator>(std::move(result), + std::move(data)); +} + +std::unique_ptr<DocHitInfoIterator> +DocHitInfoIteratorSectionRestrict::ApplyRestrictions( + std::unique_ptr<DocHitInfoIterator> iterator, SectionRestrictData* data) { + ChildrenMapper mapper; + mapper = [&data, &mapper](std::unique_ptr<DocHitInfoIterator> iterator) + -> std::unique_ptr<DocHitInfoIterator> { + if (iterator->is_leaf()) { + return std::make_unique<DocHitInfoIteratorSectionRestrict>( + std::move(iterator), data); + } else { + iterator->MapChildren(mapper); + return iterator; + } + }; + return mapper(std::move(iterator)); +} libtextclassifier3::Status DocHitInfoIteratorSectionRestrict::Advance() { doc_hit_info_ = DocHitInfo(kInvalidDocumentId); - hit_intersect_section_ids_mask_ = kSectionIdMaskNone; while (delegate_->Advance().ok()) { DocumentId document_id = delegate_->doc_hit_info().document_id(); - SectionIdMask section_id_mask = - delegate_->doc_hit_info().hit_section_ids_mask(); - - auto data_optional = document_store_.GetAliveDocumentFilterData( - document_id, current_time_ms_); + auto data_optional = data_->document_store().GetAliveDocumentFilterData( + document_id, data_->current_time_ms()); if (!data_optional) { // Ran into some error retrieving information on this hit, skip continue; @@ -63,34 +159,35 @@ libtextclassifier3::Status DocHitInfoIteratorSectionRestrict::Advance() { // Guaranteed that the DocumentFilterData exists at this point SchemaTypeId schema_type_id = data_optional.value().schema_type_id(); - - // A hit can be in multiple sections at once, need to check which of the - // section ids match the target sections - while (section_id_mask != 0) { - // There was a hit in this section id - SectionId section_id = __builtin_ctzll(section_id_mask); - - auto section_metadata_or = - schema_store_.GetSectionMetadata(schema_type_id, section_id); - - if (section_metadata_or.ok()) { - const SectionMetadata* section_metadata = - section_metadata_or.ValueOrDie(); - - if (target_sections_.find(section_metadata->path) != - target_sections_.end()) { - // The hit was in the target section name, return OK/found - hit_intersect_section_ids_mask_ |= UINT64_C(1) << section_id; - } - } - - // Mark this section as checked - section_id_mask &= ~(UINT64_C(1) << section_id); + auto schema_type_or = data_->schema_store().GetSchemaType(schema_type_id); + if (!schema_type_or.ok()) { + // Ran into error retrieving schema type, skip + continue; } + const std::string* schema_type = std::move(schema_type_or).ValueOrDie(); + SectionIdMask allowed_sections_mask = + data_->ComputeAllowedSectionsMask(*schema_type); - if (hit_intersect_section_ids_mask_ != kSectionIdMaskNone) { + // A hit can be in multiple sections at once, need to check which of the + // section ids match the sections allowed by type_property_masks_. This can + // be done by doing a bitwise and of the section_id_mask in the doc hit and + // the allowed_sections_mask. + SectionIdMask section_id_mask = + delegate_->doc_hit_info().hit_section_ids_mask() & + allowed_sections_mask; + + // Return this document if: + // - the sectionIdMask is not empty after applying property filters, or + // - no property filters apply for its schema type (allowed_sections_mask + // == kSectionIdMaskAll). This is needed to ensure that in case of empty + // query (which uses doc-hit-info-iterator-all-document-id), where + // section_id_mask is kSectionIdMaskNone, doc hits with no property + // restrictions don't get filtered out. Doc hits for schema types for + // whom property filters are specified will still get filtered out. + if (allowed_sections_mask == kSectionIdMaskAll || + section_id_mask != kSectionIdMaskNone) { doc_hit_info_ = delegate_->doc_hit_info(); - doc_hit_info_.set_hit_section_ids_mask(hit_intersect_section_ids_mask_); + doc_hit_info_.set_hit_section_ids_mask(section_id_mask); return libtextclassifier3::Status::OK; } // Didn't find a matching section name for this hit. Continue. @@ -104,30 +201,46 @@ libtextclassifier3::StatusOr<DocHitInfoIterator::TrimmedNode> DocHitInfoIteratorSectionRestrict::TrimRightMostNode() && { ICING_ASSIGN_OR_RETURN(TrimmedNode trimmed_delegate, std::move(*delegate_).TrimRightMostNode()); + // TrimRightMostNode is only used by suggestion processor to process query + // expression, so an entry for wildcard should always be present in + // type_property_filters_ when code flow reaches here. If the InternalError + // below is returned, that means TrimRightMostNode hasn't been called in the + // right context. + const auto it = data_->type_property_filters().find("*"); + if (it == data_->type_property_filters().end()) { + return absl_ports::InternalError( + "A wildcard entry should always be present in type property filters " + "whenever TrimRightMostNode() is called for " + "DocHitInfoIteratorSectionRestrict"); + } + const std::set<std::string>& target_sections = it->second; + if (target_sections.empty()) { + return absl_ports::InternalError( + "Target sections should not be empty whenever TrimRightMostNode() is " + "called for DocHitInfoIteratorSectionRestrict"); + } if (trimmed_delegate.iterator_ == nullptr) { // TODO(b/228240987): Update TrimmedNode and downstream code to handle // multiple section restricts. - trimmed_delegate.target_section_ = std::move(*target_sections_.begin()); + trimmed_delegate.target_section_ = std::move(*target_sections.begin()); return trimmed_delegate; } trimmed_delegate.iterator_ = - std::make_unique<DocHitInfoIteratorSectionRestrict>( - std::move(trimmed_delegate.iterator_), &document_store_, - &schema_store_, std::move(target_sections_), current_time_ms_); + std::unique_ptr<DocHitInfoIteratorSectionRestrict>( + new DocHitInfoIteratorSectionRestrict( + std::move(trimmed_delegate.iterator_), std::move(data_))); return std::move(trimmed_delegate); } -int32_t DocHitInfoIteratorSectionRestrict::GetNumBlocksInspected() const { - return delegate_->GetNumBlocksInspected(); -} - -int32_t DocHitInfoIteratorSectionRestrict::GetNumLeafAdvanceCalls() const { - return delegate_->GetNumLeafAdvanceCalls(); -} - std::string DocHitInfoIteratorSectionRestrict::ToString() const { - return absl_ports::StrCat("(", absl_ports::StrJoin(target_sections_, ","), - "): ", delegate_->ToString()); + std::string output = ""; + for (auto it = data_->type_property_filters().cbegin(); + it != data_->type_property_filters().cend(); it++) { + std::string paths = absl_ports::StrJoin(it->second, ","); + output += (it->first) + ":" + (paths) + "; "; + } + std::string result = "{" + output.substr(0, output.size() - 2) + "}: "; + return absl_ports::StrCat(result, delegate_->ToString()); } } // namespace lib diff --git a/icing/index/iterator/doc-hit-info-iterator-section-restrict.h b/icing/index/iterator/doc-hit-info-iterator-section-restrict.h index 58dd120..387ff52 100644 --- a/icing/index/iterator/doc-hit-info-iterator-section-restrict.h +++ b/icing/index/iterator/doc-hit-info-iterator-section-restrict.h @@ -17,12 +17,18 @@ #include <cstdint> #include <memory> +#include <set> #include <string> -#include <string_view> +#include <vector> #include "icing/text_classifier/lib3/utils/base/status.h" +#include "icing/text_classifier/lib3/utils/base/statusor.h" #include "icing/index/iterator/doc-hit-info-iterator.h" +#include "icing/index/iterator/section-restrict-data.h" +#include "icing/proto/search.pb.h" #include "icing/schema/schema-store.h" +#include "icing/schema/section.h" +#include "icing/store/document-id.h" #include "icing/store/document-store.h" namespace icing { @@ -35,30 +41,48 @@ namespace lib { // That class is meant to be applied to the root of a query tree and filter over // all results at the end. This class is more used in the limited scope of a // term or a small group of terms. -class DocHitInfoIteratorSectionRestrict : public DocHitInfoIterator { +class DocHitInfoIteratorSectionRestrict : public DocHitInfoLeafIterator { public: // Does not take any ownership, and all pointers must refer to valid objects // that outlive the one constructed. explicit DocHitInfoIteratorSectionRestrict( - std::unique_ptr<DocHitInfoIterator> delegate, + std::unique_ptr<DocHitInfoIterator> delegate, SectionRestrictData* data); + + // Methods that apply section restrictions to all DocHitInfoLeafIterator nodes + // inside the provided iterator tree, and return the root of the tree + // afterwards. These methods do not take any ownership for the raw pointer + // parameters, which must refer to valid objects that outlive the iterator + // returned. + static std::unique_ptr<DocHitInfoIterator> ApplyRestrictions( + std::unique_ptr<DocHitInfoIterator> iterator, const DocumentStore* document_store, const SchemaStore* schema_store, std::set<std::string> target_sections, int64_t current_time_ms); + static std::unique_ptr<DocHitInfoIterator> ApplyRestrictions( + std::unique_ptr<DocHitInfoIterator> iterator, + const DocumentStore* document_store, const SchemaStore* schema_store, + const SearchSpecProto& search_spec, int64_t current_time_ms); + static std::unique_ptr<DocHitInfoIterator> ApplyRestrictions( + std::unique_ptr<DocHitInfoIterator> iterator, SectionRestrictData* data); libtextclassifier3::Status Advance() override; libtextclassifier3::StatusOr<TrimmedNode> TrimRightMostNode() && override; - int32_t GetNumBlocksInspected() const override; - - int32_t GetNumLeafAdvanceCalls() const override; + CallStats GetCallStats() const override { return delegate_->GetCallStats(); } std::string ToString() const override; - // Note that the DocHitInfoIteratorSectionRestrict is the only iterator that - // should set filtering_section_mask, hence the received - // filtering_section_mask is ignored and the filtering_section_mask passed to - // the delegate will be set to hit_intersect_section_ids_mask_. This will - // allow to filter the matching sections in the delegate. + // Note that the DocHitInfoIteratorSectionRestrict can only be applied at + // DocHitInfoLeafIterator, which can be a term iterator or another + // DocHitInfoIteratorSectionRestrict. + // + // To filter the matching sections, filtering_section_mask should be set to + // doc_hit_info_.hit_section_ids_mask() held in the outermost + // DocHitInfoIteratorSectionRestrict, which is equal to the intersection of + // all hit_section_ids_mask in the DocHitInfoIteratorSectionRestrict chain, + // since for any two section restrict iterators chained together, the outer + // one's hit_section_ids_mask is always a subset of the inner one's + // hit_section_ids_mask. void PopulateMatchedTermsStats( std::vector<TermMatchInfo>* matched_terms_stats, SectionIdMask filtering_section_mask = kSectionIdMaskAll) const override { @@ -68,16 +92,14 @@ class DocHitInfoIteratorSectionRestrict : public DocHitInfoIterator { } delegate_->PopulateMatchedTermsStats( matched_terms_stats, - /*filtering_section_mask=*/hit_intersect_section_ids_mask_); + /*filtering_section_mask=*/filtering_section_mask & + doc_hit_info_.hit_section_ids_mask()); } private: std::unique_ptr<DocHitInfoIterator> delegate_; - const DocumentStore& document_store_; - const SchemaStore& schema_store_; - - std::set<std::string> target_sections_; - int64_t current_time_ms_; + // Does not own. + SectionRestrictData* data_; }; } // namespace lib diff --git a/icing/index/iterator/doc-hit-info-iterator-section-restrict_test.cc b/icing/index/iterator/doc-hit-info-iterator-section-restrict_test.cc index c765e6d..ee65fe1 100644 --- a/icing/index/iterator/doc-hit-info-iterator-section-restrict_test.cc +++ b/icing/index/iterator/doc-hit-info-iterator-section-restrict_test.cc @@ -15,6 +15,7 @@ #include "icing/index/iterator/doc-hit-info-iterator-section-restrict.h" #include <memory> +#include <set> #include <string> #include <utility> #include <vector> @@ -101,13 +102,14 @@ class DocHitInfoIteratorSectionRestrictTest : public ::testing::Test { ICING_ASSERT_OK_AND_ASSIGN( DocumentStore::CreateResult create_result, - DocumentStore::Create(&filesystem_, test_dir_, &fake_clock_, - schema_store_.get(), - /*force_recovery_and_revalidate_documents=*/false, - /*namespace_id_fingerprint=*/false, - PortableFileBackedProtoLog< - DocumentWrapper>::kDeflateCompressionLevel, - /*initialize_stats=*/nullptr)); + DocumentStore::Create( + &filesystem_, test_dir_, &fake_clock_, schema_store_.get(), + /*force_recovery_and_revalidate_documents=*/false, + /*namespace_id_fingerprint=*/false, /*pre_mapping_fbv=*/false, + /*use_persistent_hash_map=*/false, + PortableFileBackedProtoLog< + DocumentWrapper>::kDeflateCompressionLevel, + /*initialize_stats=*/nullptr)); document_store_ = std::move(create_result.document_store); } @@ -149,48 +151,50 @@ TEST_F(DocHitInfoIteratorSectionRestrictTest, auto original_iterator = std::make_unique<DocHitInfoIteratorDummy>(doc_hit_infos, "hi"); - original_iterator->set_hit_intersect_section_ids_mask( - original_section_id_mask); + original_iterator->set_hit_section_ids_mask(original_section_id_mask); // Filtering for the indexed section name (which has a section id of 0) should // get a result. - DocHitInfoIteratorSectionRestrict section_restrict_iterator( - std::move(original_iterator), document_store_.get(), schema_store_.get(), - /*target_sections=*/{indexed_section_0}, - fake_clock_.GetSystemTimeMilliseconds()); + std::unique_ptr<DocHitInfoIterator> section_restrict_iterator = + DocHitInfoIteratorSectionRestrict::ApplyRestrictions( + std::move(original_iterator), document_store_.get(), + schema_store_.get(), + /*target_sections=*/{indexed_section_0}, + fake_clock_.GetSystemTimeMilliseconds()); std::vector<TermMatchInfo> matched_terms_stats; - section_restrict_iterator.PopulateMatchedTermsStats(&matched_terms_stats); + section_restrict_iterator->PopulateMatchedTermsStats(&matched_terms_stats); EXPECT_THAT(matched_terms_stats, IsEmpty()); - ICING_EXPECT_OK(section_restrict_iterator.Advance()); - EXPECT_THAT(section_restrict_iterator.doc_hit_info().document_id(), + ICING_EXPECT_OK(section_restrict_iterator->Advance()); + EXPECT_THAT(section_restrict_iterator->doc_hit_info().document_id(), Eq(document_id)); SectionIdMask expected_section_id_mask = 0b00000001; // hits in sections 0 - EXPECT_EQ(section_restrict_iterator.hit_intersect_section_ids_mask(), + EXPECT_EQ(section_restrict_iterator->doc_hit_info().hit_section_ids_mask(), expected_section_id_mask); - section_restrict_iterator.PopulateMatchedTermsStats(&matched_terms_stats); + section_restrict_iterator->PopulateMatchedTermsStats(&matched_terms_stats); std::unordered_map<SectionId, Hit::TermFrequency> expected_section_ids_tf_map = {{0, 1}}; EXPECT_THAT(matched_terms_stats, ElementsAre(EqualsTermMatchInfo( "hi", expected_section_ids_tf_map))); - EXPECT_FALSE(section_restrict_iterator.Advance().ok()); + EXPECT_FALSE(section_restrict_iterator->Advance().ok()); } TEST_F(DocHitInfoIteratorSectionRestrictTest, EmptyOriginalIterator) { std::unique_ptr<DocHitInfoIterator> original_iterator_empty = std::make_unique<DocHitInfoIteratorDummy>(); - DocHitInfoIteratorSectionRestrict filtered_iterator( - std::move(original_iterator_empty), document_store_.get(), - schema_store_.get(), /*target_sections=*/{}, - fake_clock_.GetSystemTimeMilliseconds()); + std::unique_ptr<DocHitInfoIterator> filtered_iterator = + DocHitInfoIteratorSectionRestrict::ApplyRestrictions( + std::move(original_iterator_empty), document_store_.get(), + schema_store_.get(), /*target_sections=*/std::set<std::string>(), + fake_clock_.GetSystemTimeMilliseconds()); - EXPECT_THAT(GetDocumentIds(&filtered_iterator), IsEmpty()); + EXPECT_THAT(GetDocumentIds(filtered_iterator.get()), IsEmpty()); std::vector<TermMatchInfo> matched_terms_stats; - filtered_iterator.PopulateMatchedTermsStats(&matched_terms_stats); + filtered_iterator->PopulateMatchedTermsStats(&matched_terms_stats); EXPECT_THAT(matched_terms_stats, IsEmpty()); } @@ -209,12 +213,14 @@ TEST_F(DocHitInfoIteratorSectionRestrictTest, IncludesHitWithMatchingSection) { std::make_unique<DocHitInfoIteratorDummy>(doc_hit_infos); // Filtering for the indexed section name should get a result - DocHitInfoIteratorSectionRestrict section_restrict_iterator( - std::move(original_iterator), document_store_.get(), schema_store_.get(), - /*target_sections=*/{indexed_section_0}, - fake_clock_.GetSystemTimeMilliseconds()); - - EXPECT_THAT(GetDocumentIds(§ion_restrict_iterator), + std::unique_ptr<DocHitInfoIterator> section_restrict_iterator = + DocHitInfoIteratorSectionRestrict::ApplyRestrictions( + std::move(original_iterator), document_store_.get(), + schema_store_.get(), + /*target_sections=*/{indexed_section_0}, + fake_clock_.GetSystemTimeMilliseconds()); + + EXPECT_THAT(GetDocumentIds(section_restrict_iterator.get()), ElementsAre(document_id)); } @@ -235,18 +241,18 @@ TEST_F(DocHitInfoIteratorSectionRestrictTest, std::make_unique<DocHitInfoIteratorDummy>(doc_hit_infos); // Filter for both target_sections - DocHitInfoIteratorSectionRestrict section_restrict_iterator( - std::move(original_iterator), document_store_.get(), schema_store_.get(), - /*target_sections=*/{indexed_section_0, indexed_section_1}, - fake_clock_.GetSystemTimeMilliseconds()); - - ICING_ASSERT_OK(section_restrict_iterator.Advance()); + std::unique_ptr<DocHitInfoIterator> section_restrict_iterator = + DocHitInfoIteratorSectionRestrict::ApplyRestrictions( + std::move(original_iterator), document_store_.get(), + schema_store_.get(), + /*target_sections=*/{indexed_section_0, indexed_section_1}, + fake_clock_.GetSystemTimeMilliseconds()); + + ICING_ASSERT_OK(section_restrict_iterator->Advance()); std::vector<SectionId> expected_section_ids = {kIndexedSectionId0, kIndexedSectionId1}; - EXPECT_THAT(section_restrict_iterator.doc_hit_info(), + EXPECT_THAT(section_restrict_iterator->doc_hit_info(), EqualsDocHitInfo(document_id, expected_section_ids)); - EXPECT_THAT(section_restrict_iterator.hit_intersect_section_ids_mask(), - Eq(section_id_mask)); } TEST_F(DocHitInfoIteratorSectionRestrictTest, @@ -266,17 +272,17 @@ TEST_F(DocHitInfoIteratorSectionRestrictTest, std::make_unique<DocHitInfoIteratorDummy>(doc_hit_infos); // Filter for both target_sections - DocHitInfoIteratorSectionRestrict section_restrict_iterator( - std::move(original_iterator), document_store_.get(), schema_store_.get(), - /*target_sections=*/{indexed_section_1}, - fake_clock_.GetSystemTimeMilliseconds()); - - ICING_ASSERT_OK(section_restrict_iterator.Advance()); + std::unique_ptr<DocHitInfoIterator> section_restrict_iterator = + DocHitInfoIteratorSectionRestrict::ApplyRestrictions( + std::move(original_iterator), document_store_.get(), + schema_store_.get(), + /*target_sections=*/{indexed_section_1}, + fake_clock_.GetSystemTimeMilliseconds()); + + ICING_ASSERT_OK(section_restrict_iterator->Advance()); std::vector<SectionId> expected_section_ids = {kIndexedSectionId1}; - EXPECT_THAT(section_restrict_iterator.doc_hit_info(), + EXPECT_THAT(section_restrict_iterator->doc_hit_info(), EqualsDocHitInfo(document_id, expected_section_ids)); - EXPECT_THAT(section_restrict_iterator.hit_intersect_section_ids_mask(), - Eq(1U << kIndexedSectionId1)); } TEST_F(DocHitInfoIteratorSectionRestrictTest, @@ -295,17 +301,17 @@ TEST_F(DocHitInfoIteratorSectionRestrictTest, std::make_unique<DocHitInfoIteratorDummy>(doc_hit_infos); // Filter for both target_sections - DocHitInfoIteratorSectionRestrict section_restrict_iterator( - std::move(original_iterator), document_store_.get(), schema_store_.get(), - /*target_sections=*/{indexed_section_0, indexed_section_1}, - fake_clock_.GetSystemTimeMilliseconds()); - - ICING_ASSERT_OK(section_restrict_iterator.Advance()); + std::unique_ptr<DocHitInfoIterator> section_restrict_iterator = + DocHitInfoIteratorSectionRestrict::ApplyRestrictions( + std::move(original_iterator), document_store_.get(), + schema_store_.get(), + /*target_sections=*/{indexed_section_0, indexed_section_1}, + fake_clock_.GetSystemTimeMilliseconds()); + + ICING_ASSERT_OK(section_restrict_iterator->Advance()); std::vector<SectionId> expected_section_ids = {kIndexedSectionId1}; - EXPECT_THAT(section_restrict_iterator.doc_hit_info(), + EXPECT_THAT(section_restrict_iterator->doc_hit_info(), EqualsDocHitInfo(document_id, expected_section_ids)); - EXPECT_THAT(section_restrict_iterator.hit_intersect_section_ids_mask(), - Eq(1U << kIndexedSectionId1)); } TEST_F(DocHitInfoIteratorSectionRestrictTest, NoMatchingDocumentFilterData) { @@ -316,13 +322,15 @@ TEST_F(DocHitInfoIteratorSectionRestrictTest, NoMatchingDocumentFilterData) { std::make_unique<DocHitInfoIteratorDummy>(doc_hit_infos); // Filtering for the indexed section name should get a result - DocHitInfoIteratorSectionRestrict section_restrict_iterator( - std::move(original_iterator), document_store_.get(), schema_store_.get(), - /*target_sections=*/{""}, fake_clock_.GetSystemTimeMilliseconds()); + std::unique_ptr<DocHitInfoIterator> section_restrict_iterator = + DocHitInfoIteratorSectionRestrict::ApplyRestrictions( + std::move(original_iterator), document_store_.get(), + schema_store_.get(), + /*target_sections=*/{""}, fake_clock_.GetSystemTimeMilliseconds()); - EXPECT_THAT(GetDocumentIds(§ion_restrict_iterator), IsEmpty()); + EXPECT_THAT(GetDocumentIds(section_restrict_iterator.get()), IsEmpty()); std::vector<TermMatchInfo> matched_terms_stats; - section_restrict_iterator.PopulateMatchedTermsStats(&matched_terms_stats); + section_restrict_iterator->PopulateMatchedTermsStats(&matched_terms_stats); EXPECT_THAT(matched_terms_stats, IsEmpty()); } @@ -342,14 +350,16 @@ TEST_F(DocHitInfoIteratorSectionRestrictTest, std::make_unique<DocHitInfoIteratorDummy>(doc_hit_infos); // Filtering for the indexed section name should get a result - DocHitInfoIteratorSectionRestrict section_restrict_iterator( - std::move(original_iterator), document_store_.get(), schema_store_.get(), - /*target_sections=*/{"some_section_name"}, - fake_clock_.GetSystemTimeMilliseconds()); - - EXPECT_THAT(GetDocumentIds(§ion_restrict_iterator), IsEmpty()); + std::unique_ptr<DocHitInfoIterator> section_restrict_iterator = + DocHitInfoIteratorSectionRestrict::ApplyRestrictions( + std::move(original_iterator), document_store_.get(), + schema_store_.get(), + /*target_sections=*/{"some_section_name"}, + fake_clock_.GetSystemTimeMilliseconds()); + + EXPECT_THAT(GetDocumentIds(section_restrict_iterator.get()), IsEmpty()); std::vector<TermMatchInfo> matched_terms_stats; - section_restrict_iterator.PopulateMatchedTermsStats(&matched_terms_stats); + section_restrict_iterator->PopulateMatchedTermsStats(&matched_terms_stats); EXPECT_THAT(matched_terms_stats, IsEmpty()); } @@ -367,14 +377,16 @@ TEST_F(DocHitInfoIteratorSectionRestrictTest, std::unique_ptr<DocHitInfoIterator> original_iterator = std::make_unique<DocHitInfoIteratorDummy>(doc_hit_infos); - DocHitInfoIteratorSectionRestrict section_restrict_iterator( - std::move(original_iterator), document_store_.get(), schema_store_.get(), - /*target_sections=*/{indexed_section_0}, - fake_clock_.GetSystemTimeMilliseconds()); + std::unique_ptr<DocHitInfoIterator> section_restrict_iterator = + DocHitInfoIteratorSectionRestrict::ApplyRestrictions( + std::move(original_iterator), document_store_.get(), + schema_store_.get(), + /*target_sections=*/{indexed_section_0}, + fake_clock_.GetSystemTimeMilliseconds()); - EXPECT_THAT(GetDocumentIds(§ion_restrict_iterator), IsEmpty()); + EXPECT_THAT(GetDocumentIds(section_restrict_iterator.get()), IsEmpty()); std::vector<TermMatchInfo> matched_terms_stats; - section_restrict_iterator.PopulateMatchedTermsStats(&matched_terms_stats); + section_restrict_iterator->PopulateMatchedTermsStats(&matched_terms_stats); EXPECT_THAT(matched_terms_stats, IsEmpty()); } @@ -390,42 +402,42 @@ TEST_F(DocHitInfoIteratorSectionRestrictTest, // Create a hit that exists in a different section, so it shouldn't match any // section filters std::vector<DocHitInfo> doc_hit_infos = { - DocHitInfo(document_id, kSectionIdMaskNone << not_matching_section_id)}; + DocHitInfo(document_id, UINT64_C(1) << not_matching_section_id)}; std::unique_ptr<DocHitInfoIterator> original_iterator = std::make_unique<DocHitInfoIteratorDummy>(doc_hit_infos); - DocHitInfoIteratorSectionRestrict section_restrict_iterator( - std::move(original_iterator), document_store_.get(), schema_store_.get(), - /*target_sections=*/{indexed_section_0}, - fake_clock_.GetSystemTimeMilliseconds()); + std::unique_ptr<DocHitInfoIterator> section_restrict_iterator = + DocHitInfoIteratorSectionRestrict::ApplyRestrictions( + std::move(original_iterator), document_store_.get(), + schema_store_.get(), + /*target_sections=*/{indexed_section_0}, + fake_clock_.GetSystemTimeMilliseconds()); - EXPECT_THAT(GetDocumentIds(§ion_restrict_iterator), IsEmpty()); + EXPECT_THAT(GetDocumentIds(section_restrict_iterator.get()), IsEmpty()); std::vector<TermMatchInfo> matched_terms_stats; - section_restrict_iterator.PopulateMatchedTermsStats(&matched_terms_stats); + section_restrict_iterator->PopulateMatchedTermsStats(&matched_terms_stats); EXPECT_THAT(matched_terms_stats, IsEmpty()); } -TEST_F(DocHitInfoIteratorSectionRestrictTest, GetNumBlocksInspected) { +TEST_F(DocHitInfoIteratorSectionRestrictTest, GetCallStats) { + DocHitInfoIterator::CallStats original_call_stats( + /*num_leaf_advance_calls_lite_index_in=*/2, + /*num_leaf_advance_calls_main_index_in=*/5, + /*num_leaf_advance_calls_integer_index_in=*/3, + /*num_leaf_advance_calls_no_index_in=*/1, + /*num_blocks_inspected_in=*/4); // arbitrary value auto original_iterator = std::make_unique<DocHitInfoIteratorDummy>(); - original_iterator->SetNumBlocksInspected(5); + original_iterator->SetCallStats(original_call_stats); - DocHitInfoIteratorSectionRestrict section_restrict_iterator( - std::move(original_iterator), document_store_.get(), schema_store_.get(), - /*target_sections=*/{""}, fake_clock_.GetSystemTimeMilliseconds()); + std::unique_ptr<DocHitInfoIterator> section_restrict_iterator = + DocHitInfoIteratorSectionRestrict::ApplyRestrictions( + std::move(original_iterator), document_store_.get(), + schema_store_.get(), + /*target_sections=*/{""}, fake_clock_.GetSystemTimeMilliseconds()); - EXPECT_THAT(section_restrict_iterator.GetNumBlocksInspected(), Eq(5)); -} - -TEST_F(DocHitInfoIteratorSectionRestrictTest, GetNumLeafAdvanceCalls) { - auto original_iterator = std::make_unique<DocHitInfoIteratorDummy>(); - original_iterator->SetNumLeafAdvanceCalls(6); - - DocHitInfoIteratorSectionRestrict section_restrict_iterator( - std::move(original_iterator), document_store_.get(), schema_store_.get(), - /*target_sections=*/{""}, fake_clock_.GetSystemTimeMilliseconds()); - - EXPECT_THAT(section_restrict_iterator.GetNumLeafAdvanceCalls(), Eq(6)); + EXPECT_THAT(section_restrict_iterator->GetCallStats(), + Eq(original_call_stats)); } TEST_F(DocHitInfoIteratorSectionRestrictTest, @@ -443,12 +455,10 @@ TEST_F(DocHitInfoIteratorSectionRestrictTest, // Anything that's not 0, which is the indexed property SectionId not_matching_section_id = 2; - // Build an interator tree like: - // Restrict - // | + // Build an iterator tree like: // AND // / \ - // [1, 1],[2, 2] [3, 2] + // [1, 1],[2, 2] [3, 2] std::vector<DocHitInfo> left_infos = { DocHitInfo(document_id1, 1U << matching_section_id), DocHitInfo(document_id2, 1U << not_matching_section_id)}; @@ -459,14 +469,21 @@ TEST_F(DocHitInfoIteratorSectionRestrictTest, std::make_unique<DocHitInfoIteratorDummy>(left_infos); std::unique_ptr<DocHitInfoIterator> right_iterator = std::make_unique<DocHitInfoIteratorDummy>(right_infos, "term", 10); - std::unique_ptr<DocHitInfoIterator> original_iterator = std::make_unique<DocHitInfoIteratorAnd>(std::move(left_iterator), std::move(right_iterator)); - DocHitInfoIteratorSectionRestrict section_restrict_iterator( - std::move(original_iterator), document_store_.get(), schema_store_.get(), - {indexed_section_0}, fake_clock_.GetSystemTimeMilliseconds()); + // After applying section restriction: + // AND + // / \ + // Restrict Restrict + // | | + // [1, 1],[2, 2] [3, 2] + std::unique_ptr<DocHitInfoIterator> section_restrict_iterator = + DocHitInfoIteratorSectionRestrict::ApplyRestrictions( + std::move(original_iterator), document_store_.get(), + schema_store_.get(), {indexed_section_0}, + fake_clock_.GetSystemTimeMilliseconds()); // The trimmed tree. // Restrict @@ -474,12 +491,12 @@ TEST_F(DocHitInfoIteratorSectionRestrictTest, // [1, 1],[2, 2] ICING_ASSERT_OK_AND_ASSIGN( DocHitInfoIterator::TrimmedNode node, - std::move(section_restrict_iterator).TrimRightMostNode()); + std::move(*section_restrict_iterator).TrimRightMostNode()); EXPECT_THAT(GetDocumentIds(node.iterator_.get()), ElementsAre(document_id1)); EXPECT_THAT(node.term_, Eq("term")); EXPECT_THAT(node.term_start_index_, Eq(10)); - EXPECT_THAT(node.target_section_, Eq("")); + EXPECT_THAT(node.target_section_, Eq(indexed_section_0)); } TEST_F(DocHitInfoIteratorSectionRestrictTest, TrimSectionRestrictIterator) { @@ -504,14 +521,16 @@ TEST_F(DocHitInfoIteratorSectionRestrictTest, TrimSectionRestrictIterator) { std::unique_ptr<DocHitInfoIterator> original_iterator = std::make_unique<DocHitInfoIteratorDummy>(doc_infos, "term", 10); - DocHitInfoIteratorSectionRestrict section_restrict_iterator( - std::move(original_iterator), document_store_.get(), schema_store_.get(), - {indexed_section_0}, fake_clock_.GetSystemTimeMilliseconds()); + std::unique_ptr<DocHitInfoIterator> section_restrict_iterator = + DocHitInfoIteratorSectionRestrict::ApplyRestrictions( + std::move(original_iterator), document_store_.get(), + schema_store_.get(), {indexed_section_0}, + fake_clock_.GetSystemTimeMilliseconds()); // The trimmed tree has null iterator but has target section. ICING_ASSERT_OK_AND_ASSIGN( DocHitInfoIterator::TrimmedNode node, - std::move(section_restrict_iterator).TrimRightMostNode()); + std::move(*section_restrict_iterator).TrimRightMostNode()); EXPECT_THAT(node.iterator_, testing::IsNull()); EXPECT_THAT(node.term_, Eq("term")); diff --git a/icing/index/iterator/doc-hit-info-iterator-test-util.h b/icing/index/iterator/doc-hit-info-iterator-test-util.h index a77b91c..c75fb33 100644 --- a/icing/index/iterator/doc-hit-info-iterator-test-util.h +++ b/icing/index/iterator/doc-hit-info-iterator-test-util.h @@ -71,7 +71,7 @@ class DocHitInfoTermFrequencyPair { // will then proceed to return the doc_hit_infos in order as Advance's are // called. After all doc_hit_infos are returned, Advance will return a NotFound // error (also like normal DocHitInfoIterators). -class DocHitInfoIteratorDummy : public DocHitInfoIterator { +class DocHitInfoIteratorDummy : public DocHitInfoLeafIterator { public: DocHitInfoIteratorDummy() = default; explicit DocHitInfoIteratorDummy( @@ -140,25 +140,14 @@ class DocHitInfoIteratorDummy : public DocHitInfoIterator { matched_terms_stats->push_back(term_stats); } - void set_hit_intersect_section_ids_mask( - SectionIdMask hit_intersect_section_ids_mask) { - hit_intersect_section_ids_mask_ = hit_intersect_section_ids_mask; + void set_hit_section_ids_mask(SectionIdMask hit_section_ids_mask) { + doc_hit_info_.set_hit_section_ids_mask(hit_section_ids_mask); } - int32_t GetNumBlocksInspected() const override { - return num_blocks_inspected_; - } - - void SetNumBlocksInspected(int32_t num_blocks_inspected) { - num_blocks_inspected_ = num_blocks_inspected; - } - - int32_t GetNumLeafAdvanceCalls() const override { - return num_leaf_advance_calls_; - } + CallStats GetCallStats() const override { return call_stats_; } - void SetNumLeafAdvanceCalls(int32_t num_leaf_advance_calls) { - num_leaf_advance_calls_ = num_leaf_advance_calls; + void SetCallStats(CallStats call_stats) { + call_stats_ = std::move(call_stats); } std::string ToString() const override { @@ -176,8 +165,7 @@ class DocHitInfoIteratorDummy : public DocHitInfoIterator { private: int32_t index_ = -1; - int32_t num_blocks_inspected_ = 0; - int32_t num_leaf_advance_calls_ = 0; + CallStats call_stats_; std::vector<DocHitInfoTermFrequencyPair> doc_hit_infos_; std::string term_; int term_start_index_; diff --git a/icing/index/iterator/doc-hit-info-iterator.h b/icing/index/iterator/doc-hit-info-iterator.h index d8cd3ad..728f957 100644 --- a/icing/index/iterator/doc-hit-info-iterator.h +++ b/icing/index/iterator/doc-hit-info-iterator.h @@ -17,8 +17,12 @@ #include <array> #include <cstdint> +#include <functional> +#include <memory> #include <string> #include <string_view> +#include <utility> +#include <vector> #include "icing/text_classifier/lib3/utils/base/status.h" #include "icing/text_classifier/lib3/utils/base/statusor.h" @@ -52,8 +56,7 @@ struct TermMatchInfo { // Iterator over DocHitInfos (collapsed Hits) in REVERSE document_id order. // -// NOTE: You must call Advance() before calling hit_info() or -// hit_intersect_section_ids_mask(). +// NOTE: You must call Advance() before calling hit_info(). // // Example: // DocHitInfoIterator itr = GetIterator(...); @@ -62,6 +65,112 @@ struct TermMatchInfo { // } class DocHitInfoIterator { public: + using ChildrenMapper = std::function<std::unique_ptr<DocHitInfoIterator>( + std::unique_ptr<DocHitInfoIterator>)>; + + // CallStats is a wrapper class of all stats to collect among all levels of + // the DocHitInfoIterator tree. Mostly the internal nodes will aggregate the + // number of all leaf nodes, while the leaf nodes will return the actual + // numbers. + struct CallStats { + // The number of times Advance() was called on the leaf node for term lite + // index. + // - Leaf nodes: + // - DocHitInfoIteratorTermLite should maintain and set it correctly. + // - Others should set it 0. + // - Internal nodes: should aggregate values from all children. + int32_t num_leaf_advance_calls_lite_index; + + // The number of times Advance() was called on the leaf node for term main + // index. + // - Leaf nodes: + // - DocHitInfoIteratorTermMain should maintain and set it correctly. + // - Others should set it 0. + // - Internal nodes: should aggregate values from all children. + int32_t num_leaf_advance_calls_main_index; + + // The number of times Advance() was called on the leaf node for integer + // index. + // - Leaf nodes: + // - DocHitInfoIteratorNumeric should maintain and set it correctly. + // - Others should set it 0. + // - Internal nodes: should aggregate values from all children. + int32_t num_leaf_advance_calls_integer_index; + + // The number of times Advance() was called on the leaf node without reading + // any hits from index. Usually it is a special field for + // DocHitInfoIteratorAllDocumentId. + // - Leaf nodes: + // - DocHitInfoIteratorAllDocumentId should maintain and set it correctly. + // - Others should set it 0. + // - Internal nodes: should aggregate values from all children. + int32_t num_leaf_advance_calls_no_index; + + // The number of flash index blocks that have been read as a result of + // operations on this object. + // - Leaf nodes: should maintain and set it correctly for all child classes + // involving flash index block access. + // - Internal nodes: should aggregate values from all children. + int32_t num_blocks_inspected; + + explicit CallStats() + : CallStats(/*num_leaf_advance_calls_lite_index_in=*/0, + /*num_leaf_advance_calls_main_index_in=*/0, + /*num_leaf_advance_calls_integer_index_in=*/0, + /*num_leaf_advance_calls_no_index_in=*/0, + /*num_blocks_inspected_in=*/0) {} + + explicit CallStats(int32_t num_leaf_advance_calls_lite_index_in, + int32_t num_leaf_advance_calls_main_index_in, + int32_t num_leaf_advance_calls_integer_index_in, + int32_t num_leaf_advance_calls_no_index_in, + int32_t num_blocks_inspected_in) + : num_leaf_advance_calls_lite_index( + num_leaf_advance_calls_lite_index_in), + num_leaf_advance_calls_main_index( + num_leaf_advance_calls_main_index_in), + num_leaf_advance_calls_integer_index( + num_leaf_advance_calls_integer_index_in), + num_leaf_advance_calls_no_index(num_leaf_advance_calls_no_index_in), + num_blocks_inspected(num_blocks_inspected_in) {} + + int32_t num_leaf_advance_calls() const { + return num_leaf_advance_calls_lite_index + + num_leaf_advance_calls_main_index + + num_leaf_advance_calls_integer_index + + num_leaf_advance_calls_no_index; + } + + bool operator==(const CallStats& other) const { + return num_leaf_advance_calls_lite_index == + other.num_leaf_advance_calls_lite_index && + num_leaf_advance_calls_main_index == + other.num_leaf_advance_calls_main_index && + num_leaf_advance_calls_integer_index == + other.num_leaf_advance_calls_integer_index && + num_leaf_advance_calls_no_index == + other.num_leaf_advance_calls_no_index && + num_blocks_inspected == other.num_blocks_inspected; + } + + CallStats operator+(const CallStats& other) const { + return CallStats(num_leaf_advance_calls_lite_index + + other.num_leaf_advance_calls_lite_index, + num_leaf_advance_calls_main_index + + other.num_leaf_advance_calls_main_index, + num_leaf_advance_calls_integer_index + + other.num_leaf_advance_calls_integer_index, + num_leaf_advance_calls_no_index + + other.num_leaf_advance_calls_no_index, + num_blocks_inspected + other.num_blocks_inspected); + } + + CallStats& operator+=(const CallStats& other) { + *this = *this + other; + return *this; + } + }; + struct TrimmedNode { // the query results which we should only search for suggestion in these // documents. @@ -100,6 +209,11 @@ class DocHitInfoIterator { // INVALID_ARGUMENT if the right-most node is not suppose to be trimmed. virtual libtextclassifier3::StatusOr<TrimmedNode> TrimRightMostNode() && = 0; + // Map all direct children of this iterator according to the passed mapper. + virtual void MapChildren(const ChildrenMapper& mapper) = 0; + + virtual bool is_leaf() { return false; } + virtual ~DocHitInfoIterator() = default; // Returns: @@ -114,20 +228,8 @@ class DocHitInfoIterator { // construction or if Advance returned an error. const DocHitInfo& doc_hit_info() const { return doc_hit_info_; } - // SectionIdMask representing which sections (if any) have matched *ALL* query - // terms for the current document_id. - SectionIdMask hit_intersect_section_ids_mask() const { - return hit_intersect_section_ids_mask_; - } - - // Gets the number of flash index blocks that have been read as a - // result of operations on this object. - virtual int32_t GetNumBlocksInspected() const = 0; - - // HitIterators may be constructed into trees. Internal nodes will return the - // sum of the number of Advance() calls to all leaf nodes. Leaf nodes will - // return the number of times Advance() was called on it. - virtual int32_t GetNumLeafAdvanceCalls() const = 0; + // Returns CallStats of the DocHitInfoIterator tree. + virtual CallStats GetCallStats() const = 0; // A string representing the iterator. virtual std::string ToString() const = 0; @@ -145,7 +247,6 @@ class DocHitInfoIterator { protected: DocHitInfo doc_hit_info_; - SectionIdMask hit_intersect_section_ids_mask_ = kSectionIdMaskNone; // Helper function to advance the given iterator to at most the given // document_id. @@ -160,11 +261,20 @@ class DocHitInfoIterator { // Didn't find anything for the other iterator, reset to invalid values and // return. doc_hit_info_ = DocHitInfo(kInvalidDocumentId); - hit_intersect_section_ids_mask_ = kSectionIdMaskNone; return absl_ports::ResourceExhaustedError( "No more DocHitInfos in iterator"); } -}; // namespace DocHitInfoIterator +}; + +// A leaf node is a term node or a chain of section restriction node applied on +// a term node. +class DocHitInfoLeafIterator : public DocHitInfoIterator { + public: + bool is_leaf() override { return true; } + + // Calling MapChildren on leaf node does not make sense, and will do nothing. + void MapChildren(const ChildrenMapper& mapper) override {} +}; } // namespace lib } // namespace icing diff --git a/icing/index/iterator/section-restrict-data.cc b/icing/index/iterator/section-restrict-data.cc new file mode 100644 index 0000000..085437d --- /dev/null +++ b/icing/index/iterator/section-restrict-data.cc @@ -0,0 +1,82 @@ +// Copyright (C) 2023 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "icing/index/iterator/section-restrict-data.h" + +#include <set> +#include <string> +#include <unordered_map> +#include <utility> + +#include "icing/text_classifier/lib3/utils/base/statusor.h" +#include "icing/schema/schema-store.h" +#include "icing/schema/section.h" + +namespace icing { +namespace lib { + +SectionIdMask SectionRestrictData::GenerateSectionMask( + const std::string& schema_type, + const std::set<std::string>& target_sections) const { + SectionIdMask section_mask = kSectionIdMaskNone; + auto section_metadata_list = schema_store_.GetSectionMetadata(schema_type); + if (!section_metadata_list.ok()) { + // The current schema doesn't have section metadata. + return kSectionIdMaskNone; + } + for (const SectionMetadata& section_metadata : + *section_metadata_list.ValueOrDie()) { + if (target_sections.find(section_metadata.path) != target_sections.end()) { + section_mask |= UINT64_C(1) << section_metadata.id; + } + } + return section_mask; +} + +SectionIdMask SectionRestrictData::ComputeAllowedSectionsMask( + const std::string& schema_type) { + if (const auto type_property_mask_itr = + type_property_masks_.find(schema_type); + type_property_mask_itr != type_property_masks_.end()) { + return type_property_mask_itr->second; + } + + // Section id mask of schema_type is never calculated before, so + // calculate it here and put it into type_property_masks_. + // - If type property filters of schema_type or wildcard (*) are + // specified, then create a mask according to the filters. + // - Otherwise, create a mask to match all properties. + SectionIdMask new_section_id_mask = kSectionIdMaskAll; + if (const auto itr = type_property_filters_.find(schema_type); + itr != type_property_filters_.end()) { + // Property filters defined for given schema type + new_section_id_mask = GenerateSectionMask(schema_type, itr->second); + } else if (const auto wildcard_itr = type_property_filters_.find( + std::string(SchemaStore::kSchemaTypeWildcard)); + wildcard_itr != type_property_filters_.end()) { + // Property filters defined for wildcard entry + new_section_id_mask = + GenerateSectionMask(schema_type, wildcard_itr->second); + } else { + // Do not cache the section mask if no property filters apply to this schema + // type to avoid taking up unnecessary space. + return kSectionIdMaskAll; + } + + type_property_masks_[schema_type] = new_section_id_mask; + return new_section_id_mask; +} + +} // namespace lib +} // namespace icing diff --git a/icing/index/iterator/section-restrict-data.h b/icing/index/iterator/section-restrict-data.h new file mode 100644 index 0000000..26ca597 --- /dev/null +++ b/icing/index/iterator/section-restrict-data.h @@ -0,0 +1,98 @@ +// Copyright (C) 2023 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef ICING_INDEX_ITERATOR_SECTION_RESTRICT_DATA_H_ +#define ICING_INDEX_ITERATOR_SECTION_RESTRICT_DATA_H_ + +#include <cstdint> +#include <set> +#include <string> +#include <unordered_map> +#include <utility> + +#include "icing/schema/schema-store.h" +#include "icing/schema/section.h" +#include "icing/store/document-store.h" + +namespace icing { +namespace lib { + +class SectionRestrictData { + public: + // Does not take any ownership, and all pointers must refer to valid objects + // that outlive the one constructed. + SectionRestrictData(const DocumentStore* document_store, + const SchemaStore* schema_store, int64_t current_time_ms, + std::unordered_map<std::string, std::set<std::string>> + type_property_filters) + : document_store_(*document_store), + schema_store_(*schema_store), + current_time_ms_(current_time_ms), + type_property_filters_(std::move(type_property_filters)) {} + + // Calculates the section mask of allowed sections(determined by the + // property filters map) for the given schema type and caches the same for any + // future calls. + // + // Returns: + // - If type_property_filters_ has an entry for the given schema type or + // wildcard(*), return a bitwise or of section IDs in the schema type + // that that are also present in the relevant filter list. + // - Otherwise, return kSectionIdMaskAll. + SectionIdMask ComputeAllowedSectionsMask(const std::string& schema_type); + + const DocumentStore& document_store() const { return document_store_; } + + const SchemaStore& schema_store() const { return schema_store_; } + + int64_t current_time_ms() const { return current_time_ms_; } + + const std::unordered_map<std::string, std::set<std::string>>& + type_property_filters() const { + return type_property_filters_; + } + + private: + const DocumentStore& document_store_; + const SchemaStore& schema_store_; + int64_t current_time_ms_; + + // Map of property filters per schema type. Supports wildcard(*) for schema + // type that will apply to all schema types that are not specifically + // specified in the mapping otherwise. + std::unordered_map<std::string, std::set<std::string>> type_property_filters_; + // Mapping of schema type to the section mask of allowed sections for that + // schema type. This section mask is lazily calculated based on the + // specified property filters and cached for any future use. + std::unordered_map<std::string, SectionIdMask> type_property_masks_; + + // Generates a section mask for the given schema type and the target + // sections. + // + // Returns: + // - A bitwise or of section IDs in the schema_type that that are also + // present in the target_sections list. + // - If none of the sections in the schema_type are present in the + // target_sections list, return kSectionIdMaskNone. + // This is done by doing a bitwise or of the target section ids for the + // given schema type. + SectionIdMask GenerateSectionMask( + const std::string& schema_type, + const std::set<std::string>& target_sections) const; +}; + +} // namespace lib +} // namespace icing + +#endif // ICING_INDEX_ITERATOR_SECTION_RESTRICT_DATA_H_ diff --git a/icing/index/lite/doc-hit-info-iterator-term-lite.cc b/icing/index/lite/doc-hit-info-iterator-term-lite.cc index acf3b33..21eecb6 100644 --- a/icing/index/lite/doc-hit-info-iterator-term-lite.cc +++ b/icing/index/lite/doc-hit-info-iterator-term-lite.cc @@ -65,12 +65,11 @@ libtextclassifier3::Status DocHitInfoIteratorTermLite::Advance() { // Nothing more for the iterator to return. Set these members to invalid // values. doc_hit_info_ = DocHitInfo(); - hit_intersect_section_ids_mask_ = kSectionIdMaskNone; return absl_ports::ResourceExhaustedError( "No more DocHitInfos in iterator"); } + ++num_advance_calls_; doc_hit_info_ = cached_hits_.at(cached_hits_idx_); - hit_intersect_section_ids_mask_ = doc_hit_info_.hit_section_ids_mask(); return libtextclassifier3::Status::OK; } diff --git a/icing/index/lite/doc-hit-info-iterator-term-lite.h b/icing/index/lite/doc-hit-info-iterator-term-lite.h index 873ea89..7facd88 100644 --- a/icing/index/lite/doc-hit-info-iterator-term-lite.h +++ b/icing/index/lite/doc-hit-info-iterator-term-lite.h @@ -28,7 +28,7 @@ namespace icing { namespace lib { -class DocHitInfoIteratorTermLite : public DocHitInfoIterator { +class DocHitInfoIteratorTermLite : public DocHitInfoLeafIterator { public: explicit DocHitInfoIteratorTermLite(const TermIdCodec* term_id_codec, LiteIndex* lite_index, @@ -51,8 +51,14 @@ class DocHitInfoIteratorTermLite : public DocHitInfoIterator { libtextclassifier3::StatusOr<TrimmedNode> TrimRightMostNode() && override; - int32_t GetNumBlocksInspected() const override { return 0; } - int32_t GetNumLeafAdvanceCalls() const override { return num_advance_calls_; } + CallStats GetCallStats() const override { + return CallStats( + /*num_leaf_advance_calls_lite_index_in=*/num_advance_calls_, + /*num_leaf_advance_calls_main_index_in=*/0, + /*num_leaf_advance_calls_integer_index_in=*/0, + /*num_leaf_advance_calls_no_index_in=*/0, + /*num_blocks_inspected_in=*/0); + } void PopulateMatchedTermsStats( std::vector<TermMatchInfo>* matched_terms_stats, diff --git a/icing/index/lite/lite-index-header.h b/icing/index/lite/lite-index-header.h index 58379d6..75de8fa 100644 --- a/icing/index/lite/lite-index-header.h +++ b/icing/index/lite/lite-index-header.h @@ -15,6 +15,9 @@ #ifndef ICING_LEGACY_INDEX_ICING_LITE_INDEX_HEADER_H_ #define ICING_LEGACY_INDEX_ICING_LITE_INDEX_HEADER_H_ +#include <cstddef> +#include <cstdint> + #include "icing/legacy/core/icing-string-util.h" #include "icing/store/document-id.h" @@ -50,7 +53,14 @@ class LiteIndex_Header { class LiteIndex_HeaderImpl : public LiteIndex_Header { public: struct HeaderData { - static const uint32_t kMagic = 0xb4fb8792; + static uint32_t GetCurrentMagic( + bool include_property_existence_metadata_hits) { + if (!include_property_existence_metadata_hits) { + return 0x01c61418; + } else { + return 0x56e07d5b; + } + } uint32_t lite_index_crc; uint32_t magic; @@ -66,10 +76,15 @@ class LiteIndex_HeaderImpl : public LiteIndex_Header { uint32_t searchable_end; }; - explicit LiteIndex_HeaderImpl(HeaderData *hdr) : hdr_(hdr) {} + explicit LiteIndex_HeaderImpl(HeaderData *hdr, + bool include_property_existence_metadata_hits) + : hdr_(hdr), + include_property_existence_metadata_hits_( + include_property_existence_metadata_hits) {} bool check_magic() const override { - return hdr_->magic == HeaderData::kMagic; + return hdr_->magic == HeaderData::GetCurrentMagic( + include_property_existence_metadata_hits_); } uint32_t lite_index_crc() const override { return hdr_->lite_index_crc; } @@ -96,7 +111,8 @@ class LiteIndex_HeaderImpl : public LiteIndex_Header { void Reset() override { hdr_->lite_index_crc = 0; - hdr_->magic = HeaderData::kMagic; + hdr_->magic = + HeaderData::GetCurrentMagic(include_property_existence_metadata_hits_); hdr_->last_added_docid = kInvalidDocumentId; hdr_->cur_size = 0; hdr_->searchable_end = 0; @@ -104,6 +120,7 @@ class LiteIndex_HeaderImpl : public LiteIndex_Header { private: HeaderData *hdr_; + bool include_property_existence_metadata_hits_; }; static_assert(24 == sizeof(LiteIndex_HeaderImpl::HeaderData), "sizeof(HeaderData) != 24"); diff --git a/icing/index/lite/lite-index-options.cc b/icing/index/lite/lite-index-options.cc index 29075f8..7e6c076 100644 --- a/icing/index/lite/lite-index-options.cc +++ b/icing/index/lite/lite-index-options.cc @@ -14,7 +14,13 @@ #include "icing/index/lite/lite-index-options.h" +#include <algorithm> +#include <cstddef> +#include <cstdint> +#include <string> + #include "icing/index/lite/term-id-hit-pair.h" +#include "icing/legacy/index/icing-dynamic-trie.h" namespace icing { namespace lib { @@ -63,10 +69,16 @@ IcingDynamicTrie::Options CalculateTrieOptions(uint32_t hit_buffer_size) { } // namespace -LiteIndexOptions::LiteIndexOptions(const std::string& filename_base, - uint32_t hit_buffer_want_merge_bytes) +LiteIndexOptions::LiteIndexOptions( + const std::string& filename_base, uint32_t hit_buffer_want_merge_bytes, + bool hit_buffer_sort_at_indexing, uint32_t hit_buffer_sort_threshold_bytes, + bool include_property_existence_metadata_hits) : filename_base(filename_base), - hit_buffer_want_merge_bytes(hit_buffer_want_merge_bytes) { + hit_buffer_want_merge_bytes(hit_buffer_want_merge_bytes), + hit_buffer_sort_at_indexing(hit_buffer_sort_at_indexing), + hit_buffer_sort_threshold_bytes(hit_buffer_sort_threshold_bytes), + include_property_existence_metadata_hits( + include_property_existence_metadata_hits) { hit_buffer_size = CalculateHitBufferSize(hit_buffer_want_merge_bytes); lexicon_options = CalculateTrieOptions(hit_buffer_size); display_mappings_options = CalculateTrieOptions(hit_buffer_size); diff --git a/icing/index/lite/lite-index-options.h b/icing/index/lite/lite-index-options.h index ae58802..8b03449 100644 --- a/icing/index/lite/lite-index-options.h +++ b/icing/index/lite/lite-index-options.h @@ -15,6 +15,9 @@ #ifndef ICING_LEGACY_INDEX_ICING_LITE_INDEX_OPTIONS_H_ #define ICING_LEGACY_INDEX_ICING_LITE_INDEX_OPTIONS_H_ +#include <cstdint> +#include <string> + #include "icing/legacy/index/icing-dynamic-trie.h" namespace icing { @@ -27,7 +30,10 @@ struct LiteIndexOptions { // hit_buffer_want_merge_bytes and the logic in CalculateHitBufferSize and // CalculateTrieOptions. LiteIndexOptions(const std::string& filename_base, - uint32_t hit_buffer_want_merge_bytes); + uint32_t hit_buffer_want_merge_bytes, + bool hit_buffer_sort_at_indexing, + uint32_t hit_buffer_sort_threshold_bytes, + bool include_property_existence_metadata_hits = false); IcingDynamicTrie::Options lexicon_options; IcingDynamicTrie::Options display_mappings_options; @@ -35,6 +41,9 @@ struct LiteIndexOptions { std::string filename_base; uint32_t hit_buffer_want_merge_bytes = 0; uint32_t hit_buffer_size = 0; + bool hit_buffer_sort_at_indexing = false; + uint32_t hit_buffer_sort_threshold_bytes = 0; + bool include_property_existence_metadata_hits = false; }; } // namespace lib diff --git a/icing/index/lite/lite-index.cc b/icing/index/lite/lite-index.cc index bf54dec..3f9cc93 100644 --- a/icing/index/lite/lite-index.cc +++ b/icing/index/lite/lite-index.cc @@ -36,6 +36,8 @@ #include "icing/index/hit/doc-hit-info.h" #include "icing/index/hit/hit.h" #include "icing/index/lite/lite-index-header.h" +#include "icing/index/lite/term-id-hit-pair.h" +#include "icing/index/term-id-codec.h" #include "icing/index/term-property-id.h" #include "icing/legacy/core/icing-string-util.h" #include "icing/legacy/core/icing-timer.h" @@ -44,10 +46,13 @@ #include "icing/legacy/index/icing-filesystem.h" #include "icing/legacy/index/icing-mmapper.h" #include "icing/proto/debug.pb.h" +#include "icing/proto/scoring.pb.h" #include "icing/proto/storage.pb.h" #include "icing/proto/term.pb.h" #include "icing/schema/section.h" #include "icing/store/document-id.h" +#include "icing/store/namespace-id.h" +#include "icing/store/suggestion-result-checker.h" #include "icing/util/crc32.h" #include "icing/util/logging.h" #include "icing/util/status-macros.h" @@ -160,10 +165,11 @@ libtextclassifier3::Status LiteIndex::Initialize() { } // Set up header. - header_mmap_.Remap(hit_buffer_fd_.get(), 0, header_size()); + header_mmap_.Remap(hit_buffer_fd_.get(), kHeaderFileOffset, header_size()); header_ = std::make_unique<LiteIndex_HeaderImpl>( reinterpret_cast<LiteIndex_HeaderImpl::HeaderData*>( - header_mmap_.address())); + header_mmap_.address()), + options_.include_property_existence_metadata_hits); header_->Reset(); if (!hit_buffer_.Init(hit_buffer_fd_.get(), header_padded_size, true, @@ -175,10 +181,11 @@ libtextclassifier3::Status LiteIndex::Initialize() { UpdateChecksum(); } else { - header_mmap_.Remap(hit_buffer_fd_.get(), 0, header_size()); + header_mmap_.Remap(hit_buffer_fd_.get(), kHeaderFileOffset, header_size()); header_ = std::make_unique<LiteIndex_HeaderImpl>( reinterpret_cast<LiteIndex_HeaderImpl::HeaderData*>( - header_mmap_.address())); + header_mmap_.address()), + options_.include_property_existence_metadata_hits); if (!hit_buffer_.Init(hit_buffer_fd_.get(), header_padded_size, true, sizeof(TermIdHitPair::Value), header_->cur_size(), @@ -352,6 +359,73 @@ libtextclassifier3::StatusOr<uint32_t> LiteIndex::GetTermId( return tvi; } +void LiteIndex::ScoreAndAppendFetchedHit( + const Hit& hit, SectionIdMask section_id_mask, + bool only_from_prefix_sections, + SuggestionScoringSpecProto::SuggestionRankingStrategy::Code score_by, + const SuggestionResultChecker* suggestion_result_checker, + DocumentId& last_document_id, bool& is_last_document_desired, + int& total_score_out, std::vector<DocHitInfo>* hits_out, + std::vector<Hit::TermFrequencyArray>* term_frequency_out) const { + // Check sections. + if (((UINT64_C(1) << hit.section_id()) & section_id_mask) == 0) { + return; + } + // Check prefix section only. + if (only_from_prefix_sections && !hit.is_in_prefix_section()) { + return; + } + // Check whether this Hit is desired. + // TODO(b/230553264) Move common logic into helper function once we support + // score term by prefix_hit in lite_index. + DocumentId document_id = hit.document_id(); + bool is_new_document = document_id != last_document_id; + if (is_new_document) { + last_document_id = document_id; + is_last_document_desired = + suggestion_result_checker == nullptr || + suggestion_result_checker->BelongsToTargetResults(document_id, + hit.section_id()); + } + if (!is_last_document_desired) { + // The document is removed or expired or not desired. + return; + } + + // Score the hit by the strategy + switch (score_by) { + case SuggestionScoringSpecProto::SuggestionRankingStrategy::NONE: + total_score_out = 1; + break; + case SuggestionScoringSpecProto::SuggestionRankingStrategy::DOCUMENT_COUNT: + if (is_new_document) { + ++total_score_out; + } + break; + case SuggestionScoringSpecProto::SuggestionRankingStrategy::TERM_FREQUENCY: + if (hit.has_term_frequency()) { + total_score_out += hit.term_frequency(); + } else { + ++total_score_out; + } + break; + } + + // Append the Hit or update hit section to the output vector. + if (is_new_document && hits_out != nullptr) { + hits_out->push_back(DocHitInfo(document_id)); + if (term_frequency_out != nullptr) { + term_frequency_out->push_back(Hit::TermFrequencyArray()); + } + } + if (hits_out != nullptr) { + hits_out->back().UpdateSection(hit.section_id()); + if (term_frequency_out != nullptr) { + term_frequency_out->back()[hit.section_id()] = hit.term_frequency(); + } + } +} + int LiteIndex::FetchHits( uint32_t term_id, SectionIdMask section_id_mask, bool only_from_prefix_sections, @@ -359,19 +433,38 @@ int LiteIndex::FetchHits( const SuggestionResultChecker* suggestion_result_checker, std::vector<DocHitInfo>* hits_out, std::vector<Hit::TermFrequencyArray>* term_frequency_out) { - int score = 0; - DocumentId last_document_id = kInvalidDocumentId; - // Record whether the last document belongs to the given namespaces. - bool is_last_document_desired = false; - - if (NeedSort()) { - // Transition from shared_lock in NeedSort to unique_lock here is safe - // because it doesn't hurt to sort again if sorting was done already by - // another thread after NeedSort is evaluated. NeedSort is called before - // sorting to improve concurrency as threads can avoid acquiring the unique - // lock if no sorting is needed. + bool need_sort_at_querying = false; + { + absl_ports::shared_lock l(&mutex_); + + // We sort here when: + // 1. We don't enable sorting at indexing time (i.e. we sort at querying + // time), and there is an unsorted tail portion. OR + // 2. The unsorted tail size exceeds the hit_buffer_sort_threshold, + // regardless of whether or not hit_buffer_sort_at_indexing is enabled. + // This is more of a sanity check. We should not really be encountering + // this case. + need_sort_at_querying = NeedSortAtQuerying(); + } + if (need_sort_at_querying) { absl_ports::unique_lock l(&mutex_); - SortHits(); + IcingTimer timer; + + // Transition from shared_lock to unique_lock is safe here because it + // doesn't hurt to sort again if sorting was done already by another thread + // after need_sort_at_querying is evaluated. + // We check need_sort_at_querying to improve query concurrency as threads + // can avoid acquiring the unique lock if no sorting is needed. + SortHitsImpl(); + + if (options_.hit_buffer_sort_at_indexing) { + // This is the second case for sort. Log as this should be a very rare + // occasion. + ICING_LOG(WARNING) << "Sorting HitBuffer at querying time when " + "hit_buffer_sort_at_indexing is enabled. Sort and " + "merge HitBuffer in " + << timer.Elapsed() * 1000 << " ms."; + } } // This downgrade from an unique_lock to a shared_lock is safe because we're @@ -379,75 +472,72 @@ int LiteIndex::FetchHits( // only in Seek(). // Any operations that might execute in between the transition of downgrading // the lock here are guaranteed not to alter the searchable section (or the - // LiteIndex due to a global lock in IcingSearchEngine). + // LiteIndex) due to a global lock in IcingSearchEngine. absl_ports::shared_lock l(&mutex_); - for (uint32_t idx = Seek(term_id); idx < header_->searchable_end(); idx++) { - TermIdHitPair term_id_hit_pair = - hit_buffer_.array_cast<TermIdHitPair>()[idx]; - if (term_id_hit_pair.term_id() != term_id) break; - - const Hit& hit = term_id_hit_pair.hit(); - // Check sections. - if (((UINT64_C(1) << hit.section_id()) & section_id_mask) == 0) { - continue; - } - // Check prefix section only. - if (only_from_prefix_sections && !hit.is_in_prefix_section()) { - continue; - } - // TODO(b/230553264) Move common logic into helper function once we support - // score term by prefix_hit in lite_index. - // Check whether this Hit is desired. - DocumentId document_id = hit.document_id(); - bool is_new_document = document_id != last_document_id; - if (is_new_document) { - last_document_id = document_id; - is_last_document_desired = - suggestion_result_checker == nullptr || - suggestion_result_checker->BelongsToTargetResults(document_id, - hit.section_id()); - } - if (!is_last_document_desired) { - // The document is removed or expired or not desired. - continue; - } - // Score the hit by the strategy - switch (score_by) { - case SuggestionScoringSpecProto::SuggestionRankingStrategy::NONE: - score = 1; - break; - case SuggestionScoringSpecProto::SuggestionRankingStrategy:: - DOCUMENT_COUNT: - if (is_new_document) { - ++score; - } - break; - case SuggestionScoringSpecProto::SuggestionRankingStrategy:: - TERM_FREQUENCY: - if (hit.has_term_frequency()) { - score += hit.term_frequency(); - } else { - ++score; - } - break; - } + // Search in the HitBuffer array for Hits with the corresponding term_id. + // Hits are added in increasing order of doc ids, so hits that get appended + // later have larger docIds. This means that: + // 1. Hits in the unsorted tail will have larger docIds than hits in the + // sorted portion. + // 2. Hits at the end of the unsorted tail will have larger docIds than hits + // in the front of the tail. + // We want to retrieve hits in descending order of docIds. Therefore we should + // search by doing: + // 1. Linear search first in reverse iteration order over the unsorted tail + // portion. + // 2. Followed by binary search on the sorted portion. + const TermIdHitPair* array = hit_buffer_.array_cast<TermIdHitPair>(); - // Append the Hit or update hit section to the output vector. - if (is_new_document && hits_out != nullptr) { - hits_out->push_back(DocHitInfo(document_id)); - if (term_frequency_out != nullptr) { - term_frequency_out->push_back(Hit::TermFrequencyArray()); + DocumentId last_document_id = kInvalidDocumentId; + // Record whether the last document belongs to the given namespaces. + bool is_last_document_desired = false; + int total_score = 0; + + // Linear search over unsorted tail in reverse iteration order. + // This should only be performed when hit_buffer_sort_at_indexing is enabled. + // When disabled, the entire HitBuffer should be sorted already and only + // binary search is needed. + if (options_.hit_buffer_sort_at_indexing) { + uint32_t unsorted_length = header_->cur_size() - header_->searchable_end(); + for (uint32_t i = 1; i <= unsorted_length; ++i) { + TermIdHitPair term_id_hit_pair = array[header_->cur_size() - i]; + if (term_id_hit_pair.term_id() == term_id) { + // We've found a matched hit. + const Hit& matched_hit = term_id_hit_pair.hit(); + // Score the hit and add to total_score. Also add the hits and its term + // frequency info to hits_out and term_frequency_out if the two vectors + // are non-null. + ScoreAndAppendFetchedHit(matched_hit, section_id_mask, + only_from_prefix_sections, score_by, + suggestion_result_checker, last_document_id, + is_last_document_desired, total_score, + hits_out, term_frequency_out); } } - if (hits_out != nullptr) { - hits_out->back().UpdateSection(hit.section_id()); - if (term_frequency_out != nullptr) { - term_frequency_out->back()[hit.section_id()] = hit.term_frequency(); - } + } + + // Do binary search over the sorted section and repeat the above steps. + TermIdHitPair target_term_id_hit_pair( + term_id, Hit(Hit::kMaxDocumentIdSortValue, Hit::kDefaultTermFrequency)); + for (const TermIdHitPair* ptr = std::lower_bound( + array, array + header_->searchable_end(), target_term_id_hit_pair); + ptr < array + header_->searchable_end(); ++ptr) { + if (ptr->term_id() != term_id) { + // We've processed all matches. Stop iterating further. + break; } + + const Hit& matched_hit = ptr->hit(); + // Score the hit and add to total_score. Also add the hits and its term + // frequency info to hits_out and term_frequency_out if the two vectors are + // non-null. + ScoreAndAppendFetchedHit( + matched_hit, section_id_mask, only_from_prefix_sections, score_by, + suggestion_result_checker, last_document_id, is_last_document_desired, + total_score, hits_out, term_frequency_out); } - return score; + return total_score; } libtextclassifier3::StatusOr<int> LiteIndex::ScoreHits( @@ -455,9 +545,9 @@ libtextclassifier3::StatusOr<int> LiteIndex::ScoreHits( SuggestionScoringSpecProto::SuggestionRankingStrategy::Code score_by, const SuggestionResultChecker* suggestion_result_checker) { return FetchHits(term_id, kSectionIdMaskAll, - /*only_from_prefix_sections=*/false, score_by, - suggestion_result_checker, - /*hits_out=*/nullptr); + /*only_from_prefix_sections=*/false, score_by, + suggestion_result_checker, + /*hits_out=*/nullptr); } bool LiteIndex::is_full() const { @@ -515,7 +605,7 @@ IndexStorageInfoProto LiteIndex::GetStorageInfo( return storage_info; } -void LiteIndex::SortHits() { +void LiteIndex::SortHitsImpl() { // Make searchable by sorting by hit buffer. uint32_t sort_len = header_->cur_size() - header_->searchable_end(); if (sort_len <= 0) { @@ -546,25 +636,6 @@ void LiteIndex::SortHits() { UpdateChecksum(); } -uint32_t LiteIndex::Seek(uint32_t term_id) const { - // Binary search for our term_id. Make sure we get the first - // element. Using kBeginSortValue ensures this for the hit value. - TermIdHitPair term_id_hit_pair( - term_id, Hit(Hit::kMaxDocumentIdSortValue, Hit::kDefaultTermFrequency)); - - const TermIdHitPair::Value* array = - hit_buffer_.array_cast<TermIdHitPair::Value>(); - if (header_->searchable_end() != header_->cur_size()) { - ICING_LOG(WARNING) << "Lite index: hit buffer searchable end != current " - << "size during Seek(): " - << header_->searchable_end() << " vs " - << header_->cur_size(); - } - const TermIdHitPair::Value* ptr = std::lower_bound( - array, array + header_->searchable_end(), term_id_hit_pair.value()); - return ptr - array; -} - libtextclassifier3::Status LiteIndex::Optimize( const std::vector<DocumentId>& document_id_old_to_new, const TermIdCodec* term_id_codec, DocumentId new_last_added_document_id) { @@ -575,7 +646,7 @@ libtextclassifier3::Status LiteIndex::Optimize( } // Sort the hits so that hits with the same term id will be grouped together, // which helps later to determine which terms will be unused after compaction. - SortHits(); + SortHitsImpl(); uint32_t new_size = 0; uint32_t curr_term_id = 0; uint32_t curr_tvi = 0; diff --git a/icing/index/lite/lite-index.h b/icing/index/lite/lite-index.h index 916a14b..288602a 100644 --- a/icing/index/lite/lite-index.h +++ b/icing/index/lite/lite-index.h @@ -20,6 +20,7 @@ #define ICING_INDEX_LITE_INDEX_H_ #include <cstdint> +#include <iterator> #include <limits> #include <memory> #include <string> @@ -48,7 +49,6 @@ #include "icing/store/document-id.h" #include "icing/store/namespace-id.h" #include "icing/store/suggestion-result-checker.h" -#include "icing/util/bit-util.h" #include "icing/util/crc32.h" namespace icing { @@ -63,6 +63,9 @@ class LiteIndex { // An entry in the hit buffer. using Options = LiteIndexOptions; + // Offset for the LiteIndex_Header in the hit buffer mmap. + static constexpr uint32_t kHeaderFileOffset = 0; + // Updates checksum of subcomponents. ~LiteIndex(); @@ -152,8 +155,8 @@ class LiteIndex { // Add all hits with term_id from the sections specified in section_id_mask, // skipping hits in non-prefix sections if only_from_prefix_sections is true, // to hits_out. If hits_out is nullptr, no hits will be added. The - // corresponding hit term frequencies will also be added if term_frequency_out - // is nullptr. + // corresponding hit term frequencies will also not be added if + // term_frequency_out is nullptr. // // Only those hits which belongs to the given namespaces will be counted and // fetched. A nullptr namespace checker will disable this check. @@ -181,15 +184,29 @@ class LiteIndex { uint32_t size() const ICING_LOCKS_EXCLUDED(mutex_) { absl_ports::shared_lock l(&mutex_); - return sizeLocked(); + return size_impl(); } bool WantsMerge() const ICING_LOCKS_EXCLUDED(mutex_) { absl_ports::shared_lock l(&mutex_); - return is_full() || sizeLocked() >= (options_.hit_buffer_want_merge_bytes / - sizeof(TermIdHitPair::Value)); + return is_full() || size_impl() >= (options_.hit_buffer_want_merge_bytes / + sizeof(TermIdHitPair::Value)); + } + + // Whether or not the HitBuffer's unsorted tail size exceeds the sort + // threshold. + bool HasUnsortedHitsExceedingSortThreshold() const + ICING_LOCKS_EXCLUDED(mutex_) { + absl_ports::shared_lock l(&mutex_); + return HasUnsortedHitsExceedingSortThresholdImpl(); } + // Sort hits stored in the index. + void SortHits() ICING_LOCKS_EXCLUDED(mutex_) { + absl_ports::unique_lock l(&mutex_); + SortHitsImpl(); + }; + class const_iterator { friend class LiteIndex; @@ -326,17 +343,13 @@ class LiteIndex { // Check if the hit buffer has reached its capacity. bool is_full() const ICING_SHARED_LOCKS_REQUIRED(mutex_); - uint32_t sizeLocked() const ICING_SHARED_LOCKS_REQUIRED(mutex_) { - return header_->cur_size(); - } - // Non-locking implementation for empty(). bool empty_impl() const ICING_SHARED_LOCKS_REQUIRED(mutex_) { return size_impl() == 0; } // Non-locking implementation for size(). - bool size_impl() const ICING_SHARED_LOCKS_REQUIRED(mutex_) { + uint32_t size_impl() const ICING_SHARED_LOCKS_REQUIRED(mutex_) { return header_->cur_size(); } @@ -352,18 +365,48 @@ class LiteIndex { NamespaceId namespace_id) ICING_EXCLUSIVE_LOCKS_REQUIRED(mutex_); - // Whether or not the HitBuffer requires sorting. - bool NeedSort() ICING_LOCKS_EXCLUDED(mutex_) { - absl_ports::shared_lock l(&mutex_); - return header_->cur_size() - header_->searchable_end() > 0; + // We need to sort during querying time when: + // 1. Sorting at indexing time is not enabled and there is an unsorted tail + // section in the HitBuffer. + // 2. The unsorted tail size exceeds the hit_buffer_sort_threshold, regardless + // of whether or not hit_buffer_sort_at_indexing is enabled. This is to + // prevent performing sequential search on a large unsorted tail section, + // which would result in bad query performance. + // This is more of a sanity check. We should not really be encountering + // this case. + bool NeedSortAtQuerying() const ICING_SHARED_LOCKS_REQUIRED(mutex_) { + return HasUnsortedHitsExceedingSortThresholdImpl() || + (!options_.hit_buffer_sort_at_indexing && + header_->cur_size() - header_->searchable_end() > 0); } - // Sort hits stored in the index. - void SortHits() ICING_EXCLUSIVE_LOCKS_REQUIRED(mutex_); + // Non-locking implementation for HasUnsortedHitsExceedingSortThresholdImpl(). + bool HasUnsortedHitsExceedingSortThresholdImpl() const + ICING_SHARED_LOCKS_REQUIRED(mutex_) { + return header_->cur_size() - header_->searchable_end() >= + (options_.hit_buffer_sort_threshold_bytes / + sizeof(TermIdHitPair::Value)); + } - // Returns the position of the first element with term_id, or the searchable - // end of the hit buffer if term_id is not present. - uint32_t Seek(uint32_t term_id) const ICING_SHARED_LOCKS_REQUIRED(mutex_); + // Non-locking implementation for SortHits(). + void SortHitsImpl() ICING_EXCLUSIVE_LOCKS_REQUIRED(mutex_); + + // Calculates and adds the score for a fetched hit to total_score_out, while + // updating last_document_id (which keeps track of the last added docId so + // far), and is_last_document_desired (which keeps track of whether that last + // added docId belongs to the query's desired namespace.) + // + // Also appends the hit to hits_out and term_frequency_out if the vectors are + // not null. + void ScoreAndAppendFetchedHit( + const Hit& hit, SectionIdMask section_id_mask, + bool only_from_prefix_sections, + SuggestionScoringSpecProto::SuggestionRankingStrategy::Code score_by, + const SuggestionResultChecker* suggestion_result_checker, + DocumentId& last_document_id, bool& is_last_document_desired, + int& total_score_out, std::vector<DocHitInfo>* hits_out, + std::vector<Hit::TermFrequencyArray>* term_frequency_out) const + ICING_SHARED_LOCKS_REQUIRED(mutex_); // File descriptor that points to where the header and hit buffer are written // to. diff --git a/icing/index/lite/lite-index_test.cc b/icing/index/lite/lite-index_test.cc index 5f141ed..9811fa2 100644 --- a/icing/index/lite/lite-index_test.cc +++ b/icing/index/lite/lite-index_test.cc @@ -14,14 +14,27 @@ #include "icing/index/lite/lite-index.h" +#include <cstdint> +#include <memory> +#include <string> +#include <unordered_map> #include <vector> #include "gmock/gmock.h" #include "gtest/gtest.h" +#include "icing/file/filesystem.h" +#include "icing/index/hit/doc-hit-info.h" +#include "icing/index/hit/hit.h" +#include "icing/index/iterator/doc-hit-info-iterator.h" #include "icing/index/lite/doc-hit-info-iterator-term-lite.h" +#include "icing/index/lite/lite-index-header.h" #include "icing/index/term-id-codec.h" +#include "icing/legacy/index/icing-dynamic-trie.h" +#include "icing/legacy/index/icing-filesystem.h" +#include "icing/proto/scoring.pb.h" +#include "icing/proto/term.pb.h" #include "icing/schema/section.h" -#include "icing/store/suggestion-result-checker.h" +#include "icing/store/namespace-id.h" #include "icing/testing/always-false-suggestion-result-checker-impl.h" #include "icing/testing/common-matchers.h" #include "icing/testing/tmp-directory.h" @@ -34,6 +47,8 @@ namespace { using ::testing::ElementsAre; using ::testing::Eq; using ::testing::IsEmpty; +using ::testing::IsFalse; +using ::testing::IsTrue; using ::testing::SizeIs; class LiteIndexTest : public testing::Test { @@ -41,62 +56,518 @@ class LiteIndexTest : public testing::Test { void SetUp() override { index_dir_ = GetTestTempDir() + "/test_dir"; ASSERT_TRUE(filesystem_.CreateDirectoryRecursively(index_dir_.c_str())); - - std::string lite_index_file_name = index_dir_ + "/test_file.lite-idx.index"; - LiteIndex::Options options(lite_index_file_name, - /*hit_buffer_want_merge_bytes=*/1024 * 1024); - ICING_ASSERT_OK_AND_ASSIGN(lite_index_, - LiteIndex::Create(options, &icing_filesystem_)); - - ICING_ASSERT_OK_AND_ASSIGN( - term_id_codec_, - TermIdCodec::Create( - IcingDynamicTrie::max_value_index(IcingDynamicTrie::Options()), - IcingDynamicTrie::max_value_index(options.lexicon_options))); } void TearDown() override { term_id_codec_.reset(); - lite_index_.reset(); ASSERT_TRUE(filesystem_.DeleteDirectoryRecursively(index_dir_.c_str())); } std::string index_dir_; Filesystem filesystem_; IcingFilesystem icing_filesystem_; - std::unique_ptr<LiteIndex> lite_index_; std::unique_ptr<TermIdCodec> term_id_codec_; }; constexpr NamespaceId kNamespace0 = 0; -TEST_F(LiteIndexTest, LiteIndexAppendHits) { +TEST_F(LiteIndexTest, + LiteIndexFetchHits_sortAtQuerying_unsortedHitsBelowSortThreshold) { + // Set up LiteIndex and TermIdCodec + std::string lite_index_file_name = index_dir_ + "/test_file.lite-idx.index"; + // At 64 bytes the unsorted tail can contain a max of 8 TermHitPairs. + LiteIndex::Options options(lite_index_file_name, + /*hit_buffer_want_merge_bytes=*/1024 * 1024, + /*hit_buffer_sort_at_indexing=*/false, + /*hit_buffer_sort_threshold_bytes=*/64); + ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LiteIndex> lite_index, + LiteIndex::Create(options, &icing_filesystem_)); ICING_ASSERT_OK_AND_ASSIGN( - uint32_t tvi, - lite_index_->InsertTerm("foo", TermMatchType::PREFIX, kNamespace0)); + term_id_codec_, + TermIdCodec::Create( + IcingDynamicTrie::max_value_index(IcingDynamicTrie::Options()), + IcingDynamicTrie::max_value_index(options.lexicon_options))); + + // Add some hits + ICING_ASSERT_OK_AND_ASSIGN( + uint32_t foo_tvi, + lite_index->InsertTerm("foo", TermMatchType::PREFIX, kNamespace0)); ICING_ASSERT_OK_AND_ASSIGN(uint32_t foo_term_id, - term_id_codec_->EncodeTvi(tvi, TviType::LITE)); - Hit doc_hit0(/*section_id=*/0, /*document_id=*/0, Hit::kDefaultTermFrequency, + term_id_codec_->EncodeTvi(foo_tvi, TviType::LITE)); + Hit foo_hit0(/*section_id=*/0, /*document_id=*/1, Hit::kDefaultTermFrequency, /*is_in_prefix_section=*/false); - Hit doc_hit1(/*section_id=*/1, /*document_id=*/0, Hit::kDefaultTermFrequency, + Hit foo_hit1(/*section_id=*/1, /*document_id=*/1, Hit::kDefaultTermFrequency, /*is_in_prefix_section=*/false); - ICING_ASSERT_OK(lite_index_->AddHit(foo_term_id, doc_hit0)); - ICING_ASSERT_OK(lite_index_->AddHit(foo_term_id, doc_hit1)); + ICING_ASSERT_OK(lite_index->AddHit(foo_term_id, foo_hit0)); + ICING_ASSERT_OK(lite_index->AddHit(foo_term_id, foo_hit1)); + ICING_ASSERT_OK_AND_ASSIGN( + uint32_t bar_tvi, + lite_index->InsertTerm("bar", TermMatchType::PREFIX, kNamespace0)); + ICING_ASSERT_OK_AND_ASSIGN(uint32_t bar_term_id, + term_id_codec_->EncodeTvi(bar_tvi, TviType::LITE)); + Hit bar_hit0(/*section_id=*/0, /*document_id=*/0, Hit::kDefaultTermFrequency, + /*is_in_prefix_section=*/false); + Hit bar_hit1(/*section_id=*/1, /*document_id=*/0, Hit::kDefaultTermFrequency, + /*is_in_prefix_section=*/false); + ICING_ASSERT_OK(lite_index->AddHit(bar_term_id, bar_hit0)); + ICING_ASSERT_OK(lite_index->AddHit(bar_term_id, bar_hit1)); + + // Check that unsorted hits does not exceed the sort threshold. + EXPECT_THAT(lite_index->HasUnsortedHitsExceedingSortThreshold(), IsFalse()); + + // Check that hits are unsorted. Persist the data and pread from + // LiteIndexHeader. + ASSERT_THAT(lite_index->PersistToDisk(), IsOk()); + LiteIndex_HeaderImpl::HeaderData header_data; + ASSERT_TRUE(filesystem_.PRead((lite_index_file_name + "hb").c_str(), + &header_data, sizeof(header_data), + LiteIndex::kHeaderFileOffset)); + EXPECT_THAT(header_data.cur_size - header_data.searchable_end, Eq(4)); + + // Query the LiteIndex std::vector<DocHitInfo> hits1; - lite_index_->FetchHits( + lite_index->FetchHits( foo_term_id, kSectionIdMaskAll, /*only_from_prefix_sections=*/false, SuggestionScoringSpecProto::SuggestionRankingStrategy::DOCUMENT_COUNT, /*namespace_checker=*/nullptr, &hits1); EXPECT_THAT(hits1, SizeIs(1)); - EXPECT_THAT(hits1.back().document_id(), Eq(0)); + EXPECT_THAT(hits1.back().document_id(), Eq(1)); // Check that the hits are coming from section 0 and section 1. EXPECT_THAT(hits1.back().hit_section_ids_mask(), Eq(0b11)); std::vector<DocHitInfo> hits2; AlwaysFalseSuggestionResultCheckerImpl always_false_suggestion_result_checker; - lite_index_->FetchHits( + lite_index->FetchHits( + foo_term_id, kSectionIdMaskAll, + /*only_from_prefix_sections=*/false, + SuggestionScoringSpecProto::SuggestionRankingStrategy::DOCUMENT_COUNT, + &always_false_suggestion_result_checker, &hits2); + // Check that no hits are returned because they get skipped by the namespace + // checker. + EXPECT_THAT(hits2, IsEmpty()); + + // Check that hits are sorted after querying LiteIndex. Persist the data and + // pread from LiteIndexHeader. + ASSERT_THAT(lite_index->PersistToDisk(), IsOk()); + ASSERT_TRUE(filesystem_.PRead((lite_index_file_name + "hb").c_str(), + &header_data, sizeof(header_data), + LiteIndex::kHeaderFileOffset)); + EXPECT_THAT(header_data.cur_size - header_data.searchable_end, Eq(0)); +} + +TEST_F(LiteIndexTest, + LiteIndexFetchHits_sortAtIndexing_unsortedHitsBelowSortThreshold) { + // Set up LiteIndex and TermIdCodec + std::string lite_index_file_name = index_dir_ + "/test_file.lite-idx.index"; + // At 64 bytes the unsorted tail can contain a max of 8 TermHitPairs. + // However note that in these tests we're unable to sort hits after + // indexing, as sorting performed by the string-section-indexing-handler + // after indexing all hits in an entire document, rather than after each + // AddHits() operation. + LiteIndex::Options options(lite_index_file_name, + /*hit_buffer_want_merge_bytes=*/1024 * 1024, + /*hit_buffer_sort_at_indexing=*/true, + /*hit_buffer_sort_threshold_bytes=*/64); + ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LiteIndex> lite_index, + LiteIndex::Create(options, &icing_filesystem_)); + ICING_ASSERT_OK_AND_ASSIGN( + term_id_codec_, + TermIdCodec::Create( + IcingDynamicTrie::max_value_index(IcingDynamicTrie::Options()), + IcingDynamicTrie::max_value_index(options.lexicon_options))); + + // Add some hits + ICING_ASSERT_OK_AND_ASSIGN( + uint32_t foo_tvi, + lite_index->InsertTerm("foo", TermMatchType::PREFIX, kNamespace0)); + ICING_ASSERT_OK_AND_ASSIGN(uint32_t foo_term_id, + term_id_codec_->EncodeTvi(foo_tvi, TviType::LITE)); + Hit foo_hit0(/*section_id=*/0, /*document_id=*/1, Hit::kDefaultTermFrequency, + /*is_in_prefix_section=*/false); + Hit foo_hit1(/*section_id=*/1, /*document_id=*/1, Hit::kDefaultTermFrequency, + /*is_in_prefix_section=*/false); + ICING_ASSERT_OK(lite_index->AddHit(foo_term_id, foo_hit0)); + ICING_ASSERT_OK(lite_index->AddHit(foo_term_id, foo_hit1)); + + ICING_ASSERT_OK_AND_ASSIGN( + uint32_t bar_tvi, + lite_index->InsertTerm("bar", TermMatchType::PREFIX, kNamespace0)); + ICING_ASSERT_OK_AND_ASSIGN(uint32_t bar_term_id, + term_id_codec_->EncodeTvi(bar_tvi, TviType::LITE)); + Hit bar_hit0(/*section_id=*/0, /*document_id=*/0, Hit::kDefaultTermFrequency, + /*is_in_prefix_section=*/false); + Hit bar_hit1(/*section_id=*/1, /*document_id=*/0, Hit::kDefaultTermFrequency, + /*is_in_prefix_section=*/false); + ICING_ASSERT_OK(lite_index->AddHit(bar_term_id, bar_hit0)); + ICING_ASSERT_OK(lite_index->AddHit(bar_term_id, bar_hit1)); + + // Check that unsorted hits does not exceed the sort threshold. + EXPECT_THAT(lite_index->HasUnsortedHitsExceedingSortThreshold(), IsFalse()); + + // Check that hits are unsorted. Persist the data and pread from + // LiteIndexHeader. + ASSERT_THAT(lite_index->PersistToDisk(), IsOk()); + LiteIndex_HeaderImpl::HeaderData header_data; + ASSERT_TRUE(filesystem_.PRead((lite_index_file_name + "hb").c_str(), + &header_data, sizeof(header_data), + LiteIndex::kHeaderFileOffset)); + EXPECT_THAT(header_data.cur_size - header_data.searchable_end, Eq(4)); + + // Query the LiteIndex + std::vector<DocHitInfo> hits1; + lite_index->FetchHits( + foo_term_id, kSectionIdMaskAll, + /*only_from_prefix_sections=*/false, + SuggestionScoringSpecProto::SuggestionRankingStrategy::DOCUMENT_COUNT, + /*namespace_checker=*/nullptr, &hits1); + EXPECT_THAT(hits1, SizeIs(1)); + EXPECT_THAT(hits1.back().document_id(), Eq(1)); + // Check that the hits are coming from section 0 and section 1. + EXPECT_THAT(hits1.back().hit_section_ids_mask(), Eq(0b11)); + + std::vector<DocHitInfo> hits2; + AlwaysFalseSuggestionResultCheckerImpl always_false_suggestion_result_checker; + lite_index->FetchHits( + foo_term_id, kSectionIdMaskAll, + /*only_from_prefix_sections=*/false, + SuggestionScoringSpecProto::SuggestionRankingStrategy::DOCUMENT_COUNT, + &always_false_suggestion_result_checker, &hits2); + // Check that no hits are returned because they get skipped by the namespace + // checker. + EXPECT_THAT(hits2, IsEmpty()); + + // Check that hits are still unsorted after querying LiteIndex because the + // HitBuffer unsorted size is still below the sort threshold, and we've + // enabled sort_at_indexing. + // Persist the data and performing a pread on LiteIndexHeader. + ASSERT_THAT(lite_index->PersistToDisk(), IsOk()); + ASSERT_TRUE(filesystem_.PRead((lite_index_file_name + "hb").c_str(), + &header_data, sizeof(header_data), + LiteIndex::kHeaderFileOffset)); + EXPECT_THAT(header_data.cur_size - header_data.searchable_end, Eq(4)); +} + +TEST_F( + LiteIndexTest, + LiteIndexFetchHits_sortAtQuerying_unsortedHitsExceedingSortAtIndexThreshold) { + // Set up LiteIndex and TermIdCodec + std::string lite_index_file_name = index_dir_ + "/test_file.lite-idx.index"; + // At 64 bytes the unsorted tail can contain a max of 8 TermHitPairs. + // However note that in these tests we're unable to sort hits after + // indexing, as sorting performed by the string-section-indexing-handler + // after indexing all hits in an entire document, rather than after each + // AddHits() operation. + LiteIndex::Options options(lite_index_file_name, + /*hit_buffer_want_merge_bytes=*/1024 * 1024, + /*hit_buffer_sort_at_indexing=*/false, + /*hit_buffer_sort_threshold_bytes=*/64); + ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LiteIndex> lite_index, + LiteIndex::Create(options, &icing_filesystem_)); + ICING_ASSERT_OK_AND_ASSIGN( + term_id_codec_, + TermIdCodec::Create( + IcingDynamicTrie::max_value_index(IcingDynamicTrie::Options()), + IcingDynamicTrie::max_value_index(options.lexicon_options))); + + // Create 4 hits for docs 0-2, and 2 hits for doc 3 -- 14 in total + // Doc 0 + Hit doc0_hit0(/*section_id=*/0, /*document_id=*/0, Hit::kDefaultTermFrequency, + /*is_in_prefix_section=*/false); + Hit doc0_hit1(/*section_id=*/0, /*document_id=*/0, Hit::kDefaultTermFrequency, + /*is_in_prefix_section=*/false); + Hit doc0_hit2(/*section_id=*/1, /*document_id=*/0, Hit::kDefaultTermFrequency, + /*is_in_prefix_section=*/false); + Hit doc0_hit3(/*section_id=*/2, /*document_id=*/0, Hit::kDefaultTermFrequency, + /*is_in_prefix_section=*/false); + // Doc 1 + Hit doc1_hit0(/*section_id=*/0, /*document_id=*/1, Hit::kDefaultTermFrequency, + /*is_in_prefix_section=*/false); + Hit doc1_hit1(/*section_id=*/0, /*document_id=*/1, Hit::kDefaultTermFrequency, + /*is_in_prefix_section=*/false); + Hit doc1_hit2(/*section_id=*/1, /*document_id=*/1, Hit::kDefaultTermFrequency, + /*is_in_prefix_section=*/false); + Hit doc1_hit3(/*section_id=*/2, /*document_id=*/1, Hit::kDefaultTermFrequency, + /*is_in_prefix_section=*/false); + // Doc 2 + Hit doc2_hit0(/*section_id=*/0, /*document_id=*/2, Hit::kDefaultTermFrequency, + /*is_in_prefix_section=*/false); + Hit doc2_hit1(/*section_id=*/0, /*document_id=*/2, Hit::kDefaultTermFrequency, + /*is_in_prefix_section=*/false); + Hit doc2_hit2(/*section_id=*/1, /*document_id=*/2, Hit::kDefaultTermFrequency, + /*is_in_prefix_section=*/false); + Hit doc2_hit3(/*section_id=*/2, /*document_id=*/2, Hit::kDefaultTermFrequency, + /*is_in_prefix_section=*/false); + // Doc 3 + Hit doc3_hit0(/*section_id=*/0, /*document_id=*/3, Hit::kDefaultTermFrequency, + /*is_in_prefix_section=*/false); + Hit doc3_hit1(/*section_id=*/0, /*document_id=*/3, Hit::kDefaultTermFrequency, + /*is_in_prefix_section=*/false); + + // Create terms + // Foo + ICING_ASSERT_OK_AND_ASSIGN( + uint32_t foo_tvi, + lite_index->InsertTerm("foo", TermMatchType::EXACT_ONLY, kNamespace0)); + ICING_ASSERT_OK_AND_ASSIGN(uint32_t foo_term_id, + term_id_codec_->EncodeTvi(foo_tvi, TviType::LITE)); + // Bar + ICING_ASSERT_OK_AND_ASSIGN( + uint32_t bar_tvi, + lite_index->InsertTerm("bar", TermMatchType::PREFIX, kNamespace0)); + ICING_ASSERT_OK_AND_ASSIGN(uint32_t bar_term_id, + term_id_codec_->EncodeTvi(bar_tvi, TviType::LITE)); + // Baz + ICING_ASSERT_OK_AND_ASSIGN( + uint32_t baz_tvi, + lite_index->InsertTerm("baz", TermMatchType::PREFIX, kNamespace0)); + ICING_ASSERT_OK_AND_ASSIGN(uint32_t baz_term_id, + term_id_codec_->EncodeTvi(baz_tvi, TviType::LITE)); + // Qux + ICING_ASSERT_OK_AND_ASSIGN( + uint32_t qux_tvi, + lite_index->InsertTerm("qux", TermMatchType::PREFIX, kNamespace0)); + ICING_ASSERT_OK_AND_ASSIGN(uint32_t qux_term_id, + term_id_codec_->EncodeTvi(qux_tvi, TviType::LITE)); + + // Add 14 hits and make sure that termIds are added in unsorted order. + // Documents should be inserted in order as new incoming hits should have + // larger document ids. + ICING_ASSERT_OK(lite_index->AddHit(foo_term_id, doc0_hit0)); + ICING_ASSERT_OK(lite_index->AddHit(bar_term_id, doc0_hit1)); + ICING_ASSERT_OK(lite_index->AddHit(baz_term_id, doc0_hit2)); + ICING_ASSERT_OK(lite_index->AddHit(qux_term_id, doc0_hit3)); + ICING_ASSERT_OK(lite_index->AddHit(foo_term_id, doc1_hit0)); + ICING_ASSERT_OK(lite_index->AddHit(bar_term_id, doc1_hit1)); + ICING_ASSERT_OK(lite_index->AddHit(foo_term_id, doc1_hit2)); + ICING_ASSERT_OK(lite_index->AddHit(bar_term_id, doc1_hit3)); + ICING_ASSERT_OK(lite_index->AddHit(foo_term_id, doc2_hit0)); + ICING_ASSERT_OK(lite_index->AddHit(baz_term_id, doc2_hit1)); + ICING_ASSERT_OK(lite_index->AddHit(qux_term_id, doc2_hit2)); + ICING_ASSERT_OK(lite_index->AddHit(foo_term_id, doc2_hit3)); + ICING_ASSERT_OK(lite_index->AddHit(foo_term_id, doc3_hit0)); + ICING_ASSERT_OK(lite_index->AddHit(baz_term_id, doc3_hit1)); + // Verify that the HitBuffer has not been sorted. + EXPECT_THAT(lite_index->HasUnsortedHitsExceedingSortThreshold(), IsTrue()); + + // We now have the following in the hit buffer: + // <term>: {(docId, sectionId)...} + // foo: {(0, 0); (1, 0); (1, 1); (2, 0); (2, 2); (3, 0)} + // bar: {(0, 0); (1, 0); (1, 2)} + // baz: {(0, 1); (2, 0); (3, 0)} + // quz: {(0, 2); (2, 1)} + + // Search over the HitBuffer. + std::vector<DocHitInfo> hits1; + lite_index->FetchHits( + foo_term_id, kSectionIdMaskAll, + /*only_from_prefix_sections=*/false, + SuggestionScoringSpecProto::SuggestionRankingStrategy::DOCUMENT_COUNT, + /*namespace_checker=*/nullptr, &hits1); + EXPECT_THAT(hits1, SizeIs(4)); + // Check that hits are retrieved in descending order of docIds. + EXPECT_THAT(hits1[0].document_id(), Eq(3)); + EXPECT_THAT(hits1[0].hit_section_ids_mask(), Eq(0b1)); + EXPECT_THAT(hits1[1].document_id(), Eq(2)); + EXPECT_THAT(hits1[1].hit_section_ids_mask(), Eq(0b101)); + EXPECT_THAT(hits1[2].document_id(), Eq(1)); + EXPECT_THAT(hits1[2].hit_section_ids_mask(), Eq(0b11)); + EXPECT_THAT(hits1[3].document_id(), Eq(0)); + EXPECT_THAT(hits1[3].hit_section_ids_mask(), Eq(0b1)); + + std::vector<DocHitInfo> hits2; + AlwaysFalseSuggestionResultCheckerImpl always_false_suggestion_result_checker; + lite_index->FetchHits( + foo_term_id, kSectionIdMaskAll, + /*only_from_prefix_sections=*/false, + SuggestionScoringSpecProto::SuggestionRankingStrategy::DOCUMENT_COUNT, + &always_false_suggestion_result_checker, &hits2); + // Check that no hits are returned because they get skipped by the namespace + // checker. + EXPECT_THAT(hits2, IsEmpty()); + + std::vector<DocHitInfo> hits3; + lite_index->FetchHits( + bar_term_id, 0b1, + /*only_from_prefix_sections=*/false, + SuggestionScoringSpecProto::SuggestionRankingStrategy::DOCUMENT_COUNT, + /*namespace_checker=*/nullptr, &hits3); + EXPECT_THAT(hits3, SizeIs(2)); + // Check fetching hits with SectionIdMask. + EXPECT_THAT(hits3[0].document_id(), Eq(1)); + EXPECT_THAT(hits3[1].hit_section_ids_mask(), Eq(0b1)); + EXPECT_THAT(hits3[1].document_id(), Eq(0)); + EXPECT_THAT(hits3[1].hit_section_ids_mask(), Eq(0b1)); + + // Check that the HitBuffer is sorted after the query call. + EXPECT_THAT(lite_index->HasUnsortedHitsExceedingSortThreshold(), IsFalse()); +} + +TEST_F( + LiteIndexTest, + LiteIndexFetchHits_sortAtIndexing_unsortedHitsExceedingSortAtIndexThreshold) { + // Set up LiteIndex and TermIdCodec + std::string lite_index_file_name = index_dir_ + "/test_file.lite-idx.index"; + // At 64 bytes the unsorted tail can contain a max of 8 TermHitPairs. + LiteIndex::Options options(lite_index_file_name, + /*hit_buffer_want_merge_bytes=*/1024 * 1024, + /*hit_buffer_sort_at_indexing=*/true, + /*hit_buffer_sort_threshold_bytes=*/64); + ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LiteIndex> lite_index, + LiteIndex::Create(options, &icing_filesystem_)); + ICING_ASSERT_OK_AND_ASSIGN( + term_id_codec_, + TermIdCodec::Create( + IcingDynamicTrie::max_value_index(IcingDynamicTrie::Options()), + IcingDynamicTrie::max_value_index(options.lexicon_options))); + + // Create 4 hits for docs 0-2, and 2 hits for doc 3 -- 14 in total + // Doc 0 + Hit doc0_hit0(/*section_id=*/0, /*document_id=*/0, Hit::kDefaultTermFrequency, + /*is_in_prefix_section=*/false); + Hit doc0_hit1(/*section_id=*/0, /*document_id=*/0, Hit::kDefaultTermFrequency, + /*is_in_prefix_section=*/false); + Hit doc0_hit2(/*section_id=*/1, /*document_id=*/0, Hit::kDefaultTermFrequency, + /*is_in_prefix_section=*/false); + Hit doc0_hit3(/*section_id=*/2, /*document_id=*/0, Hit::kDefaultTermFrequency, + /*is_in_prefix_section=*/false); + // Doc 1 + Hit doc1_hit0(/*section_id=*/0, /*document_id=*/1, Hit::kDefaultTermFrequency, + /*is_in_prefix_section=*/false); + Hit doc1_hit1(/*section_id=*/0, /*document_id=*/1, Hit::kDefaultTermFrequency, + /*is_in_prefix_section=*/false); + Hit doc1_hit2(/*section_id=*/1, /*document_id=*/1, Hit::kDefaultTermFrequency, + /*is_in_prefix_section=*/false); + Hit doc1_hit3(/*section_id=*/2, /*document_id=*/1, Hit::kDefaultTermFrequency, + /*is_in_prefix_section=*/false); + // Doc 2 + Hit doc2_hit0(/*section_id=*/0, /*document_id=*/2, Hit::kDefaultTermFrequency, + /*is_in_prefix_section=*/false); + Hit doc2_hit1(/*section_id=*/0, /*document_id=*/2, Hit::kDefaultTermFrequency, + /*is_in_prefix_section=*/false); + Hit doc2_hit2(/*section_id=*/1, /*document_id=*/2, Hit::kDefaultTermFrequency, + /*is_in_prefix_section=*/false); + Hit doc2_hit3(/*section_id=*/2, /*document_id=*/2, Hit::kDefaultTermFrequency, + /*is_in_prefix_section=*/false); + // Doc 3 + Hit doc3_hit0(/*section_id=*/0, /*document_id=*/3, Hit::kDefaultTermFrequency, + /*is_in_prefix_section=*/false); + Hit doc3_hit1(/*section_id=*/0, /*document_id=*/3, Hit::kDefaultTermFrequency, + /*is_in_prefix_section=*/false); + Hit doc3_hit2(/*section_id=*/1, /*document_id=*/3, Hit::kDefaultTermFrequency, + /*is_in_prefix_section=*/false); + Hit doc3_hit3(/*section_id=*/2, /*document_id=*/3, Hit::kDefaultTermFrequency, + /*is_in_prefix_section=*/false); + // Doc 4 + Hit doc4_hit0(/*section_id=*/0, /*document_id=*/4, Hit::kDefaultTermFrequency, + /*is_in_prefix_section=*/false); + Hit doc4_hit1(/*section_id=*/0, /*document_id=*/4, Hit::kDefaultTermFrequency, + /*is_in_prefix_section=*/false); + Hit doc4_hit2(/*section_id=*/1, /*document_id=*/4, Hit::kDefaultTermFrequency, + /*is_in_prefix_section=*/false); + Hit doc4_hit3(/*section_id=*/2, /*document_id=*/4, Hit::kDefaultTermFrequency, + /*is_in_prefix_section=*/false); + + // Create terms + // Foo + ICING_ASSERT_OK_AND_ASSIGN( + uint32_t foo_tvi, + lite_index->InsertTerm("foo", TermMatchType::EXACT_ONLY, kNamespace0)); + ICING_ASSERT_OK_AND_ASSIGN(uint32_t foo_term_id, + term_id_codec_->EncodeTvi(foo_tvi, TviType::LITE)); + // Bar + ICING_ASSERT_OK_AND_ASSIGN( + uint32_t bar_tvi, + lite_index->InsertTerm("bar", TermMatchType::PREFIX, kNamespace0)); + ICING_ASSERT_OK_AND_ASSIGN(uint32_t bar_term_id, + term_id_codec_->EncodeTvi(bar_tvi, TviType::LITE)); + // Baz + ICING_ASSERT_OK_AND_ASSIGN( + uint32_t baz_tvi, + lite_index->InsertTerm("baz", TermMatchType::PREFIX, kNamespace0)); + ICING_ASSERT_OK_AND_ASSIGN(uint32_t baz_term_id, + term_id_codec_->EncodeTvi(baz_tvi, TviType::LITE)); + // Qux + ICING_ASSERT_OK_AND_ASSIGN( + uint32_t qux_tvi, + lite_index->InsertTerm("qux", TermMatchType::PREFIX, kNamespace0)); + ICING_ASSERT_OK_AND_ASSIGN(uint32_t qux_term_id, + term_id_codec_->EncodeTvi(qux_tvi, TviType::LITE)); + + // Add hits and make sure that termIds are added in unsorted order. + // Documents should be inserted in order as new incoming hits should have + // larger document ids. + ICING_ASSERT_OK(lite_index->AddHit(foo_term_id, doc0_hit0)); + ICING_ASSERT_OK(lite_index->AddHit(bar_term_id, doc0_hit1)); + ICING_ASSERT_OK(lite_index->AddHit(baz_term_id, doc0_hit2)); + ICING_ASSERT_OK(lite_index->AddHit(qux_term_id, doc0_hit3)); + ICING_ASSERT_OK(lite_index->AddHit(foo_term_id, doc1_hit0)); + ICING_ASSERT_OK(lite_index->AddHit(bar_term_id, doc1_hit1)); + ICING_ASSERT_OK(lite_index->AddHit(foo_term_id, doc1_hit2)); + ICING_ASSERT_OK(lite_index->AddHit(bar_term_id, doc1_hit3)); + // Adding 8 hits exceeds the sort threshold. However when sort_at_indexing is + // enabled, sorting is done in the string-section-indexing-handler rather than + // AddHit() itself, we need to invoke SortHits() manually. + EXPECT_THAT(lite_index->HasUnsortedHitsExceedingSortThreshold(), IsTrue()); + lite_index->SortHits(); + // Check that the HitBuffer is sorted. + ASSERT_THAT(lite_index->PersistToDisk(), IsOk()); + LiteIndex_HeaderImpl::HeaderData header_data; + ASSERT_TRUE(filesystem_.PRead((lite_index_file_name + "hb").c_str(), + &header_data, sizeof(header_data), + LiteIndex::kHeaderFileOffset)); + EXPECT_THAT(header_data.cur_size - header_data.searchable_end, Eq(0)); + + // Add 12 more hits so that sort threshold is exceeded again. + ICING_ASSERT_OK(lite_index->AddHit(foo_term_id, doc2_hit0)); + ICING_ASSERT_OK(lite_index->AddHit(baz_term_id, doc2_hit1)); + ICING_ASSERT_OK(lite_index->AddHit(qux_term_id, doc2_hit2)); + ICING_ASSERT_OK(lite_index->AddHit(foo_term_id, doc2_hit3)); + ICING_ASSERT_OK(lite_index->AddHit(foo_term_id, doc3_hit0)); + ICING_ASSERT_OK(lite_index->AddHit(baz_term_id, doc3_hit1)); + ICING_ASSERT_OK(lite_index->AddHit(foo_term_id, doc3_hit2)); + ICING_ASSERT_OK(lite_index->AddHit(bar_term_id, doc3_hit3)); + ICING_ASSERT_OK(lite_index->AddHit(baz_term_id, doc4_hit0)); + ICING_ASSERT_OK(lite_index->AddHit(qux_term_id, doc4_hit1)); + ICING_ASSERT_OK(lite_index->AddHit(foo_term_id, doc4_hit2)); + ICING_ASSERT_OK(lite_index->AddHit(bar_term_id, doc4_hit3)); + + // Adding these hits exceeds the sort threshold. However when sort_at_indexing + // is enabled, sorting is done in the string-section-indexing-handler rather + // than AddHit() itself. + EXPECT_THAT(lite_index->HasUnsortedHitsExceedingSortThreshold(), IsTrue()); + + // We now have the following in the hit buffer: + // <term>: {(docId, sectionId)...} + // foo: {(0, 0); (1, 0); (1, 1); (2, 0); (2, 2); (3, 0); (3, 1); (4, 1)} + // bar: {(0, 0); (1, 0); (1, 2); (3, 2); (4, 2)} + // baz: {(0, 1); (2, 0); (3, 0); (4, 0)} + // quz: {(0, 2); (2, 1); (4, 0)} + + // Search over the HitBuffer. + std::vector<DocHitInfo> hits1; + lite_index->FetchHits( + foo_term_id, kSectionIdMaskAll, + /*only_from_prefix_sections=*/false, + SuggestionScoringSpecProto::SuggestionRankingStrategy::DOCUMENT_COUNT, + /*namespace_checker=*/nullptr, &hits1); + EXPECT_THAT(hits1, SizeIs(5)); + // Check that hits are retrieved in descending order of docIds. + EXPECT_THAT(hits1[0].document_id(), Eq(4)); + EXPECT_THAT(hits1[0].hit_section_ids_mask(), Eq(0b10)); + EXPECT_THAT(hits1[1].document_id(), Eq(3)); + EXPECT_THAT(hits1[1].hit_section_ids_mask(), Eq(0b11)); + EXPECT_THAT(hits1[2].document_id(), Eq(2)); + EXPECT_THAT(hits1[2].hit_section_ids_mask(), Eq(0b101)); + EXPECT_THAT(hits1[3].document_id(), Eq(1)); + EXPECT_THAT(hits1[3].hit_section_ids_mask(), Eq(0b11)); + EXPECT_THAT(hits1[4].document_id(), Eq(0)); + EXPECT_THAT(hits1[4].hit_section_ids_mask(), Eq(0b1)); + + std::vector<DocHitInfo> hits2; + AlwaysFalseSuggestionResultCheckerImpl always_false_suggestion_result_checker; + lite_index->FetchHits( foo_term_id, kSectionIdMaskAll, /*only_from_prefix_sections=*/false, SuggestionScoringSpecProto::SuggestionRankingStrategy::DOCUMENT_COUNT, @@ -104,13 +575,119 @@ TEST_F(LiteIndexTest, LiteIndexAppendHits) { // Check that no hits are returned because they get skipped by the namespace // checker. EXPECT_THAT(hits2, IsEmpty()); + + std::vector<DocHitInfo> hits3; + lite_index->FetchHits( + bar_term_id, 0b1, + /*only_from_prefix_sections=*/false, + SuggestionScoringSpecProto::SuggestionRankingStrategy::DOCUMENT_COUNT, + /*namespace_checker=*/nullptr, &hits3); + EXPECT_THAT(hits3, SizeIs(2)); + // Check fetching hits with SectionIdMask. + EXPECT_THAT(hits3[0].document_id(), Eq(1)); + EXPECT_THAT(hits3[1].hit_section_ids_mask(), Eq(0b1)); + EXPECT_THAT(hits3[1].document_id(), Eq(0)); + EXPECT_THAT(hits3[1].hit_section_ids_mask(), Eq(0b1)); + + // Check that the HitBuffer is sorted after the query call. FetchHits should + // sort before performing binary search if the HitBuffer unsorted size exceeds + // the sort threshold. Regardless of the sort_at_indexing config. + EXPECT_THAT(lite_index->HasUnsortedHitsExceedingSortThreshold(), IsFalse()); + ASSERT_THAT(lite_index->PersistToDisk(), IsOk()); + ASSERT_TRUE(filesystem_.PRead((lite_index_file_name + "hb").c_str(), + &header_data, sizeof(header_data), + LiteIndex::kHeaderFileOffset)); + EXPECT_THAT(header_data.cur_size - header_data.searchable_end, Eq(0)); } TEST_F(LiteIndexTest, LiteIndexIterator) { + // Set up LiteIndex and TermIdCodec + std::string lite_index_file_name = index_dir_ + "/test_file.lite-idx.index"; + // At 64 bytes the unsorted tail can contain a max of 8 TermHitPairs. + LiteIndex::Options options(lite_index_file_name, + /*hit_buffer_want_merge_bytes=*/1024 * 1024, + /*hit_buffer_sort_at_indexing=*/true, + /*hit_buffer_sort_threshold_bytes=*/64); + ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LiteIndex> lite_index, + LiteIndex::Create(options, &icing_filesystem_)); + ICING_ASSERT_OK_AND_ASSIGN( + term_id_codec_, + TermIdCodec::Create( + IcingDynamicTrie::max_value_index(IcingDynamicTrie::Options()), + IcingDynamicTrie::max_value_index(options.lexicon_options))); + + const std::string term = "foo"; + ICING_ASSERT_OK_AND_ASSIGN( + uint32_t tvi, + lite_index->InsertTerm(term, TermMatchType::PREFIX, kNamespace0)); + ICING_ASSERT_OK_AND_ASSIGN(uint32_t foo_term_id, + term_id_codec_->EncodeTvi(tvi, TviType::LITE)); + Hit doc0_hit0(/*section_id=*/0, /*document_id=*/0, /*term_frequency=*/3, + /*is_in_prefix_section=*/false); + Hit doc0_hit1(/*section_id=*/1, /*document_id=*/0, /*term_frequency=*/5, + /*is_in_prefix_section=*/false); + SectionIdMask doc0_section_id_mask = 0b11; + std::unordered_map<SectionId, Hit::TermFrequency> + expected_section_ids_tf_map0 = {{0, 3}, {1, 5}}; + ICING_ASSERT_OK(lite_index->AddHit(foo_term_id, doc0_hit0)); + ICING_ASSERT_OK(lite_index->AddHit(foo_term_id, doc0_hit1)); + + Hit doc1_hit1(/*section_id=*/1, /*document_id=*/1, /*term_frequency=*/7, + /*is_in_prefix_section=*/false); + Hit doc1_hit2(/*section_id=*/2, /*document_id=*/1, /*term_frequency=*/11, + /*is_in_prefix_section=*/false); + SectionIdMask doc1_section_id_mask = 0b110; + std::unordered_map<SectionId, Hit::TermFrequency> + expected_section_ids_tf_map1 = {{1, 7}, {2, 11}}; + ICING_ASSERT_OK(lite_index->AddHit(foo_term_id, doc1_hit1)); + ICING_ASSERT_OK(lite_index->AddHit(foo_term_id, doc1_hit2)); + + std::unique_ptr<DocHitInfoIteratorTermLiteExact> iter = + std::make_unique<DocHitInfoIteratorTermLiteExact>( + term_id_codec_.get(), lite_index.get(), term, /*term_start_index=*/0, + /*unnormalized_term_length=*/0, kSectionIdMaskAll, + /*need_hit_term_frequency=*/true); + + ASSERT_THAT(iter->Advance(), IsOk()); + EXPECT_THAT(iter->doc_hit_info().document_id(), Eq(1)); + EXPECT_THAT(iter->doc_hit_info().hit_section_ids_mask(), + Eq(doc1_section_id_mask)); + + std::vector<TermMatchInfo> matched_terms_stats; + iter->PopulateMatchedTermsStats(&matched_terms_stats); + EXPECT_THAT(matched_terms_stats, ElementsAre(EqualsTermMatchInfo( + term, expected_section_ids_tf_map1))); + + ASSERT_THAT(iter->Advance(), IsOk()); + EXPECT_THAT(iter->doc_hit_info().document_id(), Eq(0)); + EXPECT_THAT(iter->doc_hit_info().hit_section_ids_mask(), + Eq(doc0_section_id_mask)); + matched_terms_stats.clear(); + iter->PopulateMatchedTermsStats(&matched_terms_stats); + EXPECT_THAT(matched_terms_stats, ElementsAre(EqualsTermMatchInfo( + term, expected_section_ids_tf_map0))); +} + +TEST_F(LiteIndexTest, LiteIndexIterator_sortAtIndexingDisabled) { + // Set up LiteIndex and TermIdCodec + std::string lite_index_file_name = index_dir_ + "/test_file.lite-idx.index"; + // At 64 bytes the unsorted tail can contain a max of 8 TermHitPairs. + LiteIndex::Options options(lite_index_file_name, + /*hit_buffer_want_merge_bytes=*/1024 * 1024, + /*hit_buffer_sort_at_indexing=*/false, + /*hit_buffer_sort_threshold_bytes=*/64); + ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LiteIndex> lite_index, + LiteIndex::Create(options, &icing_filesystem_)); + ICING_ASSERT_OK_AND_ASSIGN( + term_id_codec_, + TermIdCodec::Create( + IcingDynamicTrie::max_value_index(IcingDynamicTrie::Options()), + IcingDynamicTrie::max_value_index(options.lexicon_options))); + const std::string term = "foo"; ICING_ASSERT_OK_AND_ASSIGN( uint32_t tvi, - lite_index_->InsertTerm(term, TermMatchType::PREFIX, kNamespace0)); + lite_index->InsertTerm(term, TermMatchType::PREFIX, kNamespace0)); ICING_ASSERT_OK_AND_ASSIGN(uint32_t foo_term_id, term_id_codec_->EncodeTvi(tvi, TviType::LITE)); Hit doc0_hit0(/*section_id=*/0, /*document_id=*/0, /*term_frequency=*/3, @@ -120,8 +697,8 @@ TEST_F(LiteIndexTest, LiteIndexIterator) { SectionIdMask doc0_section_id_mask = 0b11; std::unordered_map<SectionId, Hit::TermFrequency> expected_section_ids_tf_map0 = {{0, 3}, {1, 5}}; - ICING_ASSERT_OK(lite_index_->AddHit(foo_term_id, doc0_hit0)); - ICING_ASSERT_OK(lite_index_->AddHit(foo_term_id, doc0_hit1)); + ICING_ASSERT_OK(lite_index->AddHit(foo_term_id, doc0_hit0)); + ICING_ASSERT_OK(lite_index->AddHit(foo_term_id, doc0_hit1)); Hit doc1_hit1(/*section_id=*/1, /*document_id=*/1, /*term_frequency=*/7, /*is_in_prefix_section=*/false); @@ -130,12 +707,12 @@ TEST_F(LiteIndexTest, LiteIndexIterator) { SectionIdMask doc1_section_id_mask = 0b110; std::unordered_map<SectionId, Hit::TermFrequency> expected_section_ids_tf_map1 = {{1, 7}, {2, 11}}; - ICING_ASSERT_OK(lite_index_->AddHit(foo_term_id, doc1_hit1)); - ICING_ASSERT_OK(lite_index_->AddHit(foo_term_id, doc1_hit2)); + ICING_ASSERT_OK(lite_index->AddHit(foo_term_id, doc1_hit1)); + ICING_ASSERT_OK(lite_index->AddHit(foo_term_id, doc1_hit2)); std::unique_ptr<DocHitInfoIteratorTermLiteExact> iter = std::make_unique<DocHitInfoIteratorTermLiteExact>( - term_id_codec_.get(), lite_index_.get(), term, /*term_start_index=*/0, + term_id_codec_.get(), lite_index.get(), term, /*term_start_index=*/0, /*unnormalized_term_length=*/0, kSectionIdMaskAll, /*need_hit_term_frequency=*/true); diff --git a/icing/index/lite/lite-index_thread-safety_test.cc b/icing/index/lite/lite-index_thread-safety_test.cc index 7711f92..53aa6cd 100644 --- a/icing/index/lite/lite-index_thread-safety_test.cc +++ b/icing/index/lite/lite-index_thread-safety_test.cc @@ -19,12 +19,9 @@ #include "gmock/gmock.h" #include "gtest/gtest.h" -#include "icing/index/lite/doc-hit-info-iterator-term-lite.h" #include "icing/index/lite/lite-index.h" #include "icing/index/term-id-codec.h" #include "icing/schema/section.h" -#include "icing/store/suggestion-result-checker.h" -#include "icing/testing/always-false-suggestion-result-checker-impl.h" #include "icing/testing/common-matchers.h" #include "icing/testing/tmp-directory.h" @@ -52,7 +49,9 @@ class LiteIndexThreadSafetyTest : public testing::Test { std::string lite_index_file_name = index_dir_ + "/test_file.lite-idx-thread-safety.index"; LiteIndex::Options options(lite_index_file_name, - /*hit_buffer_want_merge_bytes=*/1024 * 1024); + /*hit_buffer_want_merge_bytes=*/1024 * 1024, + /*hit_buffer_sort_at_indexing=*/true, + /*hit_buffer_sort_threshold_bytes=*/64); ICING_ASSERT_OK_AND_ASSIGN(lite_index_, LiteIndex::Create(options, &icing_filesystem_)); diff --git a/icing/index/lite/term-id-hit-pair.h b/icing/index/lite/term-id-hit-pair.h index 61ec502..82bd010 100644 --- a/icing/index/lite/term-id-hit-pair.h +++ b/icing/index/lite/term-id-hit-pair.h @@ -73,6 +73,8 @@ class TermIdHitPair { return value_ == rhs.value_; } + bool operator<(const TermIdHitPair& rhs) const { return value_ < rhs.value_; } + private: Value value_; }; diff --git a/icing/index/main/doc-hit-info-iterator-term-main.cc b/icing/index/main/doc-hit-info-iterator-term-main.cc index 8f0d3f5..3e66858 100644 --- a/icing/index/main/doc-hit-info-iterator-term-main.cc +++ b/icing/index/main/doc-hit-info-iterator-term-main.cc @@ -14,16 +14,20 @@ #include "icing/index/main/doc-hit-info-iterator-term-main.h" -#include <cstdint> #include <memory> +#include <optional> +#include <string> +#include <utility> +#include <vector> #include "icing/text_classifier/lib3/utils/base/status.h" #include "icing/absl_ports/canonical_errors.h" #include "icing/absl_ports/str_cat.h" -#include "icing/file/posting_list/posting-list-identifier.h" #include "icing/index/hit/doc-hit-info.h" +#include "icing/index/hit/hit.h" +#include "icing/index/iterator/doc-hit-info-iterator.h" +#include "icing/index/main/main-index.h" #include "icing/index/main/posting-list-hit-accessor.h" -#include "icing/legacy/core/icing-string-util.h" #include "icing/schema/section.h" #include "icing/store/document-id.h" #include "icing/util/logging.h" @@ -44,6 +48,30 @@ std::string SectionIdMaskToString(SectionIdMask section_id_mask) { return mask; } +void MergeNewHitIntoCachedDocHitInfos( + const Hit& hit, bool need_hit_term_frequency, + std::vector<DocHitInfoIteratorTermMain::DocHitInfoAndTermFrequencyArray>& + cached_doc_hit_infos_out) { + if (cached_doc_hit_infos_out.empty() || + hit.document_id() != + cached_doc_hit_infos_out.back().doc_hit_info.document_id()) { + std::optional<Hit::TermFrequencyArray> tf_arr; + if (need_hit_term_frequency) { + tf_arr = std::make_optional<Hit::TermFrequencyArray>(); + } + + cached_doc_hit_infos_out.push_back( + DocHitInfoIteratorTermMain::DocHitInfoAndTermFrequencyArray( + DocHitInfo(hit.document_id()), std::move(tf_arr))); + } + + cached_doc_hit_infos_out.back().doc_hit_info.UpdateSection(hit.section_id()); + if (need_hit_term_frequency) { + (*cached_doc_hit_infos_out.back().term_frequency_array)[hit.section_id()] = + hit.term_frequency(); + } +} + } // namespace libtextclassifier3::Status DocHitInfoIteratorTermMain::Advance() { @@ -72,12 +100,12 @@ libtextclassifier3::Status DocHitInfoIteratorTermMain::Advance() { // Nothing more for the iterator to return. Set these members to invalid // values. doc_hit_info_ = DocHitInfo(); - hit_intersect_section_ids_mask_ = kSectionIdMaskNone; return absl_ports::ResourceExhaustedError( "No more DocHitInfos in iterator"); } - doc_hit_info_ = cached_doc_hit_infos_.at(cached_doc_hit_infos_idx_); - hit_intersect_section_ids_mask_ = doc_hit_info_.hit_section_ids_mask(); + ++num_advance_calls_; + doc_hit_info_ = + cached_doc_hit_infos_.at(cached_doc_hit_infos_idx_).doc_hit_info; return libtextclassifier3::Status::OK; } @@ -90,16 +118,16 @@ DocHitInfoIteratorTermMain::TrimRightMostNode() && { } libtextclassifier3::Status DocHitInfoIteratorTermMainExact::RetrieveMoreHits() { - DocHitInfo last_doc_hit_info; + DocHitInfoAndTermFrequencyArray last_doc_hit_info; if (!cached_doc_hit_infos_.empty()) { - last_doc_hit_info = cached_doc_hit_infos_.back(); + last_doc_hit_info = std::move(cached_doc_hit_infos_.back()); } cached_doc_hit_infos_idx_ = 0; cached_doc_hit_infos_.clear(); - if (last_doc_hit_info.document_id() != kInvalidDocumentId) { + if (last_doc_hit_info.doc_hit_info.document_id() != kInvalidDocumentId) { // Carry over the last hit. It might need to be merged with the first hit of // of the next posting list in the chain. - cached_doc_hit_infos_.push_back(last_doc_hit_info); + cached_doc_hit_infos_.push_back(std::move(last_doc_hit_info)); } if (posting_list_accessor_ == nullptr) { ICING_ASSIGN_OR_RETURN(posting_list_accessor_, @@ -110,10 +138,11 @@ libtextclassifier3::Status DocHitInfoIteratorTermMainExact::RetrieveMoreHits() { posting_list_accessor_->GetNextHitsBatch()); if (hits.empty()) { all_pages_consumed_ = true; + return libtextclassifier3::Status::OK; } + ++num_blocks_inspected_; - cached_doc_hit_infos_.reserve(hits.size() + 1); - cached_hit_term_frequency_.reserve(hits.size() + 1); + cached_doc_hit_infos_.reserve(cached_doc_hit_infos_.size() + hits.size()); for (const Hit& hit : hits) { // Check sections. if (((UINT64_C(1) << hit.section_id()) & section_restrict_mask_) == 0) { @@ -123,13 +152,9 @@ libtextclassifier3::Status DocHitInfoIteratorTermMainExact::RetrieveMoreHits() { if (hit.is_prefix_hit()) { continue; } - if (cached_doc_hit_infos_.empty() || - hit.document_id() != cached_doc_hit_infos_.back().document_id()) { - cached_doc_hit_infos_.push_back(DocHitInfo(hit.document_id())); - cached_hit_term_frequency_.push_back(Hit::TermFrequencyArray()); - } - cached_doc_hit_infos_.back().UpdateSection(hit.section_id()); - cached_hit_term_frequency_.back()[hit.section_id()] = hit.term_frequency(); + + MergeNewHitIntoCachedDocHitInfos(hit, need_hit_term_frequency_, + cached_doc_hit_infos_); } return libtextclassifier3::Status::OK; } @@ -141,19 +166,18 @@ std::string DocHitInfoIteratorTermMainExact::ToString() const { libtextclassifier3::Status DocHitInfoIteratorTermMainPrefix::RetrieveMoreHits() { - DocHitInfo last_doc_hit_info; + DocHitInfoAndTermFrequencyArray last_doc_hit_info; if (!cached_doc_hit_infos_.empty()) { - last_doc_hit_info = cached_doc_hit_infos_.back(); + last_doc_hit_info = std::move(cached_doc_hit_infos_.back()); } cached_doc_hit_infos_idx_ = 0; cached_doc_hit_infos_.clear(); - if (last_doc_hit_info.document_id() != kInvalidDocumentId) { + if (last_doc_hit_info.doc_hit_info.document_id() != kInvalidDocumentId) { // Carry over the last hit. It might need to be merged with the first hit of // of the next posting list in the chain. - cached_doc_hit_infos_.push_back(last_doc_hit_info); + cached_doc_hit_infos_.push_back(std::move(last_doc_hit_info)); } - ++num_blocks_inspected_; if (posting_list_accessor_ == nullptr) { ICING_ASSIGN_OR_RETURN(MainIndex::GetPrefixAccessorResult result, main_index_->GetAccessorForPrefixTerm(term_)); @@ -164,11 +188,11 @@ DocHitInfoIteratorTermMainPrefix::RetrieveMoreHits() { posting_list_accessor_->GetNextHitsBatch()); if (hits.empty()) { all_pages_consumed_ = true; + return libtextclassifier3::Status::OK; } - cached_doc_hit_infos_.reserve(hits.size()); - if (need_hit_term_frequency_) { - cached_hit_term_frequency_.reserve(hits.size()); - } + + ++num_blocks_inspected_; + cached_doc_hit_infos_.reserve(cached_doc_hit_infos_.size() + hits.size()); for (const Hit& hit : hits) { // Check sections. if (((UINT64_C(1) << hit.section_id()) & section_restrict_mask_) == 0) { @@ -178,18 +202,9 @@ DocHitInfoIteratorTermMainPrefix::RetrieveMoreHits() { if (!exact_ && !hit.is_in_prefix_section()) { continue; } - if (cached_doc_hit_infos_.empty() || - hit.document_id() != cached_doc_hit_infos_.back().document_id()) { - cached_doc_hit_infos_.push_back(DocHitInfo(hit.document_id())); - if (need_hit_term_frequency_) { - cached_hit_term_frequency_.push_back(Hit::TermFrequencyArray()); - } - } - cached_doc_hit_infos_.back().UpdateSection(hit.section_id()); - if (need_hit_term_frequency_) { - cached_hit_term_frequency_.back()[hit.section_id()] = - hit.term_frequency(); - } + + MergeNewHitIntoCachedDocHitInfos(hit, need_hit_term_frequency_, + cached_doc_hit_infos_); } return libtextclassifier3::Status::OK; } diff --git a/icing/index/main/doc-hit-info-iterator-term-main.h b/icing/index/main/doc-hit-info-iterator-term-main.h index 08a385c..e32db2a 100644 --- a/icing/index/main/doc-hit-info-iterator-term-main.h +++ b/icing/index/main/doc-hit-info-iterator-term-main.h @@ -17,10 +17,14 @@ #include <cstdint> #include <memory> +#include <optional> +#include <string> +#include <utility> #include <vector> #include "icing/text_classifier/lib3/utils/base/status.h" #include "icing/index/hit/doc-hit-info.h" +#include "icing/index/hit/hit.h" #include "icing/index/iterator/doc-hit-info-iterator.h" #include "icing/index/main/main-index.h" #include "icing/index/main/posting-list-hit-accessor.h" @@ -29,8 +33,21 @@ namespace icing { namespace lib { -class DocHitInfoIteratorTermMain : public DocHitInfoIterator { +class DocHitInfoIteratorTermMain : public DocHitInfoLeafIterator { public: + struct DocHitInfoAndTermFrequencyArray { + DocHitInfo doc_hit_info; + std::optional<Hit::TermFrequencyArray> term_frequency_array; + + explicit DocHitInfoAndTermFrequencyArray() = default; + + explicit DocHitInfoAndTermFrequencyArray( + DocHitInfo doc_hit_info_in, + std::optional<Hit::TermFrequencyArray> term_frequency_array_in) + : doc_hit_info(std::move(doc_hit_info_in)), + term_frequency_array(std::move(term_frequency_array_in)) {} + }; + explicit DocHitInfoIteratorTermMain(MainIndex* main_index, const std::string& term, int term_start_index, @@ -53,10 +70,14 @@ class DocHitInfoIteratorTermMain : public DocHitInfoIterator { libtextclassifier3::StatusOr<TrimmedNode> TrimRightMostNode() && override; - int32_t GetNumBlocksInspected() const override { - return num_blocks_inspected_; + CallStats GetCallStats() const override { + return CallStats( + /*num_leaf_advance_calls_lite_index_in=*/0, + /*num_leaf_advance_calls_main_index_in=*/num_advance_calls_, + /*num_leaf_advance_calls_integer_index_in=*/0, + /*num_leaf_advance_calls_no_index_in=*/0, + /*num_blocks_inspected_in=*/num_blocks_inspected_); } - int32_t GetNumLeafAdvanceCalls() const override { return num_advance_calls_; } void PopulateMatchedTermsStats( std::vector<TermMatchInfo>* matched_terms_stats, @@ -74,8 +95,9 @@ class DocHitInfoIteratorTermMain : public DocHitInfoIterator { while (section_mask_copy) { SectionId section_id = __builtin_ctzll(section_mask_copy); if (need_hit_term_frequency_) { - section_term_frequencies.at(section_id) = cached_hit_term_frequency_.at( - cached_doc_hit_infos_idx_)[section_id]; + section_term_frequencies.at(section_id) = + (*cached_doc_hit_infos_.at(cached_doc_hit_infos_idx_) + .term_frequency_array)[section_id]; } section_mask_copy &= ~(UINT64_C(1) << section_id); } @@ -106,12 +128,13 @@ class DocHitInfoIteratorTermMain : public DocHitInfoIterator { std::unique_ptr<PostingListHitAccessor> posting_list_accessor_; MainIndex* main_index_; - // Stores hits retrieved from the index. This may only be a subset of the hits - // that are present in the index. Current value pointed to by the Iterator is - // tracked by cached_doc_hit_infos_idx_. - std::vector<DocHitInfo> cached_doc_hit_infos_; - std::vector<Hit::TermFrequencyArray> cached_hit_term_frequency_; + // Stores hits and optional term frequency arrays retrieved from the index. + // This may only be a subset of the hits that are present in the index. + // Current value pointed to by the Iterator is tracked by + // cached_doc_hit_infos_idx_. + std::vector<DocHitInfoAndTermFrequencyArray> cached_doc_hit_infos_; int cached_doc_hit_infos_idx_; + int num_advance_calls_; int num_blocks_inspected_; bool all_pages_consumed_; @@ -168,10 +191,6 @@ class DocHitInfoIteratorTermMainPrefix : public DocHitInfoIteratorTermMain { libtextclassifier3::Status RetrieveMoreHits() override; private: - // After retrieving DocHitInfos from the index, a DocHitInfo for docid 1 and - // "foo" and a DocHitInfo for docid 1 and "fool". These DocHitInfos should be - // merged. - void SortAndDedupeDocumentIds(); // Whether or not posting_list_accessor_ holds a posting list chain for // 'term' or for a term for which 'term' is a prefix. This is necessary to // determine whether to return hits that are not from a prefix section (hits diff --git a/icing/index/main/main-index-merger_test.cc b/icing/index/main/main-index-merger_test.cc index 8a2f691..37e14fc 100644 --- a/icing/index/main/main-index-merger_test.cc +++ b/icing/index/main/main-index-merger_test.cc @@ -45,7 +45,9 @@ class MainIndexMergerTest : public testing::Test { std::string lite_index_file_name = index_dir_ + "/test_file.lite-idx.index"; LiteIndex::Options options(lite_index_file_name, - /*hit_buffer_want_merge_bytes=*/1024 * 1024); + /*hit_buffer_want_merge_bytes=*/1024 * 1024, + /*hit_buffer_sort_at_indexing=*/true, + /*hit_buffer_sort_threshold_bytes=*/1024 * 8); ICING_ASSERT_OK_AND_ASSIGN(lite_index_, LiteIndex::Create(options, &icing_filesystem_)); diff --git a/icing/index/main/main-index.cc b/icing/index/main/main-index.cc index d5e9d57..aae60c6 100644 --- a/icing/index/main/main-index.cc +++ b/icing/index/main/main-index.cc @@ -751,6 +751,13 @@ libtextclassifier3::StatusOr<DocumentId> MainIndex::TransferAndAddHits( old_pl_accessor.GetNextHitsBatch()); while (!tmp.empty()) { for (const Hit& hit : tmp) { + // A safety check to add robustness to the codebase, so to make sure that + // we never access invalid memory, in case that hit from the posting list + // is corrupted. + if (hit.document_id() < 0 || + hit.document_id() >= document_id_old_to_new.size()) { + continue; + } DocumentId new_document_id = document_id_old_to_new[hit.document_id()]; // Transfer the document id of the hit, if the document is not deleted // or outdated. diff --git a/icing/index/main/main-index_test.cc b/icing/index/main/main-index_test.cc index ac724b0..fa96e6c 100644 --- a/icing/index/main/main-index_test.cc +++ b/icing/index/main/main-index_test.cc @@ -38,6 +38,7 @@ namespace lib { namespace { using ::testing::ElementsAre; +using ::testing::Eq; using ::testing::IsEmpty; using ::testing::NiceMock; using ::testing::Return; @@ -90,7 +91,9 @@ class MainIndexTest : public testing::Test { std::string lite_index_file_name = index_dir_ + "/test_file.lite-idx.index"; LiteIndex::Options options(lite_index_file_name, - /*hit_buffer_want_merge_bytes=*/1024 * 1024); + /*hit_buffer_want_merge_bytes=*/1024 * 1024, + /*hit_buffer_sort_at_indexing=*/true, + /*hit_buffer_sort_threshold_bytes=*/1024 * 8); ICING_ASSERT_OK_AND_ASSIGN(lite_index_, LiteIndex::Create(options, &icing_filesystem_)); @@ -361,7 +364,9 @@ TEST_F(MainIndexTest, MergeIndexToPreexisting) { // - Doc4 {"four", "foul" is_in_prefix_section=true} std::string lite_index_file_name2 = index_dir_ + "/test_file.lite-idx.index2"; LiteIndex::Options options(lite_index_file_name2, - /*hit_buffer_want_merge_bytes=*/1024 * 1024); + /*hit_buffer_want_merge_bytes=*/1024 * 1024, + /*hit_buffer_sort_at_indexing=*/true, + /*hit_buffer_sort_threshold_bytes=*/1024 * 8); ICING_ASSERT_OK_AND_ASSIGN(lite_index_, LiteIndex::Create(options, &icing_filesystem_)); ICING_ASSERT_OK_AND_ASSIGN( @@ -531,30 +536,35 @@ TEST_F(MainIndexTest, PrefixNotRetrievedInExactSearch) { std::vector<SectionId>{doc1_hit.section_id()}))); } -TEST_F(MainIndexTest, SearchChainedPostingLists) { +TEST_F(MainIndexTest, + SearchChainedPostingListsShouldMergeSectionsAndTermFrequency) { // Index 2048 document with 3 hits in each document. When merged into the main // index, this will 1) lead to a chained posting list and 2) split at least // one document's hits across multiple posting lists. + const std::string term = "foot"; + ICING_ASSERT_OK_AND_ASSIGN( uint32_t tvi, - lite_index_->InsertTerm("foot", TermMatchType::EXACT_ONLY, kNamespace0)); + lite_index_->InsertTerm(term, TermMatchType::EXACT_ONLY, kNamespace0)); ICING_ASSERT_OK_AND_ASSIGN(uint32_t foot_term_id, term_id_codec_->EncodeTvi(tvi, TviType::LITE)); for (DocumentId document_id = 0; document_id < 2048; ++document_id) { - Hit doc_hit0(/*section_id=*/0, /*document_id=*/document_id, - Hit::kDefaultTermFrequency, - /*is_in_prefix_section=*/false); + Hit::TermFrequency term_frequency = static_cast<Hit::TermFrequency>( + document_id % Hit::kMaxTermFrequency + 1); + Hit doc_hit0( + /*section_id=*/0, /*document_id=*/document_id, term_frequency, + /*is_in_prefix_section=*/false); ICING_ASSERT_OK(lite_index_->AddHit(foot_term_id, doc_hit0)); - Hit doc_hit1(/*section_id=*/1, /*document_id=*/document_id, - Hit::kDefaultTermFrequency, - /*is_in_prefix_section=*/false); + Hit doc_hit1( + /*section_id=*/1, /*document_id=*/document_id, term_frequency, + /*is_in_prefix_section=*/false); ICING_ASSERT_OK(lite_index_->AddHit(foot_term_id, doc_hit1)); - Hit doc_hit2(/*section_id=*/2, /*document_id=*/document_id, - Hit::kDefaultTermFrequency, - /*is_in_prefix_section=*/false); + Hit doc_hit2( + /*section_id=*/2, /*document_id=*/document_id, term_frequency, + /*is_in_prefix_section=*/false); ICING_ASSERT_OK(lite_index_->AddHit(foot_term_id, doc_hit2)); } @@ -568,15 +578,35 @@ TEST_F(MainIndexTest, SearchChainedPostingLists) { // 3. Merge the lite index. ICING_ASSERT_OK(Merge(*lite_index_, *term_id_codec_, main_index.get())); // Get hits for all documents containing "foot" - which should be all of them. - std::vector<DocHitInfo> hits = - GetExactHits(main_index.get(), /*term_start_index=*/0, - /*unnormalized_term_length=*/0, "foot"); - EXPECT_THAT(hits, SizeIs(2048)); - EXPECT_THAT(hits.front(), - EqualsDocHitInfo(2047, std::vector<SectionId>{0, 1, 2})); - EXPECT_THAT(hits.back(), - EqualsDocHitInfo(0, std::vector<SectionId>{0, 1, 2})); + auto iterator = std::make_unique<DocHitInfoIteratorTermMainExact>( + main_index.get(), term, /*term_start_index=*/0, + /*unnormalized_term_length=*/0, kSectionIdMaskAll, + /*need_hit_term_frequency=*/true); + + DocumentId expected_document_id = 2047; + while (iterator->Advance().ok()) { + EXPECT_THAT(iterator->doc_hit_info(), + EqualsDocHitInfo(expected_document_id, + std::vector<SectionId>{0, 1, 2})); + + std::vector<TermMatchInfo> matched_terms_stats; + iterator->PopulateMatchedTermsStats(&matched_terms_stats); + + Hit::TermFrequency expected_term_frequency = + static_cast<Hit::TermFrequency>( + expected_document_id % Hit::kMaxTermFrequency + 1); + ASSERT_THAT(matched_terms_stats, SizeIs(1)); + EXPECT_THAT(matched_terms_stats[0].term, Eq(term)); + EXPECT_THAT(matched_terms_stats[0].term_frequencies[0], + Eq(expected_term_frequency)); + EXPECT_THAT(matched_terms_stats[0].term_frequencies[1], + Eq(expected_term_frequency)); + EXPECT_THAT(matched_terms_stats[0].term_frequencies[2], + Eq(expected_term_frequency)); + --expected_document_id; + } + EXPECT_THAT(expected_document_id, Eq(-1)); } TEST_F(MainIndexTest, MergeIndexBackfilling) { @@ -606,7 +636,9 @@ TEST_F(MainIndexTest, MergeIndexBackfilling) { // - Doc1 {"foot" is_in_prefix_section=false} std::string lite_index_file_name2 = index_dir_ + "/test_file.lite-idx.index2"; LiteIndex::Options options(lite_index_file_name2, - /*hit_buffer_want_merge_bytes=*/1024 * 1024); + /*hit_buffer_want_merge_bytes=*/1024 * 1024, + /*hit_buffer_sort_at_indexing=*/true, + /*hit_buffer_sort_threshold_bytes=*/1024 * 8); ICING_ASSERT_OK_AND_ASSIGN(lite_index_, LiteIndex::Create(options, &icing_filesystem_)); ICING_ASSERT_OK_AND_ASSIGN( diff --git a/icing/index/main/posting-list-hit-serializer.cc b/icing/index/main/posting-list-hit-serializer.cc index 00c70e9..e14a0c0 100644 --- a/icing/index/main/posting-list-hit-serializer.cc +++ b/icing/index/main/posting-list-hit-serializer.cc @@ -300,7 +300,8 @@ libtextclassifier3::Status PostingListHitSerializer::PrependHitToNotFull( // Therefore, offset must be less than kSpecialHitSize + 5. Since posting // list size must be divisible by sizeof(Hit) (5), it is guaranteed that // offset < size_in_bytes, so it is safe to ignore the return value here. - ConsumeTermFrequencyIfPresent(posting_list_used, &cur, &offset); + ICING_RETURN_IF_ERROR( + ConsumeTermFrequencyIfPresent(posting_list_used, &cur, &offset)); // Safe to ignore the return value of PadToEnd because offset must be less // than posting_list_used->size_in_bytes(). Otherwise, this function // already would have returned FAILED_PRECONDITION. @@ -419,7 +420,7 @@ libtextclassifier3::Status PostingListHitSerializer::PopFrontHits( // previous hits in the posting list and because there's no way that the // posting list could run out of room because it previously stored this hit // AND another hit. - PrependHit(posting_list_used, out[1]); + ICING_RETURN_IF_ERROR(PrependHit(posting_list_used, out[1])); } else if (num_hits > 0) { return GetHitsInternal(posting_list_used, /*limit=*/num_hits, /*pop=*/true, nullptr); diff --git a/icing/index/main/posting-list-hit-serializer.h b/icing/index/main/posting-list-hit-serializer.h index 975b05a..2986d9c 100644 --- a/icing/index/main/posting-list-hit-serializer.h +++ b/icing/index/main/posting-list-hit-serializer.h @@ -23,6 +23,7 @@ #include "icing/file/posting_list/posting-list-common.h" #include "icing/file/posting_list/posting-list-used.h" #include "icing/index/hit/hit.h" +#include "icing/util/status-macros.h" namespace icing { namespace lib { @@ -72,8 +73,9 @@ class PostingListHitSerializer : public PostingListSerializer { // keep_prepended is true, whatever could be prepended is kept, otherwise the // posting list is left in its original state. template <class T, Hit (*GetHit)(const T&)> - uint32_t PrependHitArray(PostingListUsed* posting_list_used, const T* array, - uint32_t num_hits, bool keep_prepended) const; + libtextclassifier3::StatusOr<uint32_t> PrependHitArray( + PostingListUsed* posting_list_used, const T* array, uint32_t num_hits, + bool keep_prepended) const; // Retrieves the hits stored in the posting list. // @@ -312,9 +314,10 @@ class PostingListHitSerializer : public PostingListSerializer { // Inlined functions. Implementation details below. Avert eyes! template <class T, Hit (*GetHit)(const T&)> -uint32_t PostingListHitSerializer::PrependHitArray( - PostingListUsed* posting_list_used, const T* array, uint32_t num_hits, - bool keep_prepended) const { +libtextclassifier3::StatusOr<uint32_t> +PostingListHitSerializer::PrependHitArray(PostingListUsed* posting_list_used, + const T* array, uint32_t num_hits, + bool keep_prepended) const { if (!IsPostingListValid(posting_list_used)) { return 0; } @@ -331,7 +334,7 @@ uint32_t PostingListHitSerializer::PrependHitArray( // before. PopFrontHits guarantees that it will remove all 'i' hits so long // as there are at least 'i' hits in the posting list, which we know there // are. - PopFrontHits(posting_list_used, /*num_hits=*/i); + ICING_RETURN_IF_ERROR(PopFrontHits(posting_list_used, /*num_hits=*/i)); } return i; } diff --git a/icing/index/main/posting-list-hit-serializer_test.cc b/icing/index/main/posting-list-hit-serializer_test.cc index ffd8166..7f0b945 100644 --- a/icing/index/main/posting-list-hit-serializer_test.cc +++ b/icing/index/main/posting-list-hit-serializer_test.cc @@ -59,14 +59,14 @@ TEST(PostingListHitSerializerTest, PostingListUsedPrependHitNotFull) { // Make used. Hit hit0(/*section_id=*/0, 0, /*term_frequency=*/56); - serializer.PrependHit(&pl_used, hit0); + ICING_ASSERT_OK(serializer.PrependHit(&pl_used, hit0)); // Size = sizeof(uncompressed hit0) int expected_size = sizeof(Hit); EXPECT_THAT(serializer.GetBytesUsed(&pl_used), Le(expected_size)); EXPECT_THAT(serializer.GetHits(&pl_used), IsOkAndHolds(ElementsAre(hit0))); Hit hit1(/*section_id=*/0, 1, Hit::kDefaultTermFrequency); - serializer.PrependHit(&pl_used, hit1); + ICING_ASSERT_OK(serializer.PrependHit(&pl_used, hit1)); // Size = sizeof(uncompressed hit1) // + sizeof(hit0-hit1) + sizeof(hit0::term_frequency) expected_size += 2 + sizeof(Hit::TermFrequency); @@ -75,7 +75,7 @@ TEST(PostingListHitSerializerTest, PostingListUsedPrependHitNotFull) { IsOkAndHolds(ElementsAre(hit1, hit0))); Hit hit2(/*section_id=*/0, 2, /*term_frequency=*/56); - serializer.PrependHit(&pl_used, hit2); + ICING_ASSERT_OK(serializer.PrependHit(&pl_used, hit2)); // Size = sizeof(uncompressed hit2) // + sizeof(hit1-hit2) // + sizeof(hit0-hit1) + sizeof(hit0::term_frequency) @@ -85,7 +85,7 @@ TEST(PostingListHitSerializerTest, PostingListUsedPrependHitNotFull) { IsOkAndHolds(ElementsAre(hit2, hit1, hit0))); Hit hit3(/*section_id=*/0, 3, Hit::kDefaultTermFrequency); - serializer.PrependHit(&pl_used, hit3); + ICING_ASSERT_OK(serializer.PrependHit(&pl_used, hit3)); // Size = sizeof(uncompressed hit3) // + sizeof(hit2-hit3) + sizeof(hit2::term_frequency) // + sizeof(hit1-hit2) @@ -232,17 +232,19 @@ TEST(PostingListHitSerializerTest, // Add five hits. The PL is in the empty state and an empty min size PL can // only fit two hits. So PrependHitArray should fail. - uint32_t num_can_prepend = - serializer.PrependHitArray<HitElt, HitElt::get_hit>( - &pl_used, &hits_in[0], hits_in.size(), false); + ICING_ASSERT_OK_AND_ASSIGN( + uint32_t num_can_prepend, + (serializer.PrependHitArray<HitElt, HitElt::get_hit>( + &pl_used, &hits_in[0], hits_in.size(), false))); EXPECT_THAT(num_can_prepend, Eq(2)); int can_fit_hits = num_can_prepend; // The PL has room for 2 hits. We should be able to add them without any // problem, transitioning the PL from EMPTY -> ALMOST_FULL -> FULL const HitElt *hits_in_ptr = hits_in.data() + (hits_in.size() - 2); - num_can_prepend = serializer.PrependHitArray<HitElt, HitElt::get_hit>( - &pl_used, hits_in_ptr, can_fit_hits, false); + ICING_ASSERT_OK_AND_ASSIGN( + num_can_prepend, (serializer.PrependHitArray<HitElt, HitElt::get_hit>( + &pl_used, hits_in_ptr, can_fit_hits, false))); EXPECT_THAT(num_can_prepend, Eq(can_fit_hits)); EXPECT_THAT(size, Eq(serializer.GetBytesUsed(&pl_used))); std::deque<Hit> hits_pushed; @@ -289,8 +291,10 @@ TEST(PostingListHitSerializerTest, PostingListPrependHitArrayPostingList) { // Add five hits. The PL is in the empty state and should be able to fit all // five hits without issue, transitioning the PL from EMPTY -> NOT_FULL. - uint32_t num_could_fit = serializer.PrependHitArray<HitElt, HitElt::get_hit>( - &pl_used, &hits_in[0], hits_in.size(), false); + ICING_ASSERT_OK_AND_ASSIGN( + uint32_t num_could_fit, + (serializer.PrependHitArray<HitElt, HitElt::get_hit>( + &pl_used, &hits_in[0], hits_in.size(), false))); EXPECT_THAT(num_could_fit, Eq(hits_in.size())); EXPECT_THAT(byte_size, Eq(serializer.GetBytesUsed(&pl_used))); std::deque<Hit> hits_pushed; @@ -334,8 +338,9 @@ TEST(PostingListHitSerializerTest, PostingListPrependHitArrayPostingList) { // Add these 6 hits. The PL is currently in the NOT_FULL state and should // remain in the NOT_FULL state. - num_could_fit = serializer.PrependHitArray<HitElt, HitElt::get_hit>( - &pl_used, &hits_in[0], hits_in.size(), false); + ICING_ASSERT_OK_AND_ASSIGN( + num_could_fit, (serializer.PrependHitArray<HitElt, HitElt::get_hit>( + &pl_used, &hits_in[0], hits_in.size(), false))); EXPECT_THAT(num_could_fit, Eq(hits_in.size())); EXPECT_THAT(byte_size, Eq(serializer.GetBytesUsed(&pl_used))); // All hits from hits_in were added. @@ -368,8 +373,9 @@ TEST(PostingListHitSerializerTest, PostingListPrependHitArrayPostingList) { // Add this 1 hit. The PL is currently in the NOT_FULL state and should // transition to the ALMOST_FULL state - even though there is still some // unused space. - num_could_fit = serializer.PrependHitArray<HitElt, HitElt::get_hit>( - &pl_used, &hits_in[0], hits_in.size(), false); + ICING_ASSERT_OK_AND_ASSIGN( + num_could_fit, (serializer.PrependHitArray<HitElt, HitElt::get_hit>( + &pl_used, &hits_in[0], hits_in.size(), false))); EXPECT_THAT(num_could_fit, Eq(hits_in.size())); EXPECT_THAT(byte_size, Eq(serializer.GetBytesUsed(&pl_used))); // All hits from hits_in were added. @@ -408,8 +414,9 @@ TEST(PostingListHitSerializerTest, PostingListPrependHitArrayPostingList) { // second hit should tranisition to the FULL state because the delta between // Hit #13 and Hit #14 (2 bytes) is larger than the remaining unused area // (1 byte). - num_could_fit = serializer.PrependHitArray<HitElt, HitElt::get_hit>( - &pl_used, &hits_in[0], hits_in.size(), false); + ICING_ASSERT_OK_AND_ASSIGN( + num_could_fit, (serializer.PrependHitArray<HitElt, HitElt::get_hit>( + &pl_used, &hits_in[0], hits_in.size(), false))); EXPECT_THAT(num_could_fit, Eq(hits_in.size())); EXPECT_THAT(size, Eq(serializer.GetBytesUsed(&pl_used))); // All hits from hits_in were added. @@ -442,8 +449,11 @@ TEST(PostingListHitSerializerTest, PostingListPrependHitArrayTooManyHits) { // PrependHitArray should fail because hit_elts_in_too_many is far too large // for the minimum size pl. - uint32_t num_could_fit = serializer.PrependHitArray<HitElt, HitElt::get_hit>( - &pl_used, &hit_elts_in_too_many[0], hit_elts_in_too_many.size(), false); + ICING_ASSERT_OK_AND_ASSIGN( + uint32_t num_could_fit, + (serializer.PrependHitArray<HitElt, HitElt::get_hit>( + &pl_used, &hit_elts_in_too_many[0], hit_elts_in_too_many.size(), + false))); ASSERT_THAT(num_could_fit, Lt(hit_elts_in_too_many.size())); ASSERT_THAT(serializer.GetBytesUsed(&pl_used), Eq(0)); ASSERT_THAT(serializer.GetHits(&pl_used), IsOkAndHolds(IsEmpty())); @@ -453,8 +463,10 @@ TEST(PostingListHitSerializerTest, PostingListPrependHitArrayTooManyHits) { PostingListUsed::CreateFromUnitializedRegion(&serializer, kHitsSize)); // PrependHitArray should fail because hit_elts_in_too_many is one hit too // large for this pl. - num_could_fit = serializer.PrependHitArray<HitElt, HitElt::get_hit>( - &pl_used, &hit_elts_in_too_many[0], hit_elts_in_too_many.size(), false); + ICING_ASSERT_OK_AND_ASSIGN( + num_could_fit, (serializer.PrependHitArray<HitElt, HitElt::get_hit>( + &pl_used, &hit_elts_in_too_many[0], + hit_elts_in_too_many.size(), false))); ASSERT_THAT(num_could_fit, Lt(hit_elts_in_too_many.size())); ASSERT_THAT(serializer.GetBytesUsed(&pl_used), Eq(0)); ASSERT_THAT(serializer.GetHits(&pl_used), IsOkAndHolds(IsEmpty())); @@ -476,7 +488,7 @@ TEST(PostingListHitSerializerTest, ICING_ASSERT_OK(serializer.PrependHit(&pl, Hit(Hit::kInvalidValue >> 2, 0))); // Status should jump to full directly. ASSERT_THAT(serializer.GetBytesUsed(&pl), Eq(pl_size)); - serializer.PopFrontHits(&pl, 1); + ICING_ASSERT_OK(serializer.PopFrontHits(&pl, 1)); // Status should return to not full as before. ASSERT_THAT(serializer.GetBytesUsed(&pl), Eq(bytes_used)); } diff --git a/icing/index/numeric/doc-hit-info-iterator-numeric.h b/icing/index/numeric/doc-hit-info-iterator-numeric.h index fc66a1d..7cdb230 100644 --- a/icing/index/numeric/doc-hit-info-iterator-numeric.h +++ b/icing/index/numeric/doc-hit-info-iterator-numeric.h @@ -29,7 +29,7 @@ namespace icing { namespace lib { template <typename T> -class DocHitInfoIteratorNumeric : public DocHitInfoIterator { +class DocHitInfoIteratorNumeric : public DocHitInfoLeafIterator { public: explicit DocHitInfoIteratorNumeric( std::unique_ptr<typename NumericIndex<T>::Iterator> numeric_index_iter) @@ -53,9 +53,19 @@ class DocHitInfoIteratorNumeric : public DocHitInfoIterator { "Cannot generate suggestion if the last term is numeric operator."); } - int32_t GetNumBlocksInspected() const override { return 0; } + CallStats GetCallStats() const override { + if (numeric_index_iter_ == nullptr) { + return CallStats(); + } - int32_t GetNumLeafAdvanceCalls() const override { return 0; } + return CallStats(/*num_leaf_advance_calls_lite_index_in=*/0, + /*num_leaf_advance_calls_main_index_in=*/0, + /*num_leaf_advance_calls_integer_index_in=*/ + numeric_index_iter_->GetNumAdvanceCalls(), + /*num_leaf_advance_calls_no_index_in=*/0, + /*num_blocks_inspected_in=*/ + numeric_index_iter_->GetNumBlocksInspected()); + } std::string ToString() const override { return "test"; } diff --git a/icing/index/numeric/dummy-numeric-index.h b/icing/index/numeric/dummy-numeric-index.h index 2c077a2..d18f2aa 100644 --- a/icing/index/numeric/dummy-numeric-index.h +++ b/icing/index/numeric/dummy-numeric-index.h @@ -15,6 +15,7 @@ #ifndef ICING_INDEX_NUMERIC_DUMMY_NUMERIC_INDEX_H_ #define ICING_INDEX_NUMERIC_DUMMY_NUMERIC_INDEX_H_ +#include <cstdint> #include <functional> #include <map> #include <memory> @@ -166,7 +167,8 @@ class DummyNumericIndex : public NumericIndex<T> { explicit Iterator(T key_lower, T key_upper, std::vector<BucketInfo>&& bucket_info_vec) : NumericIndex<T>::Iterator(key_lower, key_upper), - pq_(std::less<BucketInfo>(), std::move(bucket_info_vec)) {} + pq_(std::less<BucketInfo>(), std::move(bucket_info_vec)), + num_advance_calls_(0) {} ~Iterator() override = default; @@ -174,38 +176,55 @@ class DummyNumericIndex : public NumericIndex<T> { DocHitInfo GetDocHitInfo() const override { return doc_hit_info_; } + int32_t GetNumAdvanceCalls() const override { return num_advance_calls_; } + + int32_t GetNumBlocksInspected() const override { return 0; } + private: std::priority_queue<BucketInfo> pq_; DocHitInfo doc_hit_info_; + + int32_t num_advance_calls_; }; explicit DummyNumericIndex(const Filesystem& filesystem, std::string&& working_path) : NumericIndex<T>(filesystem, std::move(working_path), PersistentStorage::WorkingPathType::kDummy), - last_added_document_id_(kInvalidDocumentId) {} + dummy_crcs_buffer_( + std::make_unique<uint8_t[]>(sizeof(PersistentStorage::Crcs))), + last_added_document_id_(kInvalidDocumentId) { + memset(dummy_crcs_buffer_.get(), 0, sizeof(PersistentStorage::Crcs)); + } - libtextclassifier3::Status PersistStoragesToDisk() override { + libtextclassifier3::Status PersistStoragesToDisk(bool force) override { return libtextclassifier3::Status::OK; } - libtextclassifier3::Status PersistMetadataToDisk() override { + libtextclassifier3::Status PersistMetadataToDisk(bool force) override { return libtextclassifier3::Status::OK; } - libtextclassifier3::StatusOr<Crc32> ComputeInfoChecksum() override { + libtextclassifier3::StatusOr<Crc32> ComputeInfoChecksum(bool force) override { return Crc32(0); } - libtextclassifier3::StatusOr<Crc32> ComputeStoragesChecksum() override { + libtextclassifier3::StatusOr<Crc32> ComputeStoragesChecksum( + bool force) override { return Crc32(0); } - PersistentStorage::Crcs& crcs() override { return dummy_crcs_; } - const PersistentStorage::Crcs& crcs() const override { return dummy_crcs_; } + PersistentStorage::Crcs& crcs() override { + return *reinterpret_cast<PersistentStorage::Crcs*>( + dummy_crcs_buffer_.get()); + } + const PersistentStorage::Crcs& crcs() const override { + return *reinterpret_cast<const PersistentStorage::Crcs*>( + dummy_crcs_buffer_.get()); + } std::unordered_map<std::string, std::map<T, std::vector<BasicHit>>> storage_; - PersistentStorage::Crcs dummy_crcs_; + std::unique_ptr<uint8_t[]> dummy_crcs_buffer_; DocumentId last_added_document_id_; }; @@ -251,6 +270,7 @@ libtextclassifier3::Status DummyNumericIndex<T>::Iterator::Advance() { // Merge sections with same document_id into a single DocHitInfo while (!pq_.empty() && pq_.top().GetCurrentBasicHit().document_id() == document_id) { + ++num_advance_calls_; doc_hit_info_.UpdateSection(pq_.top().GetCurrentBasicHit().section_id()); BucketInfo info = pq_.top(); diff --git a/icing/index/numeric/integer-index-bucket-util.h b/icing/index/numeric/integer-index-bucket-util.h index 863bd01..d6fc245 100644 --- a/icing/index/numeric/integer-index-bucket-util.h +++ b/icing/index/numeric/integer-index-bucket-util.h @@ -61,7 +61,7 @@ struct DataRangeAndBucketInfo { // - Data slice (i.e. [start, end)) can be empty. // // REQUIRES: -// - original_key_lower <= original_key_upper +// - original_key_lower < original_key_upper // - num_data_threshold > 0 // - Keys of all data are in range [original_key_lower, original_key_upper] // diff --git a/icing/index/numeric/integer-index-storage.cc b/icing/index/numeric/integer-index-storage.cc index fa62b19..72e0266 100644 --- a/icing/index/numeric/integer-index-storage.cc +++ b/icing/index/numeric/integer-index-storage.cc @@ -45,6 +45,7 @@ #include "icing/index/numeric/posting-list-integer-index-serializer.h" #include "icing/schema/section.h" #include "icing/store/document-id.h" +#include "icing/util/crc32.h" #include "icing/util/status-macros.h" namespace icing { @@ -151,18 +152,25 @@ class BucketPostingListIterator { : pl_accessor_(std::move(pl_accessor)), should_retrieve_next_batch_(true) {} + struct AdvanceAndFilterResult { + libtextclassifier3::Status status = libtextclassifier3::Status::OK; + int32_t num_advance_calls = 0; + int32_t num_blocks_inspected = 0; + }; // Advances to the next relevant data. The posting list of a bucket contains // keys within range [bucket.key_lower, bucket.key_upper], but some of them // may be out of [query_key_lower, query_key_upper], so when advancing we have // to filter out those non-relevant keys. // // Returns: + // AdvanceAndFilterResult. status will be: // - OK on success // - RESOURCE_EXHAUSTED_ERROR if reaching the end (i.e. no more relevant // data) // - Any other PostingListIntegerIndexAccessor errors - libtextclassifier3::Status AdvanceAndFilter(int64_t query_key_lower, - int64_t query_key_upper) { + AdvanceAndFilterResult AdvanceAndFilter(int64_t query_key_lower, + int64_t query_key_upper) { + AdvanceAndFilterResult result; // Move curr_ until reaching a relevant data (i.e. key in range // [query_key_lower, query_key_upper]) do { @@ -172,12 +180,18 @@ class BucketPostingListIterator { curr_ >= cached_batch_integer_index_data_.cend(); } if (should_retrieve_next_batch_) { - ICING_RETURN_IF_ERROR(GetNextDataBatch()); + auto status = GetNextDataBatch(); + if (!status.ok()) { + result.status = std::move(status); + return result; + } + ++result.num_blocks_inspected; should_retrieve_next_batch_ = false; } + ++result.num_advance_calls; } while (curr_->key() < query_key_lower || curr_->key() > query_key_upper); - return libtextclassifier3::Status::OK; + return result; } const BasicHit& GetCurrentBasicHit() const { return curr_->basic_hit(); } @@ -222,7 +236,9 @@ class IntegerIndexStorageIterator : public NumericIndex<int64_t>::Iterator { explicit IntegerIndexStorageIterator( int64_t query_key_lower, int64_t query_key_upper, std::vector<std::unique_ptr<BucketPostingListIterator>>&& bucket_pl_iters) - : NumericIndex<int64_t>::Iterator(query_key_lower, query_key_upper) { + : NumericIndex<int64_t>::Iterator(query_key_lower, query_key_upper), + num_advance_calls_(0), + num_blocks_inspected_(0) { std::vector<BucketPostingListIterator*> bucket_pl_iters_raw_ptrs; for (std::unique_ptr<BucketPostingListIterator>& bucket_pl_itr : bucket_pl_iters) { @@ -232,11 +248,15 @@ class IntegerIndexStorageIterator : public NumericIndex<int64_t>::Iterator { // Note: it is possible that the bucket iterator fails to advance for the // first round, because data could be filtered out by [query_key_lower, // query_key_upper]. In this case, just discard the iterator. - if (bucket_pl_itr->AdvanceAndFilter(query_key_lower, query_key_upper) - .ok()) { + BucketPostingListIterator::AdvanceAndFilterResult + advance_and_filter_result = + bucket_pl_itr->AdvanceAndFilter(query_key_lower, query_key_upper); + if (advance_and_filter_result.status.ok()) { bucket_pl_iters_raw_ptrs.push_back(bucket_pl_itr.get()); bucket_pl_iters_.push_back(std::move(bucket_pl_itr)); } + num_advance_calls_ += advance_and_filter_result.num_advance_calls; + num_blocks_inspected_ += advance_and_filter_result.num_blocks_inspected; } pq_ = std::priority_queue<BucketPostingListIterator*, @@ -259,6 +279,12 @@ class IntegerIndexStorageIterator : public NumericIndex<int64_t>::Iterator { DocHitInfo GetDocHitInfo() const override { return doc_hit_info_; } + int32_t GetNumAdvanceCalls() const override { return num_advance_calls_; } + + int32_t GetNumBlocksInspected() const override { + return num_blocks_inspected_; + } + private: BucketPostingListIterator::Comparator comparator_; @@ -280,6 +306,9 @@ class IntegerIndexStorageIterator : public NumericIndex<int64_t>::Iterator { pq_; DocHitInfo doc_hit_info_; + + int32_t num_advance_calls_; + int32_t num_blocks_inspected_; }; libtextclassifier3::Status IntegerIndexStorageIterator::Advance() { @@ -299,7 +328,12 @@ libtextclassifier3::Status IntegerIndexStorageIterator::Advance() { do { doc_hit_info_.UpdateSection( bucket_itr->GetCurrentBasicHit().section_id()); - advance_status = bucket_itr->AdvanceAndFilter(key_lower_, key_upper_); + BucketPostingListIterator::AdvanceAndFilterResult + advance_and_filter_result = + bucket_itr->AdvanceAndFilter(key_lower_, key_upper_); + advance_status = std::move(advance_and_filter_result.status); + num_advance_calls_ += advance_and_filter_result.num_advance_calls; + num_blocks_inspected_ += advance_and_filter_result.num_blocks_inspected; } while (advance_status.ok() && bucket_itr->GetCurrentBasicHit().document_id() == document_id); if (advance_status.ok()) { @@ -311,6 +345,11 @@ libtextclassifier3::Status IntegerIndexStorageIterator::Advance() { } bool IntegerIndexStorage::Options::IsValid() const { + if (num_data_threshold_for_bucket_split <= + kMinNumDataThresholdForBucketSplit) { + return false; + } + if (!HasCustomInitBuckets()) { return true; } @@ -403,12 +442,20 @@ libtextclassifier3::Status IntegerIndexStorage::AddKeys( return libtextclassifier3::Status::OK; } + SetDirty(); + std::sort(new_keys.begin(), new_keys.end()); // Dedupe auto last = std::unique(new_keys.begin(), new_keys.end()); new_keys.erase(last, new_keys.end()); + if (static_cast<int32_t>(new_keys.size()) > + std::numeric_limits<int32_t>::max() - info().num_data) { + return absl_ports::ResourceExhaustedError( + "# of keys in this integer index storage exceed the limit"); + } + // When adding keys into a bucket, we potentially split it into 2 new buckets // and one of them will be added into the unsorted bucket array. // When handling keys belonging to buckets in the unsorted bucket array, we @@ -649,6 +696,9 @@ libtextclassifier3::Status IntegerIndexStorage::TransferIndex( return lhs.get() < rhs.get(); }); + const int32_t num_data_threshold_for_bucket_merge = + kNumDataThresholdRatioForBucketMerge * + new_storage->options_.num_data_threshold_for_bucket_split; int64_t curr_key_lower = std::numeric_limits<int64_t>::min(); int64_t curr_key_upper = std::numeric_limits<int64_t>::min(); std::vector<IntegerIndexData> accumulated_data; @@ -687,7 +737,7 @@ libtextclassifier3::Status IntegerIndexStorage::TransferIndex( // - Flush accumulated_data and create a new bucket for them. // - OR merge new_data into accumulated_data and go to the next round. if (!accumulated_data.empty() && accumulated_data.size() + new_data.size() > - kNumDataThresholdForBucketMerge) { + num_data_threshold_for_bucket_merge) { // TODO(b/259743562): [Optimization 3] adjust upper bound to fit more data // from new_data to accumulated_data. ICING_RETURN_IF_ERROR(FlushDataIntoNewSortedBucket( @@ -879,9 +929,11 @@ IntegerIndexStorage::InitializeExistingFiles( IntegerIndexStorage::FlushDataIntoNewSortedBucket( int64_t key_lower, int64_t key_upper, std::vector<IntegerIndexData>&& data, IntegerIndexStorage* storage) { + storage->SetDirty(); + if (data.empty()) { - return storage->sorted_buckets_->Append( - Bucket(key_lower, key_upper, PostingListIdentifier::kInvalid)); + return storage->sorted_buckets_->Append(Bucket( + key_lower, key_upper, PostingListIdentifier::kInvalid, /*num_data=*/0)); } ICING_ASSIGN_OR_RETURN( @@ -891,10 +943,16 @@ IntegerIndexStorage::FlushDataIntoNewSortedBucket( data.end())); storage->info().num_data += data.size(); - return storage->sorted_buckets_->Append(Bucket(key_lower, key_upper, pl_id)); + return storage->sorted_buckets_->Append( + Bucket(key_lower, key_upper, pl_id, data.size())); } -libtextclassifier3::Status IntegerIndexStorage::PersistStoragesToDisk() { +libtextclassifier3::Status IntegerIndexStorage::PersistStoragesToDisk( + bool force) { + if (!force && !is_storage_dirty()) { + return libtextclassifier3::Status::OK; + } + ICING_RETURN_IF_ERROR(sorted_buckets_->PersistToDisk()); ICING_RETURN_IF_ERROR(unsorted_buckets_->PersistToDisk()); if (!flash_index_storage_->PersistToDisk()) { @@ -904,19 +962,35 @@ libtextclassifier3::Status IntegerIndexStorage::PersistStoragesToDisk() { return libtextclassifier3::Status::OK; } -libtextclassifier3::Status IntegerIndexStorage::PersistMetadataToDisk() { +libtextclassifier3::Status IntegerIndexStorage::PersistMetadataToDisk( + bool force) { + // We can skip persisting metadata to disk only if both info and storage are + // clean. + if (!force && !is_info_dirty() && !is_storage_dirty()) { + return libtextclassifier3::Status::OK; + } + // Changes should have been applied to the underlying file when using // MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC, but call msync() as an // extra safety step to ensure they are written out. return metadata_mmapped_file_->PersistToDisk(); } -libtextclassifier3::StatusOr<Crc32> IntegerIndexStorage::ComputeInfoChecksum() { +libtextclassifier3::StatusOr<Crc32> IntegerIndexStorage::ComputeInfoChecksum( + bool force) { + if (!force && !is_info_dirty()) { + return Crc32(crcs().component_crcs.info_crc); + } + return info().ComputeChecksum(); } libtextclassifier3::StatusOr<Crc32> -IntegerIndexStorage::ComputeStoragesChecksum() { +IntegerIndexStorage::ComputeStoragesChecksum(bool force) { + if (!force && !is_storage_dirty()) { + return Crc32(crcs().component_crcs.storages_crc); + } + // Compute crcs ICING_ASSIGN_OR_RETURN(Crc32 sorted_buckets_crc, sorted_buckets_->ComputeChecksum()); @@ -933,6 +1007,89 @@ IntegerIndexStorage::AddKeysIntoBucketAndSplitIfNecessary( const std::vector<int64_t>::const_iterator& it_start, const std::vector<int64_t>::const_iterator& it_end, FileBackedVector<Bucket>::MutableView& mutable_bucket) { + int32_t num_data_in_bucket = mutable_bucket.Get().num_data(); + int32_t num_new_data = std::distance(it_start, it_end); + if (mutable_bucket.Get().key_lower() < mutable_bucket.Get().key_upper() && + num_new_data + num_data_in_bucket > + options_.num_data_threshold_for_bucket_split) { + // Split bucket. + + // 1. Read all data and free all posting lists. + std::vector<IntegerIndexData> all_data; + if (mutable_bucket.Get().posting_list_identifier().is_valid()) { + ICING_ASSIGN_OR_RETURN( + std::unique_ptr<PostingListIntegerIndexAccessor> pl_accessor, + PostingListIntegerIndexAccessor::CreateFromExisting( + flash_index_storage_.get(), posting_list_serializer_, + mutable_bucket.Get().posting_list_identifier())); + ICING_ASSIGN_OR_RETURN(all_data, pl_accessor->GetAllDataAndFree()); + } + + // 2. Append all new data. + all_data.reserve(all_data.size() + num_new_data); + for (auto it = it_start; it != it_end; ++it) { + all_data.push_back(IntegerIndexData(section_id, document_id, *it)); + } + + // 3. Run bucket splitting algorithm to decide new buckets and dispatch + // data. + // - # of data in a full bucket = + // options_.num_data_threshold_for_bucket_split. + // - Bucket splitting logic will be invoked if adding new data + // (num_new_data >= 1) into a full bucket. + // - In order to achieve good (amortized) time complexity, we want # of + // data in new buckets to be around half_of_threshold (i.e. + // options_.num_data_threshold_for_bucket_split / 2). + // - Using half_of_threshold as the cutoff threshold will cause splitting + // buckets with [half_of_threshold, half_of_threshold, num_new_data] + // data, which is not ideal because num_new_data is usually small. + // - Thus, we pick (half_of_threshold + kNumDataAfterSplitAdjustment) as + // the cutoff threshold to avoid over-splitting. It can tolerate + // num_new_data up to (2 * kNumDataAfterSplitAdjustment) and + // split only 2 buckets (instead of 3) with + // [half_of_threshold + kNumDataAfterSplitAdjustment, + // half_of_threshold + (kNumDataAfterSplitAdjustment - num_new_data)]. + int32_t cutoff_threshold = + options_.num_data_threshold_for_bucket_split / 2 + + kNumDataAfterSplitAdjustment; + std::vector<integer_index_bucket_util::DataRangeAndBucketInfo> + new_bucket_infos = integer_index_bucket_util::Split( + all_data, mutable_bucket.Get().key_lower(), + mutable_bucket.Get().key_upper(), cutoff_threshold); + if (new_bucket_infos.empty()) { + ICING_LOG(WARNING) + << "No buckets after splitting. This should not happen."; + return absl_ports::InternalError("Split error"); + } + + // 4. Flush data and create new buckets. + std::vector<Bucket> new_buckets; + for (int i = 0; i < new_bucket_infos.size(); ++i) { + int32_t num_data_in_new_bucket = + std::distance(new_bucket_infos[i].start, new_bucket_infos[i].end); + ICING_ASSIGN_OR_RETURN( + PostingListIdentifier pl_id, + FlushDataIntoPostingLists( + flash_index_storage_.get(), posting_list_serializer_, + new_bucket_infos[i].start, new_bucket_infos[i].end)); + if (i == 0) { + // Reuse mutable_bucket + mutable_bucket.Get().set_key_lower(new_bucket_infos[i].key_lower); + mutable_bucket.Get().set_key_upper(new_bucket_infos[i].key_upper); + mutable_bucket.Get().set_posting_list_identifier(pl_id); + mutable_bucket.Get().set_num_data(num_data_in_new_bucket); + } else { + new_buckets.push_back(Bucket(new_bucket_infos[i].key_lower, + new_bucket_infos[i].key_upper, pl_id, + num_data_in_new_bucket)); + } + } + + return new_buckets; + } + + // Otherwise, we don't need to split bucket. Just simply add all new data into + // the bucket. std::unique_ptr<PostingListIntegerIndexAccessor> pl_accessor; if (mutable_bucket.Get().posting_list_identifier().is_valid()) { ICING_ASSIGN_OR_RETURN( @@ -946,68 +1103,6 @@ IntegerIndexStorage::AddKeysIntoBucketAndSplitIfNecessary( } for (auto it = it_start; it != it_end; ++it) { - if (mutable_bucket.Get().key_lower() < mutable_bucket.Get().key_upper() && - pl_accessor->WantsSplit()) { - // If the bucket needs split (max size and full) and is splittable, then - // we perform bucket splitting. - - // 1. Finalize the current posting list accessor. - PostingListAccessor::FinalizeResult result = - std::move(*pl_accessor).Finalize(); - if (!result.status.ok()) { - return result.status; - } - - // 2. Create another posting list accessor instance. Read all data and - // free all posting lists. - ICING_ASSIGN_OR_RETURN( - pl_accessor, - PostingListIntegerIndexAccessor::CreateFromExisting( - flash_index_storage_.get(), posting_list_serializer_, result.id)); - ICING_ASSIGN_OR_RETURN(std::vector<IntegerIndexData> all_data, - pl_accessor->GetAllDataAndFree()); - - // 3. Append all remaining new data. - all_data.reserve(all_data.size() + std::distance(it, it_end)); - for (; it != it_end; ++it) { - all_data.push_back(IntegerIndexData(section_id, document_id, *it)); - } - - // 4. Run bucket splitting algorithm to decide new buckets and dispatch - // data. - std::vector<integer_index_bucket_util::DataRangeAndBucketInfo> - new_bucket_infos = integer_index_bucket_util::Split( - all_data, mutable_bucket.Get().key_lower(), - mutable_bucket.Get().key_upper(), - kNumDataThresholdForBucketSplit); - if (new_bucket_infos.empty()) { - ICING_LOG(WARNING) - << "No buckets after splitting. This should not happen."; - return absl_ports::InternalError("Split error"); - } - - // 5. Flush data. - std::vector<Bucket> new_buckets; - for (int i = 0; i < new_bucket_infos.size(); ++i) { - ICING_ASSIGN_OR_RETURN( - PostingListIdentifier pl_id, - FlushDataIntoPostingLists( - flash_index_storage_.get(), posting_list_serializer_, - new_bucket_infos[i].start, new_bucket_infos[i].end)); - if (i == 0) { - // Reuse mutable_bucket - mutable_bucket.Get().set_key_lower(new_bucket_infos[i].key_lower); - mutable_bucket.Get().set_key_upper(new_bucket_infos[i].key_upper); - mutable_bucket.Get().set_posting_list_identifier(pl_id); - } else { - new_buckets.push_back(Bucket(new_bucket_infos[i].key_lower, - new_bucket_infos[i].key_upper, pl_id)); - } - } - - return new_buckets; - } - ICING_RETURN_IF_ERROR(pl_accessor->PrependData( IntegerIndexData(section_id, document_id, *it))); } @@ -1022,6 +1117,9 @@ IntegerIndexStorage::AddKeysIntoBucketAndSplitIfNecessary( } mutable_bucket.Get().set_posting_list_identifier(result.id); + // We've already verified num_new_data won't exceed the limit of the entire + // storage, so it is safe to add to the counter of the bucket. + mutable_bucket.Get().set_num_data(num_data_in_bucket + num_new_data); return std::vector<Bucket>(); } diff --git a/icing/index/numeric/integer-index-storage.h b/icing/index/numeric/integer-index-storage.h index 9f2e58c..0c1afbb 100644 --- a/icing/index/numeric/integer-index-storage.h +++ b/icing/index/numeric/integer-index-storage.h @@ -75,7 +75,7 @@ namespace lib { class IntegerIndexStorage : public PersistentStorage { public: struct Info { - static constexpr int32_t kMagic = 0xc4bf0ccc; + static constexpr int32_t kMagic = 0x6470e547; int32_t magic; int32_t num_data; @@ -99,10 +99,12 @@ class IntegerIndexStorage : public PersistentStorage { explicit Bucket(int64_t key_lower, int64_t key_upper, PostingListIdentifier posting_list_identifier = - PostingListIdentifier::kInvalid) + PostingListIdentifier::kInvalid, + int32_t num_data = 0) : key_lower_(key_lower), key_upper_(key_upper), - posting_list_identifier_(posting_list_identifier) {} + posting_list_identifier_(posting_list_identifier), + num_data_(num_data) {} bool operator<(const Bucket& other) const { return key_lower_ < other.key_lower_; @@ -130,12 +132,16 @@ class IntegerIndexStorage : public PersistentStorage { posting_list_identifier_ = posting_list_identifier; } + int32_t num_data() const { return num_data_; } + void set_num_data(int32_t num_data) { num_data_ = num_data; } + private: int64_t key_lower_; int64_t key_upper_; PostingListIdentifier posting_list_identifier_; + int32_t num_data_; } __attribute__((packed)); - static_assert(sizeof(Bucket) == 20, ""); + static_assert(sizeof(Bucket) == 24, ""); static_assert(sizeof(Bucket) == FileBackedVector<Bucket>::kElementTypeSize, "Bucket type size is inconsistent with FileBackedVector " "element type size"); @@ -146,15 +152,31 @@ class IntegerIndexStorage : public PersistentStorage { "Max # of buckets cannot fit into FileBackedVector"); struct Options { - explicit Options(bool pre_mapping_fbv_in) - : pre_mapping_fbv(pre_mapping_fbv_in) {} + // - According to the benchmark result, the more # of buckets, the higher + // latency for range query. Therefore, this number cannot be too small to + // avoid splitting bucket too aggressively. + // - We use `num_data_threshold_for_bucket_split / 2 + 5` as the cutoff + // threshold after splitting. This number cannot be too small (e.g. 10) + // because in this case we will have similar # of data in a single bucket + // before and after splitting, which contradicts the purpose of splitting. + // - For convenience, let's set 64 as the minimum value. + static constexpr int32_t kMinNumDataThresholdForBucketSplit = 64; + + explicit Options(int32_t num_data_threshold_for_bucket_split_in, + bool pre_mapping_fbv_in) + : num_data_threshold_for_bucket_split( + num_data_threshold_for_bucket_split_in), + pre_mapping_fbv(pre_mapping_fbv_in) {} explicit Options(std::vector<Bucket> custom_init_sorted_buckets_in, std::vector<Bucket> custom_init_unsorted_buckets_in, + int32_t num_data_threshold_for_bucket_split_in, bool pre_mapping_fbv_in) : custom_init_sorted_buckets(std::move(custom_init_sorted_buckets_in)), custom_init_unsorted_buckets( std::move(custom_init_unsorted_buckets_in)), + num_data_threshold_for_bucket_split( + num_data_threshold_for_bucket_split_in), pre_mapping_fbv(pre_mapping_fbv_in) {} bool IsValid() const; @@ -172,6 +194,14 @@ class IntegerIndexStorage : public PersistentStorage { std::vector<Bucket> custom_init_sorted_buckets; std::vector<Bucket> custom_init_unsorted_buckets; + // Threshold for invoking bucket splitting. If # of data in a bucket exceeds + // this number after adding new data, then it will invoke bucket splitting + // logic. + // + // Note: num_data_threshold_for_bucket_split should be >= + // kMinNumDataThresholdForBucketSplit. + int32_t num_data_threshold_for_bucket_split; + // Flag indicating whether memory map max possible file size for underlying // FileBackedVector before growing the actual file size. bool pre_mapping_fbv; @@ -188,28 +218,25 @@ class IntegerIndexStorage : public PersistentStorage { WorkingPathType::kDirectory; static constexpr std::string_view kFilePrefix = "integer_index_storage"; - // # of data threshold for bucket merging during optimization (TransferIndex). - // If total # data of adjacent buckets exceed this value, then flush the - // accumulated data. Otherwise merge buckets and their data. - // - // Calculated by: 0.7 * (kMaxPostingListSize / sizeof(IntegerIndexData)), - // where kMaxPostingListSize = (kPageSize - sizeof(IndexBlock::BlockHeader)). - static constexpr int32_t kNumDataThresholdForBucketMerge = 240; - - // # of data threshold for bucket splitting during indexing (AddKeys). - // When the posting list of a bucket is full, we will try to split data into - // multiple buckets according to their keys. In order to achieve good - // (amortized) time complexity, we want # of data in new buckets to be at most - // half # of elements in a full posting list. + // Default # of data threshold for bucket splitting during indexing (AddKeys). + // When # of data in a bucket reaches this number, we will try to split data + // into multiple buckets according to their keys. + static constexpr int32_t kDefaultNumDataThresholdForBucketSplit = 65536; + + // # of data threshold for bucket merging during optimization (TransferIndex) + // = kNumDataThresholdRatioForBucketMerge * + // options.num_data_threshold_for_bucket_split // - // Calculated by: 0.5 * (kMaxPostingListSize / sizeof(IntegerIndexData)), - // where kMaxPostingListSize = (kPageSize - sizeof(IndexBlock::BlockHeader)). - static constexpr int32_t kNumDataThresholdForBucketSplit = 170; + // If total # data of adjacent buckets exceed this threshold, then flush the + // accumulated data. Otherwise merge buckets and their data. + static constexpr double kNumDataThresholdRatioForBucketMerge = 0.7; // Length threshold to sort and merge unsorted buckets into sorted buckets. If // the length of unsorted_buckets exceed the threshold, then call // SortBuckets(). - static constexpr int32_t kUnsortedBucketsLengthThreshold = 50; + // TODO(b/259743562): decide if removing unsorted buckets given that we + // changed bucket splitting threshold and # of buckets are small now. + static constexpr int32_t kUnsortedBucketsLengthThreshold = 5; // Creates a new IntegerIndexStorage instance to index integers (for a single // property). If any of the underlying file is missing, then delete the whole @@ -272,6 +299,8 @@ class IntegerIndexStorage : public PersistentStorage { // // Returns: // - OK on success + // - RESOURCE_EXHAUSTED_ERROR if # of integers in this storage exceed + // INT_MAX after adding new_keys // - Any FileBackedVector or PostingList errors libtextclassifier3::Status AddKeys(DocumentId document_id, SectionId section_id, @@ -314,6 +343,8 @@ class IntegerIndexStorage : public PersistentStorage { int32_t num_data() const { return info().num_data; } private: + static constexpr int8_t kNumDataAfterSplitAdjustment = 5; + explicit IntegerIndexStorage( const Filesystem& filesystem, std::string&& working_path, Options&& options, @@ -329,7 +360,9 @@ class IntegerIndexStorage : public PersistentStorage { metadata_mmapped_file_(std::move(metadata_mmapped_file)), sorted_buckets_(std::move(sorted_buckets)), unsorted_buckets_(std::move(unsorted_buckets)), - flash_index_storage_(std::move(flash_index_storage)) {} + flash_index_storage_(std::move(flash_index_storage)), + is_info_dirty_(false), + is_storage_dirty_(false) {} static libtextclassifier3::StatusOr<std::unique_ptr<IntegerIndexStorage>> InitializeNewFiles( @@ -360,20 +393,20 @@ class IntegerIndexStorage : public PersistentStorage { // Returns: // - OK on success // - INTERNAL_ERROR on I/O error - libtextclassifier3::Status PersistStoragesToDisk() override; + libtextclassifier3::Status PersistStoragesToDisk(bool force) override; // Flushes contents of metadata file. // // Returns: // - OK on success // - INTERNAL_ERROR on I/O error - libtextclassifier3::Status PersistMetadataToDisk() override; + libtextclassifier3::Status PersistMetadataToDisk(bool force) override; // Computes and returns Info checksum. // // Returns: // - Crc of the Info on success - libtextclassifier3::StatusOr<Crc32> ComputeInfoChecksum() override; + libtextclassifier3::StatusOr<Crc32> ComputeInfoChecksum(bool force) override; // Computes and returns all storages checksum. Checksums of sorted_buckets_, // unsorted_buckets_ will be combined together by XOR. @@ -382,7 +415,8 @@ class IntegerIndexStorage : public PersistentStorage { // Returns: // - Crc of all storages on success // - INTERNAL_ERROR if any data inconsistency - libtextclassifier3::StatusOr<Crc32> ComputeStoragesChecksum() override; + libtextclassifier3::StatusOr<Crc32> ComputeStoragesChecksum( + bool force) override; // Helper function to add keys in range [it_start, it_end) into the given // bucket. It handles the bucket and its corresponding posting list(s) to make @@ -442,6 +476,17 @@ class IntegerIndexStorage : public PersistentStorage { kInfoMetadataFileOffset); } + void SetInfoDirty() { is_info_dirty_ = true; } + // When storage is dirty, we have to set info dirty as well. So just expose + // SetDirty to set both. + void SetDirty() { + is_info_dirty_ = true; + is_storage_dirty_ = true; + } + + bool is_info_dirty() const { return is_info_dirty_; } + bool is_storage_dirty() const { return is_storage_dirty_; } + Options options_; PostingListIntegerIndexSerializer* posting_list_serializer_; // Does not own. @@ -450,6 +495,9 @@ class IntegerIndexStorage : public PersistentStorage { std::unique_ptr<FileBackedVector<Bucket>> sorted_buckets_; std::unique_ptr<FileBackedVector<Bucket>> unsorted_buckets_; std::unique_ptr<FlashIndexStorage> flash_index_storage_; + + bool is_info_dirty_; + bool is_storage_dirty_; }; } // namespace lib diff --git a/icing/index/numeric/integer-index-storage_benchmark.cc b/icing/index/numeric/integer-index-storage_benchmark.cc index bf5f134..85d381d 100644 --- a/icing/index/numeric/integer-index-storage_benchmark.cc +++ b/icing/index/numeric/integer-index-storage_benchmark.cc @@ -68,6 +68,8 @@ using ::testing::Eq; using ::testing::IsEmpty; using ::testing::SizeIs; +static constexpr int32_t kNumDataThresholdForBucketSplit = + IntegerIndexStorage::kDefaultNumDataThresholdForBucketSplit; static constexpr bool kPreMappingFbv = true; static constexpr SectionId kDefaultSectionId = 12; @@ -150,11 +152,13 @@ void BM_Index(benchmark::State& state) { state.PauseTiming(); benchmark.filesystem.DeleteDirectoryRecursively( benchmark.working_path.c_str()); - ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<IntegerIndexStorage> storage, - IntegerIndexStorage::Create( - benchmark.filesystem, benchmark.working_path, - IntegerIndexStorage::Options(kPreMappingFbv), - &benchmark.posting_list_serializer)); + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<IntegerIndexStorage> storage, + IntegerIndexStorage::Create( + benchmark.filesystem, benchmark.working_path, + IntegerIndexStorage::Options(kNumDataThresholdForBucketSplit, + kPreMappingFbv), + &benchmark.posting_list_serializer)); state.ResumeTiming(); for (int i = 0; i < num_keys; ++i) { @@ -210,11 +214,13 @@ void BM_BatchIndex(benchmark::State& state) { state.PauseTiming(); benchmark.filesystem.DeleteDirectoryRecursively( benchmark.working_path.c_str()); - ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<IntegerIndexStorage> storage, - IntegerIndexStorage::Create( - benchmark.filesystem, benchmark.working_path, - IntegerIndexStorage::Options(kPreMappingFbv), - &benchmark.posting_list_serializer)); + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<IntegerIndexStorage> storage, + IntegerIndexStorage::Create( + benchmark.filesystem, benchmark.working_path, + IntegerIndexStorage::Options(kNumDataThresholdForBucketSplit, + kPreMappingFbv), + &benchmark.posting_list_serializer)); std::vector<int64_t> keys_copy(keys); state.ResumeTiming(); @@ -263,9 +269,11 @@ void BM_ExactQuery(benchmark::State& state) { ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<IntegerIndexStorage> storage, - IntegerIndexStorage::Create(benchmark.filesystem, benchmark.working_path, - IntegerIndexStorage::Options(kPreMappingFbv), - &benchmark.posting_list_serializer)); + IntegerIndexStorage::Create( + benchmark.filesystem, benchmark.working_path, + IntegerIndexStorage::Options(kNumDataThresholdForBucketSplit, + kPreMappingFbv), + &benchmark.posting_list_serializer)); ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<NumberGenerator<int64_t>> generator, CreateIntegerGenerator(distribution_type, kDefaultSeed, num_keys)); @@ -340,9 +348,11 @@ void BM_RangeQueryAll(benchmark::State& state) { ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<IntegerIndexStorage> storage, - IntegerIndexStorage::Create(benchmark.filesystem, benchmark.working_path, - IntegerIndexStorage::Options(kPreMappingFbv), - &benchmark.posting_list_serializer)); + IntegerIndexStorage::Create( + benchmark.filesystem, benchmark.working_path, + IntegerIndexStorage::Options(kNumDataThresholdForBucketSplit, + kPreMappingFbv), + &benchmark.posting_list_serializer)); ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<NumberGenerator<int64_t>> generator, CreateIntegerGenerator(distribution_type, kDefaultSeed, num_keys)); diff --git a/icing/index/numeric/integer-index-storage_test.cc b/icing/index/numeric/integer-index-storage_test.cc index 4d4e665..a632bc8 100644 --- a/icing/index/numeric/integer-index-storage_test.cc +++ b/icing/index/numeric/integer-index-storage_test.cc @@ -30,8 +30,6 @@ #include "icing/file/file-backed-vector.h" #include "icing/file/filesystem.h" #include "icing/file/persistent-storage.h" -#include "icing/file/posting_list/flash-index-storage.h" -#include "icing/file/posting_list/index-block.h" #include "icing/file/posting_list/posting-list-identifier.h" #include "icing/index/hit/doc-hit-info.h" #include "icing/index/iterator/doc-hit-info-iterator.h" @@ -59,6 +57,7 @@ using ::testing::IsFalse; using ::testing::IsTrue; using ::testing::Key; using ::testing::Le; +using ::testing::Lt; using ::testing::Ne; using ::testing::Not; @@ -106,7 +105,32 @@ libtextclassifier3::StatusOr<std::vector<DocHitInfo>> Query( } TEST_P(IntegerIndexStorageTest, OptionsEmptyCustomInitBucketsShouldBeValid) { - EXPECT_THAT(Options(/*pre_mapping_fbv_in=*/GetParam()).IsValid(), IsTrue()); + EXPECT_THAT( + Options(IntegerIndexStorage::kDefaultNumDataThresholdForBucketSplit, + /*pre_mapping_fbv_in=*/GetParam()) + .IsValid(), + IsTrue()); +} + +TEST_P(IntegerIndexStorageTest, OptionsInvalidNumDataThresholdForBucketSplit) { + EXPECT_THAT(Options(/*custom_init_sorted_buckets_in=*/{}, + /*custom_init_unsorted_buckets_in=*/{}, + /*num_data_threshold_for_bucket_split=*/-1, + /*pre_mapping_fbv_in=*/GetParam()) + .IsValid(), + IsFalse()); + EXPECT_THAT(Options(/*custom_init_sorted_buckets_in=*/{}, + /*custom_init_unsorted_buckets_in=*/{}, + /*num_data_threshold_for_bucket_split=*/0, + /*pre_mapping_fbv_in=*/GetParam()) + .IsValid(), + IsFalse()); + EXPECT_THAT(Options(/*custom_init_sorted_buckets_in=*/{}, + /*custom_init_unsorted_buckets_in=*/{}, + /*num_data_threshold_for_bucket_split=*/63, + /*pre_mapping_fbv_in=*/GetParam()) + .IsValid(), + IsFalse()); } TEST_P(IntegerIndexStorageTest, OptionsInvalidCustomInitBucketsRange) { @@ -116,6 +140,7 @@ TEST_P(IntegerIndexStorageTest, OptionsInvalidCustomInitBucketsRange) { {Bucket(std::numeric_limits<int64_t>::min(), 5), Bucket(9, 6)}, /*custom_init_unsorted_buckets_in=*/ {Bucket(10, std::numeric_limits<int64_t>::max())}, + IntegerIndexStorage::kDefaultNumDataThresholdForBucketSplit, /*pre_mapping_fbv_in=*/GetParam()) .IsValid(), IsFalse()); @@ -126,6 +151,7 @@ TEST_P(IntegerIndexStorageTest, OptionsInvalidCustomInitBucketsRange) { {Bucket(10, std::numeric_limits<int64_t>::max())}, /*custom_init_unsorted_buckets_in=*/ {Bucket(std::numeric_limits<int64_t>::min(), 5), Bucket(9, 6)}, + IntegerIndexStorage::kDefaultNumDataThresholdForBucketSplit, /*pre_mapping_fbv_in=*/GetParam()) .IsValid(), IsFalse()); @@ -138,91 +164,109 @@ TEST_P(IntegerIndexStorageTest, ASSERT_THAT(valid_posting_list_identifier.is_valid(), IsTrue()); // Invalid custom init sorted bucket - EXPECT_THAT(Options(/*custom_init_sorted_buckets_in=*/ - {Bucket(std::numeric_limits<int64_t>::min(), - std::numeric_limits<int64_t>::max(), - valid_posting_list_identifier)}, - /*custom_init_unsorted_buckets_in=*/{}, - /*pre_mapping_fbv_in=*/GetParam()) - .IsValid(), - IsFalse()); + EXPECT_THAT( + Options(/*custom_init_sorted_buckets_in=*/ + {Bucket(std::numeric_limits<int64_t>::min(), + std::numeric_limits<int64_t>::max(), + valid_posting_list_identifier)}, + /*custom_init_unsorted_buckets_in=*/{}, + IntegerIndexStorage::kDefaultNumDataThresholdForBucketSplit, + /*pre_mapping_fbv_in=*/GetParam()) + .IsValid(), + IsFalse()); // Invalid custom init unsorted bucket - EXPECT_THAT(Options(/*custom_init_sorted_buckets_in=*/{}, - /*custom_init_unsorted_buckets_in=*/ - {Bucket(std::numeric_limits<int64_t>::min(), - std::numeric_limits<int64_t>::max(), - valid_posting_list_identifier)}, - /*pre_mapping_fbv_in=*/GetParam()) - .IsValid(), - IsFalse()); + EXPECT_THAT( + Options(/*custom_init_sorted_buckets_in=*/{}, + /*custom_init_unsorted_buckets_in=*/ + {Bucket(std::numeric_limits<int64_t>::min(), + std::numeric_limits<int64_t>::max(), + valid_posting_list_identifier)}, + IntegerIndexStorage::kDefaultNumDataThresholdForBucketSplit, + /*pre_mapping_fbv_in=*/GetParam()) + .IsValid(), + IsFalse()); } TEST_P(IntegerIndexStorageTest, OptionsInvalidCustomInitBucketsOverlapping) { // sorted buckets overlap - EXPECT_THAT(Options(/*custom_init_sorted_buckets_in=*/ - {Bucket(std::numeric_limits<int64_t>::min(), -100), - Bucket(-100, std::numeric_limits<int64_t>::max())}, - /*custom_init_unsorted_buckets_in=*/{}, - /*pre_mapping_fbv_in=*/GetParam()) - .IsValid(), - IsFalse()); + EXPECT_THAT( + Options(/*custom_init_sorted_buckets_in=*/ + {Bucket(std::numeric_limits<int64_t>::min(), -100), + Bucket(-100, std::numeric_limits<int64_t>::max())}, + /*custom_init_unsorted_buckets_in=*/{}, + IntegerIndexStorage::kDefaultNumDataThresholdForBucketSplit, + /*pre_mapping_fbv_in=*/GetParam()) + .IsValid(), + IsFalse()); // unsorted buckets overlap - EXPECT_THAT(Options(/*custom_init_sorted_buckets_in=*/{}, - /*custom_init_unsorted_buckets_in=*/ - {Bucket(-100, std::numeric_limits<int64_t>::max()), - Bucket(std::numeric_limits<int64_t>::min(), -100)}, - /*pre_mapping_fbv_in=*/GetParam()) - .IsValid(), - IsFalse()); + EXPECT_THAT( + Options(/*custom_init_sorted_buckets_in=*/{}, + /*custom_init_unsorted_buckets_in=*/ + {Bucket(-100, std::numeric_limits<int64_t>::max()), + Bucket(std::numeric_limits<int64_t>::min(), -100)}, + IntegerIndexStorage::kDefaultNumDataThresholdForBucketSplit, + /*pre_mapping_fbv_in=*/GetParam()) + .IsValid(), + IsFalse()); // Cross buckets overlap - EXPECT_THAT(Options(/*custom_init_sorted_buckets_in=*/ - {Bucket(std::numeric_limits<int64_t>::min(), -100), - Bucket(-99, 0)}, - /*custom_init_unsorted_buckets_in=*/ - {Bucket(200, std::numeric_limits<int64_t>::max()), - Bucket(0, 50), Bucket(51, 199)}, - /*pre_mapping_fbv_in=*/GetParam()) - .IsValid(), - IsFalse()); + EXPECT_THAT( + Options(/*custom_init_sorted_buckets_in=*/ + {Bucket(std::numeric_limits<int64_t>::min(), -100), + Bucket(-99, 0)}, + /*custom_init_unsorted_buckets_in=*/ + {Bucket(200, std::numeric_limits<int64_t>::max()), Bucket(0, 50), + Bucket(51, 199)}, + IntegerIndexStorage::kDefaultNumDataThresholdForBucketSplit, + /*pre_mapping_fbv_in=*/GetParam()) + .IsValid(), + IsFalse()); } TEST_P(IntegerIndexStorageTest, OptionsInvalidCustomInitBucketsUnion) { // Missing INT64_MAX - EXPECT_THAT(Options(/*custom_init_sorted_buckets_in=*/ - {Bucket(std::numeric_limits<int64_t>::min(), -100), - Bucket(-99, 0)}, - /*custom_init_unsorted_buckets_in=*/ - {Bucket(1, 1000)}, /*pre_mapping_fbv_in=*/GetParam()) - .IsValid(), - IsFalse()); + EXPECT_THAT( + Options(/*custom_init_sorted_buckets_in=*/ + {Bucket(std::numeric_limits<int64_t>::min(), -100), + Bucket(-99, 0)}, + /*custom_init_unsorted_buckets_in=*/{Bucket(1, 1000)}, + IntegerIndexStorage::kDefaultNumDataThresholdForBucketSplit, + /*pre_mapping_fbv_in=*/GetParam()) + .IsValid(), + IsFalse()); // Missing INT64_MIN - EXPECT_THAT(Options(/*custom_init_sorted_buckets_in=*/ - {Bucket(-200, -100), Bucket(-99, 0)}, - /*custom_init_unsorted_buckets_in=*/ - {Bucket(1, std::numeric_limits<int64_t>::max())}, - /*pre_mapping_fbv_in=*/GetParam()) - .IsValid(), - IsFalse()); + EXPECT_THAT( + Options(/*custom_init_sorted_buckets_in=*/ + {Bucket(-200, -100), Bucket(-99, 0)}, + /*custom_init_unsorted_buckets_in=*/ + {Bucket(1, std::numeric_limits<int64_t>::max())}, + IntegerIndexStorage::kDefaultNumDataThresholdForBucketSplit, + /*pre_mapping_fbv_in=*/GetParam()) + .IsValid(), + IsFalse()); // Missing some intermediate ranges - EXPECT_THAT(Options(/*custom_init_sorted_buckets_in=*/ - {Bucket(std::numeric_limits<int64_t>::min(), -100)}, - /*custom_init_unsorted_buckets_in=*/ - {Bucket(1, std::numeric_limits<int64_t>::max())}, - /*pre_mapping_fbv_in=*/GetParam()) - .IsValid(), - IsFalse()); + EXPECT_THAT( + Options(/*custom_init_sorted_buckets_in=*/ + {Bucket(std::numeric_limits<int64_t>::min(), -100)}, + /*custom_init_unsorted_buckets_in=*/ + {Bucket(1, std::numeric_limits<int64_t>::max())}, + IntegerIndexStorage::kDefaultNumDataThresholdForBucketSplit, + /*pre_mapping_fbv_in=*/GetParam()) + .IsValid(), + IsFalse()); } TEST_P(IntegerIndexStorageTest, InvalidWorkingPath) { EXPECT_THAT( IntegerIndexStorage::Create( filesystem_, "/dev/null/integer_index_storage_test", - Options(/*pre_mapping_fbv_in=*/GetParam()), serializer_.get()), + Options(IntegerIndexStorage::kDefaultNumDataThresholdForBucketSplit, + /*pre_mapping_fbv_in=*/GetParam()), + serializer_.get()), StatusIs(libtextclassifier3::StatusCode::INTERNAL)); } @@ -232,6 +276,7 @@ TEST_P(IntegerIndexStorageTest, CreateWithInvalidOptionsShouldFail) { /*custom_init_unsorted_buckets_in=*/ {Bucket(-100, std::numeric_limits<int64_t>::max()), Bucket(std::numeric_limits<int64_t>::min(), -100)}, + IntegerIndexStorage::kDefaultNumDataThresholdForBucketSplit, /*pre_mapping_fbv_in=*/GetParam()); ASSERT_THAT(invalid_options.IsValid(), IsFalse()); @@ -246,9 +291,11 @@ TEST_P(IntegerIndexStorageTest, InitializeNewFiles) { ASSERT_FALSE(filesystem_.DirectoryExists(working_path_.c_str())); ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<IntegerIndexStorage> storage, - IntegerIndexStorage::Create(filesystem_, working_path_, - Options(/*pre_mapping_fbv_in=*/GetParam()), - serializer_.get())); + IntegerIndexStorage::Create( + filesystem_, working_path_, + Options(IntegerIndexStorage::kDefaultNumDataThresholdForBucketSplit, + /*pre_mapping_fbv_in=*/GetParam()), + serializer_.get())); ICING_ASSERT_OK(storage->PersistToDisk()); } @@ -290,9 +337,11 @@ TEST_P(IntegerIndexStorageTest, // Create new integer index storage ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<IntegerIndexStorage> storage, - IntegerIndexStorage::Create(filesystem_, working_path_, - Options(/*pre_mapping_fbv_in=*/GetParam()), - serializer_.get())); + IntegerIndexStorage::Create( + filesystem_, working_path_, + Options(IntegerIndexStorage::kDefaultNumDataThresholdForBucketSplit, + /*pre_mapping_fbv_in=*/GetParam()), + serializer_.get())); // Insert some data. ICING_ASSERT_OK(storage->AddKeys(/*document_id=*/0, /*section_id=*/20, @@ -305,9 +354,11 @@ TEST_P(IntegerIndexStorageTest, // Without calling PersistToDisk, checksums will not be recomputed or synced // to disk, so initializing another instance on the same files should fail. EXPECT_THAT( - IntegerIndexStorage::Create(filesystem_, working_path_, - Options(/*pre_mapping_fbv_in=*/GetParam()), - serializer_.get()), + IntegerIndexStorage::Create( + filesystem_, working_path_, + Options(IntegerIndexStorage::kDefaultNumDataThresholdForBucketSplit, + /*pre_mapping_fbv_in=*/GetParam()), + serializer_.get()), StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION)); } @@ -315,9 +366,11 @@ TEST_P(IntegerIndexStorageTest, InitializationShouldSucceedWithPersistToDisk) { // Create new integer index storage ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<IntegerIndexStorage> storage1, - IntegerIndexStorage::Create(filesystem_, working_path_, - Options(/*pre_mapping_fbv_in=*/GetParam()), - serializer_.get())); + IntegerIndexStorage::Create( + filesystem_, working_path_, + Options(IntegerIndexStorage::kDefaultNumDataThresholdForBucketSplit, + /*pre_mapping_fbv_in=*/GetParam()), + serializer_.get())); // Insert some data. ICING_ASSERT_OK(storage1->AddKeys(/*document_id=*/0, /*section_id=*/20, @@ -339,9 +392,11 @@ TEST_P(IntegerIndexStorageTest, InitializationShouldSucceedWithPersistToDisk) { ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<IntegerIndexStorage> storage2, - IntegerIndexStorage::Create(filesystem_, working_path_, - Options(/*pre_mapping_fbv_in=*/GetParam()), - serializer_.get())); + IntegerIndexStorage::Create( + filesystem_, working_path_, + Options(IntegerIndexStorage::kDefaultNumDataThresholdForBucketSplit, + /*pre_mapping_fbv_in=*/GetParam()), + serializer_.get())); EXPECT_THAT( Query(storage2.get(), /*key_lower=*/std::numeric_limits<int64_t>::min(), /*key_upper=*/std::numeric_limits<int64_t>::max()), @@ -355,9 +410,11 @@ TEST_P(IntegerIndexStorageTest, InitializationShouldSucceedAfterDestruction) { // Create new integer index storage ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<IntegerIndexStorage> storage, - IntegerIndexStorage::Create(filesystem_, working_path_, - Options(/*pre_mapping_fbv_in=*/GetParam()), - serializer_.get())); + IntegerIndexStorage::Create( + filesystem_, working_path_, + Options(IntegerIndexStorage::kDefaultNumDataThresholdForBucketSplit, + /*pre_mapping_fbv_in=*/GetParam()), + serializer_.get())); // Insert some data. ICING_ASSERT_OK(storage->AddKeys(/*document_id=*/0, /*section_id=*/20, @@ -380,9 +437,11 @@ TEST_P(IntegerIndexStorageTest, InitializationShouldSucceedAfterDestruction) { // we should be able to get the same contents. ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<IntegerIndexStorage> storage, - IntegerIndexStorage::Create(filesystem_, working_path_, - Options(/*pre_mapping_fbv_in=*/GetParam()), - serializer_.get())); + IntegerIndexStorage::Create( + filesystem_, working_path_, + Options(IntegerIndexStorage::kDefaultNumDataThresholdForBucketSplit, + /*pre_mapping_fbv_in=*/GetParam()), + serializer_.get())); EXPECT_THAT( Query(storage.get(), /*key_lower=*/std::numeric_limits<int64_t>::min(), /*key_upper=*/std::numeric_limits<int64_t>::max()), @@ -397,9 +456,11 @@ TEST_P(IntegerIndexStorageTest, // Create new integer index storage ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<IntegerIndexStorage> storage, - IntegerIndexStorage::Create(filesystem_, working_path_, - Options(/*pre_mapping_fbv_in=*/GetParam()), - serializer_.get())); + IntegerIndexStorage::Create( + filesystem_, working_path_, + Options(IntegerIndexStorage::kDefaultNumDataThresholdForBucketSplit, + /*pre_mapping_fbv_in=*/GetParam()), + serializer_.get())); ICING_ASSERT_OK(storage->AddKeys(kDefaultDocumentId, kDefaultSectionId, /*new_keys=*/{0, 100, -100})); @@ -428,7 +489,9 @@ TEST_P(IntegerIndexStorageTest, libtextclassifier3::StatusOr<std::unique_ptr<IntegerIndexStorage>> storage_or = IntegerIndexStorage::Create( filesystem_, working_path_, - Options(/*pre_mapping_fbv_in=*/GetParam()), serializer_.get()); + Options(IntegerIndexStorage::kDefaultNumDataThresholdForBucketSplit, + /*pre_mapping_fbv_in=*/GetParam()), + serializer_.get()); EXPECT_THAT(storage_or, StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION)); EXPECT_THAT(storage_or.status().error_message(), @@ -442,9 +505,11 @@ TEST_P(IntegerIndexStorageTest, // Create new integer index storage ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<IntegerIndexStorage> storage, - IntegerIndexStorage::Create(filesystem_, working_path_, - Options(/*pre_mapping_fbv_in=*/GetParam()), - serializer_.get())); + IntegerIndexStorage::Create( + filesystem_, working_path_, + Options(IntegerIndexStorage::kDefaultNumDataThresholdForBucketSplit, + /*pre_mapping_fbv_in=*/GetParam()), + serializer_.get())); ICING_ASSERT_OK(storage->AddKeys(kDefaultDocumentId, kDefaultSectionId, /*new_keys=*/{0, 100, -100})); @@ -474,7 +539,9 @@ TEST_P(IntegerIndexStorageTest, libtextclassifier3::StatusOr<std::unique_ptr<IntegerIndexStorage>> storage_or = IntegerIndexStorage::Create( filesystem_, working_path_, - Options(/*pre_mapping_fbv_in=*/GetParam()), serializer_.get()); + Options(IntegerIndexStorage::kDefaultNumDataThresholdForBucketSplit, + /*pre_mapping_fbv_in=*/GetParam()), + serializer_.get()); EXPECT_THAT(storage_or, StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION)); EXPECT_THAT(storage_or.status().error_message(), @@ -488,9 +555,11 @@ TEST_P(IntegerIndexStorageTest, // Create new integer index storage ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<IntegerIndexStorage> storage, - IntegerIndexStorage::Create(filesystem_, working_path_, - Options(/*pre_mapping_fbv_in=*/GetParam()), - serializer_.get())); + IntegerIndexStorage::Create( + filesystem_, working_path_, + Options(IntegerIndexStorage::kDefaultNumDataThresholdForBucketSplit, + /*pre_mapping_fbv_in=*/GetParam()), + serializer_.get())); ICING_ASSERT_OK(storage->AddKeys(kDefaultDocumentId, kDefaultSectionId, /*new_keys=*/{0, 100, -100})); @@ -522,7 +591,9 @@ TEST_P(IntegerIndexStorageTest, libtextclassifier3::StatusOr<std::unique_ptr<IntegerIndexStorage>> storage_or = IntegerIndexStorage::Create( filesystem_, working_path_, - Options(/*pre_mapping_fbv_in=*/GetParam()), serializer_.get()); + Options(IntegerIndexStorage::kDefaultNumDataThresholdForBucketSplit, + /*pre_mapping_fbv_in=*/GetParam()), + serializer_.get()); EXPECT_THAT(storage_or, StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION)); EXPECT_THAT(storage_or.status().error_message(), @@ -536,9 +607,11 @@ TEST_P(IntegerIndexStorageTest, // Create new integer index storage ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<IntegerIndexStorage> storage, - IntegerIndexStorage::Create(filesystem_, working_path_, - Options(/*pre_mapping_fbv_in=*/GetParam()), - serializer_.get())); + IntegerIndexStorage::Create( + filesystem_, working_path_, + Options(IntegerIndexStorage::kDefaultNumDataThresholdForBucketSplit, + /*pre_mapping_fbv_in=*/GetParam()), + serializer_.get())); ICING_ASSERT_OK(storage->AddKeys(kDefaultDocumentId, kDefaultSectionId, /*new_keys=*/{0, 100, -100})); @@ -572,7 +645,9 @@ TEST_P(IntegerIndexStorageTest, libtextclassifier3::StatusOr<std::unique_ptr<IntegerIndexStorage>> storage_or = IntegerIndexStorage::Create( filesystem_, working_path_, - Options(/*pre_mapping_fbv_in=*/GetParam()), serializer_.get()); + Options(IntegerIndexStorage::kDefaultNumDataThresholdForBucketSplit, + /*pre_mapping_fbv_in=*/GetParam()), + serializer_.get()); EXPECT_THAT(storage_or, StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION)); EXPECT_THAT(storage_or.status().error_message(), @@ -586,14 +661,119 @@ TEST_P(IntegerIndexStorageTest, InvalidQuery) { // Create new integer index storage ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<IntegerIndexStorage> storage, - IntegerIndexStorage::Create(filesystem_, working_path_, - Options(/*pre_mapping_fbv_in=*/GetParam()), - serializer_.get())); + IntegerIndexStorage::Create( + filesystem_, working_path_, + Options(IntegerIndexStorage::kDefaultNumDataThresholdForBucketSplit, + /*pre_mapping_fbv_in=*/GetParam()), + serializer_.get())); EXPECT_THAT( storage->GetIterator(/*query_key_lower=*/0, /*query_key_upper=*/-1), StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); } +TEST_P(IntegerIndexStorageTest, AddKeysShouldUpdateNumData) { + // We use predefined custom buckets to initialize new integer index storage + // and create some test keys accordingly. + std::vector<Bucket> custom_init_sorted_buckets = { + Bucket(-1000, -100), Bucket(0, 100), Bucket(150, 199), Bucket(200, 300), + Bucket(301, 999)}; + std::vector<Bucket> custom_init_unsorted_buckets = { + Bucket(1000, std::numeric_limits<int64_t>::max()), Bucket(-99, -1), + Bucket(101, 149), Bucket(std::numeric_limits<int64_t>::min(), -1001)}; + { + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<IntegerIndexStorage> storage, + IntegerIndexStorage::Create( + filesystem_, working_path_, + Options(std::move(custom_init_sorted_buckets), + std::move(custom_init_unsorted_buckets), + IntegerIndexStorage::kDefaultNumDataThresholdForBucketSplit, + /*pre_mapping_fbv_in=*/GetParam()), + serializer_.get())); + + // Add some keys into buckets [(-1000,-100), (200,300), (-99,-1)]. + EXPECT_THAT(storage->AddKeys(/*document_id=*/0, kDefaultSectionId, + /*new_keys=*/{-51, -500}), + IsOk()); + EXPECT_THAT(storage->AddKeys(/*document_id=*/1, kDefaultSectionId, + /*new_keys=*/{201, 209, -149}), + IsOk()); + EXPECT_THAT(storage->AddKeys(/*document_id=*/2, kDefaultSectionId, + /*new_keys=*/{208}), + IsOk()); + EXPECT_THAT(storage->num_data(), Eq(6)); + + ICING_ASSERT_OK(storage->PersistToDisk()); + } + + // Check sorted_buckets manually. + const std::string sorted_buckets_file_path = absl_ports::StrCat( + working_path_, "/", IntegerIndexStorage::kFilePrefix, ".s"); + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<FileBackedVector<Bucket>> sorted_buckets, + FileBackedVector<Bucket>::Create( + filesystem_, sorted_buckets_file_path, + MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC)); + EXPECT_THAT(sorted_buckets->num_elements(), Eq(5)); + + ICING_ASSERT_OK_AND_ASSIGN(const Bucket* sbk1, + sorted_buckets->Get(/*idx=*/0)); + EXPECT_THAT(sbk1->key_lower(), Eq(-1000)); + EXPECT_THAT(sbk1->key_upper(), Eq(-100)); + EXPECT_THAT(sbk1->num_data(), Eq(2)); + ICING_ASSERT_OK_AND_ASSIGN(const Bucket* sbk2, + sorted_buckets->Get(/*idx=*/1)); + EXPECT_THAT(sbk2->key_lower(), Eq(0)); + EXPECT_THAT(sbk2->key_upper(), Eq(100)); + EXPECT_THAT(sbk2->num_data(), Eq(0)); + ICING_ASSERT_OK_AND_ASSIGN(const Bucket* sbk3, + sorted_buckets->Get(/*idx=*/2)); + EXPECT_THAT(sbk3->key_lower(), Eq(150)); + EXPECT_THAT(sbk3->key_upper(), Eq(199)); + EXPECT_THAT(sbk3->num_data(), Eq(0)); + ICING_ASSERT_OK_AND_ASSIGN(const Bucket* sbk4, + sorted_buckets->Get(/*idx=*/3)); + EXPECT_THAT(sbk4->key_lower(), Eq(200)); + EXPECT_THAT(sbk4->key_upper(), Eq(300)); + EXPECT_THAT(sbk4->num_data(), Eq(3)); + ICING_ASSERT_OK_AND_ASSIGN(const Bucket* sbk5, + sorted_buckets->Get(/*idx=*/4)); + EXPECT_THAT(sbk5->key_lower(), Eq(301)); + EXPECT_THAT(sbk5->key_upper(), Eq(999)); + EXPECT_THAT(sbk5->num_data(), Eq(0)); + + // Check unsorted_buckets and unsorted buckets manually. + const std::string unsorted_buckets_file_path = absl_ports::StrCat( + working_path_, "/", IntegerIndexStorage::kFilePrefix, ".u"); + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<FileBackedVector<Bucket>> unsorted_buckets, + FileBackedVector<Bucket>::Create( + filesystem_, unsorted_buckets_file_path, + MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC)); + EXPECT_THAT(unsorted_buckets->num_elements(), Eq(4)); + + ICING_ASSERT_OK_AND_ASSIGN(const Bucket* ubk1, + unsorted_buckets->Get(/*idx=*/0)); + EXPECT_THAT(ubk1->key_lower(), Eq(1000)); + EXPECT_THAT(ubk1->key_upper(), Eq(std::numeric_limits<int64_t>::max())); + EXPECT_THAT(ubk1->num_data(), Eq(0)); + ICING_ASSERT_OK_AND_ASSIGN(const Bucket* ubk2, + unsorted_buckets->Get(/*idx=*/1)); + EXPECT_THAT(ubk2->key_lower(), Eq(-99)); + EXPECT_THAT(ubk2->key_upper(), Eq(-1)); + EXPECT_THAT(ubk2->num_data(), Eq(1)); + ICING_ASSERT_OK_AND_ASSIGN(const Bucket* ubk3, + unsorted_buckets->Get(/*idx=*/2)); + EXPECT_THAT(ubk3->key_lower(), Eq(101)); + EXPECT_THAT(ubk3->key_upper(), Eq(149)); + EXPECT_THAT(ubk3->num_data(), Eq(0)); + ICING_ASSERT_OK_AND_ASSIGN(const Bucket* ubk4, + unsorted_buckets->Get(/*idx=*/3)); + EXPECT_THAT(ubk4->key_lower(), Eq(std::numeric_limits<int64_t>::min())); + EXPECT_THAT(ubk4->key_upper(), Eq(-1001)); + EXPECT_THAT(ubk4->num_data(), Eq(0)); +} + TEST_P(IntegerIndexStorageTest, ExactQuerySortedBuckets) { // We use predefined custom buckets to initialize new integer index storage // and create some test keys accordingly. @@ -609,6 +789,7 @@ TEST_P(IntegerIndexStorageTest, ExactQuerySortedBuckets) { filesystem_, working_path_, Options(std::move(custom_init_sorted_buckets), std::move(custom_init_unsorted_buckets), + IntegerIndexStorage::kDefaultNumDataThresholdForBucketSplit, /*pre_mapping_fbv_in=*/GetParam()), serializer_.get())); @@ -664,6 +845,7 @@ TEST_P(IntegerIndexStorageTest, ExactQueryUnsortedBuckets) { filesystem_, working_path_, Options(std::move(custom_init_sorted_buckets), std::move(custom_init_unsorted_buckets), + IntegerIndexStorage::kDefaultNumDataThresholdForBucketSplit, /*pre_mapping_fbv_in=*/GetParam()), serializer_.get())); @@ -725,6 +907,7 @@ TEST_P(IntegerIndexStorageTest, ExactQueryIdenticalKeys) { filesystem_, working_path_, Options(std::move(custom_init_sorted_buckets), std::move(custom_init_unsorted_buckets), + IntegerIndexStorage::kDefaultNumDataThresholdForBucketSplit, /*pre_mapping_fbv_in=*/GetParam()), serializer_.get())); @@ -768,6 +951,7 @@ TEST_P(IntegerIndexStorageTest, RangeQueryEmptyIntegerIndexStorage) { filesystem_, working_path_, Options(std::move(custom_init_sorted_buckets), std::move(custom_init_unsorted_buckets), + IntegerIndexStorage::kDefaultNumDataThresholdForBucketSplit, /*pre_mapping_fbv_in=*/GetParam()), serializer_.get())); @@ -792,6 +976,7 @@ TEST_P(IntegerIndexStorageTest, RangeQuerySingleEntireSortedBucket) { filesystem_, working_path_, Options(std::move(custom_init_sorted_buckets), std::move(custom_init_unsorted_buckets), + IntegerIndexStorage::kDefaultNumDataThresholdForBucketSplit, /*pre_mapping_fbv_in=*/GetParam()), serializer_.get())); @@ -847,6 +1032,7 @@ TEST_P(IntegerIndexStorageTest, RangeQuerySingleEntireUnsortedBucket) { filesystem_, working_path_, Options(std::move(custom_init_sorted_buckets), std::move(custom_init_unsorted_buckets), + IntegerIndexStorage::kDefaultNumDataThresholdForBucketSplit, /*pre_mapping_fbv_in=*/GetParam()), serializer_.get())); @@ -905,6 +1091,7 @@ TEST_P(IntegerIndexStorageTest, RangeQuerySinglePartialSortedBucket) { filesystem_, working_path_, Options(std::move(custom_init_sorted_buckets), std::move(custom_init_unsorted_buckets), + IntegerIndexStorage::kDefaultNumDataThresholdForBucketSplit, /*pre_mapping_fbv_in=*/GetParam()), serializer_.get())); @@ -957,6 +1144,7 @@ TEST_P(IntegerIndexStorageTest, RangeQuerySinglePartialUnsortedBucket) { filesystem_, working_path_, Options(std::move(custom_init_sorted_buckets), std::move(custom_init_unsorted_buckets), + IntegerIndexStorage::kDefaultNumDataThresholdForBucketSplit, /*pre_mapping_fbv_in=*/GetParam()), serializer_.get())); @@ -1009,6 +1197,7 @@ TEST_P(IntegerIndexStorageTest, RangeQueryMultipleBuckets) { filesystem_, working_path_, Options(std::move(custom_init_sorted_buckets), std::move(custom_init_unsorted_buckets), + IntegerIndexStorage::kDefaultNumDataThresholdForBucketSplit, /*pre_mapping_fbv_in=*/GetParam()), serializer_.get())); @@ -1098,6 +1287,7 @@ TEST_P(IntegerIndexStorageTest, BatchAdd) { filesystem_, working_path_, Options(std::move(custom_init_sorted_buckets), std::move(custom_init_unsorted_buckets), + IntegerIndexStorage::kDefaultNumDataThresholdForBucketSplit, /*pre_mapping_fbv_in=*/GetParam()), serializer_.get())); @@ -1126,9 +1316,11 @@ TEST_P(IntegerIndexStorageTest, BatchAdd) { TEST_P(IntegerIndexStorageTest, BatchAddShouldDedupeKeys) { ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<IntegerIndexStorage> storage, - IntegerIndexStorage::Create(filesystem_, working_path_, - Options(/*pre_mapping_fbv_in=*/GetParam()), - serializer_.get())); + IntegerIndexStorage::Create( + filesystem_, working_path_, + Options(IntegerIndexStorage::kDefaultNumDataThresholdForBucketSplit, + /*pre_mapping_fbv_in=*/GetParam()), + serializer_.get())); std::vector<int64_t> keys = {2, 3, 1, 2, 4, -1, -1, 100, 3}; EXPECT_THAT( @@ -1152,6 +1344,7 @@ TEST_P(IntegerIndexStorageTest, MultipleKeysShouldMergeAndDedupeDocHitInfo) { filesystem_, working_path_, Options(std::move(custom_init_sorted_buckets), std::move(custom_init_unsorted_buckets), + IntegerIndexStorage::kDefaultNumDataThresholdForBucketSplit, /*pre_mapping_fbv_in=*/GetParam()), serializer_.get())); @@ -1188,6 +1381,7 @@ TEST_P(IntegerIndexStorageTest, filesystem_, working_path_, Options(std::move(custom_init_sorted_buckets), std::move(custom_init_unsorted_buckets), + IntegerIndexStorage::kDefaultNumDataThresholdForBucketSplit, /*pre_mapping_fbv_in=*/GetParam()), serializer_.get())); @@ -1235,26 +1429,151 @@ TEST_P(IntegerIndexStorageTest, EqualsDocHitInfo(kDefaultDocumentId, expected_sections)))); } +TEST_P(IntegerIndexStorageTest, IteratorCallStatsMultipleBuckets) { + // We use predefined custom buckets to initialize new integer index storage + // and create some test keys accordingly. + std::vector<Bucket> custom_init_sorted_buckets = { + Bucket(-1000, -100), Bucket(0, 100), Bucket(150, 199), Bucket(200, 300), + Bucket(301, 999)}; + std::vector<Bucket> custom_init_unsorted_buckets = { + Bucket(1000, std::numeric_limits<int64_t>::max()), Bucket(-99, -1), + Bucket(101, 149), Bucket(std::numeric_limits<int64_t>::min(), -1001)}; + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<IntegerIndexStorage> storage, + IntegerIndexStorage::Create( + filesystem_, working_path_, + Options(std::move(custom_init_sorted_buckets), + std::move(custom_init_unsorted_buckets), + IntegerIndexStorage::kDefaultNumDataThresholdForBucketSplit, + /*pre_mapping_fbv_in=*/GetParam()), + serializer_.get())); + + // Add some keys into sorted buckets [(-1000,-100), (200,300)]. + ICING_ASSERT_OK(storage->AddKeys(/*document_id=*/0, kDefaultSectionId, + /*new_keys=*/{-500})); + ICING_ASSERT_OK(storage->AddKeys(/*document_id=*/1, kDefaultSectionId, + /*new_keys=*/{208})); + ICING_ASSERT_OK(storage->AddKeys(/*document_id=*/2, kDefaultSectionId, + /*new_keys=*/{-200})); + ICING_ASSERT_OK(storage->AddKeys(/*document_id=*/3, kDefaultSectionId, + /*new_keys=*/{-1000})); + ICING_ASSERT_OK(storage->AddKeys(/*document_id=*/4, kDefaultSectionId, + /*new_keys=*/{300})); + ASSERT_THAT(storage->num_data(), Eq(5)); + + // GetIterator for range [INT_MIN, INT_MAX] and Advance all. Those 5 keys are + // in 2 buckets, so we will be inspecting 2 posting lists in 2 blocks. + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<DocHitInfoIterator> iter1, + storage->GetIterator(/*key_lower=*/std::numeric_limits<int64_t>::min(), + /*key_upper=*/std::numeric_limits<int64_t>::max())); + while (iter1->Advance().ok()) { + // Advance all hits. + } + EXPECT_THAT( + iter1->GetCallStats(), + EqualsDocHitInfoIteratorCallStats( + /*num_leaf_advance_calls_lite_index=*/0, + /*num_leaf_advance_calls_main_index=*/0, + /*num_leaf_advance_calls_integer_index=*/5, + /*num_leaf_advance_calls_no_index=*/0, /*num_blocks_inspected=*/2)); + + // GetIterator for range [-1000, -100] and Advance all. Since we only have to + // read bucket (-1000,-100), there will be 3 advance calls and 1 block + // inspected. + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<DocHitInfoIterator> iter2, + storage->GetIterator(/*key_lower=*/-1000, /*key_upper=*/-100)); + while (iter2->Advance().ok()) { + // Advance all hits. + } + EXPECT_THAT( + iter2->GetCallStats(), + EqualsDocHitInfoIteratorCallStats( + /*num_leaf_advance_calls_lite_index=*/0, + /*num_leaf_advance_calls_main_index=*/0, + /*num_leaf_advance_calls_integer_index=*/3, + /*num_leaf_advance_calls_no_index=*/0, /*num_blocks_inspected=*/1)); +} + +TEST_P(IntegerIndexStorageTest, IteratorCallStatsSingleBucketChainedBlocks) { + // We use predefined custom buckets to initialize new integer index storage + // and create some test keys accordingly. + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<IntegerIndexStorage> storage, + IntegerIndexStorage::Create( + filesystem_, working_path_, + Options(IntegerIndexStorage::kDefaultNumDataThresholdForBucketSplit, + /*pre_mapping_fbv_in=*/GetParam()), + serializer_.get())); + + int32_t num_keys_to_add = 800; + ASSERT_THAT(num_keys_to_add, + Lt(IntegerIndexStorage::kDefaultNumDataThresholdForBucketSplit)); + for (int i = 0; i < num_keys_to_add; ++i) { + ICING_ASSERT_OK(storage->AddKeys(/*document_id=*/i, kDefaultSectionId, + /*new_keys=*/{i})); + } + + // Those 800 keys are in 1 single bucket with 3 chained posting lists, so we + // will be inspecting 3 blocks. + int32_t expected_num_blocks_inspected = 3; + + // GetIterator for range [INT_MIN, INT_MAX] and Advance all. + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<DocHitInfoIterator> iter1, + storage->GetIterator(/*key_lower=*/std::numeric_limits<int64_t>::min(), + /*key_upper=*/std::numeric_limits<int64_t>::max())); + while (iter1->Advance().ok()) { + // Advance all hits. + } + EXPECT_THAT(iter1->GetCallStats(), + EqualsDocHitInfoIteratorCallStats( + /*num_leaf_advance_calls_lite_index=*/0, + /*num_leaf_advance_calls_main_index=*/0, + /*num_leaf_advance_calls_integer_index=*/num_keys_to_add, + /*num_leaf_advance_calls_no_index=*/0, + expected_num_blocks_inspected)); + + // GetIterator for range [1, 1] and Advance all. Although there is only 1 + // relevant data, we still have to inspect the entire bucket and its posting + // lists chain (which contain 3 blocks and 800 data). + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<DocHitInfoIterator> iter2, + storage->GetIterator(/*key_lower=*/1, /*key_upper=*/1)); + while (iter2->Advance().ok()) { + // Advance all hits. + } + EXPECT_THAT(iter2->GetCallStats(), + EqualsDocHitInfoIteratorCallStats( + /*num_leaf_advance_calls_lite_index=*/0, + /*num_leaf_advance_calls_main_index=*/0, + /*num_leaf_advance_calls_integer_index=*/num_keys_to_add, + /*num_leaf_advance_calls_no_index=*/0, + expected_num_blocks_inspected)); +} + TEST_P(IntegerIndexStorageTest, SplitBuckets) { + int32_t custom_num_data_threshold_for_bucket_split = 300; + ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<IntegerIndexStorage> storage, - IntegerIndexStorage::Create(filesystem_, working_path_, - Options(/*pre_mapping_fbv_in=*/GetParam()), - serializer_.get())); - - uint32_t block_size = FlashIndexStorage::SelectBlockSize(); - uint32_t max_posting_list_bytes = IndexBlock::CalculateMaxPostingListBytes( - block_size, serializer_->GetDataTypeBytes()); - uint32_t max_num_data_before_split = - max_posting_list_bytes / serializer_->GetDataTypeBytes(); - - // Add max_num_data_before_split + 1 keys to invoke bucket splitting. - // Keys: max_num_data_before_split to 0 - // Document ids: 0 to max_num_data_before_split + IntegerIndexStorage::Create( + filesystem_, working_path_, + Options(/*custom_init_sorted_buckets_in=*/{}, + /*custom_init_unsorted_buckets_in=*/{}, + custom_num_data_threshold_for_bucket_split, + /*pre_mapping_fbv_in=*/GetParam()), + serializer_.get())); + + // Add custom_num_data_threshold_for_bucket_split + 1 keys to invoke bucket + // splitting. + // - Keys: custom_num_data_threshold_for_bucket_split to 0 Document + // - ids: 0 to custom_num_data_threshold_for_bucket_split std::unordered_map<int64_t, DocumentId> data; - int64_t key = max_num_data_before_split; + int64_t key = custom_num_data_threshold_for_bucket_split; DocumentId document_id = 0; - for (int i = 0; i < max_num_data_before_split + 1; ++i) { + for (int i = 0; i < custom_num_data_threshold_for_bucket_split + 1; ++i) { data[key] = document_id; ICING_ASSERT_OK( storage->AddKeys(document_id, kDefaultSectionId, /*new_keys=*/{key})); @@ -1299,7 +1618,8 @@ TEST_P(IntegerIndexStorageTest, SplitBuckets) { // Ensure that search works normally. std::vector<SectionId> expected_sections = {kDefaultSectionId}; - for (int64_t key = max_num_data_before_split; key >= 0; key--) { + for (int64_t key = custom_num_data_threshold_for_bucket_split; key >= 0; + key--) { ASSERT_THAT(data, Contains(Key(key))); DocumentId expected_document_id = data[key]; EXPECT_THAT(Query(storage.get(), /*key_lower=*/key, /*key_upper=*/key), @@ -1309,20 +1629,21 @@ TEST_P(IntegerIndexStorageTest, SplitBuckets) { } TEST_P(IntegerIndexStorageTest, SplitBucketsTriggerSortBuckets) { + int32_t custom_num_data_threshold_for_bucket_split = 300; + ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<IntegerIndexStorage> storage, - IntegerIndexStorage::Create(filesystem_, working_path_, - Options(/*pre_mapping_fbv_in=*/GetParam()), - serializer_.get())); - - uint32_t block_size = FlashIndexStorage::SelectBlockSize(); - uint32_t max_posting_list_bytes = IndexBlock::CalculateMaxPostingListBytes( - block_size, serializer_->GetDataTypeBytes()); - uint32_t max_num_data_before_split = - max_posting_list_bytes / serializer_->GetDataTypeBytes(); + IntegerIndexStorage::Create( + filesystem_, working_path_, + Options(/*custom_init_sorted_buckets_in=*/{}, + /*custom_init_unsorted_buckets_in=*/{}, + custom_num_data_threshold_for_bucket_split, + /*pre_mapping_fbv_in=*/GetParam()), + serializer_.get())); // Add IntegerIndexStorage::kUnsortedBucketsLengthThreshold keys. For each - // key, add max_num_data_before_split + 1 data. Then we will get: + // key, add custom_num_data_threshold_for_bucket_split + 1 data. Then we will + // get: // - Bucket splitting will create kUnsortedBucketsLengthThreshold + 1 unsorted // buckets [[50, 50], [49, 49], ..., [1, 1], [51, INT64_MAX]]. // - Since there are kUnsortedBucketsLengthThreshold + 1 unsorted buckets, we @@ -1332,7 +1653,7 @@ TEST_P(IntegerIndexStorageTest, SplitBucketsTriggerSortBuckets) { DocumentId document_id = 0; for (int i = 0; i < IntegerIndexStorage::kUnsortedBucketsLengthThreshold; ++i) { - for (int j = 0; j < max_num_data_before_split + 1; ++j) { + for (int j = 0; j < custom_num_data_threshold_for_bucket_split + 1; ++j) { data[key].push_back(document_id); ICING_ASSERT_OK( storage->AddKeys(document_id, kDefaultSectionId, /*new_keys=*/{key})); @@ -1396,6 +1717,7 @@ TEST_P(IntegerIndexStorageTest, TransferIndex) { filesystem_, working_path_, Options(std::move(custom_init_sorted_buckets), std::move(custom_init_unsorted_buckets), + IntegerIndexStorage::kDefaultNumDataThresholdForBucketSplit, /*pre_mapping_fbv_in=*/GetParam()), serializer_.get())); @@ -1433,9 +1755,11 @@ TEST_P(IntegerIndexStorageTest, TransferIndex) { { ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<IntegerIndexStorage> new_storage, - IntegerIndexStorage::Create(filesystem_, working_path_ + "_temp", - Options(/*pre_mapping_fbv_in=*/GetParam()), - serializer_.get())); + IntegerIndexStorage::Create( + filesystem_, working_path_ + "_temp", + Options(IntegerIndexStorage::kDefaultNumDataThresholdForBucketSplit, + /*pre_mapping_fbv_in=*/GetParam()), + serializer_.get())); EXPECT_THAT( storage->TransferIndex(document_id_old_to_new, new_storage.get()), IsOk()); @@ -1445,9 +1769,11 @@ TEST_P(IntegerIndexStorageTest, TransferIndex) { // Verify after transferring and reinitializing the instance. ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<IntegerIndexStorage> new_storage, - IntegerIndexStorage::Create(filesystem_, working_path_ + "_temp", - Options(/*pre_mapping_fbv_in=*/GetParam()), - serializer_.get())); + IntegerIndexStorage::Create( + filesystem_, working_path_ + "_temp", + Options(IntegerIndexStorage::kDefaultNumDataThresholdForBucketSplit, + /*pre_mapping_fbv_in=*/GetParam()), + serializer_.get())); std::vector<SectionId> expected_sections = {kDefaultSectionId}; EXPECT_THAT(new_storage->num_data(), Eq(7)); @@ -1493,9 +1819,11 @@ TEST_P(IntegerIndexStorageTest, TransferIndex) { TEST_P(IntegerIndexStorageTest, TransferIndexOutOfRangeDocumentId) { ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<IntegerIndexStorage> storage, - IntegerIndexStorage::Create(filesystem_, working_path_, - Options(/*pre_mapping_fbv_in=*/GetParam()), - serializer_.get())); + IntegerIndexStorage::Create( + filesystem_, working_path_, + Options(IntegerIndexStorage::kDefaultNumDataThresholdForBucketSplit, + /*pre_mapping_fbv_in=*/GetParam()), + serializer_.get())); ICING_ASSERT_OK(storage->AddKeys(/*document_id=*/1, kDefaultSectionId, /*new_keys=*/{120})); @@ -1510,9 +1838,11 @@ TEST_P(IntegerIndexStorageTest, TransferIndexOutOfRangeDocumentId) { // Transfer to new storage. ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<IntegerIndexStorage> new_storage, - IntegerIndexStorage::Create(filesystem_, working_path_ + "_temp", - Options(/*pre_mapping_fbv_in=*/GetParam()), - serializer_.get())); + IntegerIndexStorage::Create( + filesystem_, working_path_ + "_temp", + Options(IntegerIndexStorage::kDefaultNumDataThresholdForBucketSplit, + /*pre_mapping_fbv_in=*/GetParam()), + serializer_.get())); EXPECT_THAT(storage->TransferIndex(document_id_old_to_new, new_storage.get()), IsOk()); @@ -1542,6 +1872,7 @@ TEST_P(IntegerIndexStorageTest, TransferEmptyIndex) { filesystem_, working_path_, Options(std::move(custom_init_sorted_buckets), std::move(custom_init_unsorted_buckets), + IntegerIndexStorage::kDefaultNumDataThresholdForBucketSplit, /*pre_mapping_fbv_in=*/GetParam()), serializer_.get())); ASSERT_THAT(storage->num_data(), Eq(0)); @@ -1552,9 +1883,11 @@ TEST_P(IntegerIndexStorageTest, TransferEmptyIndex) { // Transfer to new storage. ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<IntegerIndexStorage> new_storage, - IntegerIndexStorage::Create(filesystem_, working_path_ + "_temp", - Options(/*pre_mapping_fbv_in=*/GetParam()), - serializer_.get())); + IntegerIndexStorage::Create( + filesystem_, working_path_ + "_temp", + Options(IntegerIndexStorage::kDefaultNumDataThresholdForBucketSplit, + /*pre_mapping_fbv_in=*/GetParam()), + serializer_.get())); EXPECT_THAT(storage->TransferIndex(document_id_old_to_new, new_storage.get()), IsOk()); @@ -1581,6 +1914,7 @@ TEST_P(IntegerIndexStorageTest, TransferIndexDeleteAll) { filesystem_, working_path_, Options(std::move(custom_init_sorted_buckets), std::move(custom_init_unsorted_buckets), + IntegerIndexStorage::kDefaultNumDataThresholdForBucketSplit, /*pre_mapping_fbv_in=*/GetParam()), serializer_.get())); @@ -1605,9 +1939,11 @@ TEST_P(IntegerIndexStorageTest, TransferIndexDeleteAll) { { ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<IntegerIndexStorage> new_storage, - IntegerIndexStorage::Create(filesystem_, working_path_ + "_temp", - Options(/*pre_mapping_fbv_in=*/GetParam()), - serializer_.get())); + IntegerIndexStorage::Create( + filesystem_, working_path_ + "_temp", + Options(IntegerIndexStorage::kDefaultNumDataThresholdForBucketSplit, + /*pre_mapping_fbv_in=*/GetParam()), + serializer_.get())); EXPECT_THAT( storage->TransferIndex(document_id_old_to_new, new_storage.get()), IsOk()); @@ -1617,9 +1953,11 @@ TEST_P(IntegerIndexStorageTest, TransferIndexDeleteAll) { // Verify after transferring and reinitializing the instance. ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<IntegerIndexStorage> new_storage, - IntegerIndexStorage::Create(filesystem_, working_path_ + "_temp", - Options(/*pre_mapping_fbv_in=*/GetParam()), - serializer_.get())); + IntegerIndexStorage::Create( + filesystem_, working_path_ + "_temp", + Options(IntegerIndexStorage::kDefaultNumDataThresholdForBucketSplit, + /*pre_mapping_fbv_in=*/GetParam()), + serializer_.get())); std::vector<SectionId> expected_sections = {kDefaultSectionId}; EXPECT_THAT(new_storage->num_data(), Eq(0)); @@ -1630,6 +1968,11 @@ TEST_P(IntegerIndexStorageTest, TransferIndexDeleteAll) { } TEST_P(IntegerIndexStorageTest, TransferIndexShouldInvokeMergeBuckets) { + int32_t custom_num_data_threshold_for_bucket_split = 300; + int32_t custom_num_data_threshold_for_bucket_merge = + IntegerIndexStorage::kNumDataThresholdRatioForBucketMerge * + custom_num_data_threshold_for_bucket_split; + // This test verifies that if TransferIndex invokes bucket merging logic to // ensure sure we're able to avoid having mostly empty buckets after inserting // and deleting data for many rounds. @@ -1648,6 +1991,7 @@ TEST_P(IntegerIndexStorageTest, TransferIndexShouldInvokeMergeBuckets) { filesystem_, working_path_, Options(std::move(custom_init_sorted_buckets), std::move(custom_init_unsorted_buckets), + custom_num_data_threshold_for_bucket_split, /*pre_mapping_fbv_in=*/GetParam()), serializer_.get())); @@ -1671,7 +2015,7 @@ TEST_P(IntegerIndexStorageTest, TransferIndexShouldInvokeMergeBuckets) { /*new_keys=*/{20})); ASSERT_THAT(storage->num_data(), Eq(9)); ASSERT_THAT(storage->num_data(), - Le(IntegerIndexStorage::kNumDataThresholdForBucketMerge)); + Le(custom_num_data_threshold_for_bucket_merge)); // Create document_id_old_to_new that keeps all existing documents. std::vector<DocumentId> document_id_old_to_new(9); @@ -1683,12 +2027,17 @@ TEST_P(IntegerIndexStorageTest, TransferIndexShouldInvokeMergeBuckets) { { ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<IntegerIndexStorage> new_storage, - IntegerIndexStorage::Create(filesystem_, new_storage_working_path, - Options(/*pre_mapping_fbv_in=*/GetParam()), - serializer_.get())); + IntegerIndexStorage::Create( + filesystem_, new_storage_working_path, + Options(/*custom_init_sorted_buckets_in=*/{}, + /*custom_init_unsorted_buckets_in=*/{}, + custom_num_data_threshold_for_bucket_split, + /*pre_mapping_fbv_in=*/GetParam()), + serializer_.get())); EXPECT_THAT( storage->TransferIndex(document_id_old_to_new, new_storage.get()), IsOk()); + EXPECT_THAT(new_storage->num_data(), Eq(9)); } // Check new_storage->sorted_bucket_ manually. @@ -1704,9 +2053,15 @@ TEST_P(IntegerIndexStorageTest, TransferIndexShouldInvokeMergeBuckets) { ICING_ASSERT_OK_AND_ASSIGN(const Bucket* bk1, sorted_buckets->Get(/*idx=*/0)); EXPECT_THAT(bk1->key_lower(), Eq(std::numeric_limits<int64_t>::min())); EXPECT_THAT(bk1->key_upper(), Eq(std::numeric_limits<int64_t>::max())); + EXPECT_THAT(bk1->num_data(), Eq(9)); } TEST_P(IntegerIndexStorageTest, TransferIndexExceedsMergeThreshold) { + int32_t custom_num_data_threshold_for_bucket_split = 300; + int32_t custom_num_data_threshold_for_bucket_merge = + IntegerIndexStorage::kNumDataThresholdRatioForBucketMerge * + custom_num_data_threshold_for_bucket_split; + // This test verifies that if TransferIndex invokes bucket merging logic and // doesn't merge buckets too aggressively to ensure we won't get a bucket with // too many data. @@ -1725,15 +2080,16 @@ TEST_P(IntegerIndexStorageTest, TransferIndexExceedsMergeThreshold) { filesystem_, working_path_, Options(std::move(custom_init_sorted_buckets), std::move(custom_init_unsorted_buckets), + custom_num_data_threshold_for_bucket_split, /*pre_mapping_fbv_in=*/GetParam()), serializer_.get())); // Insert data into 2 buckets so that total # of these 2 buckets exceed - // kNumDataThresholdForBucketMerge. + // custom_num_data_threshold_for_bucket_merge. // - Bucket 1: [-1000, -100] // - Bucket 2: [101, 149] DocumentId document_id = 0; - int num_data_for_bucket1 = 200; + int num_data_for_bucket1 = custom_num_data_threshold_for_bucket_merge - 50; for (int i = 0; i < num_data_for_bucket1; ++i) { ICING_ASSERT_OK(storage->AddKeys(document_id, kDefaultSectionId, /*new_keys=*/{-200})); @@ -1747,8 +2103,10 @@ TEST_P(IntegerIndexStorageTest, TransferIndexExceedsMergeThreshold) { ++document_id; } + ASSERT_THAT(storage->num_data(), + Eq(num_data_for_bucket1 + num_data_for_bucket2)); ASSERT_THAT(num_data_for_bucket1 + num_data_for_bucket2, - Gt(IntegerIndexStorage::kNumDataThresholdForBucketMerge)); + Gt(custom_num_data_threshold_for_bucket_merge)); // Create document_id_old_to_new that keeps all existing documents. std::vector<DocumentId> document_id_old_to_new(document_id); @@ -1760,12 +2118,18 @@ TEST_P(IntegerIndexStorageTest, TransferIndexExceedsMergeThreshold) { { ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<IntegerIndexStorage> new_storage, - IntegerIndexStorage::Create(filesystem_, new_storage_working_path, - Options(/*pre_mapping_fbv_in=*/GetParam()), - serializer_.get())); + IntegerIndexStorage::Create( + filesystem_, new_storage_working_path, + Options(/*custom_init_sorted_buckets_in=*/{}, + /*custom_init_unsorted_buckets_in=*/{}, + custom_num_data_threshold_for_bucket_split, + /*pre_mapping_fbv_in=*/GetParam()), + serializer_.get())); EXPECT_THAT( storage->TransferIndex(document_id_old_to_new, new_storage.get()), IsOk()); + EXPECT_THAT(new_storage->num_data(), + Eq(num_data_for_bucket1 + num_data_for_bucket2)); } // Check new_storage->sorted_bucket_ manually. @@ -1781,9 +2145,11 @@ TEST_P(IntegerIndexStorageTest, TransferIndexExceedsMergeThreshold) { ICING_ASSERT_OK_AND_ASSIGN(const Bucket* bk1, sorted_buckets->Get(/*idx=*/0)); EXPECT_THAT(bk1->key_lower(), Eq(std::numeric_limits<int64_t>::min())); EXPECT_THAT(bk1->key_upper(), Eq(100)); + EXPECT_THAT(bk1->num_data(), Eq(num_data_for_bucket1)); ICING_ASSERT_OK_AND_ASSIGN(const Bucket* bk2, sorted_buckets->Get(/*idx=*/1)); EXPECT_THAT(bk2->key_lower(), Eq(101)); EXPECT_THAT(bk2->key_upper(), Eq(std::numeric_limits<int64_t>::max())); + EXPECT_THAT(bk2->num_data(), Eq(num_data_for_bucket2)); } INSTANTIATE_TEST_SUITE_P(IntegerIndexStorageTest, IntegerIndexStorageTest, diff --git a/icing/index/numeric/integer-index.cc b/icing/index/numeric/integer-index.cc index 5fa82a5..8c80698 100644 --- a/icing/index/numeric/integer-index.cc +++ b/icing/index/numeric/integer-index.cc @@ -91,7 +91,7 @@ libtextclassifier3::StatusOr<IntegerIndex::PropertyToStorageMapType> GetPropertyIntegerIndexStorageMap( const Filesystem& filesystem, const std::string& working_path, PostingListIntegerIndexSerializer* posting_list_serializer, - bool pre_mapping_fbv) { + int32_t num_data_threshold_for_bucket_split, bool pre_mapping_fbv) { ICING_ASSIGN_OR_RETURN(std::vector<std::string> property_paths, GetAllExistingPropertyPaths(filesystem, working_path)); @@ -102,11 +102,13 @@ GetPropertyIntegerIndexStorageMap( } std::string storage_working_path = GetPropertyIndexStoragePath(working_path, property_path); - ICING_ASSIGN_OR_RETURN(std::unique_ptr<IntegerIndexStorage> storage, - IntegerIndexStorage::Create( - filesystem, storage_working_path, - IntegerIndexStorage::Options(pre_mapping_fbv), - posting_list_serializer)); + ICING_ASSIGN_OR_RETURN( + std::unique_ptr<IntegerIndexStorage> storage, + IntegerIndexStorage::Create( + filesystem, storage_working_path, + IntegerIndexStorage::Options(num_data_threshold_for_bucket_split, + pre_mapping_fbv), + posting_list_serializer)); property_to_storage_map.insert( std::make_pair(property_path, std::move(storage))); } @@ -141,6 +143,8 @@ libtextclassifier3::StatusOr<std::unordered_set<std::string>> CreatePropertySet( } // namespace libtextclassifier3::Status IntegerIndex::Editor::IndexAllBufferedKeys() && { + integer_index_.SetDirty(); + auto iter = integer_index_.property_to_storage_map_.find(property_path_); IntegerIndexStorage* target_storage = nullptr; // 1. Check if this property already has its own individual index. @@ -161,7 +165,8 @@ libtextclassifier3::Status IntegerIndex::Editor::IndexAllBufferedKeys() && { integer_index_.filesystem_, GetPropertyIndexStoragePath(integer_index_.working_path_, kWildcardPropertyIndexFileName), - IntegerIndexStorage::Options(pre_mapping_fbv_), + IntegerIndexStorage::Options(num_data_threshold_for_bucket_split_, + pre_mapping_fbv_), integer_index_.posting_list_serializer_.get())); } ICING_RETURN_IF_ERROR( @@ -175,7 +180,8 @@ libtextclassifier3::Status IntegerIndex::Editor::IndexAllBufferedKeys() && { integer_index_.filesystem_, GetPropertyIndexStoragePath(integer_index_.working_path_, property_path_), - IntegerIndexStorage::Options(pre_mapping_fbv_), + IntegerIndexStorage::Options(num_data_threshold_for_bucket_split_, + pre_mapping_fbv_), integer_index_.posting_list_serializer_.get())); target_storage = new_storage.get(); integer_index_.property_to_storage_map_.insert( @@ -188,6 +194,7 @@ libtextclassifier3::Status IntegerIndex::Editor::IndexAllBufferedKeys() && { /* static */ libtextclassifier3::StatusOr<std::unique_ptr<IntegerIndex>> IntegerIndex::Create(const Filesystem& filesystem, std::string working_path, + int32_t num_data_threshold_for_bucket_split, bool pre_mapping_fbv) { if (!filesystem.FileExists(GetMetadataFilePath(working_path).c_str())) { // Discard working_path if metadata file is missing, and reinitialize. @@ -195,9 +202,11 @@ IntegerIndex::Create(const Filesystem& filesystem, std::string working_path, ICING_RETURN_IF_ERROR(Discard(filesystem, working_path)); } return InitializeNewFiles(filesystem, std::move(working_path), + num_data_threshold_for_bucket_split, pre_mapping_fbv); } return InitializeExistingFiles(filesystem, std::move(working_path), + num_data_threshold_for_bucket_split, pre_mapping_fbv); } @@ -227,7 +236,7 @@ IntegerIndex::GetIterator(std::string_view property_path, int64_t key_lower, std::unique_ptr<DocHitInfoIterator> delegate, wildcard_index_storage_->GetIterator(key_lower, key_upper)); std::set<std::string> property_paths = {std::move(property_path_str)}; - return std::make_unique<DocHitInfoIteratorSectionRestrict>( + return DocHitInfoIteratorSectionRestrict::ApplyRestrictions( std::move(delegate), &document_store, &schema_store, std::move(property_paths), current_time_ms); } @@ -239,6 +248,8 @@ IntegerIndex::GetIterator(std::string_view property_path, int64_t key_lower, libtextclassifier3::Status IntegerIndex::AddPropertyToWildcardStorage( const std::string& property_path) { + SetDirty(); + WildcardPropertyStorage wildcard_properties; wildcard_properties.mutable_property_entries()->Reserve( wildcard_properties_set_.size()); @@ -272,7 +283,8 @@ libtextclassifier3::Status IntegerIndex::Optimize( // we can safely swap directories later. ICING_ASSIGN_OR_RETURN( std::unique_ptr<IntegerIndex> new_integer_index, - Create(filesystem_, temp_working_path_ddir.dir(), pre_mapping_fbv_)); + Create(filesystem_, temp_working_path_ddir.dir(), + num_data_threshold_for_bucket_split_, pre_mapping_fbv_)); ICING_RETURN_IF_ERROR( TransferIndex(document_id_old_to_new, new_integer_index.get())); new_integer_index->set_last_added_document_id(new_last_added_document_id); @@ -322,20 +334,24 @@ libtextclassifier3::Status IntegerIndex::Optimize( filesystem_, GetPropertyIndexStoragePath(working_path_, kWildcardPropertyIndexFileName), - IntegerIndexStorage::Options(pre_mapping_fbv_), + IntegerIndexStorage::Options(num_data_threshold_for_bucket_split_, + pre_mapping_fbv_), posting_list_serializer_.get())); } // Initialize all existing integer index storages. - ICING_ASSIGN_OR_RETURN(property_to_storage_map_, - GetPropertyIntegerIndexStorageMap( - filesystem_, working_path_, - posting_list_serializer_.get(), pre_mapping_fbv_)); + ICING_ASSIGN_OR_RETURN( + property_to_storage_map_, + GetPropertyIntegerIndexStorageMap( + filesystem_, working_path_, posting_list_serializer_.get(), + num_data_threshold_for_bucket_split_, pre_mapping_fbv_)); return libtextclassifier3::Status::OK; } libtextclassifier3::Status IntegerIndex::Clear() { + SetDirty(); + // Step 1: clear property_to_storage_map_. property_to_storage_map_.clear(); wildcard_index_storage_.reset(); @@ -367,6 +383,7 @@ libtextclassifier3::Status IntegerIndex::Clear() { /* static */ libtextclassifier3::StatusOr<std::unique_ptr<IntegerIndex>> IntegerIndex::InitializeNewFiles(const Filesystem& filesystem, std::string&& working_path, + int32_t num_data_threshold_for_bucket_split, bool pre_mapping_fbv) { // Create working directory. if (!filesystem.CreateDirectoryRecursively(working_path.c_str())) { @@ -399,12 +416,14 @@ IntegerIndex::InitializeNewFiles(const Filesystem& filesystem, std::make_unique<MemoryMappedFile>(std::move(metadata_mmapped_file)), /*property_to_storage_map=*/{}, std::move(wildcard_property_storage), /*wildcard_properties_set=*/{}, /*wildcard_index_storage=*/nullptr, - pre_mapping_fbv)); + num_data_threshold_for_bucket_split, pre_mapping_fbv)); // Initialize info content by writing mapped memory directly. Info& info_ref = new_integer_index->info(); info_ref.magic = Info::kMagic; info_ref.last_added_document_id = kInvalidDocumentId; + info_ref.num_data_threshold_for_bucket_split = + num_data_threshold_for_bucket_split; // Initialize new PersistentStorage. The initial checksums will be computed // and set via InitializeNewStorage. ICING_RETURN_IF_ERROR(new_integer_index->InitializeNewStorage()); @@ -413,9 +432,9 @@ IntegerIndex::InitializeNewFiles(const Filesystem& filesystem, } /* static */ libtextclassifier3::StatusOr<std::unique_ptr<IntegerIndex>> -IntegerIndex::InitializeExistingFiles(const Filesystem& filesystem, - std::string&& working_path, - bool pre_mapping_fbv) { +IntegerIndex::InitializeExistingFiles( + const Filesystem& filesystem, std::string&& working_path, + int32_t num_data_threshold_for_bucket_split, bool pre_mapping_fbv) { // Mmap the content of the crcs and info. ICING_ASSIGN_OR_RETURN( MemoryMappedFile metadata_mmapped_file, @@ -432,10 +451,11 @@ IntegerIndex::InitializeExistingFiles(const Filesystem& filesystem, std::make_unique<PostingListIntegerIndexSerializer>(); // Initialize all existing integer index storages. - ICING_ASSIGN_OR_RETURN(PropertyToStorageMapType property_to_storage_map, - GetPropertyIntegerIndexStorageMap( - filesystem, working_path, - posting_list_serializer.get(), pre_mapping_fbv)); + ICING_ASSIGN_OR_RETURN( + PropertyToStorageMapType property_to_storage_map, + GetPropertyIntegerIndexStorageMap( + filesystem, working_path, posting_list_serializer.get(), + num_data_threshold_for_bucket_split, pre_mapping_fbv)); std::string wildcard_property_path = GetWildcardPropertyStorageFilePath(working_path); @@ -455,7 +475,8 @@ IntegerIndex::InitializeExistingFiles(const Filesystem& filesystem, filesystem, GetPropertyIndexStoragePath(working_path, kWildcardPropertyIndexFileName), - IntegerIndexStorage::Options(pre_mapping_fbv), + IntegerIndexStorage::Options(num_data_threshold_for_bucket_split, + pre_mapping_fbv), posting_list_serializer.get())); } @@ -465,7 +486,7 @@ IntegerIndex::InitializeExistingFiles(const Filesystem& filesystem, std::make_unique<MemoryMappedFile>(std::move(metadata_mmapped_file)), std::move(property_to_storage_map), std::move(wildcard_property_storage), std::move(wildcard_properties_set), std::move(wildcard_index_storage), - pre_mapping_fbv)); + num_data_threshold_for_bucket_split, pre_mapping_fbv)); // Initialize existing PersistentStorage. Checksums will be validated. ICING_RETURN_IF_ERROR(integer_index->InitializeExistingStorage()); @@ -474,6 +495,14 @@ IntegerIndex::InitializeExistingFiles(const Filesystem& filesystem, return absl_ports::FailedPreconditionError("Incorrect magic value"); } + // If num_data_threshold_for_bucket_split mismatches, then return error to let + // caller rebuild. + if (integer_index->info().num_data_threshold_for_bucket_split != + num_data_threshold_for_bucket_split) { + return absl_ports::FailedPreconditionError( + "Mismatch num_data_threshold_for_bucket_split"); + } + return integer_index; } @@ -488,7 +517,8 @@ IntegerIndex::TransferIntegerIndexStorage( std::unique_ptr<IntegerIndexStorage> new_storage, IntegerIndexStorage::Create( new_integer_index->filesystem_, new_storage_working_path, - IntegerIndexStorage::Options(pre_mapping_fbv_), + IntegerIndexStorage::Options(num_data_threshold_for_bucket_split_, + pre_mapping_fbv_), new_integer_index->posting_list_serializer_.get())); ICING_RETURN_IF_ERROR( @@ -552,7 +582,11 @@ libtextclassifier3::Status IntegerIndex::TransferIndex( return libtextclassifier3::Status::OK; } -libtextclassifier3::Status IntegerIndex::PersistStoragesToDisk() { +libtextclassifier3::Status IntegerIndex::PersistStoragesToDisk(bool force) { + if (!force && !is_storage_dirty()) { + return libtextclassifier3::Status::OK; + } + for (auto& [_, storage] : property_to_storage_map_) { ICING_RETURN_IF_ERROR(storage->PersistToDisk()); } @@ -564,18 +598,32 @@ libtextclassifier3::Status IntegerIndex::PersistStoragesToDisk() { return libtextclassifier3::Status::OK; } -libtextclassifier3::Status IntegerIndex::PersistMetadataToDisk() { +libtextclassifier3::Status IntegerIndex::PersistMetadataToDisk(bool force) { + if (!force && !is_info_dirty() && !is_storage_dirty()) { + return libtextclassifier3::Status::OK; + } + // Changes should have been applied to the underlying file when using // MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC, but call msync() as an // extra safety step to ensure they are written out. return metadata_mmapped_file_->PersistToDisk(); } -libtextclassifier3::StatusOr<Crc32> IntegerIndex::ComputeInfoChecksum() { +libtextclassifier3::StatusOr<Crc32> IntegerIndex::ComputeInfoChecksum( + bool force) { + if (!force && !is_info_dirty()) { + return Crc32(crcs().component_crcs.info_crc); + } + return info().ComputeChecksum(); } -libtextclassifier3::StatusOr<Crc32> IntegerIndex::ComputeStoragesChecksum() { +libtextclassifier3::StatusOr<Crc32> IntegerIndex::ComputeStoragesChecksum( + bool force) { + if (!force && !is_storage_dirty()) { + return Crc32(crcs().component_crcs.storages_crc); + } + // XOR all crcs of all storages. Since XOR is commutative and associative, // the order doesn't matter. uint32_t storages_checksum = 0; diff --git a/icing/index/numeric/integer-index.h b/icing/index/numeric/integer-index.h index 30f9852..e7a3127 100644 --- a/icing/index/numeric/integer-index.h +++ b/icing/index/numeric/integer-index.h @@ -55,25 +55,29 @@ class IntegerIndex : public NumericIndex<int64_t> { // 'wildcard' storage. static constexpr int kMaxPropertyStorages = 32; + static constexpr int32_t kDefaultNumDataThresholdForBucketSplit = + IntegerIndexStorage::kDefaultNumDataThresholdForBucketSplit; + struct Info { - static constexpr int32_t kMagic = 0x238a3dcb; + static constexpr int32_t kMagic = 0x5d8a1e8a; int32_t magic; DocumentId last_added_document_id; + int32_t num_data_threshold_for_bucket_split; Crc32 ComputeChecksum() const { return Crc32( std::string_view(reinterpret_cast<const char*>(this), sizeof(Info))); } } __attribute__((packed)); - static_assert(sizeof(Info) == 8, ""); + static_assert(sizeof(Info) == 12, ""); // Metadata file layout: <Crcs><Info> static constexpr int32_t kCrcsMetadataFileOffset = 0; static constexpr int32_t kInfoMetadataFileOffset = static_cast<int32_t>(sizeof(Crcs)); static constexpr int32_t kMetadataFileSize = sizeof(Crcs) + sizeof(Info); - static_assert(kMetadataFileSize == 20, ""); + static_assert(kMetadataFileSize == 24, ""); static constexpr WorkingPathType kWorkingPathType = WorkingPathType::kDirectory; @@ -90,6 +94,8 @@ class IntegerIndex : public NumericIndex<int64_t> { // related files will be stored under this directory. See // PersistentStorage for more details about the concept of // working_path. + // num_data_threshold_for_bucket_split: see IntegerIndexStorage::Options for + // more details. // pre_mapping_fbv: flag indicating whether memory map max possible file size // for underlying FileBackedVector before growing the actual // file size. @@ -101,7 +107,7 @@ class IntegerIndex : public NumericIndex<int64_t> { // - Any FileBackedVector/MemoryMappedFile errors. static libtextclassifier3::StatusOr<std::unique_ptr<IntegerIndex>> Create( const Filesystem& filesystem, std::string working_path, - bool pre_mapping_fbv); + int32_t num_data_threshold_for_bucket_split, bool pre_mapping_fbv); // Deletes IntegerIndex under working_path. // @@ -122,7 +128,8 @@ class IntegerIndex : public NumericIndex<int64_t> { std::string_view property_path, DocumentId document_id, SectionId section_id) override { return std::make_unique<Editor>(property_path, document_id, section_id, - *this, pre_mapping_fbv_); + *this, num_data_threshold_for_bucket_split_, + pre_mapping_fbv_); } // Returns a DocHitInfoIterator for iterating through all docs which have the @@ -172,6 +179,8 @@ class IntegerIndex : public NumericIndex<int64_t> { } void set_last_added_document_id(DocumentId document_id) override { + SetInfoDirty(); + Info& info_ref = info(); if (info_ref.last_added_document_id == kInvalidDocumentId || document_id > info_ref.last_added_document_id) { @@ -189,9 +198,12 @@ class IntegerIndex : public NumericIndex<int64_t> { public: explicit Editor(std::string_view property_path, DocumentId document_id, SectionId section_id, IntegerIndex& integer_index, + int32_t num_data_threshold_for_bucket_split, bool pre_mapping_fbv) : NumericIndex<int64_t>::Editor(property_path, document_id, section_id), integer_index_(integer_index), + num_data_threshold_for_bucket_split_( + num_data_threshold_for_bucket_split), pre_mapping_fbv_(pre_mapping_fbv) {} ~Editor() override = default; @@ -211,6 +223,8 @@ class IntegerIndex : public NumericIndex<int64_t> { IntegerIndex& integer_index_; // Does not own. + int32_t num_data_threshold_for_bucket_split_; + // Flag indicating whether memory map max possible file size for underlying // FileBackedVector before growing the actual file size. bool pre_mapping_fbv_; @@ -226,7 +240,7 @@ class IntegerIndex : public NumericIndex<int64_t> { wildcard_property_storage, std::unordered_set<std::string> wildcard_properties_set, std::unique_ptr<icing::lib::IntegerIndexStorage> wildcard_index_storage, - bool pre_mapping_fbv) + int32_t num_data_threshold_for_bucket_split, bool pre_mapping_fbv) : NumericIndex<int64_t>(filesystem, std::move(working_path), kWorkingPathType), posting_list_serializer_(std::move(posting_list_serializer)), @@ -235,15 +249,22 @@ class IntegerIndex : public NumericIndex<int64_t> { wildcard_property_storage_(std::move(wildcard_property_storage)), wildcard_properties_set_(std::move(wildcard_properties_set)), wildcard_index_storage_(std::move(wildcard_index_storage)), - pre_mapping_fbv_(pre_mapping_fbv) {} + num_data_threshold_for_bucket_split_( + num_data_threshold_for_bucket_split), + pre_mapping_fbv_(pre_mapping_fbv), + is_info_dirty_(false), + is_storage_dirty_(false) {} static libtextclassifier3::StatusOr<std::unique_ptr<IntegerIndex>> InitializeNewFiles(const Filesystem& filesystem, std::string&& working_path, + int32_t num_data_threshold_for_bucket_split, bool pre_mapping_fbv); static libtextclassifier3::StatusOr<std::unique_ptr<IntegerIndex>> InitializeExistingFiles(const Filesystem& filesystem, - std::string&& working_path, bool pre_mapping_fbv); + std::string&& working_path, + int32_t num_data_threshold_for_bucket_split, + bool pre_mapping_fbv); // Adds the property path to the list of properties using wildcard storage. // This will both update the in-memory list (wildcard_properties_set_) and @@ -296,20 +317,20 @@ class IntegerIndex : public NumericIndex<int64_t> { // Returns: // - OK on success // - INTERNAL_ERROR on I/O error - libtextclassifier3::Status PersistStoragesToDisk() override; + libtextclassifier3::Status PersistStoragesToDisk(bool force) override; // Flushes contents of metadata file. // // Returns: // - OK on success // - INTERNAL_ERROR on I/O error - libtextclassifier3::Status PersistMetadataToDisk() override; + libtextclassifier3::Status PersistMetadataToDisk(bool force) override; // Computes and returns Info checksum. // // Returns: // - Crc of the Info on success - libtextclassifier3::StatusOr<Crc32> ComputeInfoChecksum() override; + libtextclassifier3::StatusOr<Crc32> ComputeInfoChecksum(bool force) override; // Computes and returns all storages checksum. Checksums of (storage_crc, // property_path) for all existing property paths will be combined together by @@ -318,7 +339,8 @@ class IntegerIndex : public NumericIndex<int64_t> { // Returns: // - Crc of all storages on success // - INTERNAL_ERROR if any data inconsistency - libtextclassifier3::StatusOr<Crc32> ComputeStoragesChecksum() override; + libtextclassifier3::StatusOr<Crc32> ComputeStoragesChecksum( + bool force) override; Crcs& crcs() override { return *reinterpret_cast<Crcs*>(metadata_mmapped_file_->mutable_region() + @@ -340,6 +362,17 @@ class IntegerIndex : public NumericIndex<int64_t> { kInfoMetadataFileOffset); } + void SetInfoDirty() { is_info_dirty_ = true; } + // When storage is dirty, we have to set info dirty as well. So just expose + // SetDirty to set both. + void SetDirty() { + is_info_dirty_ = true; + is_storage_dirty_ = true; + } + + bool is_info_dirty() const { return is_info_dirty_; } + bool is_storage_dirty() const { return is_storage_dirty_; } + std::unique_ptr<PostingListIntegerIndexSerializer> posting_list_serializer_; std::unique_ptr<MemoryMappedFile> metadata_mmapped_file_; @@ -360,9 +393,14 @@ class IntegerIndex : public NumericIndex<int64_t> { // kMaxPropertyStorages in property_to_storage_map. std::unique_ptr<icing::lib::IntegerIndexStorage> wildcard_index_storage_; + int32_t num_data_threshold_for_bucket_split_; + // Flag indicating whether memory map max possible file size for underlying // FileBackedVector before growing the actual file size. bool pre_mapping_fbv_; + + bool is_info_dirty_; + bool is_storage_dirty_; }; } // namespace lib diff --git a/icing/index/numeric/integer-index_test.cc b/icing/index/numeric/integer-index_test.cc index 8a7acb9..3b60001 100644 --- a/icing/index/numeric/integer-index_test.cc +++ b/icing/index/numeric/integer-index_test.cc @@ -14,6 +14,7 @@ #include "icing/index/numeric/integer-index.h" +#include <cstdint> #include <limits> #include <memory> #include <string> @@ -83,13 +84,14 @@ class NumericIndexIntegerTest : public ::testing::Test { filesystem_.CreateDirectoryRecursively(document_store_dir.c_str())); ICING_ASSERT_OK_AND_ASSIGN( DocumentStore::CreateResult doc_store_create_result, - DocumentStore::Create(&filesystem_, document_store_dir, &clock_, - schema_store_.get(), - /*force_recovery_and_revalidate_documents=*/false, - /*namespace_id_fingerprint=*/false, - PortableFileBackedProtoLog< - DocumentWrapper>::kDeflateCompressionLevel, - /*initialize_stats=*/nullptr)); + DocumentStore::Create( + &filesystem_, document_store_dir, &clock_, schema_store_.get(), + /*force_recovery_and_revalidate_documents=*/false, + /*namespace_id_fingerprint=*/false, /*pre_mapping_fbv=*/false, + /*use_persistent_hash_map=*/false, + PortableFileBackedProtoLog< + DocumentWrapper>::kDeflateCompressionLevel, + /*initialize_stats=*/nullptr)); doc_store_ = std::move(doc_store_create_result.document_store); } @@ -114,8 +116,10 @@ class NumericIndexIntegerTest : public ::testing::Test { template <> libtextclassifier3::StatusOr<std::unique_ptr<NumericIndex<int64_t>>> CreateIntegerIndex<IntegerIndex>() { - return IntegerIndex::Create(filesystem_, working_path_, - /*pre_mapping_fbv=*/false); + return IntegerIndex::Create( + filesystem_, working_path_, /*num_data_threshold_for_bucket_split=*/ + IntegerIndexStorage::kDefaultNumDataThresholdForBucketSplit, + /*pre_mapping_fbv=*/false); } template <typename NotIntegerIndexType> @@ -137,9 +141,8 @@ class NumericIndexIntegerTest : public ::testing::Test { return absl_ports::InternalError("Unable to create compact directory"); } ICING_ASSIGN_OR_RETURN( - std::vector<DocumentId> docid_map, - doc_store_->OptimizeInto(document_store_compact_dir, nullptr, - /*namespace_id_fingerprint=*/false)); + DocumentStore::OptimizeResult doc_store_optimize_result, + doc_store_->OptimizeInto(document_store_compact_dir, nullptr)); doc_store_.reset(); if (!filesystem_.SwapFiles(document_store_dir.c_str(), @@ -153,15 +156,16 @@ class NumericIndexIntegerTest : public ::testing::Test { ICING_ASSIGN_OR_RETURN( DocumentStore::CreateResult doc_store_create_result, - DocumentStore::Create(&filesystem_, document_store_dir, &clock_, - schema_store_.get(), - /*force_recovery_and_revalidate_documents=*/false, - /*namespace_id_fingerprint=*/false, - PortableFileBackedProtoLog< - DocumentWrapper>::kDeflateCompressionLevel, - /*initialize_stats=*/nullptr)); + DocumentStore::Create( + &filesystem_, document_store_dir, &clock_, schema_store_.get(), + /*force_recovery_and_revalidate_documents=*/false, + /*namespace_id_fingerprint=*/false, /*pre_mapping_fbv=*/false, + /*use_persistent_hash_map=*/false, + PortableFileBackedProtoLog< + DocumentWrapper>::kDeflateCompressionLevel, + /*initialize_stats=*/nullptr)); doc_store_ = std::move(doc_store_create_result.document_store); - return docid_map; + return std::move(doc_store_optimize_result.document_id_old_to_new); } libtextclassifier3::StatusOr<std::vector<DocHitInfo>> Query( @@ -1126,14 +1130,28 @@ TYPED_TEST(NumericIndexIntegerTest, Clear) { /*document_id=*/4, std::vector<SectionId>{kDefaultSectionId})))); } +struct IntegerIndexTestParam { + int32_t num_data_threshold_for_bucket_split; + bool pre_mapping_fbv; + + explicit IntegerIndexTestParam(int32_t num_data_threshold_for_bucket_split_in, + bool pre_mapping_fbv_in) + : num_data_threshold_for_bucket_split( + num_data_threshold_for_bucket_split_in), + pre_mapping_fbv(pre_mapping_fbv_in) {} +}; + // Tests for persistent integer index only -class IntegerIndexTest : public NumericIndexIntegerTest<IntegerIndex>, - public ::testing::WithParamInterface<bool> {}; +class IntegerIndexTest + : public NumericIndexIntegerTest<IntegerIndex>, + public ::testing::WithParamInterface<IntegerIndexTestParam> {}; TEST_P(IntegerIndexTest, InvalidWorkingPath) { - EXPECT_THAT(IntegerIndex::Create(filesystem_, "/dev/null/integer_index_test", - /*pre_mapping_fbv=*/GetParam()), - StatusIs(libtextclassifier3::StatusCode::INTERNAL)); + EXPECT_THAT( + IntegerIndex::Create(filesystem_, "/dev/null/integer_index_test", + GetParam().num_data_threshold_for_bucket_split, + GetParam().pre_mapping_fbv), + StatusIs(libtextclassifier3::StatusCode::INTERNAL)); } TEST_P(IntegerIndexTest, InitializeNewFiles) { @@ -1142,7 +1160,8 @@ TEST_P(IntegerIndexTest, InitializeNewFiles) { ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<IntegerIndex> integer_index, IntegerIndex::Create(filesystem_, working_path_, - /*pre_mapping_fbv=*/GetParam())); + GetParam().num_data_threshold_for_bucket_split, + GetParam().pre_mapping_fbv)); ICING_ASSERT_OK(integer_index->PersistToDisk()); } @@ -1160,6 +1179,8 @@ TEST_P(IntegerIndexTest, InitializeNewFiles) { IntegerIndex::kInfoMetadataFileOffset)); EXPECT_THAT(info.magic, Eq(Info::kMagic)); EXPECT_THAT(info.last_added_document_id, Eq(kInvalidDocumentId)); + EXPECT_THAT(info.num_data_threshold_for_bucket_split, + Eq(GetParam().num_data_threshold_for_bucket_split)); // Check crcs section Crcs crcs; @@ -1183,7 +1204,8 @@ TEST_P(IntegerIndexTest, ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<IntegerIndex> integer_index, IntegerIndex::Create(filesystem_, working_path_, - /*pre_mapping_fbv=*/GetParam())); + GetParam().num_data_threshold_for_bucket_split, + GetParam().pre_mapping_fbv)); // Insert some data. Index(integer_index.get(), kDefaultTestPropertyPath, /*document_id=*/0, @@ -1195,16 +1217,19 @@ TEST_P(IntegerIndexTest, // Without calling PersistToDisk, checksums will not be recomputed or synced // to disk, so initializing another instance on the same files should fail. - EXPECT_THAT(IntegerIndex::Create(filesystem_, working_path_, - /*pre_mapping_fbv=*/GetParam()), - StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION)); + EXPECT_THAT( + IntegerIndex::Create(filesystem_, working_path_, + GetParam().num_data_threshold_for_bucket_split, + GetParam().pre_mapping_fbv), + StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION)); } TEST_P(IntegerIndexTest, InitializationShouldSucceedWithPersistToDisk) { ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<IntegerIndex> integer_index1, IntegerIndex::Create(filesystem_, working_path_, - /*pre_mapping_fbv=*/GetParam())); + GetParam().num_data_threshold_for_bucket_split, + GetParam().pre_mapping_fbv)); // Insert some data. Index(integer_index1.get(), kDefaultTestPropertyPath, /*document_id=*/0, @@ -1228,7 +1253,8 @@ TEST_P(IntegerIndexTest, InitializationShouldSucceedWithPersistToDisk) { ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<IntegerIndex> integer_index2, IntegerIndex::Create(filesystem_, working_path_, - /*pre_mapping_fbv=*/GetParam())); + GetParam().num_data_threshold_for_bucket_split, + GetParam().pre_mapping_fbv)); EXPECT_THAT(integer_index2->last_added_document_id(), Eq(2)); EXPECT_THAT(Query(integer_index2.get(), kDefaultTestPropertyPath, /*key_lower=*/std::numeric_limits<int64_t>::min(), @@ -1243,7 +1269,8 @@ TEST_P(IntegerIndexTest, InitializationShouldSucceedAfterDestruction) { ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<IntegerIndex> integer_index, IntegerIndex::Create(filesystem_, working_path_, - /*pre_mapping_fbv=*/GetParam())); + GetParam().num_data_threshold_for_bucket_split, + GetParam().pre_mapping_fbv)); // Insert some data. Index(integer_index.get(), kDefaultTestPropertyPath, /*document_id=*/0, @@ -1268,7 +1295,8 @@ TEST_P(IntegerIndexTest, InitializationShouldSucceedAfterDestruction) { ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<IntegerIndex> integer_index, IntegerIndex::Create(filesystem_, working_path_, - /*pre_mapping_fbv=*/GetParam())); + GetParam().num_data_threshold_for_bucket_split, + GetParam().pre_mapping_fbv)); EXPECT_THAT(integer_index->last_added_document_id(), Eq(2)); EXPECT_THAT(Query(integer_index.get(), kDefaultTestPropertyPath, /*key_lower=*/std::numeric_limits<int64_t>::min(), @@ -1283,7 +1311,8 @@ TEST_P(IntegerIndexTest, InitializeExistingFilesWithWrongAllCrcShouldFail) { ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<IntegerIndex> integer_index, IntegerIndex::Create(filesystem_, working_path_, - /*pre_mapping_fbv=*/GetParam())); + GetParam().num_data_threshold_for_bucket_split, + GetParam().pre_mapping_fbv)); // Insert some data. Index(integer_index.get(), kDefaultTestPropertyPath, /*document_id=*/0, /*section_id=*/20, /*keys=*/{0, 100, -100}); @@ -1315,8 +1344,10 @@ TEST_P(IntegerIndexTest, InitializeExistingFilesWithWrongAllCrcShouldFail) { // Attempt to create the integer index with metadata containing corrupted // all_crc. This should fail. libtextclassifier3::StatusOr<std::unique_ptr<IntegerIndex>> - integer_index_or = IntegerIndex::Create(filesystem_, working_path_, - /*pre_mapping_fbv=*/GetParam()); + integer_index_or = + IntegerIndex::Create(filesystem_, working_path_, + GetParam().num_data_threshold_for_bucket_split, + GetParam().pre_mapping_fbv); EXPECT_THAT(integer_index_or, StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION)); EXPECT_THAT(integer_index_or.status().error_message(), @@ -1329,7 +1360,8 @@ TEST_P(IntegerIndexTest, InitializeExistingFilesWithCorruptedInfoShouldFail) { ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<IntegerIndex> integer_index, IntegerIndex::Create(filesystem_, working_path_, - /*pre_mapping_fbv=*/GetParam())); + GetParam().num_data_threshold_for_bucket_split, + GetParam().pre_mapping_fbv)); // Insert some data. Index(integer_index.get(), kDefaultTestPropertyPath, /*document_id=*/0, /*section_id=*/20, /*keys=*/{0, 100, -100}); @@ -1362,8 +1394,10 @@ TEST_P(IntegerIndexTest, InitializeExistingFilesWithCorruptedInfoShouldFail) { // Attempt to create the integer index with info that doesn't match its // checksum and confirm that it fails. libtextclassifier3::StatusOr<std::unique_ptr<IntegerIndex>> - integer_index_or = IntegerIndex::Create(filesystem_, working_path_, - /*pre_mapping_fbv=*/GetParam()); + integer_index_or = + IntegerIndex::Create(filesystem_, working_path_, + GetParam().num_data_threshold_for_bucket_split, + GetParam().pre_mapping_fbv); EXPECT_THAT(integer_index_or, StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION)); EXPECT_THAT(integer_index_or.status().error_message(), @@ -1377,7 +1411,8 @@ TEST_P(IntegerIndexTest, ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<IntegerIndex> integer_index, IntegerIndex::Create(filesystem_, working_path_, - /*pre_mapping_fbv=*/GetParam())); + GetParam().num_data_threshold_for_bucket_split, + GetParam().pre_mapping_fbv)); // Insert some data. Index(integer_index.get(), kDefaultTestPropertyPath, /*document_id=*/0, /*section_id=*/20, /*keys=*/{0, 100, -100}); @@ -1400,7 +1435,9 @@ TEST_P(IntegerIndexTest, std::unique_ptr<IntegerIndexStorage> storage, IntegerIndexStorage::Create( filesystem_, std::move(storage_working_path), - IntegerIndexStorage::Options(/*pre_mapping_fbv=*/GetParam()), + IntegerIndexStorage::Options( + GetParam().num_data_threshold_for_bucket_split, + GetParam().pre_mapping_fbv), &posting_list_integer_index_serializer)); ICING_ASSERT_OK(storage->AddKeys(/*document_id=*/3, /*section_id=*/4, /*new_keys=*/{3, 4, 5})); @@ -1412,8 +1449,10 @@ TEST_P(IntegerIndexTest, // Attempt to create the integer index with corrupted storages. This should // fail. libtextclassifier3::StatusOr<std::unique_ptr<IntegerIndex>> - integer_index_or = IntegerIndex::Create(filesystem_, working_path_, - /*pre_mapping_fbv=*/GetParam()); + integer_index_or = + IntegerIndex::Create(filesystem_, working_path_, + GetParam().num_data_threshold_for_bucket_split, + GetParam().pre_mapping_fbv); EXPECT_THAT(integer_index_or, StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION)); EXPECT_THAT(integer_index_or.status().error_message(), @@ -1421,6 +1460,41 @@ TEST_P(IntegerIndexTest, } } +TEST_P( + IntegerIndexTest, + InitializeExistingFilesWithMismatchNumDataThresholdForBucketSplitShouldFail) { + { + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<IntegerIndex> integer_index, + IntegerIndex::Create(filesystem_, working_path_, + GetParam().num_data_threshold_for_bucket_split, + GetParam().pre_mapping_fbv)); + // Insert some data. + Index(integer_index.get(), kDefaultTestPropertyPath, /*document_id=*/0, + /*section_id=*/20, /*keys=*/{0, 100, -100}); + Index(integer_index.get(), kDefaultTestPropertyPath, /*document_id=*/1, + /*section_id=*/2, /*keys=*/{3, -1000, 500}); + Index(integer_index.get(), kDefaultTestPropertyPath, /*document_id=*/2, + /*section_id=*/15, /*keys=*/{-6, 321, 98}); + + ICING_ASSERT_OK(integer_index->PersistToDisk()); + } + + { + // Attempt to create the integer index with different + // num_data_threshold_for_bucket_split. This should fail. + libtextclassifier3::StatusOr<std::unique_ptr<IntegerIndex>> + integer_index_or = IntegerIndex::Create( + filesystem_, working_path_, + GetParam().num_data_threshold_for_bucket_split + 1, + GetParam().pre_mapping_fbv); + EXPECT_THAT(integer_index_or, + StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION)); + EXPECT_THAT(integer_index_or.status().error_message(), + HasSubstr("Mismatch num_data_threshold_for_bucket_split")); + } +} + TEST_P(IntegerIndexTest, WildcardStoragePersistenceQuery) { // This test sets its schema assuming that max property storages == 32. ASSERT_THAT(IntegerIndex::kMaxPropertyStorages, Eq(32)); @@ -1586,7 +1660,8 @@ TEST_P(IntegerIndexTest, WildcardStoragePersistenceQuery) { ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<IntegerIndex> integer_index, IntegerIndex::Create(filesystem_, working_path_, - /*pre_mapping_fbv=*/GetParam())); + GetParam().num_data_threshold_for_bucket_split, + GetParam().pre_mapping_fbv)); // Index numeric content for other properties to force our property into the // wildcard storage. @@ -1651,7 +1726,8 @@ TEST_P(IntegerIndexTest, WildcardStoragePersistenceQuery) { ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<IntegerIndex> integer_index, IntegerIndex::Create(filesystem_, working_path_, - /*pre_mapping_fbv=*/GetParam())); + GetParam().num_data_threshold_for_bucket_split, + GetParam().pre_mapping_fbv)); EXPECT_THAT(integer_index->num_property_indices(), Eq(33)); @@ -1691,7 +1767,8 @@ TEST_P(IntegerIndexTest, ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<IntegerIndex> integer_index, IntegerIndex::Create(filesystem_, working_path_, - /*pre_mapping_fbv=*/GetParam())); + GetParam().num_data_threshold_for_bucket_split, + GetParam().pre_mapping_fbv)); // Doc id = 1: insert 2 data for "prop1", "prop2" Index(integer_index.get(), kPropertyPath2, /*document_id=*/1, kSectionId2, @@ -1742,7 +1819,8 @@ TEST_P(IntegerIndexTest, ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<IntegerIndex> integer_index, IntegerIndex::Create(filesystem_, working_path_, - /*pre_mapping_fbv=*/GetParam())); + GetParam().num_data_threshold_for_bucket_split, + GetParam().pre_mapping_fbv)); // Key = 1 EXPECT_THAT(Query(integer_index.get(), kPropertyPath1, /*key_lower=*/1, @@ -1968,7 +2046,8 @@ TEST_P(IntegerIndexTest, WildcardStorageWorksAfterOptimize) { ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<IntegerIndex> integer_index, IntegerIndex::Create(filesystem_, working_path_, - /*pre_mapping_fbv=*/GetParam())); + GetParam().num_data_threshold_for_bucket_split, + GetParam().pre_mapping_fbv)); // Index numeric content for other properties to force our property into the // wildcard storage. @@ -2067,7 +2146,8 @@ TEST_P(IntegerIndexTest, WildcardStorageWorksAfterOptimize) { ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<IntegerIndex> integer_index, IntegerIndex::Create(filesystem_, working_path_, - /*pre_mapping_fbv=*/GetParam())); + GetParam().num_data_threshold_for_bucket_split, + GetParam().pre_mapping_fbv)); EXPECT_THAT(integer_index->num_property_indices(), Eq(33)); @@ -2236,7 +2316,8 @@ TEST_P(IntegerIndexTest, WildcardStorageAvailableIndicesAfterOptimize) { ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<IntegerIndex> integer_index, IntegerIndex::Create(filesystem_, working_path_, - /*pre_mapping_fbv=*/GetParam())); + GetParam().num_data_threshold_for_bucket_split, + GetParam().pre_mapping_fbv)); // Index numeric content for other properties to force our property into the // wildcard storage. @@ -2317,7 +2398,8 @@ TEST_P(IntegerIndexTest, WildcardStorageAvailableIndicesAfterOptimize) { ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<IntegerIndex> integer_index, IntegerIndex::Create(filesystem_, working_path_, - /*pre_mapping_fbv=*/GetParam())); + GetParam().num_data_threshold_for_bucket_split, + GetParam().pre_mapping_fbv)); EXPECT_THAT(integer_index->num_property_indices(), Eq(1)); @@ -2363,8 +2445,152 @@ TEST_P(IntegerIndexTest, WildcardStorageAvailableIndicesAfterOptimize) { /*document_id=*/7, expected_sections_typea)))); } -INSTANTIATE_TEST_SUITE_P(IntegerIndexTest, IntegerIndexTest, - testing::Values(true, false)); +TEST_P(IntegerIndexTest, IteratorCallStats) { + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<IntegerIndex> integer_index, + IntegerIndex::Create(filesystem_, working_path_, + GetParam().num_data_threshold_for_bucket_split, + GetParam().pre_mapping_fbv)); + + Index(integer_index.get(), kDefaultTestPropertyPath, /*document_id=*/0, + kDefaultSectionId, /*keys=*/{1}); + Index(integer_index.get(), kDefaultTestPropertyPath, /*document_id=*/1, + kDefaultSectionId, /*keys=*/{3}); + Index(integer_index.get(), kDefaultTestPropertyPath, /*document_id=*/2, + kDefaultSectionId, /*keys=*/{2}); + Index(integer_index.get(), kDefaultTestPropertyPath, /*document_id=*/3, + kDefaultSectionId, /*keys=*/{0}); + + // GetIterator for range [INT_MIN, INT_MAX] and Advance all. Those 4 keys are + // in 1 single bucket, so there will be only 1 posting list (and 1 block). + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<DocHitInfoIterator> iter, + integer_index->GetIterator( + kDefaultTestPropertyPath, + /*key_lower=*/std::numeric_limits<int64_t>::min(), + /*key_upper=*/std::numeric_limits<int64_t>::max(), *doc_store_, + *schema_store_, clock_.GetSystemTimeMilliseconds())); + + // 1 block should be read even without calling Advance(), since we read the + // posting list and put bucket into the priority queue in ctor. + EXPECT_THAT(iter->GetCallStats(), + EqualsDocHitInfoIteratorCallStats( + /*num_leaf_advance_calls_lite_index=*/0, + /*num_leaf_advance_calls_main_index=*/0, + /*num_leaf_advance_calls_integer_index=*/1, + /*num_leaf_advance_calls_no_index=*/0, + /*num_blocks_inspected=*/1)); + + // 1st Advance(). + ICING_ASSERT_OK(iter->Advance()); + EXPECT_THAT(iter->GetCallStats(), + EqualsDocHitInfoIteratorCallStats( + /*num_leaf_advance_calls_lite_index=*/0, + /*num_leaf_advance_calls_main_index=*/0, + /*num_leaf_advance_calls_integer_index=*/2, + /*num_leaf_advance_calls_no_index=*/0, + /*num_blocks_inspected=*/1)); + + // 2nd Advance(). + ICING_ASSERT_OK(iter->Advance()); + EXPECT_THAT(iter->GetCallStats(), + EqualsDocHitInfoIteratorCallStats( + /*num_leaf_advance_calls_lite_index=*/0, + /*num_leaf_advance_calls_main_index=*/0, + /*num_leaf_advance_calls_integer_index=*/3, + /*num_leaf_advance_calls_no_index=*/0, + /*num_blocks_inspected=*/1)); + + // 3rd Advance(). + ICING_ASSERT_OK(iter->Advance()); + EXPECT_THAT(iter->GetCallStats(), + EqualsDocHitInfoIteratorCallStats( + /*num_leaf_advance_calls_lite_index=*/0, + /*num_leaf_advance_calls_main_index=*/0, + /*num_leaf_advance_calls_integer_index=*/4, + /*num_leaf_advance_calls_no_index=*/0, + /*num_blocks_inspected=*/1)); + + // 4th Advance(). + ICING_ASSERT_OK(iter->Advance()); + EXPECT_THAT(iter->GetCallStats(), + EqualsDocHitInfoIteratorCallStats( + /*num_leaf_advance_calls_lite_index=*/0, + /*num_leaf_advance_calls_main_index=*/0, + /*num_leaf_advance_calls_integer_index=*/4, + /*num_leaf_advance_calls_no_index=*/0, + /*num_blocks_inspected=*/1)); + + // 5th Advance(). + ASSERT_THAT(iter->Advance(), + StatusIs(libtextclassifier3::StatusCode::RESOURCE_EXHAUSTED)); + EXPECT_THAT(iter->GetCallStats(), + EqualsDocHitInfoIteratorCallStats( + /*num_leaf_advance_calls_lite_index=*/0, + /*num_leaf_advance_calls_main_index=*/0, + /*num_leaf_advance_calls_integer_index=*/4, + /*num_leaf_advance_calls_no_index=*/0, + /*num_blocks_inspected=*/1)); +} + +TEST_P(IntegerIndexTest, IteratorCallStatsNonExistingProperty) { + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<IntegerIndex> integer_index, + IntegerIndex::Create(filesystem_, working_path_, + GetParam().num_data_threshold_for_bucket_split, + GetParam().pre_mapping_fbv)); + + Index(integer_index.get(), kDefaultTestPropertyPath, /*document_id=*/0, + kDefaultSectionId, /*keys=*/{1}); + Index(integer_index.get(), kDefaultTestPropertyPath, /*document_id=*/1, + kDefaultSectionId, /*keys=*/{3}); + Index(integer_index.get(), kDefaultTestPropertyPath, /*document_id=*/2, + kDefaultSectionId, /*keys=*/{2}); + Index(integer_index.get(), kDefaultTestPropertyPath, /*document_id=*/3, + kDefaultSectionId, /*keys=*/{0}); + + // GetIterator for property "otherProperty1". + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<DocHitInfoIterator> iter, + integer_index->GetIterator( + "otherProperty1", /*key_lower=*/std::numeric_limits<int64_t>::min(), + /*key_upper=*/std::numeric_limits<int64_t>::max(), *doc_store_, + *schema_store_, clock_.GetSystemTimeMilliseconds())); + + EXPECT_THAT(iter->GetCallStats(), + EqualsDocHitInfoIteratorCallStats( + /*num_leaf_advance_calls_lite_index=*/0, + /*num_leaf_advance_calls_main_index=*/0, + /*num_leaf_advance_calls_integer_index=*/0, + /*num_leaf_advance_calls_no_index=*/0, + /*num_blocks_inspected=*/0)); + + // 1st Advance(). + ASSERT_THAT(iter->Advance(), + StatusIs(libtextclassifier3::StatusCode::RESOURCE_EXHAUSTED)); + EXPECT_THAT(iter->GetCallStats(), + EqualsDocHitInfoIteratorCallStats( + /*num_leaf_advance_calls_lite_index=*/0, + /*num_leaf_advance_calls_main_index=*/0, + /*num_leaf_advance_calls_integer_index=*/0, + /*num_leaf_advance_calls_no_index=*/0, + /*num_blocks_inspected=*/0)); +} + +INSTANTIATE_TEST_SUITE_P( + IntegerIndexTest, IntegerIndexTest, + testing::Values( + IntegerIndexTestParam(/*num_data_threshold_for_bucket_split_in=*/341, + /*pre_mapping_fbv_in=*/false), + IntegerIndexTestParam(/*num_data_threshold_for_bucket_split_in=*/341, + /*pre_mapping_fbv_in=*/true), + + IntegerIndexTestParam(/*num_data_threshold_for_bucket_split_in=*/16384, + /*pre_mapping_fbv_in=*/false), + IntegerIndexTestParam(/*num_data_threshold_for_bucket_split_in=*/32768, + /*pre_mapping_fbv_in=*/false), + IntegerIndexTestParam(/*num_data_threshold_for_bucket_split_in=*/65536, + /*pre_mapping_fbv_in=*/false))); } // namespace diff --git a/icing/index/numeric/numeric-index.h b/icing/index/numeric/numeric-index.h index 24b81e7..d094d3d 100644 --- a/icing/index/numeric/numeric-index.h +++ b/icing/index/numeric/numeric-index.h @@ -15,6 +15,7 @@ #ifndef ICING_INDEX_NUMERIC_NUMERIC_INDEX_H_ #define ICING_INDEX_NUMERIC_NUMERIC_INDEX_H_ +#include <cstdint> #include <memory> #include <string> #include <string_view> @@ -100,6 +101,10 @@ class NumericIndex : public PersistentStorage { virtual DocHitInfo GetDocHitInfo() const = 0; + virtual int32_t GetNumAdvanceCalls() const = 0; + + virtual int32_t GetNumBlocksInspected() const = 0; + protected: T key_lower_; T key_upper_; @@ -177,15 +182,17 @@ class NumericIndex : public PersistentStorage { : PersistentStorage(filesystem, std::move(working_path), working_path_type) {} - virtual libtextclassifier3::Status PersistStoragesToDisk() override = 0; + virtual libtextclassifier3::Status PersistStoragesToDisk( + bool force) override = 0; - virtual libtextclassifier3::Status PersistMetadataToDisk() override = 0; + virtual libtextclassifier3::Status PersistMetadataToDisk( + bool force) override = 0; - virtual libtextclassifier3::StatusOr<Crc32> ComputeInfoChecksum() - override = 0; + virtual libtextclassifier3::StatusOr<Crc32> ComputeInfoChecksum( + bool force) override = 0; - virtual libtextclassifier3::StatusOr<Crc32> ComputeStoragesChecksum() - override = 0; + virtual libtextclassifier3::StatusOr<Crc32> ComputeStoragesChecksum( + bool force) override = 0; virtual Crcs& crcs() override = 0; virtual const Crcs& crcs() const override = 0; diff --git a/icing/index/numeric/posting-list-integer-index-accessor.h b/icing/index/numeric/posting-list-integer-index-accessor.h index f0d3d25..4f667a0 100644 --- a/icing/index/numeric/posting-list-integer-index-accessor.h +++ b/icing/index/numeric/posting-list-integer-index-accessor.h @@ -100,16 +100,6 @@ class PostingListIntegerIndexAccessor : public PostingListAccessor { // posting list. libtextclassifier3::Status PrependData(const IntegerIndexData& data); - bool WantsSplit() const { - const PostingListUsed* current_pl = - preexisting_posting_list_ != nullptr - ? &preexisting_posting_list_->posting_list - : &in_memory_posting_list_; - // Only max-sized PLs get split. Smaller PLs just get copied to larger PLs. - return current_pl->size_in_bytes() == storage_->max_posting_list_bytes() && - serializer_->IsFull(current_pl); - } - private: explicit PostingListIntegerIndexAccessor( FlashIndexStorage* storage, PostingListUsed in_memory_posting_list, diff --git a/icing/index/numeric/posting-list-integer-index-serializer.cc b/icing/index/numeric/posting-list-integer-index-serializer.cc index 6556451..99f14f9 100644 --- a/icing/index/numeric/posting-list-integer-index-serializer.cc +++ b/icing/index/numeric/posting-list-integer-index-serializer.cc @@ -222,7 +222,8 @@ libtextclassifier3::Status PostingListIntegerIndexSerializer::PrependData( } } -uint32_t PostingListIntegerIndexSerializer::PrependDataArray( +libtextclassifier3::StatusOr<uint32_t> +PostingListIntegerIndexSerializer::PrependDataArray( PostingListUsed* posting_list_used, const IntegerIndexData* array, uint32_t num_data, bool keep_prepended) const { if (!IsPostingListValid(posting_list_used)) { @@ -240,7 +241,7 @@ uint32_t PostingListIntegerIndexSerializer::PrependDataArray( // before. PopFrontData guarantees that it will remove all 'i' data so long // as there are at least 'i' data in the posting list, which we know there // are. - PopFrontData(posting_list_used, /*num_data=*/i); + ICING_RETURN_IF_ERROR(PopFrontData(posting_list_used, /*num_data=*/i)); return 0; } return i; @@ -335,7 +336,7 @@ libtextclassifier3::Status PostingListIntegerIndexSerializer::PopFrontData( // - out[1] is a valid data less than all previous data in the posting list. // - There's no way that the posting list could run out of room because it // previously stored these 2 data. - PrependData(posting_list_used, out[1]); + ICING_RETURN_IF_ERROR(PrependData(posting_list_used, out[1])); } else if (num_data > 0) { return GetDataInternal(posting_list_used, /*limit=*/num_data, /*pop=*/true, /*out=*/nullptr); diff --git a/icing/index/numeric/posting-list-integer-index-serializer.h b/icing/index/numeric/posting-list-integer-index-serializer.h index ea2f2da..cbaed33 100644 --- a/icing/index/numeric/posting-list-integer-index-serializer.h +++ b/icing/index/numeric/posting-list-integer-index-serializer.h @@ -79,9 +79,9 @@ class PostingListIntegerIndexSerializer : public PostingListSerializer { // RETURNS: // The number of data that have been prepended to the posting list. If // keep_prepended is false and reverted, then it returns 0. - uint32_t PrependDataArray(PostingListUsed* posting_list_used, - const IntegerIndexData* array, uint32_t num_data, - bool keep_prepended) const; + libtextclassifier3::StatusOr<uint32_t> PrependDataArray( + PostingListUsed* posting_list_used, const IntegerIndexData* array, + uint32_t num_data, bool keep_prepended) const; // Retrieves all data stored in the posting list. // diff --git a/icing/index/numeric/posting-list-integer-index-serializer_test.cc b/icing/index/numeric/posting-list-integer-index-serializer_test.cc index bfb4e71..716d1aa 100644 --- a/icing/index/numeric/posting-list-integer-index-serializer_test.cc +++ b/icing/index/numeric/posting-list-integer-index-serializer_test.cc @@ -241,7 +241,7 @@ TEST(PostingListIntegerIndexSerializerTest, EXPECT_THAT( serializer.PrependDataArray(&pl_used, data_in.data(), data_in.size(), /*keep_prepended=*/false), - Eq(data_in.size())); + IsOkAndHolds(data_in.size())); std::move(data_in.begin(), data_in.end(), std::back_inserter(data_pushed)); EXPECT_THAT(serializer.GetBytesUsed(&pl_used), Eq(data_pushed.size() * sizeof(IntegerIndexData))); @@ -258,7 +258,7 @@ TEST(PostingListIntegerIndexSerializerTest, EXPECT_THAT( serializer.PrependDataArray(&pl_used, data_in.data(), data_in.size(), /*keep_prepended=*/false), - Eq(data_in.size())); + IsOkAndHolds(data_in.size())); std::move(data_in.begin(), data_in.end(), std::back_inserter(data_pushed)); EXPECT_THAT(serializer.GetBytesUsed(&pl_used), Eq(data_pushed.size() * sizeof(IntegerIndexData))); @@ -276,7 +276,7 @@ TEST(PostingListIntegerIndexSerializerTest, EXPECT_THAT( serializer.PrependDataArray(&pl_used, data_in.data(), data_in.size(), /*keep_prepended=*/false), - Eq(0)); + IsOkAndHolds(0)); EXPECT_THAT(serializer.GetBytesUsed(&pl_used), Eq(data_pushed.size() * sizeof(IntegerIndexData))); EXPECT_THAT( @@ -288,7 +288,7 @@ TEST(PostingListIntegerIndexSerializerTest, EXPECT_THAT( serializer.PrependDataArray(&pl_used, data_in.data(), data_in.size(), /*keep_prepended=*/false), - Eq(data_in.size())); + IsOkAndHolds(data_in.size())); std::move(data_in.begin(), data_in.end(), std::back_inserter(data_pushed)); EXPECT_THAT(serializer.GetBytesUsed(&pl_used), Eq(data_pushed.size() * sizeof(IntegerIndexData))); @@ -319,7 +319,7 @@ TEST(PostingListIntegerIndexSerializerTest, PrependDataArrayKeepPrepended) { EXPECT_THAT( serializer.PrependDataArray(&pl_used, data_in.data(), data_in.size(), /*keep_prepended=*/true), - Eq(data_in.size())); + IsOkAndHolds(data_in.size())); std::move(data_in.begin(), data_in.end(), std::back_inserter(data_pushed)); EXPECT_THAT(serializer.GetBytesUsed(&pl_used), Eq(data_pushed.size() * sizeof(IntegerIndexData))); @@ -341,7 +341,7 @@ TEST(PostingListIntegerIndexSerializerTest, PrependDataArrayKeepPrepended) { EXPECT_THAT( serializer.PrependDataArray(&pl_used, data_in.data(), data_in.size(), /*keep_prepended=*/true), - Eq(3)); + IsOkAndHolds(3)); data_in.resize(3); std::move(data_in.begin(), data_in.end(), std::back_inserter(data_pushed)); EXPECT_THAT(serializer.GetBytesUsed(&pl_used), @@ -365,7 +365,7 @@ TEST(PostingListIntegerIndexSerializerTest, MoveFrom) { ASSERT_THAT( serializer.PrependDataArray(&pl_used1, data_arr1.data(), data_arr1.size(), /*keep_prepended=*/false), - Eq(data_arr1.size())); + IsOkAndHolds(data_arr1.size())); ICING_ASSERT_OK_AND_ASSIGN( PostingListUsed pl_used2, @@ -378,7 +378,7 @@ TEST(PostingListIntegerIndexSerializerTest, MoveFrom) { ASSERT_THAT( serializer.PrependDataArray(&pl_used2, data_arr2.data(), data_arr2.size(), /*keep_prepended=*/false), - Eq(data_arr2.size())); + IsOkAndHolds(data_arr2.size())); EXPECT_THAT(serializer.MoveFrom(/*dst=*/&pl_used2, /*src=*/&pl_used1), IsOk()); @@ -402,7 +402,7 @@ TEST(PostingListIntegerIndexSerializerTest, ASSERT_THAT( serializer.PrependDataArray(&pl_used, data_arr.data(), data_arr.size(), /*keep_prepended=*/false), - Eq(data_arr.size())); + IsOkAndHolds(data_arr.size())); EXPECT_THAT(serializer.MoveFrom(/*dst=*/&pl_used, /*src=*/nullptr), StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION)); @@ -433,7 +433,7 @@ TEST(PostingListIntegerIndexSerializerTest, MoveToPostingListTooSmall) { ASSERT_THAT( serializer.PrependDataArray(&pl_used1, data_arr1.data(), data_arr1.size(), /*keep_prepended=*/false), - Eq(data_arr1.size())); + IsOkAndHolds(data_arr1.size())); int size2 = serializer.GetMinPostingListSize(); ICING_ASSERT_OK_AND_ASSIGN( @@ -444,7 +444,7 @@ TEST(PostingListIntegerIndexSerializerTest, MoveToPostingListTooSmall) { ASSERT_THAT( serializer.PrependDataArray(&pl_used2, data_arr2.data(), data_arr2.size(), /*keep_prepended=*/false), - Eq(data_arr2.size())); + IsOkAndHolds(data_arr2.size())); EXPECT_THAT(serializer.MoveFrom(/*dst=*/&pl_used2, /*src=*/&pl_used1), StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); @@ -471,7 +471,7 @@ TEST(PostingListIntegerIndexSerializerTest, PopFrontData) { ASSERT_THAT( serializer.PrependDataArray(&pl_used, data_arr.data(), data_arr.size(), /*keep_prepended=*/false), - Eq(data_arr.size())); + IsOkAndHolds(data_arr.size())); ASSERT_THAT( serializer.GetData(&pl_used), IsOkAndHolds(ElementsAreArray(data_arr.rbegin(), data_arr.rend()))); diff --git a/icing/index/property-existence-indexing-handler.cc b/icing/index/property-existence-indexing-handler.cc new file mode 100644 index 0000000..504f380 --- /dev/null +++ b/icing/index/property-existence-indexing-handler.cc @@ -0,0 +1,127 @@ +// Copyright (C) 2023 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "icing/index/property-existence-indexing-handler.h" + +#include <memory> +#include <string> +#include <unordered_set> + +#include "icing/text_classifier/lib3/utils/base/status.h" +#include "icing/text_classifier/lib3/utils/base/statusor.h" +#include "icing/absl_ports/str_cat.h" +#include "icing/index/index.h" +#include "icing/proto/logging.pb.h" +#include "icing/store/document-id.h" +#include "icing/util/clock.h" +#include "icing/util/logging.h" +#include "icing/util/status-macros.h" +#include "icing/util/tokenized-document.h" + +namespace icing { +namespace lib { + +namespace { + +void ConstructPropertyExistenceMetaToken( + const std::string& current_path, const DocumentProto& document, + std::unordered_set<std::string>& meta_tokens) { + for (const PropertyProto& property : document.properties()) { + std::string new_path = current_path; + if (!new_path.empty()) { + new_path.append("."); + } + new_path.append(property.name()); + for (const DocumentProto& nested_document : property.document_values()) { + ConstructPropertyExistenceMetaToken(new_path, nested_document, + meta_tokens); + } + // A string property exists if and only if there is at least one non-empty + // string in the property. + bool has_string_value = false; + for (const std::string& string_value : property.string_values()) { + if (!string_value.empty()) { + has_string_value = true; + break; + } + } + if (has_string_value || property.int64_values_size() > 0 || + property.double_values_size() > 0 || + property.boolean_values_size() > 0 || + property.bytes_values_size() > 0 || + property.document_values_size() > 0) { + meta_tokens.insert( + absl_ports::StrCat(kPropertyExistenceTokenPrefix, new_path)); + } + } +} + +} // namespace + +/* static */ libtextclassifier3::StatusOr< + std::unique_ptr<PropertyExistenceIndexingHandler>> +PropertyExistenceIndexingHandler::Create(const Clock* clock, Index* index) { + ICING_RETURN_ERROR_IF_NULL(clock); + ICING_RETURN_ERROR_IF_NULL(index); + + return std::unique_ptr<PropertyExistenceIndexingHandler>( + new PropertyExistenceIndexingHandler(*clock, index)); +} + +libtextclassifier3::Status PropertyExistenceIndexingHandler::Handle( + const TokenizedDocument& tokenized_document, DocumentId document_id, + PutDocumentStatsProto* put_document_stats) { + std::unique_ptr<Timer> index_timer = clock_.GetNewTimer(); + + libtextclassifier3::Status status; + // Section id is irrelevant to metadata tokens that is used to support + // property existence check. + Index::Editor editor = + index_.Edit(document_id, /*section_id=*/0, TermMatchType::EXACT_ONLY, + /*namespace_id=*/0); + std::unordered_set<std::string> meta_tokens; + ConstructPropertyExistenceMetaToken( + /*current_path=*/"", tokenized_document.document(), meta_tokens); + for (const std::string& meta_token : meta_tokens) { + status = editor.BufferTerm(meta_token.c_str()); + if (!status.ok()) { + // We've encountered a failure. Bail out. We'll mark this doc as deleted + // and signal a failure to the client. + ICING_LOG(WARNING) << "Failed to buffer term in lite lexicon due to: " + << status.error_message(); + break; + } + } + + if (status.ok()) { + // Add all the metadata tokens to support property existence check. + status = editor.IndexAllBufferedTerms(); + if (!status.ok()) { + ICING_LOG(WARNING) << "Failed to add hits in lite index due to: " + << status.error_message(); + } + } + + if (put_document_stats != nullptr) { + put_document_stats->set_metadata_term_index_latency_ms( + index_timer->GetElapsedMilliseconds()); + put_document_stats->mutable_tokenization_stats() + ->set_num_metadata_tokens_indexed(meta_tokens.size()); + } + + return status; +} + +} // namespace lib +} // namespace icing diff --git a/icing/index/property-existence-indexing-handler.h b/icing/index/property-existence-indexing-handler.h new file mode 100644 index 0000000..55c0bb4 --- /dev/null +++ b/icing/index/property-existence-indexing-handler.h @@ -0,0 +1,86 @@ +// Copyright (C) 2023 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef ICING_INDEX_PROPERTY_EXISTENCE_INDEXING_HANDLER_H_ +#define ICING_INDEX_PROPERTY_EXISTENCE_INDEXING_HANDLER_H_ + +#include <memory> +#include <string_view> + +#include "icing/text_classifier/lib3/utils/base/status.h" +#include "icing/text_classifier/lib3/utils/base/statusor.h" +#include "icing/index/index.h" +#include "icing/proto/logging.pb.h" +#include "icing/store/document-id.h" +#include "icing/util/clock.h" +#include "icing/util/tokenized-document.h" + +namespace icing { +namespace lib { + +inline constexpr std::string_view kPropertyExistenceTokenPrefix = + "\xFF_HAS_\xFF"; + +// This class is meant to be owned by TermIndexingHandler. Instead of using this +// handler directly, callers should use TermIndexingHandler to index documents. +// +// This handler will not check or set last_added_document_id of the index, and +// it will not merge or sort the lite index either. +class PropertyExistenceIndexingHandler { + public: + // Creates a PropertyExistenceIndexingHandler instance which does not take + // ownership of any input components. All pointers must refer to valid objects + // that outlive the created PropertyExistenceIndexingHandler instance. + // + // Returns: + // - A PropertyExistenceIndexingHandler instance on success + // - FAILED_PRECONDITION_ERROR if any of the input pointer is null + static libtextclassifier3::StatusOr< + std::unique_ptr<PropertyExistenceIndexingHandler>> + Create(const Clock* clock, Index* index); + + ~PropertyExistenceIndexingHandler() = default; + + // Handles the property existence indexing process: add hits for metadata + // tokens used to index property existence. + // + // For example, if the passed in document has string properties "propA", + // "propB" and "propC.propD", and document property "propC", this handler will + // add the following metadata token to the index. + // - kPropertyExistenceTokenPrefix + "propA" + // - kPropertyExistenceTokenPrefix + "propB" + // - kPropertyExistenceTokenPrefix + "propC" + // - kPropertyExistenceTokenPrefix + "propC.propD" + // + /// Returns: + // - OK on success + // - RESOURCE_EXHAUSTED_ERROR if the index is full and can't add anymore + // content. + // - INTERNAL_ERROR if any other errors occur. + libtextclassifier3::Status Handle(const TokenizedDocument& tokenized_document, + DocumentId document_id, + PutDocumentStatsProto* put_document_stats); + + private: + explicit PropertyExistenceIndexingHandler(const Clock& clock, Index* index) + : clock_(clock), index_(*index) {} + + const Clock& clock_; // Does not own. + Index& index_; // Does not own. +}; + +} // namespace lib +} // namespace icing + +#endif // ICING_INDEX_PROPERTY_EXISTENCE_INDEXING_HANDLER_H_ diff --git a/icing/index/property-existence-indexing-handler_test.cc b/icing/index/property-existence-indexing-handler_test.cc new file mode 100644 index 0000000..e42fbc3 --- /dev/null +++ b/icing/index/property-existence-indexing-handler_test.cc @@ -0,0 +1,524 @@ +// Copyright (C) 2023 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "icing/index/property-existence-indexing-handler.h" + +#include <cstdint> +#include <limits> +#include <memory> +#include <string> +#include <string_view> +#include <utility> +#include <vector> + +#include "icing/text_classifier/lib3/utils/base/status.h" +#include "icing/text_classifier/lib3/utils/base/statusor.h" +#include "gmock/gmock.h" +#include "gtest/gtest.h" +#include "icing/absl_ports/str_cat.h" +#include "icing/document-builder.h" +#include "icing/file/filesystem.h" +#include "icing/file/portable-file-backed-proto-log.h" +#include "icing/index/hit/doc-hit-info.h" +#include "icing/index/index.h" +#include "icing/index/iterator/doc-hit-info-iterator.h" +#include "icing/legacy/index/icing-filesystem.h" +#include "icing/portable/platform.h" +#include "icing/proto/document.pb.h" +#include "icing/proto/document_wrapper.pb.h" +#include "icing/proto/schema.pb.h" +#include "icing/proto/term.pb.h" +#include "icing/schema-builder.h" +#include "icing/schema/schema-store.h" +#include "icing/schema/section.h" +#include "icing/store/document-id.h" +#include "icing/store/document-store.h" +#include "icing/testing/common-matchers.h" +#include "icing/testing/fake-clock.h" +#include "icing/testing/icu-data-file-helper.h" +#include "icing/testing/test-data.h" +#include "icing/testing/tmp-directory.h" +#include "icing/tokenization/language-segmenter-factory.h" +#include "icing/tokenization/language-segmenter.h" +#include "icing/transform/normalizer-factory.h" +#include "icing/transform/normalizer.h" +#include "icing/util/tokenized-document.h" +#include "unicode/uloc.h" + +namespace icing { +namespace lib { + +namespace { + +using ::testing::ElementsAre; +using ::testing::IsTrue; +using ::testing::Test; + +static constexpr std::string_view kTreeType = "TreeNode"; +static constexpr std::string_view kPropertyName = "name"; +static constexpr std::string_view kPropertyValue = "value"; +static constexpr std::string_view kPropertySubtrees = "subtrees"; + +static constexpr std::string_view kValueType = "Value"; +static constexpr std::string_view kPropertyBody = "body"; +static constexpr std::string_view kPropertyTimestamp = "timestamp"; +static constexpr std::string_view kPropertyScore = "score"; + +class PropertyExistenceIndexingHandlerTest : public Test { + protected: + void SetUp() override { + if (!IsCfStringTokenization() && !IsReverseJniTokenization()) { + ICING_ASSERT_OK( + // File generated via icu_data_file rule in //icing/BUILD. + icu_data_file_helper::SetUpICUDataFile( + GetTestFilePath("icing/icu.dat"))); + } + + base_dir_ = GetTestTempDir() + "/icing_test"; + ASSERT_THAT(filesystem_.CreateDirectoryRecursively(base_dir_.c_str()), + IsTrue()); + + index_dir_ = base_dir_ + "/index"; + schema_store_dir_ = base_dir_ + "/schema_store"; + document_store_dir_ = base_dir_ + "/document_store"; + + language_segmenter_factory::SegmenterOptions segmenter_options(ULOC_US); + ICING_ASSERT_OK_AND_ASSIGN( + lang_segmenter_, + language_segmenter_factory::Create(std::move(segmenter_options))); + + ICING_ASSERT_OK_AND_ASSIGN( + normalizer_, + normalizer_factory::Create( + /*max_term_byte_size=*/std::numeric_limits<int32_t>::max())); + + ASSERT_THAT( + filesystem_.CreateDirectoryRecursively(schema_store_dir_.c_str()), + IsTrue()); + ICING_ASSERT_OK_AND_ASSIGN( + schema_store_, + SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_)); + SchemaProto schema = + SchemaBuilder() + .AddType( + SchemaTypeConfigBuilder() + .SetType(kTreeType) + .AddProperty(PropertyConfigBuilder() + .SetName(kPropertyName) + .SetDataTypeString(TERM_MATCH_EXACT, + TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_OPTIONAL)) + .AddProperty( + PropertyConfigBuilder() + .SetName(kPropertyValue) + .SetDataTypeDocument( + kValueType, /*index_nested_properties=*/true) + .SetCardinality(CARDINALITY_OPTIONAL)) + .AddProperty( + PropertyConfigBuilder() + .SetName(kPropertySubtrees) + .SetDataTypeDocument( + kTreeType, /*index_nested_properties=*/false) + .SetCardinality(CARDINALITY_REPEATED))) + .AddType( + SchemaTypeConfigBuilder() + .SetType(kValueType) + .AddProperty(PropertyConfigBuilder() + .SetName(kPropertyBody) + .SetDataTypeString(TERM_MATCH_EXACT, + TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_REPEATED)) + .AddProperty(PropertyConfigBuilder() + .SetName(kPropertyTimestamp) + .SetDataType(TYPE_INT64) + .SetCardinality(CARDINALITY_OPTIONAL)) + .AddProperty(PropertyConfigBuilder() + .SetName(kPropertyScore) + .SetDataType(TYPE_DOUBLE) + .SetCardinality(CARDINALITY_OPTIONAL))) + .Build(); + ICING_ASSERT_OK(schema_store_->SetSchema( + schema, /*ignore_errors_and_delete_documents=*/false, + /*allow_circular_schema_definitions=*/true)); + + ASSERT_TRUE( + filesystem_.CreateDirectoryRecursively(document_store_dir_.c_str())); + ICING_ASSERT_OK_AND_ASSIGN( + DocumentStore::CreateResult doc_store_create_result, + DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_, + schema_store_.get(), + /*force_recovery_and_revalidate_documents=*/false, + /*namespace_id_fingerprint=*/false, + /*pre_mapping_fbv=*/false, + /*use_persistent_hash_map=*/false, + PortableFileBackedProtoLog< + DocumentWrapper>::kDeflateCompressionLevel, + /*initialize_stats=*/nullptr)); + document_store_ = std::move(doc_store_create_result.document_store); + } + + void TearDown() override { + document_store_.reset(); + schema_store_.reset(); + normalizer_.reset(); + lang_segmenter_.reset(); + + filesystem_.DeleteDirectoryRecursively(base_dir_.c_str()); + } + + Filesystem filesystem_; + IcingFilesystem icing_filesystem_; + FakeClock fake_clock_; + std::string base_dir_; + std::string index_dir_; + std::string schema_store_dir_; + std::string document_store_dir_; + + std::unique_ptr<LanguageSegmenter> lang_segmenter_; + std::unique_ptr<Normalizer> normalizer_; + std::unique_ptr<SchemaStore> schema_store_; + std::unique_ptr<DocumentStore> document_store_; +}; + +libtextclassifier3::StatusOr<std::unique_ptr<DocHitInfoIterator>> +QueryExistence(Index* index, std::string_view property_path) { + return index->GetIterator( + absl_ports::StrCat(kPropertyExistenceTokenPrefix, property_path), + /*term_start_index=*/0, + /*unnormalized_term_length=*/0, kSectionIdMaskAll, + TermMatchType::EXACT_ONLY, + /*need_hit_term_frequency=*/false); +} + +std::vector<DocHitInfo> GetHits(std::unique_ptr<DocHitInfoIterator> iterator) { + std::vector<DocHitInfo> infos; + while (iterator->Advance().ok()) { + infos.push_back(iterator->doc_hit_info()); + } + return infos; +} + +TEST_F(PropertyExistenceIndexingHandlerTest, HandlePropertyExistence) { + Index::Options options(index_dir_, /*index_merge_size=*/1024 * 1024, + /*lite_index_sort_at_indexing=*/true, + /*lite_index_sort_size=*/1024 * 8); + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<Index> index, + Index::Create(options, &filesystem_, &icing_filesystem_)); + + // Create a document with every property. + DocumentProto document0 = + DocumentBuilder() + .SetKey("icing", "uri0") + .SetSchema(std::string(kValueType)) + .AddStringProperty(std::string(kPropertyBody), "foo") + .AddInt64Property(std::string(kPropertyTimestamp), 123) + .AddDoubleProperty(std::string(kPropertyScore), 456.789) + .Build(); + // Create a document with missing body. + DocumentProto document1 = + DocumentBuilder() + .SetKey("icing", "uri1") + .SetSchema(std::string(kValueType)) + .AddInt64Property(std::string(kPropertyTimestamp), 123) + .AddDoubleProperty(std::string(kPropertyScore), 456.789) + .Build(); + // Create a document with missing timestamp. + DocumentProto document2 = + DocumentBuilder() + .SetKey("icing", "uri2") + .SetSchema(std::string(kValueType)) + .AddStringProperty(std::string(kPropertyBody), "foo") + .AddDoubleProperty(std::string(kPropertyScore), 456.789) + .Build(); + + ICING_ASSERT_OK_AND_ASSIGN( + TokenizedDocument tokenized_document0, + TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(), + std::move(document0))); + ICING_ASSERT_OK_AND_ASSIGN( + TokenizedDocument tokenized_document1, + TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(), + std::move(document1))); + ICING_ASSERT_OK_AND_ASSIGN( + TokenizedDocument tokenized_document2, + TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(), + std::move(document2))); + ICING_ASSERT_OK_AND_ASSIGN( + DocumentId document_id0, + document_store_->Put(tokenized_document0.document())); + ICING_ASSERT_OK_AND_ASSIGN( + DocumentId document_id1, + document_store_->Put(tokenized_document1.document())); + ICING_ASSERT_OK_AND_ASSIGN( + DocumentId document_id2, + document_store_->Put(tokenized_document2.document())); + + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<PropertyExistenceIndexingHandler> handler, + PropertyExistenceIndexingHandler::Create(&fake_clock_, index.get())); + + // Handle all docs + EXPECT_THAT(handler->Handle(tokenized_document0, document_id0, + /*put_document_stats=*/nullptr), + IsOk()); + EXPECT_THAT(handler->Handle(tokenized_document1, document_id1, + /*put_document_stats=*/nullptr), + IsOk()); + EXPECT_THAT(handler->Handle(tokenized_document2, document_id2, + /*put_document_stats=*/nullptr), + IsOk()); + + // Get all documents that have "body". + ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<DocHitInfoIterator> itr, + QueryExistence(index.get(), kPropertyBody)); + EXPECT_THAT( + GetHits(std::move(itr)), + ElementsAre(EqualsDocHitInfo(document_id2, std::vector<SectionId>{0}), + EqualsDocHitInfo(document_id0, std::vector<SectionId>{0}))); + + // Get all documents that have "timestamp". + ICING_ASSERT_OK_AND_ASSIGN(itr, + QueryExistence(index.get(), kPropertyTimestamp)); + EXPECT_THAT( + GetHits(std::move(itr)), + ElementsAre(EqualsDocHitInfo(document_id1, std::vector<SectionId>{0}), + EqualsDocHitInfo(document_id0, std::vector<SectionId>{0}))); + + // Get all documents that have "score". + ICING_ASSERT_OK_AND_ASSIGN(itr, QueryExistence(index.get(), kPropertyScore)); + EXPECT_THAT( + GetHits(std::move(itr)), + ElementsAre(EqualsDocHitInfo(document_id2, std::vector<SectionId>{0}), + EqualsDocHitInfo(document_id1, std::vector<SectionId>{0}), + EqualsDocHitInfo(document_id0, std::vector<SectionId>{0}))); +} + +TEST_F(PropertyExistenceIndexingHandlerTest, HandleNestedPropertyExistence) { + Index::Options options(index_dir_, /*index_merge_size=*/1024 * 1024, + /*lite_index_sort_at_indexing=*/true, + /*lite_index_sort_size=*/1024 * 8); + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<Index> index, + Index::Create(options, &filesystem_, &icing_filesystem_)); + + // Create a complex nested root_document with the following property paths. + // - name + // - subtrees + // - subtrees.name + // - subtrees.value + // - subtrees.value.timestamp + // - subtrees.subtrees + // - subtrees.subtrees.name + // - subtrees.subtrees.value + // - subtrees.subtrees.value.body + // - subtrees.subtrees.value.score + DocumentProto leaf_document = + DocumentBuilder() + .SetKey("icing", "uri") + .SetSchema(std::string(kTreeType)) + .AddStringProperty(std::string(kPropertyName), "leaf") + .AddDocumentProperty( + std::string(kPropertyValue), + DocumentBuilder() + .SetKey("icing", "uri") + .SetSchema(std::string(kValueType)) + .AddStringProperty(std::string(kPropertyBody), "foo") + .AddDoubleProperty(std::string(kPropertyScore), 456.789) + .Build()) + .Build(); + DocumentProto intermediate_document1 = + DocumentBuilder() + .SetKey("icing", "uri") + .SetSchema(std::string(kTreeType)) + .AddStringProperty(std::string(kPropertyName), "intermediate1") + .AddDocumentProperty( + std::string(kPropertyValue), + DocumentBuilder() + .SetKey("icing", "uri") + .SetSchema(std::string(kValueType)) + .AddInt64Property(std::string(kPropertyTimestamp), 123) + .Build()) + .AddDocumentProperty(std::string(kPropertySubtrees), leaf_document) + .Build(); + DocumentProto intermediate_document2 = + DocumentBuilder() + .SetKey("icing", "uri") + .SetSchema(std::string(kTreeType)) + .AddStringProperty(std::string(kPropertyName), "intermediate2") + .Build(); + DocumentProto root_document = + DocumentBuilder() + .SetKey("icing", "uri") + .SetSchema(std::string(kTreeType)) + .AddStringProperty(std::string(kPropertyName), "root") + .AddDocumentProperty(std::string(kPropertySubtrees), + intermediate_document1, intermediate_document2) + .Build(); + + // Handle root_document + ICING_ASSERT_OK_AND_ASSIGN( + TokenizedDocument tokenized_root_document, + TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(), + std::move(root_document))); + ICING_ASSERT_OK_AND_ASSIGN( + DocumentId document_id, + document_store_->Put(tokenized_root_document.document())); + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<PropertyExistenceIndexingHandler> handler, + PropertyExistenceIndexingHandler::Create(&fake_clock_, index.get())); + EXPECT_THAT(handler->Handle(tokenized_root_document, document_id, + /*put_document_stats=*/nullptr), + IsOk()); + + // Check that the above property paths can be found by query. + ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<DocHitInfoIterator> itr, + QueryExistence(index.get(), "name")); + EXPECT_THAT( + GetHits(std::move(itr)), + ElementsAre(EqualsDocHitInfo(document_id, std::vector<SectionId>{0}))); + + ICING_ASSERT_OK_AND_ASSIGN(itr, QueryExistence(index.get(), "subtrees")); + EXPECT_THAT( + GetHits(std::move(itr)), + ElementsAre(EqualsDocHitInfo(document_id, std::vector<SectionId>{0}))); + + ICING_ASSERT_OK_AND_ASSIGN(itr, QueryExistence(index.get(), "subtrees.name")); + EXPECT_THAT( + GetHits(std::move(itr)), + ElementsAre(EqualsDocHitInfo(document_id, std::vector<SectionId>{0}))); + + ICING_ASSERT_OK_AND_ASSIGN(itr, + QueryExistence(index.get(), "subtrees.value")); + EXPECT_THAT( + GetHits(std::move(itr)), + ElementsAre(EqualsDocHitInfo(document_id, std::vector<SectionId>{0}))); + + ICING_ASSERT_OK_AND_ASSIGN( + itr, QueryExistence(index.get(), "subtrees.value.timestamp")); + EXPECT_THAT( + GetHits(std::move(itr)), + ElementsAre(EqualsDocHitInfo(document_id, std::vector<SectionId>{0}))); + + ICING_ASSERT_OK_AND_ASSIGN(itr, + QueryExistence(index.get(), "subtrees.subtrees")); + EXPECT_THAT( + GetHits(std::move(itr)), + ElementsAre(EqualsDocHitInfo(document_id, std::vector<SectionId>{0}))); + + ICING_ASSERT_OK_AND_ASSIGN( + itr, QueryExistence(index.get(), "subtrees.subtrees.name")); + EXPECT_THAT( + GetHits(std::move(itr)), + ElementsAre(EqualsDocHitInfo(document_id, std::vector<SectionId>{0}))); + + ICING_ASSERT_OK_AND_ASSIGN( + itr, QueryExistence(index.get(), "subtrees.subtrees.value")); + EXPECT_THAT( + GetHits(std::move(itr)), + ElementsAre(EqualsDocHitInfo(document_id, std::vector<SectionId>{0}))); + + ICING_ASSERT_OK_AND_ASSIGN( + itr, QueryExistence(index.get(), "subtrees.subtrees.value.body")); + EXPECT_THAT( + GetHits(std::move(itr)), + ElementsAre(EqualsDocHitInfo(document_id, std::vector<SectionId>{0}))); + + ICING_ASSERT_OK_AND_ASSIGN( + itr, QueryExistence(index.get(), "subtrees.subtrees.value.score")); + EXPECT_THAT( + GetHits(std::move(itr)), + ElementsAre(EqualsDocHitInfo(document_id, std::vector<SectionId>{0}))); +} + +TEST_F(PropertyExistenceIndexingHandlerTest, SingleEmptyStringIsNonExisting) { + Index::Options options(index_dir_, /*index_merge_size=*/1024 * 1024, + /*lite_index_sort_at_indexing=*/true, + /*lite_index_sort_size=*/1024 * 8); + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<Index> index, + Index::Create(options, &filesystem_, &icing_filesystem_)); + + // Create a document with one empty body. + DocumentProto document0 = + DocumentBuilder() + .SetKey("icing", "uri0") + .SetSchema(std::string(kValueType)) + .AddStringProperty(std::string(kPropertyBody), "") + .Build(); + // Create a document with two empty body. + DocumentProto document1 = + DocumentBuilder() + .SetKey("icing", "uri1") + .SetSchema(std::string(kValueType)) + .AddStringProperty(std::string(kPropertyBody), "", "") + .Build(); + // Create a document with one non-empty body. + DocumentProto document2 = + DocumentBuilder() + .SetKey("icing", "uri2") + .SetSchema(std::string(kValueType)) + .AddStringProperty(std::string(kPropertyBody), "foo") + .Build(); + + ICING_ASSERT_OK_AND_ASSIGN( + TokenizedDocument tokenized_document0, + TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(), + std::move(document0))); + ICING_ASSERT_OK_AND_ASSIGN( + TokenizedDocument tokenized_document1, + TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(), + std::move(document1))); + ICING_ASSERT_OK_AND_ASSIGN( + TokenizedDocument tokenized_document2, + TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(), + std::move(document2))); + ICING_ASSERT_OK_AND_ASSIGN( + DocumentId document_id0, + document_store_->Put(tokenized_document0.document())); + ICING_ASSERT_OK_AND_ASSIGN( + DocumentId document_id1, + document_store_->Put(tokenized_document1.document())); + ICING_ASSERT_OK_AND_ASSIGN( + DocumentId document_id2, + document_store_->Put(tokenized_document2.document())); + + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<PropertyExistenceIndexingHandler> handler, + PropertyExistenceIndexingHandler::Create(&fake_clock_, index.get())); + + // Handle all docs + EXPECT_THAT(handler->Handle(tokenized_document0, document_id0, + /*put_document_stats=*/nullptr), + IsOk()); + EXPECT_THAT(handler->Handle(tokenized_document1, document_id1, + /*put_document_stats=*/nullptr), + IsOk()); + EXPECT_THAT(handler->Handle(tokenized_document2, document_id2, + /*put_document_stats=*/nullptr), + IsOk()); + + // Check that the documents that have one or two empty bodies will not be + // considered as having a body property. + ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<DocHitInfoIterator> itr, + QueryExistence(index.get(), kPropertyBody)); + EXPECT_THAT( + GetHits(std::move(itr)), + ElementsAre(EqualsDocHitInfo(document_id2, std::vector<SectionId>{0}))); +} + +} // namespace + +} // namespace lib +} // namespace icing diff --git a/icing/index/string-section-indexing-handler.cc b/icing/index/string-section-indexing-handler.cc index 69b8889..8b20d04 100644 --- a/icing/index/string-section-indexing-handler.cc +++ b/icing/index/string-section-indexing-handler.cc @@ -21,15 +21,12 @@ #include "icing/text_classifier/lib3/utils/base/status.h" #include "icing/text_classifier/lib3/utils/base/statusor.h" -#include "icing/absl_ports/canonical_errors.h" #include "icing/index/index.h" -#include "icing/legacy/core/icing-string-util.h" #include "icing/proto/logging.pb.h" #include "icing/proto/schema.pb.h" #include "icing/schema/section.h" #include "icing/store/document-id.h" #include "icing/transform/normalizer.h" -#include "icing/util/clock.h" #include "icing/util/logging.h" #include "icing/util/status-macros.h" #include "icing/util/tokenized-document.h" @@ -39,35 +36,18 @@ namespace lib { /* static */ libtextclassifier3::StatusOr< std::unique_ptr<StringSectionIndexingHandler>> -StringSectionIndexingHandler::Create(const Clock* clock, - const Normalizer* normalizer, +StringSectionIndexingHandler::Create(const Normalizer* normalizer, Index* index) { - ICING_RETURN_ERROR_IF_NULL(clock); ICING_RETURN_ERROR_IF_NULL(normalizer); ICING_RETURN_ERROR_IF_NULL(index); return std::unique_ptr<StringSectionIndexingHandler>( - new StringSectionIndexingHandler(clock, normalizer, index)); + new StringSectionIndexingHandler(normalizer, index)); } libtextclassifier3::Status StringSectionIndexingHandler::Handle( const TokenizedDocument& tokenized_document, DocumentId document_id, - bool recovery_mode, PutDocumentStatsProto* put_document_stats) { - std::unique_ptr<Timer> index_timer = clock_.GetNewTimer(); - - if (index_.last_added_document_id() != kInvalidDocumentId && - document_id <= index_.last_added_document_id()) { - if (recovery_mode) { - // Skip the document if document_id <= last_added_document_id in recovery - // mode without returning an error. - return libtextclassifier3::Status::OK; - } - return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf( - "DocumentId %d must be greater than last added document_id %d", - document_id, index_.last_added_document_id())); - } - index_.set_last_added_document_id(document_id); - + PutDocumentStatsProto* put_document_stats) { uint32_t num_tokens = 0; libtextclassifier3::Status status; for (const TokenizedSection& section : @@ -123,41 +103,10 @@ libtextclassifier3::Status StringSectionIndexingHandler::Handle( } if (put_document_stats != nullptr) { - put_document_stats->set_term_index_latency_ms( - index_timer->GetElapsedMilliseconds()); put_document_stats->mutable_tokenization_stats()->set_num_tokens_indexed( num_tokens); } - // If we're either successful or we've hit resource exhausted, then attempt a - // merge. - if ((status.ok() || absl_ports::IsResourceExhausted(status)) && - index_.WantsMerge()) { - ICING_LOG(INFO) << "Merging the index at docid " << document_id << "."; - - std::unique_ptr<Timer> merge_timer = clock_.GetNewTimer(); - libtextclassifier3::Status merge_status = index_.Merge(); - - if (!merge_status.ok()) { - ICING_LOG(ERROR) << "Index merging failed. Clearing index."; - if (!index_.Reset().ok()) { - return absl_ports::InternalError(IcingStringUtil::StringPrintf( - "Unable to reset to clear index after merge failure. Merge " - "failure=%d:%s", - merge_status.error_code(), merge_status.error_message().c_str())); - } else { - return absl_ports::DataLossError(IcingStringUtil::StringPrintf( - "Forced to reset index after merge failure. Merge failure=%d:%s", - merge_status.error_code(), merge_status.error_message().c_str())); - } - } - - if (put_document_stats != nullptr) { - put_document_stats->set_index_merge_latency_ms( - merge_timer->GetElapsedMilliseconds()); - } - } - return status; } diff --git a/icing/index/string-section-indexing-handler.h b/icing/index/string-section-indexing-handler.h index 6abfba5..8452e9f 100644 --- a/icing/index/string-section-indexing-handler.h +++ b/icing/index/string-section-indexing-handler.h @@ -19,18 +19,21 @@ #include "icing/text_classifier/lib3/utils/base/status.h" #include "icing/text_classifier/lib3/utils/base/statusor.h" -#include "icing/index/data-indexing-handler.h" #include "icing/index/index.h" #include "icing/proto/logging.pb.h" #include "icing/store/document-id.h" #include "icing/transform/normalizer.h" -#include "icing/util/clock.h" #include "icing/util/tokenized-document.h" namespace icing { namespace lib { -class StringSectionIndexingHandler : public DataIndexingHandler { +// This class is meant to be owned by TermIndexingHandler. Instead of using this +// handler directly, callers should use TermIndexingHandler to index documents. +// +// This handler will not check or set last_added_document_id of the index, and +// it will not merge or sort the lite index either. +class StringSectionIndexingHandler { public: // Creates a StringSectionIndexingHandler instance which does not take // ownership of any input components. All pointers must refer to valid objects @@ -41,9 +44,9 @@ class StringSectionIndexingHandler : public DataIndexingHandler { // - FAILED_PRECONDITION_ERROR if any of the input pointer is null static libtextclassifier3::StatusOr< std::unique_ptr<StringSectionIndexingHandler>> - Create(const Clock* clock, const Normalizer* normalizer, Index* index); + Create(const Normalizer* normalizer, Index* index); - ~StringSectionIndexingHandler() override = default; + ~StringSectionIndexingHandler() = default; // Handles the string term indexing process: add hits into the lite index for // all contents in tokenized_document.tokenized_string_sections and merge lite @@ -51,23 +54,18 @@ class StringSectionIndexingHandler : public DataIndexingHandler { // /// Returns: // - OK on success - // - INVALID_ARGUMENT_ERROR if document_id is less than or equal to the - // document_id of a previously indexed document in non recovery mode. // - RESOURCE_EXHAUSTED_ERROR if the index is full and can't add anymore // content. - // - DATA_LOSS_ERROR if an attempt to merge the index fails and both indices - // are cleared as a result. // - INTERNAL_ERROR if any other errors occur. // - Any main/lite index errors. - libtextclassifier3::Status Handle( - const TokenizedDocument& tokenized_document, DocumentId document_id, - bool recovery_mode, PutDocumentStatsProto* put_document_stats) override; + libtextclassifier3::Status Handle(const TokenizedDocument& tokenized_document, + DocumentId document_id, + PutDocumentStatsProto* put_document_stats); private: - explicit StringSectionIndexingHandler(const Clock* clock, - const Normalizer* normalizer, + explicit StringSectionIndexingHandler(const Normalizer* normalizer, Index* index) - : DataIndexingHandler(clock), normalizer_(*normalizer), index_(*index) {} + : normalizer_(*normalizer), index_(*index) {} const Normalizer& normalizer_; // Does not own. Index& index_; // Does not own. diff --git a/icing/index/term-indexing-handler.cc b/icing/index/term-indexing-handler.cc new file mode 100644 index 0000000..7eb9dda --- /dev/null +++ b/icing/index/term-indexing-handler.cc @@ -0,0 +1,146 @@ +// Copyright (C) 2023 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "icing/index/term-indexing-handler.h" + +#include <memory> +#include <string> +#include <utility> + +#include "icing/text_classifier/lib3/utils/base/status.h" +#include "icing/text_classifier/lib3/utils/base/statusor.h" +#include "icing/absl_ports/canonical_errors.h" +#include "icing/index/index.h" +#include "icing/index/property-existence-indexing-handler.h" +#include "icing/index/string-section-indexing-handler.h" +#include "icing/legacy/core/icing-string-util.h" +#include "icing/proto/logging.pb.h" +#include "icing/store/document-id.h" +#include "icing/transform/normalizer.h" +#include "icing/util/clock.h" +#include "icing/util/logging.h" +#include "icing/util/status-macros.h" +#include "icing/util/tokenized-document.h" + +namespace icing { +namespace lib { + +/* static */ libtextclassifier3::StatusOr<std::unique_ptr<TermIndexingHandler>> +TermIndexingHandler::Create(const Clock* clock, const Normalizer* normalizer, + Index* index, + bool build_property_existence_metadata_hits) { + ICING_RETURN_ERROR_IF_NULL(clock); + ICING_RETURN_ERROR_IF_NULL(normalizer); + ICING_RETURN_ERROR_IF_NULL(index); + + // Property existence index handler + std::unique_ptr<PropertyExistenceIndexingHandler> + property_existence_indexing_handler = nullptr; + if (build_property_existence_metadata_hits) { + ICING_ASSIGN_OR_RETURN( + property_existence_indexing_handler, + PropertyExistenceIndexingHandler::Create(clock, index)); + } + // String section index handler + ICING_ASSIGN_OR_RETURN( + std::unique_ptr<StringSectionIndexingHandler> + string_section_indexing_handler, + StringSectionIndexingHandler::Create(normalizer, index)); + + return std::unique_ptr<TermIndexingHandler>(new TermIndexingHandler( + clock, index, std::move(property_existence_indexing_handler), + std::move(string_section_indexing_handler))); +} + +libtextclassifier3::Status TermIndexingHandler::Handle( + const TokenizedDocument& tokenized_document, DocumentId document_id, + bool recovery_mode, PutDocumentStatsProto* put_document_stats) { + std::unique_ptr<Timer> index_timer = clock_.GetNewTimer(); + + if (index_.last_added_document_id() != kInvalidDocumentId && + document_id <= index_.last_added_document_id()) { + if (recovery_mode) { + // Skip the document if document_id <= last_added_document_id in recovery + // mode without returning an error. + return libtextclassifier3::Status::OK; + } + return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf( + "DocumentId %d must be greater than last added document_id %d", + document_id, index_.last_added_document_id())); + } + index_.set_last_added_document_id(document_id); + + libtextclassifier3::Status status = libtextclassifier3::Status::OK; + if (property_existence_indexing_handler_ != nullptr) { + status = property_existence_indexing_handler_->Handle( + tokenized_document, document_id, put_document_stats); + } + if (status.ok()) { + status = string_section_indexing_handler_->Handle( + tokenized_document, document_id, put_document_stats); + } + + if (put_document_stats != nullptr) { + put_document_stats->set_term_index_latency_ms( + index_timer->GetElapsedMilliseconds()); + } + + // Check if we should merge when we're either successful or we've hit resource + // exhausted. + bool should_merge = + (status.ok() || absl_ports::IsResourceExhausted(status)) && + index_.WantsMerge(); + + // Check and sort the LiteIndex HitBuffer if we don't need to merge. + if (!should_merge && index_.LiteIndexNeedSort()) { + std::unique_ptr<Timer> sort_timer = clock_.GetNewTimer(); + index_.SortLiteIndex(); + + if (put_document_stats != nullptr) { + put_document_stats->set_lite_index_sort_latency_ms( + sort_timer->GetElapsedMilliseconds()); + } + } + + // Attempt index merge if needed. + if (should_merge) { + ICING_LOG(INFO) << "Merging the index at docid " << document_id << "."; + + std::unique_ptr<Timer> merge_timer = clock_.GetNewTimer(); + libtextclassifier3::Status merge_status = index_.Merge(); + + if (!merge_status.ok()) { + ICING_LOG(ERROR) << "Index merging failed. Clearing index."; + if (!index_.Reset().ok()) { + return absl_ports::InternalError(IcingStringUtil::StringPrintf( + "Unable to reset to clear index after merge failure. Merge " + "failure=%d:%s", + merge_status.error_code(), merge_status.error_message().c_str())); + } else { + return absl_ports::DataLossError(IcingStringUtil::StringPrintf( + "Forced to reset index after merge failure. Merge failure=%d:%s", + merge_status.error_code(), merge_status.error_message().c_str())); + } + } + + if (put_document_stats != nullptr) { + put_document_stats->set_index_merge_latency_ms( + merge_timer->GetElapsedMilliseconds()); + } + } + return status; +} + +} // namespace lib +} // namespace icing diff --git a/icing/index/term-indexing-handler.h b/icing/index/term-indexing-handler.h new file mode 100644 index 0000000..c055bbf --- /dev/null +++ b/icing/index/term-indexing-handler.h @@ -0,0 +1,97 @@ +// Copyright (C) 2023 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef ICING_INDEX_TERM_INDEXING_HANDLER_H_ +#define ICING_INDEX_TERM_INDEXING_HANDLER_H_ + +#include <memory> +#include <utility> + +#include "icing/text_classifier/lib3/utils/base/status.h" +#include "icing/text_classifier/lib3/utils/base/statusor.h" +#include "icing/index/data-indexing-handler.h" +#include "icing/index/index.h" +#include "icing/index/property-existence-indexing-handler.h" +#include "icing/index/string-section-indexing-handler.h" +#include "icing/proto/logging.pb.h" +#include "icing/store/document-id.h" +#include "icing/transform/normalizer.h" +#include "icing/util/clock.h" +#include "icing/util/tokenized-document.h" + +namespace icing { +namespace lib { + +class TermIndexingHandler : public DataIndexingHandler { + public: + // Creates a TermIndexingHandler instance which does not take + // ownership of any input components. All pointers must refer to valid objects + // that outlive the created TermIndexingHandler instance. + // + // Returns: + // - A TermIndexingHandler instance on success + // - FAILED_PRECONDITION_ERROR if any of the input pointer is null + static libtextclassifier3::StatusOr<std::unique_ptr<TermIndexingHandler>> + Create(const Clock* clock, const Normalizer* normalizer, Index* index, + bool build_property_existence_metadata_hits); + + ~TermIndexingHandler() override = default; + + // Handles term indexing process: + // - Checks if document_id > last_added_document_id. + // - Updates last_added_document_id to document_id. + // - Handles PropertyExistenceIndexingHandler. + // - Handles StringSectionIndexingHandler. + // - Sorts the lite index if necessary. + // - Merges the lite index into the main index if necessary. + // + /// Returns: + // - OK on success + // - INVALID_ARGUMENT_ERROR if document_id is less than or equal to the + // document_id of a previously indexed document in non recovery mode. + // - RESOURCE_EXHAUSTED_ERROR if the index is full and can't add anymore + // content. + // - DATA_LOSS_ERROR if an attempt to merge the index fails and both indices + // are cleared as a result. + // - INTERNAL_ERROR if any other errors occur. + // - Any main/lite index errors. + libtextclassifier3::Status Handle( + const TokenizedDocument& tokenized_document, DocumentId document_id, + bool recovery_mode, PutDocumentStatsProto* put_document_stats) override; + + private: + explicit TermIndexingHandler(const Clock* clock, Index* index, + std::unique_ptr<PropertyExistenceIndexingHandler> + property_existence_indexing_handler, + std::unique_ptr<StringSectionIndexingHandler> + string_section_indexing_handler) + : DataIndexingHandler(clock), + index_(*index), + property_existence_indexing_handler_( + std::move(property_existence_indexing_handler)), + string_section_indexing_handler_( + std::move(string_section_indexing_handler)) {} + + Index& index_; // Does not own. + + std::unique_ptr<PropertyExistenceIndexingHandler> + property_existence_indexing_handler_; // Nullable + std::unique_ptr<StringSectionIndexingHandler> + string_section_indexing_handler_; +}; + +} // namespace lib +} // namespace icing + +#endif // ICING_INDEX_TERM_INDEXING_HANDLER_H_ diff --git a/icing/index/term-indexing-handler_test.cc b/icing/index/term-indexing-handler_test.cc new file mode 100644 index 0000000..1b03865 --- /dev/null +++ b/icing/index/term-indexing-handler_test.cc @@ -0,0 +1,664 @@ +// Copyright (C) 2023 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "icing/index/term-indexing-handler.h" + +#include <cstdint> +#include <limits> +#include <memory> +#include <string> +#include <string_view> +#include <unordered_map> +#include <utility> +#include <vector> + +#include "icing/text_classifier/lib3/utils/base/status.h" +#include "icing/text_classifier/lib3/utils/base/statusor.h" +#include "gmock/gmock.h" +#include "gtest/gtest.h" +#include "icing/absl_ports/str_cat.h" +#include "icing/document-builder.h" +#include "icing/file/filesystem.h" +#include "icing/file/portable-file-backed-proto-log.h" +#include "icing/index/hit/doc-hit-info.h" +#include "icing/index/hit/hit.h" +#include "icing/index/index.h" +#include "icing/index/iterator/doc-hit-info-iterator-test-util.h" +#include "icing/index/iterator/doc-hit-info-iterator.h" +#include "icing/index/property-existence-indexing-handler.h" +#include "icing/legacy/index/icing-filesystem.h" +#include "icing/portable/platform.h" +#include "icing/proto/document.pb.h" +#include "icing/proto/document_wrapper.pb.h" +#include "icing/proto/schema.pb.h" +#include "icing/proto/term.pb.h" +#include "icing/schema-builder.h" +#include "icing/schema/schema-store.h" +#include "icing/schema/section.h" +#include "icing/store/document-id.h" +#include "icing/store/document-store.h" +#include "icing/testing/common-matchers.h" +#include "icing/testing/fake-clock.h" +#include "icing/testing/icu-data-file-helper.h" +#include "icing/testing/test-data.h" +#include "icing/testing/tmp-directory.h" +#include "icing/tokenization/language-segmenter-factory.h" +#include "icing/tokenization/language-segmenter.h" +#include "icing/transform/normalizer-factory.h" +#include "icing/transform/normalizer.h" +#include "icing/util/tokenized-document.h" +#include "unicode/uloc.h" + +namespace icing { +namespace lib { + +namespace { + +using ::testing::ElementsAre; +using ::testing::Eq; +using ::testing::IsEmpty; +using ::testing::IsFalse; +using ::testing::IsTrue; +using ::testing::Test; + +// Schema type with indexable properties and section Id. +// Section Id is determined by the lexicographical order of indexable property +// path. +// Section id = 0: body +// Section id = 1: title +constexpr std::string_view kFakeType = "FakeType"; +constexpr std::string_view kPropertyBody = "body"; +constexpr std::string_view kPropertyTitle = "title"; + +constexpr SectionId kSectionIdBody = 0; +constexpr SectionId kSectionIdTitle = 1; + +// Schema type with nested indexable properties and section Id. +// Section id = 0: "name" +// Section id = 1: "nested.body" +// Section id = 3: "nested.title" +// Section id = 4: "subject" +constexpr std::string_view kNestedType = "NestedType"; +constexpr std::string_view kPropertyName = "name"; +constexpr std::string_view kPropertyNestedDoc = "nested"; +constexpr std::string_view kPropertySubject = "subject"; + +constexpr SectionId kSectionIdNestedBody = 1; + +class TermIndexingHandlerTest : public Test { + protected: + void SetUp() override { + if (!IsCfStringTokenization() && !IsReverseJniTokenization()) { + ICING_ASSERT_OK( + // File generated via icu_data_file rule in //icing/BUILD. + icu_data_file_helper::SetUpICUDataFile( + GetTestFilePath("icing/icu.dat"))); + } + + base_dir_ = GetTestTempDir() + "/icing_test"; + ASSERT_THAT(filesystem_.CreateDirectoryRecursively(base_dir_.c_str()), + IsTrue()); + + index_dir_ = base_dir_ + "/index"; + schema_store_dir_ = base_dir_ + "/schema_store"; + document_store_dir_ = base_dir_ + "/document_store"; + + language_segmenter_factory::SegmenterOptions segmenter_options(ULOC_US); + ICING_ASSERT_OK_AND_ASSIGN( + lang_segmenter_, + language_segmenter_factory::Create(std::move(segmenter_options))); + + ICING_ASSERT_OK_AND_ASSIGN( + normalizer_, + normalizer_factory::Create( + /*max_term_byte_size=*/std::numeric_limits<int32_t>::max())); + + ASSERT_THAT( + filesystem_.CreateDirectoryRecursively(schema_store_dir_.c_str()), + IsTrue()); + ICING_ASSERT_OK_AND_ASSIGN( + schema_store_, + SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_)); + SchemaProto schema = + SchemaBuilder() + .AddType( + SchemaTypeConfigBuilder() + .SetType(kFakeType) + .AddProperty(PropertyConfigBuilder() + .SetName(kPropertyTitle) + .SetDataTypeString(TERM_MATCH_PREFIX, + TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_OPTIONAL)) + .AddProperty(PropertyConfigBuilder() + .SetName(kPropertyBody) + .SetDataTypeString(TERM_MATCH_EXACT, + TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_OPTIONAL))) + .AddType( + SchemaTypeConfigBuilder() + .SetType(kNestedType) + .AddProperty( + PropertyConfigBuilder() + .SetName(kPropertyNestedDoc) + .SetDataTypeDocument( + kFakeType, /*index_nested_properties=*/true) + .SetCardinality(CARDINALITY_OPTIONAL)) + .AddProperty(PropertyConfigBuilder() + .SetName(kPropertySubject) + .SetDataTypeString(TERM_MATCH_EXACT, + TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_OPTIONAL)) + .AddProperty(PropertyConfigBuilder() + .SetName(kPropertyName) + .SetDataTypeString(TERM_MATCH_EXACT, + TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_OPTIONAL))) + .Build(); + ICING_ASSERT_OK(schema_store_->SetSchema( + schema, /*ignore_errors_and_delete_documents=*/false, + /*allow_circular_schema_definitions=*/false)); + + ASSERT_TRUE( + filesystem_.CreateDirectoryRecursively(document_store_dir_.c_str())); + ICING_ASSERT_OK_AND_ASSIGN( + DocumentStore::CreateResult doc_store_create_result, + DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_, + schema_store_.get(), + /*force_recovery_and_revalidate_documents=*/false, + /*namespace_id_fingerprint=*/false, + /*pre_mapping_fbv=*/false, + /*use_persistent_hash_map=*/false, + PortableFileBackedProtoLog< + DocumentWrapper>::kDeflateCompressionLevel, + /*initialize_stats=*/nullptr)); + document_store_ = std::move(doc_store_create_result.document_store); + } + + void TearDown() override { + document_store_.reset(); + schema_store_.reset(); + normalizer_.reset(); + lang_segmenter_.reset(); + + filesystem_.DeleteDirectoryRecursively(base_dir_.c_str()); + } + + Filesystem filesystem_; + IcingFilesystem icing_filesystem_; + FakeClock fake_clock_; + std::string base_dir_; + std::string index_dir_; + std::string schema_store_dir_; + std::string document_store_dir_; + + std::unique_ptr<LanguageSegmenter> lang_segmenter_; + std::unique_ptr<Normalizer> normalizer_; + std::unique_ptr<SchemaStore> schema_store_; + std::unique_ptr<DocumentStore> document_store_; +}; + +libtextclassifier3::StatusOr<std::unique_ptr<DocHitInfoIterator>> +QueryExistence(Index* index, std::string_view property_path) { + return index->GetIterator( + absl_ports::StrCat(kPropertyExistenceTokenPrefix, property_path), + /*term_start_index=*/0, + /*unnormalized_term_length=*/0, kSectionIdMaskAll, + TermMatchType::EXACT_ONLY, + /*need_hit_term_frequency=*/false); +} + +std::vector<DocHitInfo> GetHits(std::unique_ptr<DocHitInfoIterator> iterator) { + std::vector<DocHitInfo> infos; + while (iterator->Advance().ok()) { + infos.push_back(iterator->doc_hit_info()); + } + return infos; +} + +std::vector<DocHitInfoTermFrequencyPair> GetHitsWithTermFrequency( + std::unique_ptr<DocHitInfoIterator> iterator) { + std::vector<DocHitInfoTermFrequencyPair> infos; + while (iterator->Advance().ok()) { + std::vector<TermMatchInfo> matched_terms_stats; + iterator->PopulateMatchedTermsStats(&matched_terms_stats); + for (const TermMatchInfo& term_match_info : matched_terms_stats) { + infos.push_back(DocHitInfoTermFrequencyPair( + iterator->doc_hit_info(), term_match_info.term_frequencies)); + } + } + return infos; +} + +TEST_F(TermIndexingHandlerTest, HandleBothStringSectionAndPropertyExistence) { + Index::Options options(index_dir_, /*index_merge_size=*/1024 * 1024, + /*lite_index_sort_at_indexing=*/true, + /*lite_index_sort_size=*/1024 * 8); + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<Index> index, + Index::Create(options, &filesystem_, &icing_filesystem_)); + + DocumentProto document = + DocumentBuilder() + .SetKey("icing", "fake_type/1") + .SetSchema(std::string(kFakeType)) + .AddStringProperty(std::string(kPropertyTitle), "foo") + .AddStringProperty(std::string(kPropertyBody), "") + .Build(); + + ICING_ASSERT_OK_AND_ASSIGN( + TokenizedDocument tokenized_document, + TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(), + std::move(document))); + + ICING_ASSERT_OK_AND_ASSIGN( + DocumentId document_id, + document_store_->Put(tokenized_document.document())); + + EXPECT_THAT(index->last_added_document_id(), Eq(kInvalidDocumentId)); + + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<TermIndexingHandler> handler, + TermIndexingHandler::Create( + &fake_clock_, normalizer_.get(), index.get(), + /*build_property_existence_metadata_hits=*/true)); + EXPECT_THAT( + handler->Handle(tokenized_document, document_id, /*recovery_mode=*/false, + /*put_document_stats=*/nullptr), + IsOk()); + + EXPECT_THAT(index->last_added_document_id(), Eq(document_id)); + + // Query 'foo' + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<DocHitInfoIterator> itr, + index->GetIterator("foo", /*term_start_index=*/0, + /*unnormalized_term_length=*/0, kSectionIdMaskAll, + TermMatchType::EXACT_ONLY)); + std::vector<DocHitInfoTermFrequencyPair> hits = + GetHitsWithTermFrequency(std::move(itr)); + std::unordered_map<SectionId, Hit::TermFrequency> expected_map{ + {kSectionIdTitle, 1}}; + EXPECT_THAT(hits, ElementsAre(EqualsDocHitInfoWithTermFrequency( + document_id, expected_map))); + + // Query for "title" property existence. + ICING_ASSERT_OK_AND_ASSIGN(itr, QueryExistence(index.get(), kPropertyTitle)); + EXPECT_THAT( + GetHits(std::move(itr)), + ElementsAre(EqualsDocHitInfo(document_id, std::vector<SectionId>{0}))); + + // Query for "body" property existence. + ICING_ASSERT_OK_AND_ASSIGN(itr, QueryExistence(index.get(), kPropertyBody)); + EXPECT_THAT(GetHits(std::move(itr)), IsEmpty()); +} + +TEST_F(TermIndexingHandlerTest, + HandleIntoLiteIndex_sortInIndexingNotTriggered) { + Index::Options options(index_dir_, /*index_merge_size=*/1024 * 1024, + /*lite_index_sort_at_indexing=*/true, + /*lite_index_sort_size=*/1024 * 8); + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<Index> index, + Index::Create(options, &filesystem_, &icing_filesystem_)); + + DocumentProto document = + DocumentBuilder() + .SetKey("icing", "fake_type/1") + .SetSchema(std::string(kFakeType)) + .AddStringProperty(std::string(kPropertyTitle), "foo") + .AddStringProperty(std::string(kPropertyBody), "foo bar baz") + .Build(); + + ICING_ASSERT_OK_AND_ASSIGN( + TokenizedDocument tokenized_document, + TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(), + std::move(document))); + + ICING_ASSERT_OK_AND_ASSIGN( + DocumentId document_id, + document_store_->Put(tokenized_document.document())); + + EXPECT_THAT(index->last_added_document_id(), Eq(kInvalidDocumentId)); + + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<TermIndexingHandler> handler, + TermIndexingHandler::Create( + &fake_clock_, normalizer_.get(), index.get(), + /*build_property_existence_metadata_hits=*/true)); + EXPECT_THAT( + handler->Handle(tokenized_document, document_id, /*recovery_mode=*/false, + /*put_document_stats=*/nullptr), + IsOk()); + + EXPECT_THAT(index->last_added_document_id(), Eq(document_id)); + + // Query 'foo' + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<DocHitInfoIterator> itr, + index->GetIterator("foo", /*term_start_index=*/0, + /*unnormalized_term_length=*/0, kSectionIdMaskAll, + TermMatchType::EXACT_ONLY)); + std::vector<DocHitInfoTermFrequencyPair> hits = + GetHitsWithTermFrequency(std::move(itr)); + std::unordered_map<SectionId, Hit::TermFrequency> expected_map{ + {kSectionIdTitle, 1}, {kSectionIdBody, 1}}; + EXPECT_THAT(hits, ElementsAre(EqualsDocHitInfoWithTermFrequency( + document_id, expected_map))); + + // Query 'foo' with sectionId mask that masks all results + ICING_ASSERT_OK_AND_ASSIGN( + itr, index->GetIterator("foo", /*term_start_index=*/0, + /*unnormalized_term_length=*/0, 1U << 2, + TermMatchType::EXACT_ONLY)); + EXPECT_THAT(GetHits(std::move(itr)), IsEmpty()); +} + +TEST_F(TermIndexingHandlerTest, HandleIntoLiteIndex_sortInIndexingTriggered) { + // Create the LiteIndex with a smaller sort threshold. At 64 bytes we sort the + // HitBuffer after inserting 8 hits + Index::Options options(index_dir_, + /*index_merge_size=*/1024 * 1024, + /*lite_index_sort_at_indexing=*/true, + /*lite_index_sort_size=*/64); + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<Index> index, + Index::Create(options, &filesystem_, &icing_filesystem_)); + + DocumentProto document0 = + DocumentBuilder() + .SetKey("icing", "fake_type/0") + .SetSchema(std::string(kFakeType)) + .AddStringProperty(std::string(kPropertyTitle), "foo foo foo") + .AddStringProperty(std::string(kPropertyBody), "foo bar baz") + .Build(); + DocumentProto document1 = + DocumentBuilder() + .SetKey("icing", "fake_type/1") + .SetSchema(std::string(kFakeType)) + .AddStringProperty(std::string(kPropertyTitle), "bar baz baz") + .AddStringProperty(std::string(kPropertyBody), "foo foo baz") + .Build(); + DocumentProto document2 = + DocumentBuilder() + .SetKey("icing", "nested_type/0") + .SetSchema(std::string(kNestedType)) + .AddDocumentProperty(std::string(kPropertyNestedDoc), document1) + .AddStringProperty(std::string(kPropertyName), "qux") + .AddStringProperty(std::string(kPropertySubject), "bar bar") + .Build(); + + ICING_ASSERT_OK_AND_ASSIGN( + TokenizedDocument tokenized_document0, + TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(), + std::move(document0))); + ICING_ASSERT_OK_AND_ASSIGN( + DocumentId document_id0, + document_store_->Put(tokenized_document0.document())); + + ICING_ASSERT_OK_AND_ASSIGN( + TokenizedDocument tokenized_document1, + TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(), + std::move(document1))); + ICING_ASSERT_OK_AND_ASSIGN( + DocumentId document_id1, + document_store_->Put(tokenized_document1.document())); + + ICING_ASSERT_OK_AND_ASSIGN( + TokenizedDocument tokenized_document2, + TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(), + std::move(document2))); + ICING_ASSERT_OK_AND_ASSIGN( + DocumentId document_id2, + document_store_->Put(tokenized_document2.document())); + EXPECT_THAT(index->last_added_document_id(), Eq(kInvalidDocumentId)); + + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<TermIndexingHandler> handler, + TermIndexingHandler::Create( + &fake_clock_, normalizer_.get(), index.get(), + /*build_property_existence_metadata_hits=*/true)); + + // Handle doc0 and doc1. The LiteIndex should sort and merge after adding + // these + EXPECT_THAT(handler->Handle(tokenized_document0, document_id0, + /*recovery_mode=*/false, + /*put_document_stats=*/nullptr), + IsOk()); + EXPECT_THAT(handler->Handle(tokenized_document1, document_id1, + /*recovery_mode=*/false, + /*put_document_stats=*/nullptr), + IsOk()); + EXPECT_THAT(index->last_added_document_id(), Eq(document_id1)); + EXPECT_THAT(index->LiteIndexNeedSort(), IsFalse()); + + // Handle doc2. The LiteIndex should have an unsorted portion after adding + EXPECT_THAT(handler->Handle(tokenized_document2, document_id2, + /*recovery_mode=*/false, + /*put_document_stats=*/nullptr), + IsOk()); + EXPECT_THAT(index->last_added_document_id(), Eq(document_id2)); + + // Hits in the hit buffer: + // <term>: {(docId, sectionId, term_freq)...} + // foo: {(0, kSectionIdTitle, 3); (0, kSectionIdBody, 1); + // (1, kSectionIdBody, 2); + // (2, kSectionIdNestedBody, 2)} + // bar: {(0, kSectionIdBody, 1); + // (1, kSectionIdTitle, 1); + // (2, kSectionIdNestedTitle, 1); (2, kSectionIdSubject, 2)} + // baz: {(0, kSectionIdBody, 1); + // (1, kSectionIdTitle, 2); (1, kSectionIdBody, 1), + // (2, kSectionIdNestedTitle, 2); (2, kSectionIdNestedBody, 1)} + // qux: {(2, kSectionIdName, 1)} + + // Query 'foo' + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<DocHitInfoIterator> itr, + index->GetIterator("foo", /*term_start_index=*/0, + /*unnormalized_term_length=*/0, kSectionIdMaskAll, + TermMatchType::EXACT_ONLY)); + + // Advance the iterator and verify that we're returning hits in the correct + // order (i.e. in descending order of DocId) + ASSERT_THAT(itr->Advance(), IsOk()); + EXPECT_THAT(itr->doc_hit_info().document_id(), Eq(2)); + EXPECT_THAT(itr->doc_hit_info().hit_section_ids_mask(), + Eq(1U << kSectionIdNestedBody)); + std::vector<TermMatchInfo> matched_terms_stats; + std::unordered_map<SectionId, Hit::TermFrequency> + expected_section_ids_tf_map2 = {{kSectionIdNestedBody, 2}}; + itr->PopulateMatchedTermsStats(&matched_terms_stats); + EXPECT_THAT(matched_terms_stats, ElementsAre(EqualsTermMatchInfo( + "foo", expected_section_ids_tf_map2))); + + ASSERT_THAT(itr->Advance(), IsOk()); + EXPECT_THAT(itr->doc_hit_info().document_id(), Eq(1)); + EXPECT_THAT(itr->doc_hit_info().hit_section_ids_mask(), + Eq(1U << kSectionIdBody)); + std::unordered_map<SectionId, Hit::TermFrequency> + expected_section_ids_tf_map1 = {{kSectionIdBody, 2}}; + matched_terms_stats.clear(); + itr->PopulateMatchedTermsStats(&matched_terms_stats); + EXPECT_THAT(matched_terms_stats, ElementsAre(EqualsTermMatchInfo( + "foo", expected_section_ids_tf_map1))); + + ASSERT_THAT(itr->Advance(), IsOk()); + EXPECT_THAT(itr->doc_hit_info().document_id(), Eq(0)); + EXPECT_THAT(itr->doc_hit_info().hit_section_ids_mask(), + Eq(1U << kSectionIdTitle | 1U << kSectionIdBody)); + std::unordered_map<SectionId, Hit::TermFrequency> + expected_section_ids_tf_map0 = {{kSectionIdTitle, 3}, + {kSectionIdBody, 1}}; + matched_terms_stats.clear(); + itr->PopulateMatchedTermsStats(&matched_terms_stats); + EXPECT_THAT(matched_terms_stats, ElementsAre(EqualsTermMatchInfo( + "foo", expected_section_ids_tf_map0))); +} + +TEST_F(TermIndexingHandlerTest, HandleIntoLiteIndex_enableSortInIndexing) { + // Create the LiteIndex with a smaller sort threshold. At 64 bytes we sort the + // HitBuffer after inserting 8 hits + Index::Options options(index_dir_, + /*index_merge_size=*/1024 * 1024, + /*lite_index_sort_at_indexing=*/false, + /*lite_index_sort_size=*/64); + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<Index> index, + Index::Create(options, &filesystem_, &icing_filesystem_)); + + DocumentProto document0 = + DocumentBuilder() + .SetKey("icing", "fake_type/0") + .SetSchema(std::string(kFakeType)) + .AddStringProperty(std::string(kPropertyTitle), "foo foo foo") + .AddStringProperty(std::string(kPropertyBody), "foo bar baz") + .Build(); + DocumentProto document1 = + DocumentBuilder() + .SetKey("icing", "fake_type/1") + .SetSchema(std::string(kFakeType)) + .AddStringProperty(std::string(kPropertyTitle), "bar baz baz") + .AddStringProperty(std::string(kPropertyBody), "foo foo baz") + .Build(); + DocumentProto document2 = + DocumentBuilder() + .SetKey("icing", "nested_type/0") + .SetSchema(std::string(kNestedType)) + .AddDocumentProperty(std::string(kPropertyNestedDoc), document1) + .AddStringProperty(std::string(kPropertyName), "qux") + .AddStringProperty(std::string(kPropertySubject), "bar bar") + .Build(); + + ICING_ASSERT_OK_AND_ASSIGN( + TokenizedDocument tokenized_document0, + TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(), + std::move(document0))); + ICING_ASSERT_OK_AND_ASSIGN( + DocumentId document_id0, + document_store_->Put(tokenized_document0.document())); + + ICING_ASSERT_OK_AND_ASSIGN( + TokenizedDocument tokenized_document1, + TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(), + std::move(document1))); + ICING_ASSERT_OK_AND_ASSIGN( + DocumentId document_id1, + document_store_->Put(tokenized_document1.document())); + + ICING_ASSERT_OK_AND_ASSIGN( + TokenizedDocument tokenized_document2, + TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(), + std::move(document2))); + ICING_ASSERT_OK_AND_ASSIGN( + DocumentId document_id2, + document_store_->Put(tokenized_document2.document())); + EXPECT_THAT(index->last_added_document_id(), Eq(kInvalidDocumentId)); + + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<TermIndexingHandler> handler, + TermIndexingHandler::Create( + &fake_clock_, normalizer_.get(), index.get(), + /*build_property_existence_metadata_hits=*/true)); + + // Handle all docs + EXPECT_THAT(handler->Handle(tokenized_document0, document_id0, + /*recovery_mode=*/false, + /*put_document_stats=*/nullptr), + IsOk()); + EXPECT_THAT(handler->Handle(tokenized_document1, document_id1, + /*recovery_mode=*/false, + /*put_document_stats=*/nullptr), + IsOk()); + EXPECT_THAT(handler->Handle(tokenized_document2, document_id2, + /*recovery_mode=*/false, + /*put_document_stats=*/nullptr), + IsOk()); + EXPECT_THAT(index->last_added_document_id(), Eq(document_id2)); + + // We've disabled sorting during indexing so the HitBuffer's unsorted section + // should exceed the sort threshold. PersistToDisk and reinitialize the + // LiteIndex with sort_at_indexing=true. + ASSERT_THAT(index->PersistToDisk(), IsOk()); + options = Index::Options(index_dir_, + /*index_merge_size=*/1024 * 1024, + /*lite_index_sort_at_indexing=*/true, + /*lite_index_sort_size=*/64); + ICING_ASSERT_OK_AND_ASSIGN( + index, Index::Create(options, &filesystem_, &icing_filesystem_)); + + // Verify that the HitBuffer has been sorted after initializing with + // sort_at_indexing enabled. + EXPECT_THAT(index->LiteIndexNeedSort(), IsFalse()); + + // Hits in the hit buffer: + // <term>: {(docId, sectionId, term_freq)...} + // foo: {(0, kSectionIdTitle, 3); (0, kSectionIdBody, 1); + // (1, kSectionIdBody, 2); + // (2, kSectionIdNestedBody, 2)} + // bar: {(0, kSectionIdBody, 1); + // (1, kSectionIdTitle, 1); + // (2, kSectionIdNestedTitle, 1); (2, kSectionIdSubject, 2)} + // baz: {(0, kSectionIdBody, 1); + // (1, kSectionIdTitle, 2); (1, kSectionIdBody, 1), + // (2, kSectionIdNestedTitle, 2); (2, kSectionIdNestedBody, 1)} + // qux: {(2, kSectionIdName, 1)} + + // Query 'foo' + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<DocHitInfoIterator> itr, + index->GetIterator("foo", /*term_start_index=*/0, + /*unnormalized_term_length=*/0, kSectionIdMaskAll, + TermMatchType::EXACT_ONLY)); + + // Advance the iterator and verify that we're returning hits in the correct + // order (i.e. in descending order of DocId) + ASSERT_THAT(itr->Advance(), IsOk()); + EXPECT_THAT(itr->doc_hit_info().document_id(), Eq(2)); + EXPECT_THAT(itr->doc_hit_info().hit_section_ids_mask(), + Eq(1U << kSectionIdNestedBody)); + std::vector<TermMatchInfo> matched_terms_stats; + std::unordered_map<SectionId, Hit::TermFrequency> + expected_section_ids_tf_map2 = {{kSectionIdNestedBody, 2}}; + itr->PopulateMatchedTermsStats(&matched_terms_stats); + EXPECT_THAT(matched_terms_stats, ElementsAre(EqualsTermMatchInfo( + "foo", expected_section_ids_tf_map2))); + + ASSERT_THAT(itr->Advance(), IsOk()); + EXPECT_THAT(itr->doc_hit_info().document_id(), Eq(1)); + EXPECT_THAT(itr->doc_hit_info().hit_section_ids_mask(), + Eq(1U << kSectionIdBody)); + std::unordered_map<SectionId, Hit::TermFrequency> + expected_section_ids_tf_map1 = {{kSectionIdBody, 2}}; + matched_terms_stats.clear(); + itr->PopulateMatchedTermsStats(&matched_terms_stats); + EXPECT_THAT(matched_terms_stats, ElementsAre(EqualsTermMatchInfo( + "foo", expected_section_ids_tf_map1))); + + ASSERT_THAT(itr->Advance(), IsOk()); + EXPECT_THAT(itr->doc_hit_info().document_id(), Eq(0)); + EXPECT_THAT(itr->doc_hit_info().hit_section_ids_mask(), + Eq(1U << kSectionIdTitle | 1U << kSectionIdBody)); + std::unordered_map<SectionId, Hit::TermFrequency> + expected_section_ids_tf_map0 = {{kSectionIdTitle, 3}, + {kSectionIdBody, 1}}; + matched_terms_stats.clear(); + itr->PopulateMatchedTermsStats(&matched_terms_stats); + EXPECT_THAT(matched_terms_stats, ElementsAre(EqualsTermMatchInfo( + "foo", expected_section_ids_tf_map0))); +} + +} // namespace + +} // namespace lib +} // namespace icing diff --git a/icing/jni.lds b/icing/jni.lds index 401682a..64fae36 100644 --- a/icing/jni.lds +++ b/icing/jni.lds @@ -1,7 +1,6 @@ VERS_1.0 { # Export JNI symbols. global: - Java_*; JNI_OnLoad; # Hide everything else diff --git a/icing/jni/icing-search-engine-jni.cc b/icing/jni/icing-search-engine-jni.cc index f2a33e0..a0883fa 100644 --- a/icing/jni/icing-search-engine-jni.cc +++ b/icing/jni/icing-search-engine-jni.cc @@ -36,10 +36,6 @@ namespace { -// JNI string constants -// Matches field name of IcingSearchEngine#nativePointer. -const char kNativePointerField[] = "nativePointer"; - bool ParseProtoFromJniByteArray(JNIEnv* env, jbyteArray bytes, google::protobuf::MessageLite* protobuf) { icing::lib::ScopedPrimitiveArrayCritical<uint8_t> scoped_array(env, bytes); @@ -61,11 +57,14 @@ jbyteArray SerializeProtoToJniByteArray(JNIEnv* env, return ret; } +struct { + jfieldID native_pointer; +} JavaIcingSearchEngineImpl; + icing::lib::IcingSearchEngine* GetIcingSearchEnginePointer(JNIEnv* env, jobject object) { - jclass cls = env->GetObjectClass(object); - jfieldID field_id = env->GetFieldID(cls, kNativePointerField, "J"); - jlong native_pointer = env->GetLongField(object, field_id); + jlong native_pointer = + env->GetLongField(object, JavaIcingSearchEngineImpl.native_pointer); return reinterpret_cast<icing::lib::IcingSearchEngine*>(native_pointer); } @@ -73,19 +72,8 @@ icing::lib::IcingSearchEngine* GetIcingSearchEnginePointer(JNIEnv* env, extern "C" { -jint JNI_OnLoad(JavaVM* vm, void* reserved) { - JNIEnv* env; - if (vm->GetEnv(reinterpret_cast<void**>(&env), JNI_VERSION_1_6) != JNI_OK) { - ICING_LOG(icing::lib::ERROR) << "ERROR: GetEnv failed"; - return JNI_ERR; - } - - return JNI_VERSION_1_6; -} - -JNIEXPORT jlong JNICALL -Java_com_google_android_icing_IcingSearchEngineImpl_nativeCreate( - JNIEnv* env, jclass clazz, jbyteArray icing_search_engine_options_bytes) { +jlong nativeCreate(JNIEnv* env, jclass clazz, + jbyteArray icing_search_engine_options_bytes) { icing::lib::IcingSearchEngineOptions options; if (!ParseProtoFromJniByteArray(env, icing_search_engine_options_bytes, &options)) { @@ -103,17 +91,13 @@ Java_com_google_android_icing_IcingSearchEngineImpl_nativeCreate( return reinterpret_cast<jlong>(icing); } -JNIEXPORT void JNICALL -Java_com_google_android_icing_IcingSearchEngineImpl_nativeDestroy( - JNIEnv* env, jclass clazz, jobject object) { +void nativeDestroy(JNIEnv* env, jclass clazz, jobject object) { icing::lib::IcingSearchEngine* icing = GetIcingSearchEnginePointer(env, object); delete icing; } -JNIEXPORT jbyteArray JNICALL -Java_com_google_android_icing_IcingSearchEngineImpl_nativeInitialize( - JNIEnv* env, jclass clazz, jobject object) { +jbyteArray nativeInitialize(JNIEnv* env, jclass clazz, jobject object) { icing::lib::IcingSearchEngine* icing = GetIcingSearchEnginePointer(env, object); @@ -123,10 +107,9 @@ Java_com_google_android_icing_IcingSearchEngineImpl_nativeInitialize( return SerializeProtoToJniByteArray(env, initialize_result_proto); } -JNIEXPORT jbyteArray JNICALL -Java_com_google_android_icing_IcingSearchEngineImpl_nativeSetSchema( - JNIEnv* env, jclass clazz, jobject object, jbyteArray schema_bytes, - jboolean ignore_errors_and_delete_documents) { +jbyteArray nativeSetSchema(JNIEnv* env, jclass clazz, jobject object, + jbyteArray schema_bytes, + jboolean ignore_errors_and_delete_documents) { icing::lib::IcingSearchEngine* icing = GetIcingSearchEnginePointer(env, object); @@ -143,9 +126,7 @@ Java_com_google_android_icing_IcingSearchEngineImpl_nativeSetSchema( return SerializeProtoToJniByteArray(env, set_schema_result_proto); } -JNIEXPORT jbyteArray JNICALL -Java_com_google_android_icing_IcingSearchEngineImpl_nativeGetSchema( - JNIEnv* env, jclass clazz, jobject object) { +jbyteArray nativeGetSchema(JNIEnv* env, jclass clazz, jobject object) { icing::lib::IcingSearchEngine* icing = GetIcingSearchEnginePointer(env, object); @@ -154,9 +135,8 @@ Java_com_google_android_icing_IcingSearchEngineImpl_nativeGetSchema( return SerializeProtoToJniByteArray(env, get_schema_result_proto); } -JNIEXPORT jbyteArray JNICALL -Java_com_google_android_icing_IcingSearchEngineImpl_nativeGetSchemaType( - JNIEnv* env, jclass clazz, jobject object, jstring schema_type) { +jbyteArray nativeGetSchemaType(JNIEnv* env, jclass clazz, jobject object, + jstring schema_type) { icing::lib::IcingSearchEngine* icing = GetIcingSearchEnginePointer(env, object); @@ -167,9 +147,8 @@ Java_com_google_android_icing_IcingSearchEngineImpl_nativeGetSchemaType( return SerializeProtoToJniByteArray(env, get_schema_type_result_proto); } -JNIEXPORT jbyteArray JNICALL -Java_com_google_android_icing_IcingSearchEngineImpl_nativePut( - JNIEnv* env, jclass clazz, jobject object, jbyteArray document_bytes) { +jbyteArray nativePut(JNIEnv* env, jclass clazz, jobject object, + jbyteArray document_bytes) { icing::lib::IcingSearchEngine* icing = GetIcingSearchEnginePointer(env, object); @@ -186,10 +165,9 @@ Java_com_google_android_icing_IcingSearchEngineImpl_nativePut( return SerializeProtoToJniByteArray(env, put_result_proto); } -JNIEXPORT jbyteArray JNICALL -Java_com_google_android_icing_IcingSearchEngineImpl_nativeGet( - JNIEnv* env, jclass clazz, jobject object, jstring name_space, jstring uri, - jbyteArray result_spec_bytes) { +jbyteArray nativeGet(JNIEnv* env, jclass clazz, jobject object, + jstring name_space, jstring uri, + jbyteArray result_spec_bytes) { icing::lib::IcingSearchEngine* icing = GetIcingSearchEnginePointer(env, object); @@ -208,9 +186,8 @@ Java_com_google_android_icing_IcingSearchEngineImpl_nativeGet( return SerializeProtoToJniByteArray(env, get_result_proto); } -JNIEXPORT jbyteArray JNICALL -Java_com_google_android_icing_IcingSearchEngineImpl_nativeReportUsage( - JNIEnv* env, jclass clazz, jobject object, jbyteArray usage_report_bytes) { +jbyteArray nativeReportUsage(JNIEnv* env, jclass clazz, jobject object, + jbyteArray usage_report_bytes) { icing::lib::IcingSearchEngine* icing = GetIcingSearchEnginePointer(env, object); @@ -227,9 +204,7 @@ Java_com_google_android_icing_IcingSearchEngineImpl_nativeReportUsage( return SerializeProtoToJniByteArray(env, report_usage_result_proto); } -JNIEXPORT jbyteArray JNICALL -Java_com_google_android_icing_IcingSearchEngineImpl_nativeGetAllNamespaces( - JNIEnv* env, jclass clazz, jobject object) { +jbyteArray nativeGetAllNamespaces(JNIEnv* env, jclass clazz, jobject object) { icing::lib::IcingSearchEngine* icing = GetIcingSearchEnginePointer(env, object); @@ -239,10 +214,9 @@ Java_com_google_android_icing_IcingSearchEngineImpl_nativeGetAllNamespaces( return SerializeProtoToJniByteArray(env, get_all_namespaces_result_proto); } -JNIEXPORT jbyteArray JNICALL -Java_com_google_android_icing_IcingSearchEngineImpl_nativeGetNextPage( - JNIEnv* env, jclass clazz, jobject object, jlong next_page_token, - jlong java_to_native_start_timestamp_ms) { +jbyteArray nativeGetNextPage(JNIEnv* env, jclass clazz, jobject object, + jlong next_page_token, + jlong java_to_native_start_timestamp_ms) { icing::lib::IcingSearchEngine* icing = GetIcingSearchEnginePointer(env, object); @@ -263,9 +237,8 @@ Java_com_google_android_icing_IcingSearchEngineImpl_nativeGetNextPage( return SerializeProtoToJniByteArray(env, next_page_result_proto); } -JNIEXPORT void JNICALL -Java_com_google_android_icing_IcingSearchEngineImpl_nativeInvalidateNextPageToken( - JNIEnv* env, jclass clazz, jobject object, jlong next_page_token) { +void nativeInvalidateNextPageToken(JNIEnv* env, jclass clazz, jobject object, + jlong next_page_token) { icing::lib::IcingSearchEngine* icing = GetIcingSearchEnginePointer(env, object); @@ -274,11 +247,11 @@ Java_com_google_android_icing_IcingSearchEngineImpl_nativeInvalidateNextPageToke return; } -JNIEXPORT jbyteArray JNICALL -Java_com_google_android_icing_IcingSearchEngineImpl_nativeSearch( - JNIEnv* env, jclass clazz, jobject object, jbyteArray search_spec_bytes, - jbyteArray scoring_spec_bytes, jbyteArray result_spec_bytes, - jlong java_to_native_start_timestamp_ms) { +jbyteArray nativeSearch(JNIEnv* env, jclass clazz, jobject object, + jbyteArray search_spec_bytes, + jbyteArray scoring_spec_bytes, + jbyteArray result_spec_bytes, + jlong java_to_native_start_timestamp_ms) { icing::lib::IcingSearchEngine* icing = GetIcingSearchEnginePointer(env, object); @@ -321,10 +294,8 @@ Java_com_google_android_icing_IcingSearchEngineImpl_nativeSearch( return SerializeProtoToJniByteArray(env, search_result_proto); } -JNIEXPORT jbyteArray JNICALL -Java_com_google_android_icing_IcingSearchEngineImpl_nativeDelete( - JNIEnv* env, jclass clazz, jobject object, jstring name_space, - jstring uri) { +jbyteArray nativeDelete(JNIEnv* env, jclass clazz, jobject object, + jstring name_space, jstring uri) { icing::lib::IcingSearchEngine* icing = GetIcingSearchEnginePointer(env, object); @@ -336,9 +307,8 @@ Java_com_google_android_icing_IcingSearchEngineImpl_nativeDelete( return SerializeProtoToJniByteArray(env, delete_result_proto); } -JNIEXPORT jbyteArray JNICALL -Java_com_google_android_icing_IcingSearchEngineImpl_nativeDeleteByNamespace( - JNIEnv* env, jclass clazz, jobject object, jstring name_space) { +jbyteArray nativeDeleteByNamespace(JNIEnv* env, jclass clazz, jobject object, + jstring name_space) { icing::lib::IcingSearchEngine* icing = GetIcingSearchEnginePointer(env, object); @@ -349,9 +319,8 @@ Java_com_google_android_icing_IcingSearchEngineImpl_nativeDeleteByNamespace( return SerializeProtoToJniByteArray(env, delete_by_namespace_result_proto); } -JNIEXPORT jbyteArray JNICALL -Java_com_google_android_icing_IcingSearchEngineImpl_nativeDeleteBySchemaType( - JNIEnv* env, jclass clazz, jobject object, jstring schema_type) { +jbyteArray nativeDeleteBySchemaType(JNIEnv* env, jclass clazz, jobject object, + jstring schema_type) { icing::lib::IcingSearchEngine* icing = GetIcingSearchEnginePointer(env, object); @@ -362,10 +331,9 @@ Java_com_google_android_icing_IcingSearchEngineImpl_nativeDeleteBySchemaType( return SerializeProtoToJniByteArray(env, delete_by_schema_type_result_proto); } -JNIEXPORT jbyteArray JNICALL -Java_com_google_android_icing_IcingSearchEngineImpl_nativeDeleteByQuery( - JNIEnv* env, jclass clazz, jobject object, jbyteArray search_spec_bytes, - jboolean return_deleted_document_info) { +jbyteArray nativeDeleteByQuery(JNIEnv* env, jclass clazz, jobject object, + jbyteArray search_spec_bytes, + jboolean return_deleted_document_info) { icing::lib::IcingSearchEngine* icing = GetIcingSearchEnginePointer(env, object); @@ -381,9 +349,8 @@ Java_com_google_android_icing_IcingSearchEngineImpl_nativeDeleteByQuery( return SerializeProtoToJniByteArray(env, delete_result_proto); } -JNIEXPORT jbyteArray JNICALL -Java_com_google_android_icing_IcingSearchEngineImpl_nativePersistToDisk( - JNIEnv* env, jclass clazz, jobject object, jint persist_type_code) { +jbyteArray nativePersistToDisk(JNIEnv* env, jclass clazz, jobject object, + jint persist_type_code) { icing::lib::IcingSearchEngine* icing = GetIcingSearchEnginePointer(env, object); @@ -400,9 +367,7 @@ Java_com_google_android_icing_IcingSearchEngineImpl_nativePersistToDisk( return SerializeProtoToJniByteArray(env, persist_to_disk_result_proto); } -JNIEXPORT jbyteArray JNICALL -Java_com_google_android_icing_IcingSearchEngineImpl_nativeOptimize( - JNIEnv* env, jclass clazz, jobject object) { +jbyteArray nativeOptimize(JNIEnv* env, jclass clazz, jobject object) { icing::lib::IcingSearchEngine* icing = GetIcingSearchEnginePointer(env, object); @@ -411,9 +376,7 @@ Java_com_google_android_icing_IcingSearchEngineImpl_nativeOptimize( return SerializeProtoToJniByteArray(env, optimize_result_proto); } -JNIEXPORT jbyteArray JNICALL -Java_com_google_android_icing_IcingSearchEngineImpl_nativeGetOptimizeInfo( - JNIEnv* env, jclass clazz, jobject object) { +jbyteArray nativeGetOptimizeInfo(JNIEnv* env, jclass clazz, jobject object) { icing::lib::IcingSearchEngine* icing = GetIcingSearchEnginePointer(env, object); @@ -423,9 +386,7 @@ Java_com_google_android_icing_IcingSearchEngineImpl_nativeGetOptimizeInfo( return SerializeProtoToJniByteArray(env, get_optimize_info_result_proto); } -JNIEXPORT jbyteArray JNICALL -Java_com_google_android_icing_IcingSearchEngineImpl_nativeGetStorageInfo( - JNIEnv* env, jclass clazz, jobject object) { +jbyteArray nativeGetStorageInfo(JNIEnv* env, jclass clazz, jobject object) { icing::lib::IcingSearchEngine* icing = GetIcingSearchEnginePointer(env, object); @@ -435,9 +396,7 @@ Java_com_google_android_icing_IcingSearchEngineImpl_nativeGetStorageInfo( return SerializeProtoToJniByteArray(env, storage_info_result_proto); } -JNIEXPORT jbyteArray JNICALL -Java_com_google_android_icing_IcingSearchEngineImpl_nativeReset( - JNIEnv* env, jclass clazz, jobject object) { +jbyteArray nativeReset(JNIEnv* env, jclass clazz, jobject object) { icing::lib::IcingSearchEngine* icing = GetIcingSearchEnginePointer(env, object); @@ -446,10 +405,8 @@ Java_com_google_android_icing_IcingSearchEngineImpl_nativeReset( return SerializeProtoToJniByteArray(env, reset_result_proto); } -JNIEXPORT jbyteArray JNICALL -Java_com_google_android_icing_IcingSearchEngineImpl_nativeSearchSuggestions( - JNIEnv* env, jclass clazz, jobject object, - jbyteArray suggestion_spec_bytes) { +jbyteArray nativeSearchSuggestions(JNIEnv* env, jclass clazz, jobject object, + jbyteArray suggestion_spec_bytes) { icing::lib::IcingSearchEngine* icing = GetIcingSearchEnginePointer(env, object); @@ -466,9 +423,8 @@ Java_com_google_android_icing_IcingSearchEngineImpl_nativeSearchSuggestions( return SerializeProtoToJniByteArray(env, suggestionResponse); } -JNIEXPORT jbyteArray JNICALL -Java_com_google_android_icing_IcingSearchEngineImpl_nativeGetDebugInfo( - JNIEnv* env, jclass clazz, jobject object, jint verbosity) { +jbyteArray nativeGetDebugInfo(JNIEnv* env, jclass clazz, jobject object, + jint verbosity) { icing::lib::IcingSearchEngine* icing = GetIcingSearchEnginePointer(env, object); @@ -485,9 +441,8 @@ Java_com_google_android_icing_IcingSearchEngineImpl_nativeGetDebugInfo( return SerializeProtoToJniByteArray(env, debug_info_result_proto); } -JNIEXPORT jboolean JNICALL -Java_com_google_android_icing_IcingSearchEngineImpl_nativeShouldLog( - JNIEnv* env, jclass clazz, jshort severity, jshort verbosity) { +jboolean nativeShouldLog(JNIEnv* env, jclass clazz, jshort severity, + jshort verbosity) { if (!icing::lib::LogSeverity::Code_IsValid(severity)) { ICING_LOG(icing::lib::ERROR) << "Invalid value for logging severity: " << severity; @@ -497,9 +452,8 @@ Java_com_google_android_icing_IcingSearchEngineImpl_nativeShouldLog( static_cast<icing::lib::LogSeverity::Code>(severity), verbosity); } -JNIEXPORT jboolean JNICALL -Java_com_google_android_icing_IcingSearchEngineImpl_nativeSetLoggingLevel( - JNIEnv* env, jclass clazz, jshort severity, jshort verbosity) { +jboolean nativeSetLoggingLevel(JNIEnv* env, jclass clazz, jshort severity, + jshort verbosity) { if (!icing::lib::LogSeverity::Code_IsValid(severity)) { ICING_LOG(icing::lib::ERROR) << "Invalid value for logging severity: " << severity; @@ -509,216 +463,111 @@ Java_com_google_android_icing_IcingSearchEngineImpl_nativeSetLoggingLevel( static_cast<icing::lib::LogSeverity::Code>(severity), verbosity); } -JNIEXPORT jstring JNICALL -Java_com_google_android_icing_IcingSearchEngineImpl_nativeGetLoggingTag( - JNIEnv* env, jclass clazz) { +jstring nativeGetLoggingTag(JNIEnv* env, jclass clazz) { return env->NewStringUTF(icing::lib::kIcingLoggingTag); } -// TODO(b/240333360) Remove the methods below for IcingSearchEngine once we have -// a sync from Jetpack to g3 to contain the refactored IcingSearchEngine(with -// IcingSearchEngineImpl). -JNIEXPORT jlong JNICALL -Java_com_google_android_icing_IcingSearchEngine_nativeCreate( - JNIEnv* env, jclass clazz, jbyteArray icing_search_engine_options_bytes) { - return Java_com_google_android_icing_IcingSearchEngineImpl_nativeCreate( - env, clazz, icing_search_engine_options_bytes); -} - -JNIEXPORT void JNICALL -Java_com_google_android_icing_IcingSearchEngine_nativeDestroy(JNIEnv* env, - jclass clazz, - jobject object) { - Java_com_google_android_icing_IcingSearchEngineImpl_nativeDestroy(env, clazz, - object); -} - -JNIEXPORT jbyteArray JNICALL -Java_com_google_android_icing_IcingSearchEngine_nativeInitialize( - JNIEnv* env, jclass clazz, jobject object) { - return Java_com_google_android_icing_IcingSearchEngineImpl_nativeInitialize( - env, clazz, object); -} - -JNIEXPORT jbyteArray JNICALL -Java_com_google_android_icing_IcingSearchEngine_nativeSetSchema( - JNIEnv* env, jclass clazz, jobject object, jbyteArray schema_bytes, - jboolean ignore_errors_and_delete_documents) { - return Java_com_google_android_icing_IcingSearchEngineImpl_nativeSetSchema( - env, clazz, object, schema_bytes, ignore_errors_and_delete_documents); -} - -JNIEXPORT jbyteArray JNICALL -Java_com_google_android_icing_IcingSearchEngine_nativeGetSchema( - JNIEnv* env, jclass clazz, jobject object) { - return Java_com_google_android_icing_IcingSearchEngineImpl_nativeGetSchema( - env, clazz, object); -} - -JNIEXPORT jbyteArray JNICALL -Java_com_google_android_icing_IcingSearchEngine_nativeGetSchemaType( - JNIEnv* env, jclass clazz, jobject object, jstring schema_type) { - return Java_com_google_android_icing_IcingSearchEngineImpl_nativeGetSchemaType( - env, clazz, object, schema_type); -} - -JNIEXPORT jbyteArray JNICALL -Java_com_google_android_icing_IcingSearchEngine_nativePut( - JNIEnv* env, jclass clazz, jobject object, jbyteArray document_bytes) { - return Java_com_google_android_icing_IcingSearchEngineImpl_nativePut( - env, clazz, object, document_bytes); -} - -JNIEXPORT jbyteArray JNICALL -Java_com_google_android_icing_IcingSearchEngine_nativeGet( - JNIEnv* env, jclass clazz, jobject object, jstring name_space, jstring uri, - jbyteArray result_spec_bytes) { - return Java_com_google_android_icing_IcingSearchEngineImpl_nativeGet( - env, clazz, object, name_space, uri, result_spec_bytes); -} - -JNIEXPORT jbyteArray JNICALL -Java_com_google_android_icing_IcingSearchEngine_nativeReportUsage( - JNIEnv* env, jclass clazz, jobject object, jbyteArray usage_report_bytes) { - return Java_com_google_android_icing_IcingSearchEngineImpl_nativeReportUsage( - env, clazz, object, usage_report_bytes); -} - -JNIEXPORT jbyteArray JNICALL -Java_com_google_android_icing_IcingSearchEngine_nativeGetAllNamespaces( - JNIEnv* env, jclass clazz, jobject object) { - return Java_com_google_android_icing_IcingSearchEngineImpl_nativeGetAllNamespaces( - env, clazz, object); -} - -JNIEXPORT jbyteArray JNICALL -Java_com_google_android_icing_IcingSearchEngine_nativeGetNextPage( - JNIEnv* env, jclass clazz, jobject object, jlong next_page_token, - jlong java_to_native_start_timestamp_ms) { - return Java_com_google_android_icing_IcingSearchEngineImpl_nativeGetNextPage( - env, clazz, object, next_page_token, java_to_native_start_timestamp_ms); -} - -JNIEXPORT void JNICALL -Java_com_google_android_icing_IcingSearchEngine_nativeInvalidateNextPageToken( - JNIEnv* env, jclass clazz, jobject object, jlong next_page_token) { - Java_com_google_android_icing_IcingSearchEngineImpl_nativeInvalidateNextPageToken( - env, clazz, object, next_page_token); -} - -JNIEXPORT jbyteArray JNICALL -Java_com_google_android_icing_IcingSearchEngine_nativeSearch( - JNIEnv* env, jclass clazz, jobject object, jbyteArray search_spec_bytes, - jbyteArray scoring_spec_bytes, jbyteArray result_spec_bytes, - jlong java_to_native_start_timestamp_ms) { - return Java_com_google_android_icing_IcingSearchEngineImpl_nativeSearch( - env, clazz, object, search_spec_bytes, scoring_spec_bytes, - result_spec_bytes, java_to_native_start_timestamp_ms); -} - -JNIEXPORT jbyteArray JNICALL -Java_com_google_android_icing_IcingSearchEngine_nativeDelete(JNIEnv* env, - jclass clazz, - jobject object, - jstring name_space, - jstring uri) { - return Java_com_google_android_icing_IcingSearchEngineImpl_nativeDelete( - env, clazz, object, name_space, uri); -} - -JNIEXPORT jbyteArray JNICALL -Java_com_google_android_icing_IcingSearchEngine_nativeDeleteByNamespace( - JNIEnv* env, jclass clazz, jobject object, jstring name_space) { - return Java_com_google_android_icing_IcingSearchEngineImpl_nativeDeleteByNamespace( - env, clazz, object, name_space); -} - -JNIEXPORT jbyteArray JNICALL -Java_com_google_android_icing_IcingSearchEngine_nativeDeleteBySchemaType( - JNIEnv* env, jclass clazz, jobject object, jstring schema_type) { - return Java_com_google_android_icing_IcingSearchEngineImpl_nativeDeleteBySchemaType( - env, clazz, object, schema_type); -} - -JNIEXPORT jbyteArray JNICALL -Java_com_google_android_icing_IcingSearchEngine_nativeDeleteByQuery( - JNIEnv* env, jclass clazz, jobject object, jbyteArray search_spec_bytes, - jboolean return_deleted_document_info) { - return Java_com_google_android_icing_IcingSearchEngineImpl_nativeDeleteByQuery( - env, clazz, object, search_spec_bytes, return_deleted_document_info); -} - -JNIEXPORT jbyteArray JNICALL -Java_com_google_android_icing_IcingSearchEngine_nativePersistToDisk( - JNIEnv* env, jclass clazz, jobject object, jint persist_type_code) { - return Java_com_google_android_icing_IcingSearchEngineImpl_nativePersistToDisk( - env, clazz, object, persist_type_code); -} - -JNIEXPORT jbyteArray JNICALL -Java_com_google_android_icing_IcingSearchEngine_nativeOptimize(JNIEnv* env, - jclass clazz, - jobject object) { - return Java_com_google_android_icing_IcingSearchEngineImpl_nativeOptimize( - env, clazz, object); -} - -JNIEXPORT jbyteArray JNICALL -Java_com_google_android_icing_IcingSearchEngine_nativeGetOptimizeInfo( - JNIEnv* env, jclass clazz, jobject object) { - return Java_com_google_android_icing_IcingSearchEngineImpl_nativeGetOptimizeInfo( - env, clazz, object); -} - -JNIEXPORT jbyteArray JNICALL -Java_com_google_android_icing_IcingSearchEngine_nativeGetStorageInfo( - JNIEnv* env, jclass clazz, jobject object) { - return Java_com_google_android_icing_IcingSearchEngineImpl_nativeGetStorageInfo( - env, clazz, object); -} - -JNIEXPORT jbyteArray JNICALL -Java_com_google_android_icing_IcingSearchEngine_nativeReset(JNIEnv* env, - jclass clazz, - jobject object) { - return Java_com_google_android_icing_IcingSearchEngineImpl_nativeReset( - env, clazz, object); -} - -JNIEXPORT jbyteArray JNICALL -Java_com_google_android_icing_IcingSearchEngine_nativeSearchSuggestions( - JNIEnv* env, jclass clazz, jobject object, - jbyteArray suggestion_spec_bytes) { - return Java_com_google_android_icing_IcingSearchEngineImpl_nativeSearchSuggestions( - env, clazz, object, suggestion_spec_bytes); -} - -JNIEXPORT jbyteArray JNICALL -Java_com_google_android_icing_IcingSearchEngine_nativeGetDebugInfo( - JNIEnv* env, jclass clazz, jobject object, jint verbosity) { - return Java_com_google_android_icing_IcingSearchEngineImpl_nativeGetDebugInfo( - env, clazz, object, verbosity); -} - -JNIEXPORT jboolean JNICALL -Java_com_google_android_icing_IcingSearchEngine_nativeShouldLog( - JNIEnv* env, jclass clazz, jshort severity, jshort verbosity) { - return Java_com_google_android_icing_IcingSearchEngineImpl_nativeShouldLog( - env, clazz, severity, verbosity); -} +#pragma clang diagnostic ignored "-Wwrite-strings" +jint JNI_OnLoad(JavaVM* vm, void* reserved) { + JNIEnv* env; + if (vm->GetEnv(reinterpret_cast<void**>(&env), JNI_VERSION_1_6) != JNI_OK) { + ICING_LOG(icing::lib::ERROR) << "ERROR: GetEnv failed"; + return JNI_ERR; + } -JNIEXPORT jboolean JNICALL -Java_com_google_android_icing_IcingSearchEngine_nativeSetLoggingLevel( - JNIEnv* env, jclass clazz, jshort severity, jshort verbosity) { - return Java_com_google_android_icing_IcingSearchEngineImpl_nativeSetLoggingLevel( - env, clazz, severity, verbosity); -} + // Find your class. JNI_OnLoad is called from the correct class loader context + // for this to work. + jclass java_class = + env->FindClass("com/google/android/icing/IcingSearchEngineImpl"); + if (java_class == nullptr) { + return JNI_ERR; + } + JavaIcingSearchEngineImpl.native_pointer = + env->GetFieldID(java_class, "nativePointer", "J"); + + // Register your class' native methods. + static const JNINativeMethod methods[] = { + {"nativeCreate", "([B)J", reinterpret_cast<void*>(nativeCreate)}, + {"nativeDestroy", "(Lcom/google/android/icing/IcingSearchEngineImpl;)V", + reinterpret_cast<void*>(nativeDestroy)}, + {"nativeInitialize", + "(Lcom/google/android/icing/IcingSearchEngineImpl;)[B", + reinterpret_cast<void*>(nativeInitialize)}, + {"nativeSetSchema", + "(Lcom/google/android/icing/IcingSearchEngineImpl;[BZ)[B", + reinterpret_cast<void*>(nativeSetSchema)}, + {"nativeGetSchema", + "(Lcom/google/android/icing/IcingSearchEngineImpl;)[B", + reinterpret_cast<void*>(nativeGetSchema)}, + {"nativeGetSchemaType", + "(Lcom/google/android/icing/IcingSearchEngineImpl;Ljava/lang/String;)[B", + reinterpret_cast<void*>(nativeGetSchemaType)}, + {"nativePut", "(Lcom/google/android/icing/IcingSearchEngineImpl;[B)[B", + reinterpret_cast<void*>(nativePut)}, + {"nativeGet", + "(Lcom/google/android/icing/IcingSearchEngineImpl;Ljava/lang/" + "String;Ljava/lang/String;[B)[B", + reinterpret_cast<void*>(nativeGet)}, + {"nativeReportUsage", + "(Lcom/google/android/icing/IcingSearchEngineImpl;[B)[B", + reinterpret_cast<void*>(nativeReportUsage)}, + {"nativeGetAllNamespaces", + "(Lcom/google/android/icing/IcingSearchEngineImpl;)[B", + reinterpret_cast<void*>(nativeGetAllNamespaces)}, + {"nativeGetNextPage", + "(Lcom/google/android/icing/IcingSearchEngineImpl;JJ)[B", + reinterpret_cast<void*>(nativeGetNextPage)}, + {"nativeInvalidateNextPageToken", + "(Lcom/google/android/icing/IcingSearchEngineImpl;J)V", + reinterpret_cast<void*>(nativeInvalidateNextPageToken)}, + {"nativeSearch", + "(Lcom/google/android/icing/IcingSearchEngineImpl;[B[B[BJ)[B", + reinterpret_cast<void*>(nativeSearch)}, + {"nativeDelete", + "(Lcom/google/android/icing/IcingSearchEngineImpl;Ljava/lang/" + "String;Ljava/lang/String;)[B", + reinterpret_cast<void*>(nativeDelete)}, + {"nativeDeleteByNamespace", + "(Lcom/google/android/icing/IcingSearchEngineImpl;Ljava/lang/String;)[B", + reinterpret_cast<void*>(nativeDeleteByNamespace)}, + {"nativeDeleteBySchemaType", + "(Lcom/google/android/icing/IcingSearchEngineImpl;Ljava/lang/String;)[B", + reinterpret_cast<void*>(nativeDeleteBySchemaType)}, + {"nativeDeleteByQuery", + "(Lcom/google/android/icing/IcingSearchEngineImpl;[BZ)[B", + reinterpret_cast<void*>(nativeDeleteByQuery)}, + {"nativePersistToDisk", + "(Lcom/google/android/icing/IcingSearchEngineImpl;I)[B", + reinterpret_cast<void*>(nativePersistToDisk)}, + {"nativeOptimize", "(Lcom/google/android/icing/IcingSearchEngineImpl;)[B", + reinterpret_cast<void*>(nativeOptimize)}, + {"nativeGetOptimizeInfo", + "(Lcom/google/android/icing/IcingSearchEngineImpl;)[B", + reinterpret_cast<void*>(nativeGetOptimizeInfo)}, + {"nativeGetStorageInfo", + "(Lcom/google/android/icing/IcingSearchEngineImpl;)[B", + reinterpret_cast<void*>(nativeGetStorageInfo)}, + {"nativeReset", "(Lcom/google/android/icing/IcingSearchEngineImpl;)[B", + reinterpret_cast<void*>(nativeReset)}, + {"nativeSearchSuggestions", + "(Lcom/google/android/icing/IcingSearchEngineImpl;[B)[B", + reinterpret_cast<void*>(nativeSearchSuggestions)}, + {"nativeGetDebugInfo", + "(Lcom/google/android/icing/IcingSearchEngineImpl;I)[B", + reinterpret_cast<void*>(nativeGetDebugInfo)}, + {"nativeShouldLog", "(SS)Z", reinterpret_cast<void*>(nativeShouldLog)}, + {"nativeSetLoggingLevel", "(SS)Z", + reinterpret_cast<void*>(nativeSetLoggingLevel)}, + {"nativeGetLoggingTag", "()Ljava/lang/String;", + reinterpret_cast<void*>(nativeGetLoggingTag)}, + }; + int register_natives_success = env->RegisterNatives( + java_class, methods, sizeof(methods) / sizeof(JNINativeMethod)); + if (register_natives_success != JNI_OK) { + return register_natives_success; + } -JNIEXPORT jstring JNICALL -Java_com_google_android_icing_IcingSearchEngine_nativeGetLoggingTag( - JNIEnv* env, jclass clazz) { - return Java_com_google_android_icing_IcingSearchEngineImpl_nativeGetLoggingTag( - env, clazz); + return JNI_VERSION_1_6; } } // extern "C" diff --git a/icing/join/document-id-to-join-info.h b/icing/join/document-id-to-join-info.h new file mode 100644 index 0000000..dee4885 --- /dev/null +++ b/icing/join/document-id-to-join-info.h @@ -0,0 +1,67 @@ +// Copyright (C) 2023 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef ICING_JOIN_DOCUMENT_ID_TO_JOIN_INFO_H_ +#define ICING_JOIN_DOCUMENT_ID_TO_JOIN_INFO_H_ + +#include <utility> + +#include "icing/store/document-id.h" + +namespace icing { +namespace lib { + +// DocumentIdToJoinInfo is composed of document_id and its join info. +// - QualifiedId join: join info is the referenced document's namespace_id + +// fingerprint(uri). +// - String join: join info is the term id. +// - Integer join: join info is the integer. +// +// DocumentIdToJoinInfo will be stored in posting list. +template <typename JoinInfoType> +class DocumentIdToJoinInfo { + public: + static DocumentIdToJoinInfo<JoinInfoType> GetInvalid() { + return DocumentIdToJoinInfo<JoinInfoType>(kInvalidDocumentId, + JoinInfoType()); + } + + explicit DocumentIdToJoinInfo(DocumentId document_id, JoinInfoType join_info) + : document_id_(document_id), join_info_(std::move(join_info)) {} + + DocumentId document_id() const { return document_id_; } + const JoinInfoType& join_info() const { return join_info_; } + + bool is_valid() const { return IsDocumentIdValid(document_id_); } + + bool operator<(const DocumentIdToJoinInfo<JoinInfoType>& other) const { + if (document_id_ != other.document_id_) { + return document_id_ < other.document_id_; + } + return join_info_ < other.join_info_; + } + + bool operator==(const DocumentIdToJoinInfo<JoinInfoType>& other) const { + return document_id_ == other.document_id_ && join_info_ == other.join_info_; + } + + private: + DocumentId document_id_; + JoinInfoType join_info_; +} __attribute__((packed)); + +} // namespace lib +} // namespace icing + +#endif // ICING_JOIN_DOCUMENT_ID_TO_JOIN_INFO_H_ diff --git a/icing/join/join-children-fetcher.h b/icing/join/join-children-fetcher.h index 5f799b8..1b875bc 100644 --- a/icing/join/join-children-fetcher.h +++ b/icing/join/join-children-fetcher.h @@ -44,7 +44,7 @@ class JoinChildrenFetcher { // Get a vector of children ScoredDocumentHit by parent document id. // // TODO(b/256022027): Implement property value joins with types of string and - // int. In these cases, GetChildren should look up joinable cache to fetch + // int. In these cases, GetChildren should look up join index to fetch // joinable property value of the given parent_doc_id according to // join_spec_.parent_property_expression, and then fetch children by the // corresponding map in this class using the joinable property value. diff --git a/icing/join/join-processor.cc b/icing/join/join-processor.cc index e27b1ea..1b7ca0d 100644 --- a/icing/join/join-processor.cc +++ b/icing/join/join-processor.cc @@ -29,6 +29,7 @@ #include "icing/join/aggregation-scorer.h" #include "icing/join/doc-join-info.h" #include "icing/join/join-children-fetcher.h" +#include "icing/join/qualified-id-join-index.h" #include "icing/join/qualified-id.h" #include "icing/proto/schema.pb.h" #include "icing/proto/scoring.pb.h" @@ -37,6 +38,7 @@ #include "icing/scoring/scored-document-hit.h" #include "icing/store/document-filter-data.h" #include "icing/store/document-id.h" +#include "icing/store/namespace-fingerprint-identifier.h" #include "icing/util/status-macros.h" namespace icing { @@ -53,17 +55,121 @@ JoinProcessor::GetChildrenFetcher( "Parent property expression must be ", kQualifiedIdExpr)); } - std::sort( - child_scored_document_hits.begin(), child_scored_document_hits.end(), - ScoredDocumentHitComparator( - /*is_descending=*/join_spec.nested_spec().scoring_spec().order_by() == - ScoringSpecProto::Order::DESC)); - - // TODO(b/256022027): - // - Optimization - // - Cache property to speed up property retrieval. - // - If there is no cache, then we still have the flexibility to fetch it - // from actual docs via DocumentStore. + ScoredDocumentHitComparator score_comparator( + /*is_descending=*/join_spec.nested_spec().scoring_spec().order_by() == + ScoringSpecProto::Order::DESC); + + if (qualified_id_join_index_->is_v2()) { + // v2 + // Step 1a: sort child ScoredDocumentHits in document id descending order. + std::sort(child_scored_document_hits.begin(), + child_scored_document_hits.end(), + [](const ScoredDocumentHit& lhs, const ScoredDocumentHit& rhs) { + return lhs.document_id() > rhs.document_id(); + }); + + // Step 1b: group all child ScoredDocumentHits by the document's + // schema_type_id. + std::unordered_map<SchemaTypeId, std::vector<ScoredDocumentHit>> + schema_to_child_scored_doc_hits_map; + for (const ScoredDocumentHit& child_scored_document_hit : + child_scored_document_hits) { + std::optional<DocumentFilterData> child_doc_filter_data = + doc_store_->GetAliveDocumentFilterData( + child_scored_document_hit.document_id(), current_time_ms_); + if (!child_doc_filter_data) { + continue; + } + + schema_to_child_scored_doc_hits_map[child_doc_filter_data + ->schema_type_id()] + .push_back(child_scored_document_hit); + } + + // Step 1c: for each schema_type_id, lookup QualifiedIdJoinIndexImplV2 to + // fetch all child join data from posting list(s). Convert all + // child join data to referenced parent document ids and bucketize + // child ScoredDocumentHits by it. + std::unordered_map<DocumentId, std::vector<ScoredDocumentHit>> + parent_to_child_docs_map; + for (auto& [schema_type_id, grouped_child_scored_doc_hits] : + schema_to_child_scored_doc_hits_map) { + // Get joinable_property_id of this schema. + ICING_ASSIGN_OR_RETURN( + const JoinablePropertyMetadata* metadata, + schema_store_->GetJoinablePropertyMetadata( + schema_type_id, join_spec.child_property_expression())); + if (metadata == nullptr || + metadata->value_type != JoinableConfig::ValueType::QUALIFIED_ID) { + // Currently we only support qualified id, so skip other types. + continue; + } + + // Lookup QualifiedIdJoinIndexImplV2. + ICING_ASSIGN_OR_RETURN( + std::unique_ptr<QualifiedIdJoinIndex::JoinDataIteratorBase> + join_index_iter, + qualified_id_join_index_->GetIterator( + schema_type_id, /*joinable_property_id=*/metadata->id)); + + // - Join index contains all join data of schema_type_id and + // join_index_iter will return all of them in (child) document id + // descending order. + // - But we only need join data of child document ids which appear in + // grouped_child_scored_doc_hits. Also grouped_child_scored_doc_hits + // contain ScoredDocumentHits in (child) document id descending order. + // - Therefore, we advance 2 iterators to intersect them and get desired + // join data. + auto child_scored_doc_hits_iter = grouped_child_scored_doc_hits.cbegin(); + while (join_index_iter->Advance().ok() && + child_scored_doc_hits_iter != + grouped_child_scored_doc_hits.cend()) { + // Advance child_scored_doc_hits_iter until it points to a + // ScoredDocumentHit with document id <= the one pointed by + // join_index_iter. + while (child_scored_doc_hits_iter != + grouped_child_scored_doc_hits.cend() && + child_scored_doc_hits_iter->document_id() > + join_index_iter->GetCurrent().document_id()) { + ++child_scored_doc_hits_iter; + } + + if (child_scored_doc_hits_iter != + grouped_child_scored_doc_hits.cend() && + child_scored_doc_hits_iter->document_id() == + join_index_iter->GetCurrent().document_id()) { + // We get a join data whose child document id exists in both join + // index and grouped_child_scored_doc_hits. Convert its join info to + // referenced parent document ids and bucketize ScoredDocumentHits by + // it (putting into parent_to_child_docs_map). + const NamespaceFingerprintIdentifier& ref_ns_id = + join_index_iter->GetCurrent().join_info(); + libtextclassifier3::StatusOr<DocumentId> ref_parent_doc_id_or = + doc_store_->GetDocumentId(ref_ns_id); + if (ref_parent_doc_id_or.ok()) { + parent_to_child_docs_map[std::move(ref_parent_doc_id_or) + .ValueOrDie()] + .push_back(*child_scored_doc_hits_iter); + } + } + } + } + + // Step 1d: finally, sort each parent's joined child ScoredDocumentHits by + // score. + for (auto& [parent_doc_id, bucketized_child_scored_hits] : + parent_to_child_docs_map) { + std::sort(bucketized_child_scored_hits.begin(), + bucketized_child_scored_hits.end(), score_comparator); + } + + return JoinChildrenFetcher(join_spec, std::move(parent_to_child_docs_map)); + } + + // v1 + // TODO(b/275121148): deprecate this part after rollout v2. + std::sort(child_scored_document_hits.begin(), + child_scored_document_hits.end(), score_comparator); // Step 1: group child documents by parent documentId. Currently we only // support QualifiedId joining, so fetch the qualified id content of diff --git a/icing/join/join-processor.h b/icing/join/join-processor.h index 347ce85..517e9db 100644 --- a/icing/join/join-processor.h +++ b/icing/join/join-processor.h @@ -22,7 +22,7 @@ #include "icing/text_classifier/lib3/utils/base/statusor.h" #include "icing/join/join-children-fetcher.h" -#include "icing/join/qualified-id-type-joinable-index.h" +#include "icing/join/qualified-id-join-index.h" #include "icing/proto/search.pb.h" #include "icing/schema/schema-store.h" #include "icing/scoring/scored-document-hit.h" @@ -35,10 +35,10 @@ class JoinProcessor { public: static constexpr std::string_view kQualifiedIdExpr = "this.qualifiedId()"; - explicit JoinProcessor( - const DocumentStore* doc_store, const SchemaStore* schema_store, - const QualifiedIdTypeJoinableIndex* qualified_id_join_index, - int64_t current_time_ms) + explicit JoinProcessor(const DocumentStore* doc_store, + const SchemaStore* schema_store, + const QualifiedIdJoinIndex* qualified_id_join_index, + int64_t current_time_ms) : doc_store_(doc_store), schema_store_(schema_store), qualified_id_join_index_(qualified_id_join_index), @@ -72,14 +72,13 @@ class JoinProcessor { // - kInvalidDocumentId if the given document is not found, doesn't have // qualified id joinable type for the given property_path, or doesn't have // joinable value (an optional property) - // - Any other QualifiedIdTypeJoinableIndex errors + // - Any other QualifiedIdJoinIndex errors libtextclassifier3::StatusOr<DocumentId> FetchReferencedQualifiedId( const DocumentId& document_id, const std::string& property_path) const; const DocumentStore* doc_store_; // Does not own. const SchemaStore* schema_store_; // Does not own. - const QualifiedIdTypeJoinableIndex* - qualified_id_join_index_; // Does not own. + const QualifiedIdJoinIndex* qualified_id_join_index_; // Does not own. int64_t current_time_ms_; }; diff --git a/icing/join/join-processor_test.cc b/icing/join/join-processor_test.cc index 95d1392..a40d934 100644 --- a/icing/join/join-processor_test.cc +++ b/icing/join/join-processor_test.cc @@ -22,11 +22,15 @@ #include "icing/text_classifier/lib3/utils/base/statusor.h" #include "gmock/gmock.h" #include "gtest/gtest.h" +#include "icing/absl_ports/canonical_errors.h" #include "icing/document-builder.h" #include "icing/file/filesystem.h" #include "icing/file/portable-file-backed-proto-log.h" +#include "icing/join/join-children-fetcher.h" +#include "icing/join/qualified-id-join-index-impl-v1.h" +#include "icing/join/qualified-id-join-index-impl-v2.h" +#include "icing/join/qualified-id-join-index.h" #include "icing/join/qualified-id-join-indexing-handler.h" -#include "icing/join/qualified-id-type-joinable-index.h" #include "icing/portable/platform.h" #include "icing/proto/document.pb.h" #include "icing/proto/document_wrapper.pb.h" @@ -58,6 +62,9 @@ namespace { using ::testing::ElementsAre; using ::testing::IsTrue; +// TODO(b/275121148): remove template after deprecating +// QualifiedIdJoinIndexImplV1. +template <typename T> class JoinProcessorTest : public ::testing::Test { protected: void SetUp() override { @@ -108,6 +115,25 @@ class JoinProcessorTest : public ::testing::Test { .SetDataTypeJoinableString( JOINABLE_VALUE_TYPE_QUALIFIED_ID) .SetCardinality(CARDINALITY_OPTIONAL))) + .AddType( + SchemaTypeConfigBuilder() + .SetType("Message") + .AddProperty(PropertyConfigBuilder() + .SetName("content") + .SetDataTypeString(TERM_MATCH_EXACT, + TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_OPTIONAL)) + .AddProperty(PropertyConfigBuilder() + .SetName("sender") + .SetDataTypeJoinableString( + JOINABLE_VALUE_TYPE_QUALIFIED_ID) + .SetCardinality(CARDINALITY_OPTIONAL)) + .AddProperty(PropertyConfigBuilder() + .SetName("receiver") + .SetDataTypeJoinableString( + JOINABLE_VALUE_TYPE_QUALIFIED_ID) + .SetCardinality(CARDINALITY_OPTIONAL))) + .Build(); ASSERT_THAT(schema_store_->SetSchema( schema, /*ignore_errors_and_delete_documents=*/false, @@ -118,20 +144,18 @@ class JoinProcessorTest : public ::testing::Test { IsTrue()); ICING_ASSERT_OK_AND_ASSIGN( DocumentStore::CreateResult create_result, - DocumentStore::Create(&filesystem_, doc_store_dir_, &fake_clock_, - schema_store_.get(), - /*force_recovery_and_revalidate_documents=*/false, - /*namespace_id_fingerprint=*/false, - PortableFileBackedProtoLog< - DocumentWrapper>::kDeflateCompressionLevel, - /*initialize_stats=*/nullptr)); + DocumentStore::Create( + &filesystem_, doc_store_dir_, &fake_clock_, schema_store_.get(), + /*force_recovery_and_revalidate_documents=*/false, + /*namespace_id_fingerprint=*/true, /*pre_mapping_fbv=*/false, + /*use_persistent_hash_map=*/false, + PortableFileBackedProtoLog< + DocumentWrapper>::kDeflateCompressionLevel, + /*initialize_stats=*/nullptr)); doc_store_ = std::move(create_result.document_store); - ICING_ASSERT_OK_AND_ASSIGN( - qualified_id_join_index_, - QualifiedIdTypeJoinableIndex::Create( - filesystem_, qualified_id_join_index_dir_, - /*pre_mapping_fbv=*/false, /*use_persistent_hash_map=*/false)); + ICING_ASSERT_OK_AND_ASSIGN(qualified_id_join_index_, + CreateQualifiedIdJoinIndex<T>()); } void TearDown() override { @@ -142,6 +166,28 @@ class JoinProcessorTest : public ::testing::Test { filesystem_.DeleteDirectoryRecursively(test_dir_.c_str()); } + template <typename UnknownJoinIndexType> + libtextclassifier3::StatusOr<std::unique_ptr<QualifiedIdJoinIndex>> + CreateQualifiedIdJoinIndex() { + return absl_ports::InvalidArgumentError("Unknown type"); + } + + template <> + libtextclassifier3::StatusOr<std::unique_ptr<QualifiedIdJoinIndex>> + CreateQualifiedIdJoinIndex<QualifiedIdJoinIndexImplV1>() { + return QualifiedIdJoinIndexImplV1::Create( + filesystem_, qualified_id_join_index_dir_, /*pre_mapping_fbv=*/false, + /*use_persistent_hash_map=*/false); + } + + template <> + libtextclassifier3::StatusOr<std::unique_ptr<QualifiedIdJoinIndex>> + CreateQualifiedIdJoinIndex<QualifiedIdJoinIndexImplV2>() { + return QualifiedIdJoinIndexImplV2::Create(filesystem_, + qualified_id_join_index_dir_, + /*pre_mapping_fbv=*/false); + } + libtextclassifier3::StatusOr<DocumentId> PutAndIndexDocument( const DocumentProto& document) { ICING_ASSIGN_OR_RETURN(DocumentId document_id, doc_store_->Put(document)); @@ -152,7 +198,7 @@ class JoinProcessorTest : public ::testing::Test { ICING_ASSIGN_OR_RETURN( std::unique_ptr<QualifiedIdJoinIndexingHandler> handler, - QualifiedIdJoinIndexingHandler::Create(&fake_clock_, + QualifiedIdJoinIndexingHandler::Create(&fake_clock_, doc_store_.get(), qualified_id_join_index_.get())); ICING_RETURN_IF_ERROR(handler->Handle(tokenized_document, document_id, /*recovery_mode=*/false, @@ -162,8 +208,8 @@ class JoinProcessorTest : public ::testing::Test { libtextclassifier3::StatusOr<std::vector<JoinedScoredDocumentHit>> Join( const JoinSpecProto& join_spec, - std::vector<ScoredDocumentHit>&& parent_scored_document_hits, - std::vector<ScoredDocumentHit>&& child_scored_document_hits) { + std::vector<ScoredDocumentHit> parent_scored_document_hits, + std::vector<ScoredDocumentHit> child_scored_document_hits) { JoinProcessor join_processor( doc_store_.get(), schema_store_.get(), qualified_id_join_index_.get(), /*current_time_ms=*/fake_clock_.GetSystemTimeMilliseconds()); @@ -185,12 +231,16 @@ class JoinProcessorTest : public ::testing::Test { std::unique_ptr<LanguageSegmenter> lang_segmenter_; std::unique_ptr<SchemaStore> schema_store_; std::unique_ptr<DocumentStore> doc_store_; - std::unique_ptr<QualifiedIdTypeJoinableIndex> qualified_id_join_index_; + std::unique_ptr<QualifiedIdJoinIndex> qualified_id_join_index_; FakeClock fake_clock_; }; -TEST_F(JoinProcessorTest, JoinByQualifiedId) { +using TestTypes = + ::testing::Types<QualifiedIdJoinIndexImplV1, QualifiedIdJoinIndexImplV2>; +TYPED_TEST_SUITE(JoinProcessorTest, TestTypes); + +TYPED_TEST(JoinProcessorTest, JoinByQualifiedId_allDocuments) { DocumentProto person1 = DocumentBuilder() .SetKey("pkg$db/namespace", "person1") .SetSchema("Person") @@ -226,15 +276,15 @@ TEST_F(JoinProcessorTest, JoinByQualifiedId) { .Build(); ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id1, - PutAndIndexDocument(person1)); + this->PutAndIndexDocument(person1)); ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id2, - PutAndIndexDocument(person2)); + this->PutAndIndexDocument(person2)); ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id3, - PutAndIndexDocument(email1)); + this->PutAndIndexDocument(email1)); ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id4, - PutAndIndexDocument(email2)); + this->PutAndIndexDocument(email2)); ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id5, - PutAndIndexDocument(email3)); + this->PutAndIndexDocument(email3)); ScoredDocumentHit scored_doc_hit1(document_id1, kSectionIdMaskNone, /*score=*/0.0); @@ -266,8 +316,8 @@ TEST_F(JoinProcessorTest, JoinByQualifiedId) { ICING_ASSERT_OK_AND_ASSIGN( std::vector<JoinedScoredDocumentHit> joined_result_document_hits, - Join(join_spec, std::move(parent_scored_document_hits), - std::move(child_scored_document_hits))); + this->Join(join_spec, std::move(parent_scored_document_hits), + std::move(child_scored_document_hits))); EXPECT_THAT( joined_result_document_hits, ElementsAre(EqualsJoinedScoredDocumentHit(JoinedScoredDocumentHit( @@ -281,7 +331,112 @@ TEST_F(JoinProcessorTest, JoinByQualifiedId) { {scored_doc_hit5, scored_doc_hit3})))); } -TEST_F(JoinProcessorTest, ShouldIgnoreChildDocumentsWithoutJoiningProperty) { +TYPED_TEST(JoinProcessorTest, JoinByQualifiedId_partialDocuments) { + DocumentProto person1 = DocumentBuilder() + .SetKey("pkg$db/namespace", "person1") + .SetSchema("Person") + .AddStringProperty("Name", "Alice") + .Build(); + DocumentProto person2 = DocumentBuilder() + .SetKey("pkg$db/namespace", "person2") + .SetSchema("Person") + .AddStringProperty("Name", "Bob") + .Build(); + DocumentProto person3 = DocumentBuilder() + .SetKey("pkg$db/namespace", "person3") + .SetSchema("Person") + .AddStringProperty("Name", "Eve") + .Build(); + + DocumentProto email1 = + DocumentBuilder() + .SetKey("pkg$db/namespace", "email1") + .SetSchema("Email") + .AddStringProperty("subject", "test subject 1") + .AddStringProperty("sender", "pkg$db/namespace#person1") + .Build(); + DocumentProto email2 = + DocumentBuilder() + .SetKey("pkg$db/namespace", "email2") + .SetSchema("Email") + .AddStringProperty("subject", "test subject 2") + .AddStringProperty("sender", "pkg$db/namespace#person2") + .Build(); + DocumentProto email3 = + DocumentBuilder() + .SetKey("pkg$db/namespace", "email3") + .SetSchema("Email") + .AddStringProperty("subject", "test subject 3") + .AddStringProperty("sender", "pkg$db/namespace#person3") + .Build(); + DocumentProto email4 = + DocumentBuilder() + .SetKey("pkg$db/namespace", "email4") + .SetSchema("Email") + .AddStringProperty("subject", "test subject 4") + .AddStringProperty("sender", "pkg$db/namespace#person1") + .Build(); + + ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id1, + this->PutAndIndexDocument(person1)); + ICING_ASSERT_OK(/*document_id2 unused*/ + this->PutAndIndexDocument(person2)); + ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id3, + this->PutAndIndexDocument(person3)); + ICING_ASSERT_OK(/*document_id4 unused*/ + this->PutAndIndexDocument(email1)); + ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id5, + this->PutAndIndexDocument(email2)); + ICING_ASSERT_OK(/*document_id6 unused*/ + this->PutAndIndexDocument(email3)); + ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id7, + this->PutAndIndexDocument(email4)); + + ScoredDocumentHit scored_doc_hit1(document_id1, kSectionIdMaskNone, + /*score=*/0.0); + ScoredDocumentHit scored_doc_hit3(document_id3, kSectionIdMaskNone, + /*score=*/0.0); + ScoredDocumentHit scored_doc_hit5(document_id5, kSectionIdMaskNone, + /*score=*/4.0); + ScoredDocumentHit scored_doc_hit7(document_id7, kSectionIdMaskNone, + /*score=*/5.0); + + // Only join person1, person3, email2 and email4. + // Parent ScoredDocumentHits: person1, person3 + std::vector<ScoredDocumentHit> parent_scored_document_hits = { + scored_doc_hit3, scored_doc_hit1}; + + // Child ScoredDocumentHits: email2, email4 + std::vector<ScoredDocumentHit> child_scored_document_hits = {scored_doc_hit7, + scored_doc_hit5}; + + JoinSpecProto join_spec; + join_spec.set_parent_property_expression( + std::string(JoinProcessor::kQualifiedIdExpr)); + join_spec.set_child_property_expression("sender"); + join_spec.set_aggregation_scoring_strategy( + JoinSpecProto::AggregationScoringStrategy::COUNT); + join_spec.mutable_nested_spec()->mutable_scoring_spec()->set_order_by( + ScoringSpecProto::Order::DESC); + + ICING_ASSERT_OK_AND_ASSIGN( + std::vector<JoinedScoredDocumentHit> joined_result_document_hits, + this->Join(join_spec, std::move(parent_scored_document_hits), + std::move(child_scored_document_hits))); + EXPECT_THAT( + joined_result_document_hits, + ElementsAre(EqualsJoinedScoredDocumentHit(JoinedScoredDocumentHit( + /*final_score=*/0.0, + /*parent_scored_document_hit=*/scored_doc_hit3, + /*child_scored_document_hits=*/{})), + EqualsJoinedScoredDocumentHit(JoinedScoredDocumentHit( + /*final_score=*/1.0, + /*parent_scored_document_hit=*/scored_doc_hit1, + /*child_scored_document_hits=*/{scored_doc_hit7})))); +} + +TYPED_TEST(JoinProcessorTest, + ShouldIgnoreChildDocumentsWithoutJoiningProperty) { DocumentProto person1 = DocumentBuilder() .SetKey("pkg$db/namespace", "person1") .SetSchema("Person") @@ -302,11 +457,11 @@ TEST_F(JoinProcessorTest, ShouldIgnoreChildDocumentsWithoutJoiningProperty) { .Build(); ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id1, - PutAndIndexDocument(person1)); + this->PutAndIndexDocument(person1)); ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id2, - PutAndIndexDocument(email1)); + this->PutAndIndexDocument(email1)); ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id3, - PutAndIndexDocument(email2)); + this->PutAndIndexDocument(email2)); ScoredDocumentHit scored_doc_hit1(document_id1, kSectionIdMaskNone, /*score=*/0.0); @@ -334,8 +489,8 @@ TEST_F(JoinProcessorTest, ShouldIgnoreChildDocumentsWithoutJoiningProperty) { ICING_ASSERT_OK_AND_ASSIGN( std::vector<JoinedScoredDocumentHit> joined_result_document_hits, - Join(join_spec, std::move(parent_scored_document_hits), - std::move(child_scored_document_hits))); + this->Join(join_spec, std::move(parent_scored_document_hits), + std::move(child_scored_document_hits))); // Since Email2 doesn't have "sender" property, it should be ignored. EXPECT_THAT( joined_result_document_hits, @@ -344,7 +499,8 @@ TEST_F(JoinProcessorTest, ShouldIgnoreChildDocumentsWithoutJoiningProperty) { /*child_scored_document_hits=*/{scored_doc_hit2})))); } -TEST_F(JoinProcessorTest, ShouldIgnoreChildDocumentsWithInvalidQualifiedId) { +TYPED_TEST(JoinProcessorTest, + ShouldIgnoreChildDocumentsWithInvalidQualifiedId) { DocumentProto person1 = DocumentBuilder() .SetKey("pkg$db/namespace", "person1") .SetSchema("Person") @@ -378,13 +534,13 @@ TEST_F(JoinProcessorTest, ShouldIgnoreChildDocumentsWithInvalidQualifiedId) { .Build(); ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id1, - PutAndIndexDocument(person1)); + this->PutAndIndexDocument(person1)); ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id2, - PutAndIndexDocument(email1)); + this->PutAndIndexDocument(email1)); ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id3, - PutAndIndexDocument(email2)); + this->PutAndIndexDocument(email2)); ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id4, - PutAndIndexDocument(email3)); + this->PutAndIndexDocument(email3)); ScoredDocumentHit scored_doc_hit1(document_id1, kSectionIdMaskNone, /*score=*/0.0); @@ -414,8 +570,8 @@ TEST_F(JoinProcessorTest, ShouldIgnoreChildDocumentsWithInvalidQualifiedId) { ICING_ASSERT_OK_AND_ASSIGN( std::vector<JoinedScoredDocumentHit> joined_result_document_hits, - Join(join_spec, std::move(parent_scored_document_hits), - std::move(child_scored_document_hits))); + this->Join(join_spec, std::move(parent_scored_document_hits), + std::move(child_scored_document_hits))); // Email 2 and email 3 (document id 3 and 4) contain invalid qualified ids. // Join processor should ignore them. EXPECT_THAT(joined_result_document_hits, @@ -425,7 +581,7 @@ TEST_F(JoinProcessorTest, ShouldIgnoreChildDocumentsWithInvalidQualifiedId) { /*child_scored_document_hits=*/{scored_doc_hit2})))); } -TEST_F(JoinProcessorTest, LeftJoinShouldReturnParentWithoutChildren) { +TYPED_TEST(JoinProcessorTest, LeftJoinShouldReturnParentWithoutChildren) { DocumentProto person1 = DocumentBuilder() .SetKey("pkg$db/namespace", "person1") .SetSchema("Person") @@ -447,11 +603,11 @@ TEST_F(JoinProcessorTest, LeftJoinShouldReturnParentWithoutChildren) { .Build(); ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id1, - PutAndIndexDocument(person1)); + this->PutAndIndexDocument(person1)); ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id2, - PutAndIndexDocument(person2)); + this->PutAndIndexDocument(person2)); ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id3, - PutAndIndexDocument(email1)); + this->PutAndIndexDocument(email1)); ScoredDocumentHit scored_doc_hit1(document_id1, kSectionIdMaskNone, /*score=*/0.0); @@ -478,8 +634,8 @@ TEST_F(JoinProcessorTest, LeftJoinShouldReturnParentWithoutChildren) { ICING_ASSERT_OK_AND_ASSIGN( std::vector<JoinedScoredDocumentHit> joined_result_document_hits, - Join(join_spec, std::move(parent_scored_document_hits), - std::move(child_scored_document_hits))); + this->Join(join_spec, std::move(parent_scored_document_hits), + std::move(child_scored_document_hits))); // Person1 has no child documents, but left join should also include it. EXPECT_THAT( joined_result_document_hits, @@ -493,7 +649,7 @@ TEST_F(JoinProcessorTest, LeftJoinShouldReturnParentWithoutChildren) { /*child_scored_document_hits=*/{})))); } -TEST_F(JoinProcessorTest, ShouldSortChildDocumentsByRankingStrategy) { +TYPED_TEST(JoinProcessorTest, ShouldSortChildDocumentsByRankingStrategy) { DocumentProto person1 = DocumentBuilder() .SetKey("pkg$db/namespace", "person1") .SetSchema("Person") @@ -523,13 +679,13 @@ TEST_F(JoinProcessorTest, ShouldSortChildDocumentsByRankingStrategy) { .Build(); ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id1, - PutAndIndexDocument(person1)); + this->PutAndIndexDocument(person1)); ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id2, - PutAndIndexDocument(email1)); + this->PutAndIndexDocument(email1)); ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id3, - PutAndIndexDocument(email2)); + this->PutAndIndexDocument(email2)); ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id4, - PutAndIndexDocument(email3)); + this->PutAndIndexDocument(email3)); ScoredDocumentHit scored_doc_hit1(document_id1, kSectionIdMaskNone, /*score=*/0.0); @@ -559,8 +715,8 @@ TEST_F(JoinProcessorTest, ShouldSortChildDocumentsByRankingStrategy) { ICING_ASSERT_OK_AND_ASSIGN( std::vector<JoinedScoredDocumentHit> joined_result_document_hits, - Join(join_spec, std::move(parent_scored_document_hits), - std::move(child_scored_document_hits))); + this->Join(join_spec, std::move(parent_scored_document_hits), + std::move(child_scored_document_hits))); // Child documents should be sorted according to the (nested) ranking // strategy. EXPECT_THAT( @@ -571,7 +727,7 @@ TEST_F(JoinProcessorTest, ShouldSortChildDocumentsByRankingStrategy) { {scored_doc_hit3, scored_doc_hit4, scored_doc_hit2})))); } -TEST_F(JoinProcessorTest, ShouldAllowSelfJoining) { +TYPED_TEST(JoinProcessorTest, ShouldAllowSelfJoining) { DocumentProto email1 = DocumentBuilder() .SetKey("pkg$db/namespace", "email1") @@ -581,7 +737,7 @@ TEST_F(JoinProcessorTest, ShouldAllowSelfJoining) { .Build(); ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id1, - PutAndIndexDocument(email1)); + this->PutAndIndexDocument(email1)); ScoredDocumentHit scored_doc_hit1(document_id1, kSectionIdMaskNone, /*score=*/0.0); @@ -604,8 +760,8 @@ TEST_F(JoinProcessorTest, ShouldAllowSelfJoining) { ICING_ASSERT_OK_AND_ASSIGN( std::vector<JoinedScoredDocumentHit> joined_result_document_hits, - Join(join_spec, std::move(parent_scored_document_hits), - std::move(child_scored_document_hits))); + this->Join(join_spec, std::move(parent_scored_document_hits), + std::move(child_scored_document_hits))); EXPECT_THAT(joined_result_document_hits, ElementsAre(EqualsJoinedScoredDocumentHit(JoinedScoredDocumentHit( /*final_score=*/1.0, @@ -613,6 +769,156 @@ TEST_F(JoinProcessorTest, ShouldAllowSelfJoining) { /*child_scored_document_hits=*/{scored_doc_hit1})))); } +TYPED_TEST(JoinProcessorTest, MultipleChildSchemasJoining) { + DocumentProto person1 = DocumentBuilder() + .SetKey("pkg$db/namespace", "person1") + .SetSchema("Person") + .AddStringProperty("Name", "Alice") + .Build(); + DocumentProto person2 = DocumentBuilder() + .SetKey("pkg$db/namespace", "person2") + .SetSchema("Person") + .AddStringProperty("Name", "Bob") + .Build(); + + DocumentProto email1 = + DocumentBuilder() + .SetKey("pkg$db/namespace", "email1") + .SetSchema("Email") + .AddStringProperty("subject", "test subject 1") + .AddStringProperty("sender", "pkg$db/namespace#person2") + .Build(); + DocumentProto email2 = + DocumentBuilder() + .SetKey("pkg$db/namespace", "email2") + .SetSchema("Email") + .AddStringProperty("subject", "test subject 2") + .AddStringProperty("sender", "pkg$db/namespace#person1") + .Build(); + DocumentProto email3 = + DocumentBuilder() + .SetKey("pkg$db/namespace", "email3") + .SetSchema("Email") + .AddStringProperty("subject", "test subject 3") + .AddStringProperty("sender", "pkg$db/namespace#person1") + .Build(); + DocumentProto message1 = + DocumentBuilder() + .SetKey("pkg$db/namespace", "message1") + .SetSchema("Message") + .AddStringProperty("content", "test content 1") + .AddStringProperty("sender", "pkg$db/namespace#person1") + .AddStringProperty("receiver", "pkg$db/namespace#person2") + .Build(); + DocumentProto message2 = + DocumentBuilder() + .SetKey("pkg$db/namespace", "message2") + .SetSchema("Message") + .AddStringProperty("content", "test content 2") + .AddStringProperty("sender", "pkg$db/namespace#person2") + .AddStringProperty("receiver", "pkg$db/namespace#person1") + .Build(); + + ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id1, + this->PutAndIndexDocument(person1)); + ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id2, + this->PutAndIndexDocument(person2)); + ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id3, + this->PutAndIndexDocument(email1)); + ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id4, + this->PutAndIndexDocument(email2)); + ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id5, + this->PutAndIndexDocument(email3)); + ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id6, + this->PutAndIndexDocument(message1)); + ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id7, + this->PutAndIndexDocument(message2)); + + ScoredDocumentHit scored_doc_hit1(document_id1, kSectionIdMaskNone, + /*score=*/0.0); + ScoredDocumentHit scored_doc_hit2(document_id2, kSectionIdMaskNone, + /*score=*/0.0); + ScoredDocumentHit scored_doc_hit3(document_id3, kSectionIdMaskNone, + /*score=*/5.0); + ScoredDocumentHit scored_doc_hit4(document_id4, kSectionIdMaskNone, + /*score=*/3.0); + ScoredDocumentHit scored_doc_hit5(document_id5, kSectionIdMaskNone, + /*score=*/2.0); + ScoredDocumentHit scored_doc_hit6(document_id6, kSectionIdMaskNone, + /*score=*/4.0); + ScoredDocumentHit scored_doc_hit7(document_id7, kSectionIdMaskNone, + /*score=*/1.0); + + // Parent ScoredDocumentHits: all Person documents + std::vector<ScoredDocumentHit> parent_scored_document_hits = { + scored_doc_hit1, scored_doc_hit2}; + + // Child ScoredDocumentHits: all Email and Message documents + std::vector<ScoredDocumentHit> child_scored_document_hits = { + scored_doc_hit3, scored_doc_hit4, scored_doc_hit5, scored_doc_hit6, + scored_doc_hit7}; + + // Join by "sender". + // - Person1: [ + // email2 (scored_doc_hit4), + // email3 (scored_doc_hit5), + // message1 (scored_doc_hit6), + // ] + // - Person2: [ + // email1 (scored_doc_hit3), + // message2 (scored_doc_hit7), + // ] + JoinSpecProto join_spec; + join_spec.set_parent_property_expression( + std::string(JoinProcessor::kQualifiedIdExpr)); + join_spec.set_child_property_expression("sender"); + join_spec.set_aggregation_scoring_strategy( + JoinSpecProto::AggregationScoringStrategy::COUNT); + join_spec.mutable_nested_spec()->mutable_scoring_spec()->set_order_by( + ScoringSpecProto::Order::DESC); + + ICING_ASSERT_OK_AND_ASSIGN( + std::vector<JoinedScoredDocumentHit> joined_result_document_hits1, + this->Join(join_spec, parent_scored_document_hits, + child_scored_document_hits)); + EXPECT_THAT( + joined_result_document_hits1, + ElementsAre(EqualsJoinedScoredDocumentHit(JoinedScoredDocumentHit( + /*final_score=*/3.0, + /*parent_scored_document_hit=*/scored_doc_hit1, + /*child_scored_document_hits=*/ + {scored_doc_hit6, scored_doc_hit4, scored_doc_hit5})), + EqualsJoinedScoredDocumentHit(JoinedScoredDocumentHit( + /*final_score=*/2.0, + /*parent_scored_document_hit=*/scored_doc_hit2, + /*child_scored_document_hits=*/ + {scored_doc_hit3, scored_doc_hit7})))); + + // Join by "receiver". + // - Person1: [ + // message2 (scored_doc_hit7), + // ] + // - Person2: [ + // message1 (scored_doc_hit6), + // ] + join_spec.set_child_property_expression("receiver"); + + ICING_ASSERT_OK_AND_ASSIGN( + std::vector<JoinedScoredDocumentHit> joined_result_document_hits2, + this->Join(join_spec, parent_scored_document_hits, + child_scored_document_hits)); + EXPECT_THAT( + joined_result_document_hits2, + ElementsAre(EqualsJoinedScoredDocumentHit(JoinedScoredDocumentHit( + /*final_score=*/1.0, + /*parent_scored_document_hit=*/scored_doc_hit1, + /*child_scored_document_hits=*/{scored_doc_hit7})), + EqualsJoinedScoredDocumentHit(JoinedScoredDocumentHit( + /*final_score=*/1.0, + /*parent_scored_document_hit=*/scored_doc_hit2, + /*child_scored_document_hits=*/{scored_doc_hit6})))); +} + // TODO(b/256022027): add unit tests for non-joinable property. If joinable // value type is unset, then qualifed id join should not // include the child document even if it contains a valid diff --git a/icing/join/posting-list-join-data-accessor.h b/icing/join/posting-list-join-data-accessor.h new file mode 100644 index 0000000..6669f9f --- /dev/null +++ b/icing/join/posting-list-join-data-accessor.h @@ -0,0 +1,211 @@ +// Copyright (C) 2023 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef ICING_JOIN_POSTING_LIST_JOIN_DATA_ACCESSOR_H_ +#define ICING_JOIN_POSTING_LIST_JOIN_DATA_ACCESSOR_H_ + +#include <cstdint> +#include <memory> +#include <utility> +#include <vector> + +#include "icing/text_classifier/lib3/utils/base/status.h" +#include "icing/text_classifier/lib3/utils/base/statusor.h" +#include "icing/absl_ports/canonical_errors.h" +#include "icing/file/posting_list/flash-index-storage.h" +#include "icing/file/posting_list/index-block.h" +#include "icing/file/posting_list/posting-list-accessor.h" +#include "icing/file/posting_list/posting-list-common.h" +#include "icing/file/posting_list/posting-list-identifier.h" +#include "icing/file/posting_list/posting-list-used.h" +#include "icing/join/posting-list-join-data-serializer.h" +#include "icing/legacy/index/icing-bit-util.h" +#include "icing/util/status-macros.h" + +namespace icing { +namespace lib { + +// This class is used to provide a simple abstraction for adding join data to +// posting lists. PostingListJoinDataAccessor handles: +// 1) selection of properly-sized posting lists for the accumulated join index +// data during Finalize() +// 2) chaining of max-sized posting lists. +template <typename JoinDataType> +class PostingListJoinDataAccessor : public PostingListAccessor { + public: + // Creates an empty PostingListJoinDataAccessor. + // + // RETURNS: + // - On success, a valid instance of PostingListJoinDataAccessor + // - INVALID_ARGUMENT error if storage has an invalid block_size. + static libtextclassifier3::StatusOr< + std::unique_ptr<PostingListJoinDataAccessor<JoinDataType>>> + Create(FlashIndexStorage* storage, + PostingListJoinDataSerializer<JoinDataType>* serializer); + + // Creates a PostingListJoinDataAccessor with an existing posting list + // identified by existing_posting_list_id. + // + // RETURNS: + // - On success, a valid instance of PostingListJoinDataAccessor + // - INVALID_ARGUMENT if storage has an invalid block_size. + static libtextclassifier3::StatusOr< + std::unique_ptr<PostingListJoinDataAccessor<JoinDataType>>> + CreateFromExisting(FlashIndexStorage* storage, + PostingListJoinDataSerializer<JoinDataType>* serializer, + PostingListIdentifier existing_posting_list_id); + + PostingListSerializer* GetSerializer() override { return serializer_; } + + // Retrieves the next batch of data in the posting list chain. + // + // RETURNS: + // - On success, a vector of join data in the posting list chain + // - FAILED_PRECONDITION_ERROR if called on an instance that was created via + // Create. + // - INTERNAL_ERROR if unable to read the next posting list in the chain or + // if the posting list has been corrupted somehow. + libtextclassifier3::StatusOr<std::vector<JoinDataType>> GetNextDataBatch(); + + // Prepends one data. This may result in flushing the posting list to disk (if + // the PostingListJoinDataAccessor holds a max-sized posting list that is + // full) or freeing a pre-existing posting list if it is too small to fit all + // data necessary. + // + // RETURNS: + // - OK, on success + // - INVALID_ARGUMENT if !data.is_valid() or if data is greater than the + // previously added data. + // - RESOURCE_EXHAUSTED error if unable to grow the index to allocate a new + // posting list. + libtextclassifier3::Status PrependData(const JoinDataType& data); + + private: + explicit PostingListJoinDataAccessor( + FlashIndexStorage* storage, PostingListUsed in_memory_posting_list, + PostingListJoinDataSerializer<JoinDataType>* serializer) + : PostingListAccessor(storage, std::move(in_memory_posting_list)), + serializer_(serializer) {} + + PostingListJoinDataSerializer<JoinDataType>* serializer_; // Does not own. +}; + +template <typename JoinDataType> +/* static */ libtextclassifier3::StatusOr< + std::unique_ptr<PostingListJoinDataAccessor<JoinDataType>>> +PostingListJoinDataAccessor<JoinDataType>::Create( + FlashIndexStorage* storage, + PostingListJoinDataSerializer<JoinDataType>* serializer) { + uint32_t max_posting_list_bytes = IndexBlock::CalculateMaxPostingListBytes( + storage->block_size(), serializer->GetDataTypeBytes()); + ICING_ASSIGN_OR_RETURN(PostingListUsed in_memory_posting_list, + PostingListUsed::CreateFromUnitializedRegion( + serializer, max_posting_list_bytes)); + return std::unique_ptr<PostingListJoinDataAccessor<JoinDataType>>( + new PostingListJoinDataAccessor<JoinDataType>( + storage, std::move(in_memory_posting_list), serializer)); +} + +template <typename JoinDataType> +/* static */ libtextclassifier3::StatusOr< + std::unique_ptr<PostingListJoinDataAccessor<JoinDataType>>> +PostingListJoinDataAccessor<JoinDataType>::CreateFromExisting( + FlashIndexStorage* storage, + PostingListJoinDataSerializer<JoinDataType>* serializer, + PostingListIdentifier existing_posting_list_id) { + ICING_ASSIGN_OR_RETURN( + std::unique_ptr<PostingListJoinDataAccessor<JoinDataType>> pl_accessor, + Create(storage, serializer)); + ICING_ASSIGN_OR_RETURN(PostingListHolder holder, + storage->GetPostingList(existing_posting_list_id)); + pl_accessor->preexisting_posting_list_ = + std::make_unique<PostingListHolder>(std::move(holder)); + return pl_accessor; +} + +// Returns the next batch of join data for the provided posting list. +template <typename JoinDataType> +libtextclassifier3::StatusOr<std::vector<JoinDataType>> +PostingListJoinDataAccessor<JoinDataType>::GetNextDataBatch() { + if (preexisting_posting_list_ == nullptr) { + if (has_reached_posting_list_chain_end_) { + return std::vector<JoinDataType>(); + } + return absl_ports::FailedPreconditionError( + "Cannot retrieve data from a PostingListJoinDataAccessor that was not " + "created from a preexisting posting list."); + } + ICING_ASSIGN_OR_RETURN( + std::vector<JoinDataType> batch, + serializer_->GetData(&preexisting_posting_list_->posting_list)); + uint32_t next_block_index = kInvalidBlockIndex; + // Posting lists will only be chained when they are max-sized, in which case + // next_block_index will point to the next block for the next posting list. + // Otherwise, next_block_index can be kInvalidBlockIndex or be used to point + // to the next free list block, which is not relevant here. + if (preexisting_posting_list_->posting_list.size_in_bytes() == + storage_->max_posting_list_bytes()) { + next_block_index = preexisting_posting_list_->next_block_index; + } + + if (next_block_index != kInvalidBlockIndex) { + // Since we only have to deal with next block for max-sized posting list + // block, max_num_posting_lists is 1 and posting_list_index_bits is + // BitsToStore(1). + PostingListIdentifier next_posting_list_id( + next_block_index, /*posting_list_index=*/0, + /*posting_list_index_bits=*/BitsToStore(1)); + ICING_ASSIGN_OR_RETURN(PostingListHolder holder, + storage_->GetPostingList(next_posting_list_id)); + preexisting_posting_list_ = + std::make_unique<PostingListHolder>(std::move(holder)); + } else { + has_reached_posting_list_chain_end_ = true; + preexisting_posting_list_.reset(); + } + return batch; +} + +template <typename JoinDataType> +libtextclassifier3::Status +PostingListJoinDataAccessor<JoinDataType>::PrependData( + const JoinDataType& data) { + PostingListUsed& active_pl = (preexisting_posting_list_ != nullptr) + ? preexisting_posting_list_->posting_list + : in_memory_posting_list_; + libtextclassifier3::Status status = + serializer_->PrependData(&active_pl, data); + if (!absl_ports::IsResourceExhausted(status)) { + return status; + } + // There is no more room to add data to this current posting list! Therefore, + // we need to either move those data to a larger posting list or flush this + // posting list and create another max-sized posting list in the chain. + if (preexisting_posting_list_ != nullptr) { + ICING_RETURN_IF_ERROR(FlushPreexistingPostingList()); + } else { + ICING_RETURN_IF_ERROR(FlushInMemoryPostingList()); + } + + // Re-add data. Should always fit since we just cleared + // in_memory_posting_list_. It's fine to explicitly reference + // in_memory_posting_list_ here because there's no way of reaching this line + // while preexisting_posting_list_ is still in use. + return serializer_->PrependData(&in_memory_posting_list_, data); +} + +} // namespace lib +} // namespace icing + +#endif // ICING_JOIN_POSTING_LIST_JOIN_DATA_ACCESSOR_H_ diff --git a/icing/join/posting-list-join-data-accessor_test.cc b/icing/join/posting-list-join-data-accessor_test.cc new file mode 100644 index 0000000..ddc2d32 --- /dev/null +++ b/icing/join/posting-list-join-data-accessor_test.cc @@ -0,0 +1,435 @@ +// Copyright (C) 2023 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "icing/join/posting-list-join-data-accessor.h" + +#include <cstdint> +#include <memory> +#include <string> +#include <utility> +#include <vector> + +#include "icing/text_classifier/lib3/utils/base/status.h" +#include "gmock/gmock.h" +#include "gtest/gtest.h" +#include "icing/file/filesystem.h" +#include "icing/file/posting_list/flash-index-storage.h" +#include "icing/file/posting_list/posting-list-accessor.h" +#include "icing/file/posting_list/posting-list-common.h" +#include "icing/file/posting_list/posting-list-identifier.h" +#include "icing/join/document-id-to-join-info.h" +#include "icing/join/posting-list-join-data-serializer.h" +#include "icing/store/document-id.h" +#include "icing/store/namespace-fingerprint-identifier.h" +#include "icing/store/namespace-id.h" +#include "icing/testing/common-matchers.h" +#include "icing/testing/tmp-directory.h" + +namespace icing { +namespace lib { + +namespace { + +using ::testing::ElementsAre; +using ::testing::ElementsAreArray; +using ::testing::Eq; +using ::testing::Lt; +using ::testing::Ne; +using ::testing::SizeIs; + +using JoinDataType = DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>; + +static constexpr NamespaceId kDefaultNamespaceId = 1; + +class PostingListJoinDataAccessorTest : public ::testing::Test { + protected: + void SetUp() override { + test_dir_ = GetTestTempDir() + "/test_dir"; + file_name_ = test_dir_ + "/test_file.idx.index"; + + ASSERT_TRUE(filesystem_.DeleteDirectoryRecursively(test_dir_.c_str())); + ASSERT_TRUE(filesystem_.CreateDirectoryRecursively(test_dir_.c_str())); + + serializer_ = + std::make_unique<PostingListJoinDataSerializer<JoinDataType>>(); + + ICING_ASSERT_OK_AND_ASSIGN( + FlashIndexStorage flash_index_storage, + FlashIndexStorage::Create(file_name_, &filesystem_, serializer_.get())); + flash_index_storage_ = + std::make_unique<FlashIndexStorage>(std::move(flash_index_storage)); + } + + void TearDown() override { + flash_index_storage_.reset(); + serializer_.reset(); + ASSERT_TRUE(filesystem_.DeleteDirectoryRecursively(test_dir_.c_str())); + } + + Filesystem filesystem_; + std::string test_dir_; + std::string file_name_; + std::unique_ptr<PostingListJoinDataSerializer<JoinDataType>> serializer_; + std::unique_ptr<FlashIndexStorage> flash_index_storage_; +}; + +std::vector<JoinDataType> CreateData(int num_data, DocumentId start_document_id, + NamespaceId ref_namespace_id, + uint64_t start_ref_hash_uri) { + std::vector<JoinDataType> data; + data.reserve(num_data); + for (int i = 0; i < num_data; ++i) { + data.push_back(JoinDataType( + start_document_id, + NamespaceFingerprintIdentifier(ref_namespace_id, + /*fingerprint=*/start_ref_hash_uri))); + + ++start_document_id; + ++start_ref_hash_uri; + } + return data; +} + +TEST_F(PostingListJoinDataAccessorTest, DataAddAndRetrieveProperly) { + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<PostingListJoinDataAccessor<JoinDataType>> pl_accessor, + PostingListJoinDataAccessor<JoinDataType>::Create( + flash_index_storage_.get(), serializer_.get())); + // Add some join data + std::vector<JoinDataType> data_vec = + CreateData(/*num_data=*/5, /*start_document_id=*/0, + /*ref_namespace_id=*/kDefaultNamespaceId, + /*start_ref_hash_uri=*/819); + for (const JoinDataType& data : data_vec) { + EXPECT_THAT(pl_accessor->PrependData(data), IsOk()); + } + PostingListAccessor::FinalizeResult result = + std::move(*pl_accessor).Finalize(); + EXPECT_THAT(result.status, IsOk()); + EXPECT_THAT(result.id.block_index(), Eq(1)); + EXPECT_THAT(result.id.posting_list_index(), Eq(0)); + + // Retrieve some data. + ICING_ASSERT_OK_AND_ASSIGN(PostingListHolder pl_holder, + flash_index_storage_->GetPostingList(result.id)); + EXPECT_THAT( + serializer_->GetData(&pl_holder.posting_list), + IsOkAndHolds(ElementsAreArray(data_vec.rbegin(), data_vec.rend()))); + EXPECT_THAT(pl_holder.next_block_index, Eq(kInvalidBlockIndex)); +} + +TEST_F(PostingListJoinDataAccessorTest, PreexistingPLKeepOnSameBlock) { + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<PostingListJoinDataAccessor<JoinDataType>> pl_accessor, + PostingListJoinDataAccessor<JoinDataType>::Create( + flash_index_storage_.get(), serializer_.get())); + // Add a single data. This will fit in a min-sized posting list. + JoinDataType data1( + /*document_id=*/1, + NamespaceFingerprintIdentifier(kDefaultNamespaceId, /*fingerprint=*/123)); + ICING_ASSERT_OK(pl_accessor->PrependData(data1)); + PostingListAccessor::FinalizeResult result1 = + std::move(*pl_accessor).Finalize(); + ICING_ASSERT_OK(result1.status); + // Should be allocated to the first block. + ASSERT_THAT(result1.id.block_index(), Eq(1)); + ASSERT_THAT(result1.id.posting_list_index(), Eq(0)); + + // Add one more data. The minimum size for a posting list must be able to fit + // two data, so this should NOT cause the previous pl to be reallocated. + ICING_ASSERT_OK_AND_ASSIGN( + pl_accessor, + PostingListJoinDataAccessor<JoinDataType>::CreateFromExisting( + flash_index_storage_.get(), serializer_.get(), result1.id)); + JoinDataType data2( + /*document_id=*/2, + NamespaceFingerprintIdentifier(kDefaultNamespaceId, /*fingerprint=*/456)); + ICING_ASSERT_OK(pl_accessor->PrependData(data2)); + PostingListAccessor::FinalizeResult result2 = + std::move(*pl_accessor).Finalize(); + ICING_ASSERT_OK(result2.status); + // Should be in the same posting list. + EXPECT_THAT(result2.id, Eq(result1.id)); + + // The posting list at result2.id should hold all of the data that have been + // added. + ICING_ASSERT_OK_AND_ASSIGN(PostingListHolder pl_holder, + flash_index_storage_->GetPostingList(result2.id)); + EXPECT_THAT(serializer_->GetData(&pl_holder.posting_list), + IsOkAndHolds(ElementsAre(data2, data1))); +} + +TEST_F(PostingListJoinDataAccessorTest, PreexistingPLReallocateToLargerPL) { + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<PostingListJoinDataAccessor<JoinDataType>> pl_accessor, + PostingListJoinDataAccessor<JoinDataType>::Create( + flash_index_storage_.get(), serializer_.get())); + // Adding 3 data should cause Finalize allocating a 56-byte posting list, + // which can store at most 4 data. + std::vector<JoinDataType> data_vec1 = + CreateData(/*num_data=*/3, /*start_document_id=*/0, + /*ref_namespace_id=*/kDefaultNamespaceId, + /*start_ref_hash_uri=*/819); + for (const JoinDataType& data : data_vec1) { + ICING_ASSERT_OK(pl_accessor->PrependData(data)); + } + PostingListAccessor::FinalizeResult result1 = + std::move(*pl_accessor).Finalize(); + ICING_ASSERT_OK(result1.status); + // Should be allocated to the first block. + ASSERT_THAT(result1.id.block_index(), Eq(1)); + ASSERT_THAT(result1.id.posting_list_index(), Eq(0)); + + // Now add more data. + ICING_ASSERT_OK_AND_ASSIGN( + pl_accessor, + PostingListJoinDataAccessor<JoinDataType>::CreateFromExisting( + flash_index_storage_.get(), serializer_.get(), result1.id)); + // The current posting list can fit 1 more data. Adding 12 more data should + // result in these data being moved to a larger posting list. Also the total + // size of these data won't exceed max size posting list, so there will be + // only one single posting list and no chain. + std::vector<JoinDataType> data_vec2 = CreateData( + /*num_data=*/12, /*start_document_id=*/data_vec1.back().document_id() + 1, + /*ref_namespace_id=*/kDefaultNamespaceId, /*start_ref_hash_uri=*/819); + + for (const JoinDataType& data : data_vec2) { + ICING_ASSERT_OK(pl_accessor->PrependData(data)); + } + PostingListAccessor::FinalizeResult result2 = + std::move(*pl_accessor).Finalize(); + ICING_ASSERT_OK(result2.status); + // Should be allocated to the second (new) block because the posting list + // should grow beyond the size that the first block maintains. + EXPECT_THAT(result2.id.block_index(), Eq(2)); + EXPECT_THAT(result2.id.posting_list_index(), Eq(0)); + + // The posting list at result2.id should hold all of the data that have been + // added. + std::vector<JoinDataType> all_data_vec; + all_data_vec.reserve(data_vec1.size() + data_vec2.size()); + all_data_vec.insert(all_data_vec.end(), data_vec1.begin(), data_vec1.end()); + all_data_vec.insert(all_data_vec.end(), data_vec2.begin(), data_vec2.end()); + ICING_ASSERT_OK_AND_ASSIGN(PostingListHolder pl_holder, + flash_index_storage_->GetPostingList(result2.id)); + EXPECT_THAT(serializer_->GetData(&pl_holder.posting_list), + IsOkAndHolds(ElementsAreArray(all_data_vec.rbegin(), + all_data_vec.rend()))); +} + +TEST_F(PostingListJoinDataAccessorTest, MultiBlockChainsBlocksProperly) { + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<PostingListJoinDataAccessor<JoinDataType>> pl_accessor, + PostingListJoinDataAccessor<JoinDataType>::Create( + flash_index_storage_.get(), serializer_.get())); + // Block size is 4096, sizeof(BlockHeader) is 12 and sizeof(JoinDataType) + // is 14, so the max size posting list can store (4096 - 12) / 14 = 291 data. + // Adding 292 data should cause: + // - 2 max size posting lists being allocated to block 1 and block 2. + // - Chaining: block 2 -> block 1 + std::vector<JoinDataType> data_vec = CreateData( + /*num_data=*/292, /*start_document_id=*/0, + /*ref_namespace_id=*/kDefaultNamespaceId, /*start_ref_hash_uri=*/819); + for (const JoinDataType& data : data_vec) { + ICING_ASSERT_OK(pl_accessor->PrependData(data)); + } + PostingListAccessor::FinalizeResult result1 = + std::move(*pl_accessor).Finalize(); + ICING_ASSERT_OK(result1.status); + PostingListIdentifier second_block_id = result1.id; + // Should be allocated to the second block. + EXPECT_THAT(second_block_id, Eq(PostingListIdentifier( + /*block_index=*/2, /*posting_list_index=*/0, + /*posting_list_index_bits=*/0))); + + // We should be able to retrieve all data. + ICING_ASSERT_OK_AND_ASSIGN( + PostingListHolder pl_holder, + flash_index_storage_->GetPostingList(second_block_id)); + // This pl_holder will only hold a posting list with the data that didn't fit + // on the first block. + ICING_ASSERT_OK_AND_ASSIGN(std::vector<JoinDataType> second_block_data, + serializer_->GetData(&pl_holder.posting_list)); + ASSERT_THAT(second_block_data, SizeIs(Lt(data_vec.size()))); + auto first_block_data_start = data_vec.rbegin() + second_block_data.size(); + EXPECT_THAT(second_block_data, + ElementsAreArray(data_vec.rbegin(), first_block_data_start)); + + // Now retrieve all of the data that were on the first block. + uint32_t first_block_id = pl_holder.next_block_index; + EXPECT_THAT(first_block_id, Eq(1)); + + PostingListIdentifier pl_id(first_block_id, /*posting_list_index=*/0, + /*posting_list_index_bits=*/0); + ICING_ASSERT_OK_AND_ASSIGN(pl_holder, + flash_index_storage_->GetPostingList(pl_id)); + EXPECT_THAT( + serializer_->GetData(&pl_holder.posting_list), + IsOkAndHolds(ElementsAreArray(first_block_data_start, data_vec.rend()))); +} + +TEST_F(PostingListJoinDataAccessorTest, + PreexistingMultiBlockReusesBlocksProperly) { + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<PostingListJoinDataAccessor<JoinDataType>> pl_accessor, + PostingListJoinDataAccessor<JoinDataType>::Create( + flash_index_storage_.get(), serializer_.get())); + // Block size is 4096, sizeof(BlockHeader) is 12 and sizeof(JoinDataType) + // is 14, so the max size posting list can store (4096 - 12) / 14 = 291 data. + // Adding 292 data will cause: + // - 2 max size posting lists being allocated to block 1 and block 2. + // - Chaining: block 2 -> block 1 + std::vector<JoinDataType> data_vec1 = CreateData( + /*num_data=*/292, /*start_document_id=*/0, + /*ref_namespace_id=*/kDefaultNamespaceId, /*start_ref_hash_uri=*/819); + for (const JoinDataType& data : data_vec1) { + ICING_ASSERT_OK(pl_accessor->PrependData(data)); + } + PostingListAccessor::FinalizeResult result1 = + std::move(*pl_accessor).Finalize(); + ICING_ASSERT_OK(result1.status); + PostingListIdentifier first_add_id = result1.id; + EXPECT_THAT(first_add_id, Eq(PostingListIdentifier( + /*block_index=*/2, /*posting_list_index=*/0, + /*posting_list_index_bits=*/0))); + + // Now add more data. These should fit on the existing second block and not + // fill it up. + ICING_ASSERT_OK_AND_ASSIGN( + pl_accessor, + PostingListJoinDataAccessor<JoinDataType>::CreateFromExisting( + flash_index_storage_.get(), serializer_.get(), first_add_id)); + std::vector<JoinDataType> data_vec2 = CreateData( + /*num_data=*/10, /*start_document_id=*/data_vec1.back().document_id() + 1, + /*ref_namespace_id=*/kDefaultNamespaceId, /*start_ref_hash_uri=*/819); + for (const JoinDataType& data : data_vec2) { + ICING_ASSERT_OK(pl_accessor->PrependData(data)); + } + PostingListAccessor::FinalizeResult result2 = + std::move(*pl_accessor).Finalize(); + ICING_ASSERT_OK(result2.status); + PostingListIdentifier second_add_id = result2.id; + EXPECT_THAT(second_add_id, Eq(first_add_id)); + + // We should be able to retrieve all data. + std::vector<JoinDataType> all_data_vec; + all_data_vec.reserve(data_vec1.size() + data_vec2.size()); + all_data_vec.insert(all_data_vec.end(), data_vec1.begin(), data_vec1.end()); + all_data_vec.insert(all_data_vec.end(), data_vec2.begin(), data_vec2.end()); + ICING_ASSERT_OK_AND_ASSIGN( + PostingListHolder pl_holder, + flash_index_storage_->GetPostingList(second_add_id)); + // This pl_holder will only hold a posting list with the data that didn't fit + // on the first block. + ICING_ASSERT_OK_AND_ASSIGN(std::vector<JoinDataType> second_block_data, + serializer_->GetData(&pl_holder.posting_list)); + ASSERT_THAT(second_block_data, SizeIs(Lt(all_data_vec.size()))); + auto first_block_data_start = + all_data_vec.rbegin() + second_block_data.size(); + EXPECT_THAT(second_block_data, + ElementsAreArray(all_data_vec.rbegin(), first_block_data_start)); + + // Now retrieve all of the data that were on the first block. + uint32_t first_block_id = pl_holder.next_block_index; + EXPECT_THAT(first_block_id, Eq(1)); + + PostingListIdentifier pl_id(first_block_id, /*posting_list_index=*/0, + /*posting_list_index_bits=*/0); + ICING_ASSERT_OK_AND_ASSIGN(pl_holder, + flash_index_storage_->GetPostingList(pl_id)); + EXPECT_THAT(serializer_->GetData(&pl_holder.posting_list), + IsOkAndHolds(ElementsAreArray(first_block_data_start, + all_data_vec.rend()))); +} + +TEST_F(PostingListJoinDataAccessorTest, + InvalidDataShouldReturnInvalidArgument) { + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<PostingListJoinDataAccessor<JoinDataType>> pl_accessor, + PostingListJoinDataAccessor<JoinDataType>::Create( + flash_index_storage_.get(), serializer_.get())); + JoinDataType invalid_data = JoinDataType::GetInvalid(); + EXPECT_THAT(pl_accessor->PrependData(invalid_data), + StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); +} + +TEST_F(PostingListJoinDataAccessorTest, + JoinDataNonIncreasingShouldReturnInvalidArgument) { + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<PostingListJoinDataAccessor<JoinDataType>> pl_accessor, + PostingListJoinDataAccessor<JoinDataType>::Create( + flash_index_storage_.get(), serializer_.get())); + JoinDataType data1( + /*document_id=*/1, + NamespaceFingerprintIdentifier(kDefaultNamespaceId, /*fingerprint=*/819)); + ICING_ASSERT_OK(pl_accessor->PrependData(data1)); + + JoinDataType data2( + /*document_id=*/1, + NamespaceFingerprintIdentifier(kDefaultNamespaceId, /*fingerprint=*/818)); + EXPECT_THAT(pl_accessor->PrependData(data2), + StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); + + JoinDataType data3(/*document_id=*/1, + NamespaceFingerprintIdentifier(kDefaultNamespaceId - 1, + /*fingerprint=*/820)); + EXPECT_THAT(pl_accessor->PrependData(data3), + StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); + + JoinDataType data4(/*document_id=*/0, + NamespaceFingerprintIdentifier(kDefaultNamespaceId + 1, + /*fingerprint=*/820)); + EXPECT_THAT(pl_accessor->PrependData(data4), + StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); +} + +TEST_F(PostingListJoinDataAccessorTest, + NewPostingListNoDataAddedShouldReturnInvalidArgument) { + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<PostingListJoinDataAccessor<JoinDataType>> pl_accessor, + PostingListJoinDataAccessor<JoinDataType>::Create( + flash_index_storage_.get(), serializer_.get())); + PostingListAccessor::FinalizeResult result = + std::move(*pl_accessor).Finalize(); + EXPECT_THAT(result.status, + StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); +} + +TEST_F(PostingListJoinDataAccessorTest, + PreexistingPostingListNoDataAddedShouldSucceed) { + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<PostingListJoinDataAccessor<JoinDataType>> pl_accessor1, + PostingListJoinDataAccessor<JoinDataType>::Create( + flash_index_storage_.get(), serializer_.get())); + JoinDataType data1( + /*document_id=*/1, + NamespaceFingerprintIdentifier(kDefaultNamespaceId, /*fingerprint=*/819)); + ICING_ASSERT_OK(pl_accessor1->PrependData(data1)); + PostingListAccessor::FinalizeResult result1 = + std::move(*pl_accessor1).Finalize(); + ICING_ASSERT_OK(result1.status); + + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<PostingListJoinDataAccessor<JoinDataType>> pl_accessor2, + PostingListJoinDataAccessor<JoinDataType>::CreateFromExisting( + flash_index_storage_.get(), serializer_.get(), result1.id)); + PostingListAccessor::FinalizeResult result2 = + std::move(*pl_accessor2).Finalize(); + EXPECT_THAT(result2.status, IsOk()); +} + +} // namespace + +} // namespace lib +} // namespace icing diff --git a/icing/join/posting-list-join-data-serializer.h b/icing/join/posting-list-join-data-serializer.h new file mode 100644 index 0000000..9f39dca --- /dev/null +++ b/icing/join/posting-list-join-data-serializer.h @@ -0,0 +1,803 @@ +// Copyright (C) 2023 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef ICING_JOIN_POSTING_LIST_JOIN_DATA_SERIALIZER_H_ +#define ICING_JOIN_POSTING_LIST_JOIN_DATA_SERIALIZER_H_ + +#include <cstdint> +#include <cstring> +#include <limits> +#include <vector> + +#include "icing/text_classifier/lib3/utils/base/status.h" +#include "icing/text_classifier/lib3/utils/base/statusor.h" +#include "icing/absl_ports/canonical_errors.h" +#include "icing/file/posting_list/posting-list-common.h" +#include "icing/file/posting_list/posting-list-used.h" +#include "icing/legacy/core/icing-string-util.h" +#include "icing/util/logging.h" +#include "icing/util/status-macros.h" + +namespace icing { +namespace lib { + +// A serializer class to serialize JoinDataType to PostingListUsed. Usually +// JoinDataType is DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>, +// DocumentIdToJoinInfo<TermId>, or DocumentIdToJoinInfo<int64_t>. +// +// REQUIRES: +// - JoinDataType is comparable by operator <. +// - JoinDataType implements is_valid() method. +// - JoinDataType has static method GetInvalid() that returns a JoinDataType +// instance containing invalid data. +template <typename JoinDataType> +class PostingListJoinDataSerializer : public PostingListSerializer { + public: + using SpecialDataType = SpecialData<JoinDataType>; + static_assert(sizeof(SpecialDataType) == sizeof(JoinDataType), ""); + + static constexpr uint32_t kSpecialDataSize = + kNumSpecialData * sizeof(SpecialDataType); + + uint32_t GetDataTypeBytes() const override { return sizeof(JoinDataType); } + + uint32_t GetMinPostingListSize() const override { + static constexpr uint32_t kMinPostingListSize = kSpecialDataSize; + static_assert(sizeof(PostingListIndex) <= kMinPostingListSize, + "PostingListIndex must be small enough to fit in a " + "minimum-sized Posting List."); + + return kMinPostingListSize; + } + + uint32_t GetMinPostingListSizeToFit( + const PostingListUsed* posting_list_used) const override; + + uint32_t GetBytesUsed( + const PostingListUsed* posting_list_used) const override; + + void Clear(PostingListUsed* posting_list_used) const override; + + libtextclassifier3::Status MoveFrom(PostingListUsed* dst, + PostingListUsed* src) const override; + + // Prepend a JoinData to the posting list. + // + // RETURNS: + // - INVALID_ARGUMENT if !data.is_valid() or if data is not greater than the + // previously added data. + // - RESOURCE_EXHAUSTED if there is no more room to add data to the posting + // list. + libtextclassifier3::Status PrependData(PostingListUsed* posting_list_used, + const JoinDataType& data) const; + + // Prepend multiple JoinData to the posting list. + // Data should be sorted in ascending order (as defined by the less than + // operator for JoinData) + // If keep_prepended is true, whatever could be prepended is kept, otherwise + // the posting list is reverted and left in its original state. + // + // RETURNS: + // The number of data that have been prepended to the posting list. If + // keep_prepended is false and reverted, then it returns 0. + libtextclassifier3::StatusOr<uint32_t> PrependDataArray( + PostingListUsed* posting_list_used, const JoinDataType* array, + uint32_t num_data, bool keep_prepended) const; + + // Retrieves all data stored in the posting list. + // + // RETURNS: + // - On success, a vector of JoinDataType sorted by the reverse order of + // prepending. + // - INTERNAL_ERROR if the posting list has been corrupted somehow. + libtextclassifier3::StatusOr<std::vector<JoinDataType>> GetData( + const PostingListUsed* posting_list_used) const; + + // Same as GetData but appends data to data_arr_out. + // + // RETURNS: + // - OK on success, and data_arr_out will be appended JoinDataType sorted by + // the reverse order of prepending. + // - INTERNAL_ERROR if the posting list has been corrupted somehow. + libtextclassifier3::Status GetData( + const PostingListUsed* posting_list_used, + std::vector<JoinDataType>* data_arr_out) const; + + // Undo the last num_data data prepended. If num_data > number of data, then + // we clear all data. + // + // RETURNS: + // - OK on success + // - INTERNAL_ERROR if the posting list has been corrupted somehow. + libtextclassifier3::Status PopFrontData(PostingListUsed* posting_list_used, + uint32_t num_data) const; + + // Helper function to determine if posting list is full. + bool IsFull(const PostingListUsed* posting_list_used) const { + return GetSpecialData(posting_list_used, /*index=*/0).data().is_valid() && + GetSpecialData(posting_list_used, /*index=*/1).data().is_valid(); + } + + private: + // In PostingListJoinDataSerializer, there is no compression, but we still use + // the traditional posting list implementation. + // + // Posting list layout formats: + // + // NOT_FULL + // +-special-data-0--+-special-data-1--+------------+-----------------------+ + // | | | | | + // |data-start-offset| Data::Invalid | 0x00000000 | (compressed) data | + // | | | | | + // +-----------------+-----------------+------------+-----------------------+ + // + // ALMOST_FULL + // +-special-data-0--+-special-data-1--+-----+------------------------------+ + // | | | | | + // | Data::Invalid | 1st data |(pad)| (compressed) data | + // | | | | | + // +-----------------+-----------------+-----+------------------------------+ + // + // FULL + // +-special-data-0--+-special-data-1--+-----+------------------------------+ + // | | | | | + // | 1st data | 2nd data |(pad)| (compressed) data | + // | | | | | + // +-----------------+-----------------+-----+------------------------------+ + // + // The first two uncompressed (special) data also implicitly encode + // information about the size of the compressed data region. + // + // 1. If the posting list is NOT_FULL, then special_data_0 contains the byte + // offset of the start of the compressed data. Thus, the size of the + // compressed data is + // posting_list_used->size_in_bytes() - special_data_0.data_start_offset(). + // + // 2. If posting list is ALMOST_FULL or FULL, then the compressed data region + // starts somewhere between + // [kSpecialDataSize, kSpecialDataSize + sizeof(JoinDataType) - 1] and ends + // at posting_list_used->size_in_bytes() - 1. + // + // EXAMPLE + // JoinDataType = DocumentIdToJoinInfo<int64_t>. Posting list size: 48 bytes + // + // EMPTY! + // +-- byte 0-11 --+---- 12-23 ----+------------ 24-47 -------------+ + // | | | | + // | 48 | Data::Invalid | 0x00000000 | + // | | | | + // +---------------+---------------+--------------------------------+ + // + // Add DocumentIdToJoinInfo<int64_t>(DocumentId = 12, JoinInteger = 5) + // NOT FULL! + // +-- byte 0-11 --+---- 12-23 ----+---- 24-35 ----+---- 36-47 ----+ + // | | | | 12 | + // | 36 | Data::Invalid | 0x00000000 | 5 | + // | | | | | + // +---------------+---------------+---------------+---------------+ + // + // Add DocumentIdToJoinInfo<int64_t>(DocumentId = 18, JoinInteger = -2) + // +-- byte 0-11 --+---- 12-23 ----+---- 24-35 ----+---- 36-47 ----+ + // | | | 18 | 12 | + // | 24 | Data::Invalid | -2 | 5 | + // | | | | | + // +---------------+---------------+---------------+---------------+ + // + // Add DocumentIdToJoinInfo<int64_t>(DocumentId = 22, JoinInteger = 3) + // ALMOST_FULL! + // +-- byte 0-11 --+---- 12-23 ----+---- 24-35 ----+---- 36-47 ----+ + // | | 22 | 18 | 12 | + // | Data::Invalid | 3 | -2 | 5 | + // | | | | | + // +---------------+---------------+---------------+---------------+ + // + // Add DocumentIdToJoinInfo<int64_t>(DocumentId = 27, JoinInteger = 0) + // FULL! + // +-- byte 0-11 --+---- 12-23 ----+---- 24-35 ----+---- 36-47 ----+ + // | 27 | 22 | 18 | 12 | + // | 0 | 3 | -2 | 5 | + // | | | | | + // +---------------+---------------+---------------+---------------+ + + // Helpers to determine what state the posting list is in. + bool IsAlmostFull(const PostingListUsed* posting_list_used) const { + return !GetSpecialData(posting_list_used, /*index=*/0).data().is_valid() && + GetSpecialData(posting_list_used, /*index=*/1).data().is_valid(); + } + + bool IsEmpty(const PostingListUsed* posting_list_used) const { + return GetSpecialData(posting_list_used, /*index=*/0).data_start_offset() == + posting_list_used->size_in_bytes() && + !GetSpecialData(posting_list_used, /*index=*/1).data().is_valid(); + } + + // Returns false if both special data are invalid or if data start offset + // stored in the special data is less than kSpecialDataSize or greater than + // posting_list_used->size_in_bytes(). Returns true, otherwise. + bool IsPostingListValid(const PostingListUsed* posting_list_used) const; + + // Prepend data to a posting list that is in the ALMOST_FULL state. + // + // RETURNS: + // - OK, if successful + // - INVALID_ARGUMENT if data is not less than the previously added data. + libtextclassifier3::Status PrependDataToAlmostFull( + PostingListUsed* posting_list_used, const JoinDataType& data) const; + + // Prepend data to a posting list that is in the EMPTY state. This will always + // succeed because there are no pre-existing data and no validly constructed + // posting list could fail to fit one data. + void PrependDataToEmpty(PostingListUsed* posting_list_used, + const JoinDataType& data) const; + + // Prepend data to a posting list that is in the NOT_FULL state. + // + // RETURNS: + // - OK, if successful + // - INVALID_ARGUMENT if data is not less than the previously added data. + libtextclassifier3::Status PrependDataToNotFull( + PostingListUsed* posting_list_used, const JoinDataType& data, + uint32_t offset) const; + + // Returns either 0 (FULL state), sizeof(JoinDataType) (ALMOST_FULL state) or + // a byte offset between kSpecialDataSize and + // posting_list_used->size_in_bytes() (inclusive) (NOT_FULL state). + uint32_t GetStartByteOffset(const PostingListUsed* posting_list_used) const; + + // Sets special data 0 to properly reflect what start byte offset is (see + // layout comment for further details). + // + // Returns false if offset > posting_list_used->size_in_bytes() or offset is + // in range (kSpecialDataSize, sizeof(JoinDataType)) or + // (sizeof(JoinDataType), 0). True, otherwise. + bool SetStartByteOffset(PostingListUsed* posting_list_used, + uint32_t offset) const; + + // Helper for MoveFrom/GetData/PopFrontData. Adds limit number of data to out + // or all data in the posting list if the posting list contains less than + // limit number of data. out can be NULL. + // + // NOTE: If called with limit=1, pop=true on a posting list that transitioned + // from NOT_FULL directly to FULL, GetDataInternal will not return the posting + // list to NOT_FULL. Instead it will leave it in a valid state, but it will be + // ALMOST_FULL. + // + // RETURNS: + // - OK on success + // - INTERNAL_ERROR if the posting list has been corrupted somehow. + libtextclassifier3::Status GetDataInternal( + const PostingListUsed* posting_list_used, uint32_t limit, bool pop, + std::vector<JoinDataType>* out) const; + + // Retrieves the value stored in the index-th special data. + // + // REQUIRES: + // 0 <= index < kNumSpecialData. + // + // RETURNS: + // - A valid SpecialData<JoinDataType>. + SpecialDataType GetSpecialData(const PostingListUsed* posting_list_used, + uint32_t index) const; + + // Sets the value stored in the index-th special data to special_data. + // + // REQUIRES: + // 0 <= index < kNumSpecialData. + void SetSpecialData(PostingListUsed* posting_list_used, uint32_t index, + const SpecialDataType& special_data) const; + + // Prepends data to the memory region + // [offset - sizeof(JoinDataType), offset - 1] and + // returns the new beginning of the region. + // + // RETURNS: + // - The new beginning of the padded region, if successful. + // - INVALID_ARGUMENT if data will not fit (uncompressed) between + // [kSpecialDataSize, offset - 1] + libtextclassifier3::StatusOr<uint32_t> PrependDataUncompressed( + PostingListUsed* posting_list_used, const JoinDataType& data, + uint32_t offset) const; +}; + +template <typename JoinDataType> +uint32_t PostingListJoinDataSerializer<JoinDataType>::GetBytesUsed( + const PostingListUsed* posting_list_used) const { + // The special data will be included if they represent actual data. If they + // represent the data start offset or the invalid data sentinel, they are not + // included. + return posting_list_used->size_in_bytes() - + GetStartByteOffset(posting_list_used); +} + +template <typename JoinDataType> +uint32_t +PostingListJoinDataSerializer<JoinDataType>::GetMinPostingListSizeToFit( + const PostingListUsed* posting_list_used) const { + if (IsFull(posting_list_used) || IsAlmostFull(posting_list_used)) { + // If in either the FULL state or ALMOST_FULL state, this posting list *is* + // the minimum size posting list that can fit these data. So just return the + // size of the posting list. + return posting_list_used->size_in_bytes(); + } + + // In NOT_FULL state, BytesUsed contains no special data. The minimum sized + // posting list that would be guaranteed to fit these data would be + // ALMOST_FULL, with kInvalidData in special data 0, the uncompressed data in + // special data 1 and the n compressed data in the compressed region. + // BytesUsed contains one uncompressed data and n compressed data. Therefore, + // fitting these data into a posting list would require BytesUsed plus one + // extra data. + return GetBytesUsed(posting_list_used) + GetDataTypeBytes(); +} + +template <typename JoinDataType> +void PostingListJoinDataSerializer<JoinDataType>::Clear( + PostingListUsed* posting_list_used) const { + // Safe to ignore return value because posting_list_used->size_in_bytes() is + // a valid argument. + SetStartByteOffset(posting_list_used, + /*offset=*/posting_list_used->size_in_bytes()); +} + +template <typename JoinDataType> +libtextclassifier3::Status +PostingListJoinDataSerializer<JoinDataType>::MoveFrom( + PostingListUsed* dst, PostingListUsed* src) const { + ICING_RETURN_ERROR_IF_NULL(dst); + ICING_RETURN_ERROR_IF_NULL(src); + if (GetMinPostingListSizeToFit(src) > dst->size_in_bytes()) { + return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf( + "src MinPostingListSizeToFit %d must be larger than size %d.", + GetMinPostingListSizeToFit(src), dst->size_in_bytes())); + } + + if (!IsPostingListValid(dst)) { + return absl_ports::FailedPreconditionError( + "Dst posting list is in an invalid state and can't be used!"); + } + if (!IsPostingListValid(src)) { + return absl_ports::InvalidArgumentError( + "Cannot MoveFrom an invalid src posting list!"); + } + + // Pop just enough data that all of src's compressed data fit in + // dst posting_list's compressed area. Then we can memcpy that area. + std::vector<JoinDataType> data_arr; + while (IsFull(src) || IsAlmostFull(src) || + (dst->size_in_bytes() - kSpecialDataSize < GetBytesUsed(src))) { + if (!GetDataInternal(src, /*limit=*/1, /*pop=*/true, &data_arr).ok()) { + return absl_ports::AbortedError( + "Unable to retrieve data from src posting list."); + } + } + + // memcpy the area and set up start byte offset. + Clear(dst); + memcpy(dst->posting_list_buffer() + dst->size_in_bytes() - GetBytesUsed(src), + src->posting_list_buffer() + GetStartByteOffset(src), + GetBytesUsed(src)); + // Because we popped all data from src outside of the compressed area and we + // guaranteed that GetBytesUsed(src) is less than dst->size_in_bytes() - + // kSpecialDataSize. This is guaranteed to be a valid byte offset for the + // NOT_FULL state, so ignoring the value is safe. + SetStartByteOffset(dst, dst->size_in_bytes() - GetBytesUsed(src)); + + // Put back remaining data. + for (auto riter = data_arr.rbegin(); riter != data_arr.rend(); ++riter) { + // PrependData may return: + // - INVALID_ARGUMENT: if data is invalid or not less than the previous data + // - RESOURCE_EXHAUSTED + // RESOURCE_EXHAUSTED should be impossible because we've already assured + // that there is enough room above. + ICING_RETURN_IF_ERROR(PrependData(dst, *riter)); + } + + Clear(src); + return libtextclassifier3::Status::OK; +} + +template <typename JoinDataType> +libtextclassifier3::Status +PostingListJoinDataSerializer<JoinDataType>::PrependDataToAlmostFull( + PostingListUsed* posting_list_used, const JoinDataType& data) const { + SpecialDataType special_data = GetSpecialData(posting_list_used, /*index=*/1); + if (data < special_data.data()) { + return absl_ports::InvalidArgumentError( + "JoinData being prepended must not be smaller than the most recent " + "JoinData"); + } + + // Without compression, prepend a new data into ALMOST_FULL posting list will + // change the posting list to FULL state. Therefore, set special data 0 + // directly. + SetSpecialData(posting_list_used, /*index=*/0, SpecialDataType(data)); + return libtextclassifier3::Status::OK; +} + +template <typename JoinDataType> +void PostingListJoinDataSerializer<JoinDataType>::PrependDataToEmpty( + PostingListUsed* posting_list_used, const JoinDataType& data) const { + // First data to be added. Just add verbatim, no compression. + if (posting_list_used->size_in_bytes() == kSpecialDataSize) { + // First data will be stored at special data 1. + // Safe to ignore the return value because 1 < kNumSpecialData + SetSpecialData(posting_list_used, /*index=*/1, SpecialDataType(data)); + // Safe to ignore the return value because sizeof(JoinDataType) is a valid + // argument. + SetStartByteOffset(posting_list_used, /*offset=*/sizeof(JoinDataType)); + } else { + // Since this is the first data, size != kSpecialDataSize and + // size % sizeof(JoinDataType) == 0, we know that there is room to fit + // 'data' into the compressed region, so ValueOrDie is safe. + uint32_t offset = + PrependDataUncompressed(posting_list_used, data, + /*offset=*/posting_list_used->size_in_bytes()) + .ValueOrDie(); + // Safe to ignore the return value because PrependDataUncompressed is + // guaranteed to return a valid offset. + SetStartByteOffset(posting_list_used, offset); + } +} + +template <typename JoinDataType> +libtextclassifier3::Status +PostingListJoinDataSerializer<JoinDataType>::PrependDataToNotFull( + PostingListUsed* posting_list_used, const JoinDataType& data, + uint32_t offset) const { + JoinDataType curr = JoinDataType::GetInvalid(); + memcpy(&curr, posting_list_used->posting_list_buffer() + offset, + sizeof(JoinDataType)); + if (data < curr) { + return absl_ports::InvalidArgumentError( + "JoinData being prepended must not be smaller than the most recent " + "JoinData"); + } + + if (offset >= kSpecialDataSize + sizeof(JoinDataType)) { + offset = + PrependDataUncompressed(posting_list_used, data, offset).ValueOrDie(); + SetStartByteOffset(posting_list_used, offset); + } else { + // The new data must be put in special data 1. + SetSpecialData(posting_list_used, /*index=*/1, SpecialDataType(data)); + // State ALMOST_FULL. Safe to ignore the return value because + // sizeof(JoinDataType) is a valid argument. + SetStartByteOffset(posting_list_used, /*offset=*/sizeof(JoinDataType)); + } + return libtextclassifier3::Status::OK; +} + +template <typename JoinDataType> +libtextclassifier3::Status +PostingListJoinDataSerializer<JoinDataType>::PrependData( + PostingListUsed* posting_list_used, const JoinDataType& data) const { + if (!data.is_valid()) { + return absl_ports::InvalidArgumentError("Cannot prepend an invalid data!"); + } + if (!IsPostingListValid(posting_list_used)) { + return absl_ports::FailedPreconditionError( + "This PostingListUsed is in an invalid state and can't add any data!"); + } + + if (IsFull(posting_list_used)) { + // State FULL: no space left. + return absl_ports::ResourceExhaustedError("No more room for data"); + } else if (IsAlmostFull(posting_list_used)) { + return PrependDataToAlmostFull(posting_list_used, data); + } else if (IsEmpty(posting_list_used)) { + PrependDataToEmpty(posting_list_used, data); + return libtextclassifier3::Status::OK; + } else { + uint32_t offset = GetStartByteOffset(posting_list_used); + return PrependDataToNotFull(posting_list_used, data, offset); + } +} + +template <typename JoinDataType> +libtextclassifier3::StatusOr<uint32_t> +PostingListJoinDataSerializer<JoinDataType>::PrependDataArray( + PostingListUsed* posting_list_used, const JoinDataType* array, + uint32_t num_data, bool keep_prepended) const { + if (!IsPostingListValid(posting_list_used)) { + return 0; + } + + uint32_t i; + for (i = 0; i < num_data; ++i) { + if (!PrependData(posting_list_used, array[i]).ok()) { + break; + } + } + if (i != num_data && !keep_prepended) { + // Didn't fit. Undo everything and check that we have the same offset as + // before. PopFrontData guarantees that it will remove all 'i' data so long + // as there are at least 'i' data in the posting list, which we know there + // are. + ICING_RETURN_IF_ERROR(PopFrontData(posting_list_used, /*num_data=*/i)); + return 0; + } + return i; +} + +template <typename JoinDataType> +libtextclassifier3::StatusOr<std::vector<JoinDataType>> +PostingListJoinDataSerializer<JoinDataType>::GetData( + const PostingListUsed* posting_list_used) const { + std::vector<JoinDataType> data_arr_out; + ICING_RETURN_IF_ERROR(GetData(posting_list_used, &data_arr_out)); + return data_arr_out; +} + +template <typename JoinDataType> +libtextclassifier3::Status PostingListJoinDataSerializer<JoinDataType>::GetData( + const PostingListUsed* posting_list_used, + std::vector<JoinDataType>* data_arr_out) const { + return GetDataInternal(posting_list_used, + /*limit=*/std::numeric_limits<uint32_t>::max(), + /*pop=*/false, data_arr_out); +} + +template <typename JoinDataType> +libtextclassifier3::Status +PostingListJoinDataSerializer<JoinDataType>::PopFrontData( + PostingListUsed* posting_list_used, uint32_t num_data) const { + if (num_data == 1 && IsFull(posting_list_used)) { + // The PL is in FULL state which means that we save 2 uncompressed data in + // the 2 special postions. But FULL state may be reached by 2 different + // states. + // (1) In ALMOST_FULL state + // +------------------+-----------------+-----+---------------------------+ + // |Data::Invalid |1st data |(pad)|(compressed) data | + // | | | | | + // +------------------+-----------------+-----+---------------------------+ + // When we prepend another data, we can only put it at special data 0, and + // thus get a FULL PL + // +------------------+-----------------+-----+---------------------------+ + // |new 1st data |original 1st data|(pad)|(compressed) data | + // | | | | | + // +------------------+-----------------+-----+---------------------------+ + // + // (2) In NOT_FULL state + // +------------------+-----------------+-------+---------+---------------+ + // |data-start-offset |Data::Invalid |(pad) |1st data |(compressed) | + // | | | | |data | + // +------------------+-----------------+-------+---------+---------------+ + // When we prepend another data, we can reach any of the 3 following + // scenarios: + // (2.1) NOT_FULL + // if the space of pad and original 1st data can accommodate the new 1st + // data and the encoded delta value. + // +------------------+-----------------+-----+--------+------------------+ + // |data-start-offset |Data::Invalid |(pad)|new |(compressed) data | + // | | | |1st data| | + // +------------------+-----------------+-----+--------+------------------+ + // (2.2) ALMOST_FULL + // If the space of pad and original 1st data cannot accommodate the new 1st + // data and the encoded delta value but can accommodate the encoded delta + // value only. We can put the new 1st data at special position 1. + // +------------------+-----------------+---------+-----------------------+ + // |Data::Invalid |new 1st data |(pad) |(compressed) data | + // | | | | | + // +------------------+-----------------+---------+-----------------------+ + // (2.3) FULL + // In very rare case, it cannot even accommodate only the encoded delta + // value. we can move the original 1st data into special position 1 and the + // new 1st data into special position 0. This may happen because we use + // VarInt encoding method which may make the encoded value longer (about + // 4/3 times of original) + // +------------------+-----------------+--------------+------------------+ + // |new 1st data |original 1st data|(pad) |(compressed) data | + // | | | | | + // +------------------+-----------------+--------------+------------------+ + // + // Suppose now the PL is in FULL state. But we don't know whether it arrived + // this state from NOT_FULL (like (2.3)) or from ALMOST_FULL (like (1)). + // We'll return to ALMOST_FULL state like (1) if we simply pop the new 1st + // data, but we want to make the prepending operation "reversible". So + // there should be some way to return to NOT_FULL if possible. A simple way + // to do is: + // - Pop 2 data out of the PL to state ALMOST_FULL or NOT_FULL. + // - Add the second data ("original 1st data") back. + // + // Then we can return to the correct original states of (2.1) or (1). This + // makes our prepending operation reversible. + std::vector<JoinDataType> out; + + // Popping 2 data should never fail because we've just ensured that the + // posting list is in the FULL state. + ICING_RETURN_IF_ERROR( + GetDataInternal(posting_list_used, /*limit=*/2, /*pop=*/true, &out)); + + // PrependData should never fail because: + // - out[1] is a valid data less than all previous data in the posting list. + // - There's no way that the posting list could run out of room because it + // previously stored these 2 data. + ICING_RETURN_IF_ERROR(PrependData(posting_list_used, out[1])); + } else if (num_data > 0) { + return GetDataInternal(posting_list_used, /*limit=*/num_data, /*pop=*/true, + /*out=*/nullptr); + } + return libtextclassifier3::Status::OK; +} + +template <typename JoinDataType> +libtextclassifier3::Status +PostingListJoinDataSerializer<JoinDataType>::GetDataInternal( + const PostingListUsed* posting_list_used, uint32_t limit, bool pop, + std::vector<JoinDataType>* out) const { + uint32_t offset = GetStartByteOffset(posting_list_used); + uint32_t count = 0; + + // First traverse the first two special positions. + while (count < limit && offset < kSpecialDataSize) { + // offset / sizeof(JoinDataType) < kNumSpecialData + // because of the check above. + SpecialDataType special_data = GetSpecialData( + posting_list_used, /*index=*/offset / sizeof(JoinDataType)); + if (out != nullptr) { + out->push_back(special_data.data()); + } + offset += sizeof(JoinDataType); + ++count; + } + + // - We don't compress the data. + // - The posting list size is a multiple of data type bytes. + // So offset of the first non-special data is guaranteed to be at + // kSpecialDataSize if in ALMOST_FULL or FULL state. In fact, we must not + // apply padding skipping logic here when still storing uncompressed data, + // because in this case 0 bytes are meanful (e.g. inverted doc id byte = 0). + while (count < limit && offset < posting_list_used->size_in_bytes()) { + JoinDataType data = JoinDataType::GetInvalid(); + memcpy(&data, posting_list_used->posting_list_buffer() + offset, + sizeof(JoinDataType)); + offset += sizeof(JoinDataType); + if (out != nullptr) { + out->push_back(data); + } + ++count; + } + + if (pop) { + PostingListUsed* mutable_posting_list_used = + const_cast<PostingListUsed*>(posting_list_used); + // Modify the posting list so that we pop all data actually traversed. + if (offset >= kSpecialDataSize && + offset < posting_list_used->size_in_bytes()) { + memset( + mutable_posting_list_used->posting_list_buffer() + kSpecialDataSize, + 0, offset - kSpecialDataSize); + } + SetStartByteOffset(mutable_posting_list_used, offset); + } + + return libtextclassifier3::Status::OK; +} + +template <typename JoinDataType> +typename PostingListJoinDataSerializer<JoinDataType>::SpecialDataType +PostingListJoinDataSerializer<JoinDataType>::GetSpecialData( + const PostingListUsed* posting_list_used, uint32_t index) const { + // It is ok to temporarily construct a SpecialData with offset = 0 since we're + // going to overwrite it by memcpy. + SpecialDataType special_data(0); + memcpy(&special_data, + posting_list_used->posting_list_buffer() + + index * sizeof(SpecialDataType), + sizeof(SpecialDataType)); + return special_data; +} + +template <typename JoinDataType> +void PostingListJoinDataSerializer<JoinDataType>::SetSpecialData( + PostingListUsed* posting_list_used, uint32_t index, + const SpecialDataType& special_data) const { + memcpy(posting_list_used->posting_list_buffer() + + index * sizeof(SpecialDataType), + &special_data, sizeof(SpecialDataType)); +} + +template <typename JoinDataType> +bool PostingListJoinDataSerializer<JoinDataType>::IsPostingListValid( + const PostingListUsed* posting_list_used) const { + if (IsAlmostFull(posting_list_used)) { + // Special data 1 should hold a valid data. + if (!GetSpecialData(posting_list_used, /*index=*/1).data().is_valid()) { + ICING_LOG(ERROR) + << "Both special data cannot be invalid at the same time."; + return false; + } + } else if (!IsFull(posting_list_used)) { + // NOT_FULL. Special data 0 should hold a valid offset. + SpecialDataType special_data = + GetSpecialData(posting_list_used, /*index=*/0); + if (special_data.data_start_offset() > posting_list_used->size_in_bytes() || + special_data.data_start_offset() < kSpecialDataSize) { + ICING_LOG(ERROR) << "Offset: " << special_data.data_start_offset() + << " size: " << posting_list_used->size_in_bytes() + << " sp size: " << kSpecialDataSize; + return false; + } + } + return true; +} + +template <typename JoinDataType> +uint32_t PostingListJoinDataSerializer<JoinDataType>::GetStartByteOffset( + const PostingListUsed* posting_list_used) const { + if (IsFull(posting_list_used)) { + return 0; + } else if (IsAlmostFull(posting_list_used)) { + return sizeof(JoinDataType); + } else { + return GetSpecialData(posting_list_used, /*index=*/0).data_start_offset(); + } +} + +template <typename JoinDataType> +bool PostingListJoinDataSerializer<JoinDataType>::SetStartByteOffset( + PostingListUsed* posting_list_used, uint32_t offset) const { + if (offset > posting_list_used->size_in_bytes()) { + ICING_LOG(ERROR) << "offset cannot be a value greater than size " + << posting_list_used->size_in_bytes() << ". offset is " + << offset << "."; + return false; + } + if (offset < kSpecialDataSize && offset > sizeof(JoinDataType)) { + ICING_LOG(ERROR) << "offset cannot be a value between (" + << sizeof(JoinDataType) << ", " << kSpecialDataSize + << "). offset is " << offset << "."; + return false; + } + if (offset < sizeof(JoinDataType) && offset != 0) { + ICING_LOG(ERROR) << "offset cannot be a value between (0, " + << sizeof(JoinDataType) << "). offset is " << offset + << "."; + return false; + } + + if (offset >= kSpecialDataSize) { + // NOT_FULL state. + SetSpecialData(posting_list_used, /*index=*/0, SpecialDataType(offset)); + SetSpecialData(posting_list_used, /*index=*/1, + SpecialDataType(JoinDataType::GetInvalid())); + } else if (offset == sizeof(JoinDataType)) { + // ALMOST_FULL state. + SetSpecialData(posting_list_used, /*index=*/0, + SpecialDataType(JoinDataType::GetInvalid())); + } + // Nothing to do for the FULL state - the offset isn't actually stored + // anywhere and both 2 special data hold valid data. + return true; +} + +template <typename JoinDataType> +libtextclassifier3::StatusOr<uint32_t> +PostingListJoinDataSerializer<JoinDataType>::PrependDataUncompressed( + PostingListUsed* posting_list_used, const JoinDataType& data, + uint32_t offset) const { + if (offset < kSpecialDataSize + sizeof(JoinDataType)) { + return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf( + "Not enough room to prepend JoinData at offset %d.", offset)); + } + offset -= sizeof(JoinDataType); + memcpy(posting_list_used->posting_list_buffer() + offset, &data, + sizeof(JoinDataType)); + return offset; +} + +} // namespace lib +} // namespace icing + +#endif // ICING_JOIN_POSTING_LIST_JOIN_DATA_SERIALIZER_H_ diff --git a/icing/join/posting-list-join-data-serializer_test.cc b/icing/join/posting-list-join-data-serializer_test.cc new file mode 100644 index 0000000..20137b6 --- /dev/null +++ b/icing/join/posting-list-join-data-serializer_test.cc @@ -0,0 +1,653 @@ +// Copyright (C) 2022 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "icing/join/posting-list-join-data-serializer.h" + +#include <algorithm> +#include <iterator> +#include <vector> + +#include "icing/text_classifier/lib3/utils/base/status.h" +#include "gmock/gmock.h" +#include "gtest/gtest.h" +#include "icing/file/posting_list/posting-list-used.h" +#include "icing/join/document-id-to-join-info.h" +#include "icing/store/namespace-fingerprint-identifier.h" +#include "icing/testing/common-matchers.h" + +using testing::ElementsAre; +using testing::ElementsAreArray; +using testing::Eq; +using testing::IsEmpty; +using testing::SizeIs; + +namespace icing { +namespace lib { + +namespace { + +TEST(PostingListJoinDataSerializerTest, GetMinPostingListSizeToFitNotNull) { + PostingListJoinDataSerializer< + DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>> + serializer; + + int size = + 2551 * sizeof(DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>); + ICING_ASSERT_OK_AND_ASSIGN( + PostingListUsed pl_used, + PostingListUsed::CreateFromUnitializedRegion(&serializer, size)); + + ASSERT_THAT( + serializer.PrependData( + &pl_used, + DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>( + /*document_id=*/0, NamespaceFingerprintIdentifier( + /*namespace_id=*/1, /*fingerprint=*/2))), + IsOk()); + EXPECT_THAT( + serializer.GetMinPostingListSizeToFit(&pl_used), + Eq(2 * sizeof(DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>))); + + ASSERT_THAT( + serializer.PrependData( + &pl_used, + DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>( + /*document_id=*/1, NamespaceFingerprintIdentifier( + /*namespace_id=*/1, /*fingerprint=*/5))), + IsOk()); + EXPECT_THAT( + serializer.GetMinPostingListSizeToFit(&pl_used), + Eq(3 * sizeof(DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>))); +} + +TEST(PostingListJoinDataSerializerTest, GetMinPostingListSizeToFitAlmostFull) { + PostingListJoinDataSerializer< + DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>> + serializer; + + int size = 3 * sizeof(DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>); + ICING_ASSERT_OK_AND_ASSIGN( + PostingListUsed pl_used, + PostingListUsed::CreateFromUnitializedRegion(&serializer, size)); + + ASSERT_THAT( + serializer.PrependData( + &pl_used, + DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>( + /*document_id=*/0, NamespaceFingerprintIdentifier( + /*namespace_id=*/1, /*fingerprint=*/2))), + IsOk()); + ASSERT_THAT( + serializer.PrependData( + &pl_used, + DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>( + /*document_id=*/1, NamespaceFingerprintIdentifier( + /*namespace_id=*/1, /*fingerprint=*/5))), + IsOk()); + EXPECT_THAT(serializer.GetMinPostingListSizeToFit(&pl_used), Eq(size)); +} + +TEST(PostingListJoinDataSerializerTest, GetMinPostingListSizeToFitFull) { + PostingListJoinDataSerializer< + DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>> + serializer; + + int size = 3 * sizeof(DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>); + ICING_ASSERT_OK_AND_ASSIGN( + PostingListUsed pl_used, + PostingListUsed::CreateFromUnitializedRegion(&serializer, size)); + + ASSERT_THAT( + serializer.PrependData( + &pl_used, + DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>( + /*document_id=*/0, NamespaceFingerprintIdentifier( + /*namespace_id=*/1, /*fingerprint=*/2))), + IsOk()); + ASSERT_THAT( + serializer.PrependData( + &pl_used, + DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>( + /*document_id=*/1, NamespaceFingerprintIdentifier( + /*namespace_id=*/1, /*fingerprint=*/5))), + IsOk()); + ASSERT_THAT( + serializer.PrependData( + &pl_used, + DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>( + /*document_id=*/2, NamespaceFingerprintIdentifier( + /*namespace_id=*/1, /*fingerprint=*/10))), + IsOk()); + EXPECT_THAT(serializer.GetMinPostingListSizeToFit(&pl_used), Eq(size)); +} + +TEST(PostingListJoinDataSerializerTest, PrependDataNotFull) { + PostingListJoinDataSerializer< + DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>> + serializer; + + int size = + 2551 * sizeof(DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>); + ICING_ASSERT_OK_AND_ASSIGN( + PostingListUsed pl_used, + PostingListUsed::CreateFromUnitializedRegion(&serializer, size)); + + // Make used. + DocumentIdToJoinInfo<NamespaceFingerprintIdentifier> data0( + /*document_id=*/0, + NamespaceFingerprintIdentifier(/*namespace_id=*/1, /*fingerprint=*/2)); + EXPECT_THAT(serializer.PrependData(&pl_used, data0), IsOk()); + // Size = sizeof(uncompressed data0) + int expected_size = + sizeof(DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>); + EXPECT_THAT(serializer.GetBytesUsed(&pl_used), Eq(expected_size)); + EXPECT_THAT(serializer.GetData(&pl_used), IsOkAndHolds(ElementsAre(data0))); + + DocumentIdToJoinInfo<NamespaceFingerprintIdentifier> data1( + /*document_id=*/1, + NamespaceFingerprintIdentifier(/*namespace_id=*/1, /*fingerprint=*/5)); + EXPECT_THAT(serializer.PrependData(&pl_used, data1), IsOk()); + // Size = sizeof(uncompressed data1) + // + sizeof(uncompressed data0) + expected_size += sizeof(DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>); + EXPECT_THAT(serializer.GetBytesUsed(&pl_used), Eq(expected_size)); + EXPECT_THAT(serializer.GetData(&pl_used), + IsOkAndHolds(ElementsAre(data1, data0))); + + DocumentIdToJoinInfo<NamespaceFingerprintIdentifier> data2( + /*document_id=*/2, NamespaceFingerprintIdentifier( + /*namespace_id=*/1, /*fingerprint=*/10)); + EXPECT_THAT(serializer.PrependData(&pl_used, data2), IsOk()); + // Size = sizeof(uncompressed data2) + // + sizeof(uncompressed data1) + // + sizeof(uncompressed data0) + expected_size += sizeof(DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>); + EXPECT_THAT(serializer.GetBytesUsed(&pl_used), Eq(expected_size)); + EXPECT_THAT(serializer.GetData(&pl_used), + IsOkAndHolds(ElementsAre(data2, data1, data0))); +} + +TEST(PostingListJoinDataSerializerTest, PrependDataAlmostFull) { + PostingListJoinDataSerializer< + DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>> + serializer; + + int size = 4 * sizeof(DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>); + ICING_ASSERT_OK_AND_ASSIGN( + PostingListUsed pl_used, + PostingListUsed::CreateFromUnitializedRegion(&serializer, size)); + + // Fill up the compressed region. + // Transitions: + // Adding data0: EMPTY -> NOT_FULL + // Adding data1: NOT_FULL -> NOT_FULL + DocumentIdToJoinInfo<NamespaceFingerprintIdentifier> data0( + /*document_id=*/0, + NamespaceFingerprintIdentifier(/*namespace_id=*/1, /*fingerprint=*/2)); + DocumentIdToJoinInfo<NamespaceFingerprintIdentifier> data1( + /*document_id=*/1, + NamespaceFingerprintIdentifier(/*namespace_id=*/1, /*fingerprint=*/5)); + EXPECT_THAT(serializer.PrependData(&pl_used, data0), IsOk()); + EXPECT_THAT(serializer.PrependData(&pl_used, data1), IsOk()); + int expected_size = + 2 * sizeof(DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>); + EXPECT_THAT(serializer.GetBytesUsed(&pl_used), Eq(expected_size)); + EXPECT_THAT(serializer.GetData(&pl_used), + IsOkAndHolds(ElementsAre(data1, data0))); + + // Add one more data to transition NOT_FULL -> ALMOST_FULL + DocumentIdToJoinInfo<NamespaceFingerprintIdentifier> data2( + /*document_id=*/2, NamespaceFingerprintIdentifier( + /*namespace_id=*/1, /*fingerprint=*/10)); + EXPECT_THAT(serializer.PrependData(&pl_used, data2), IsOk()); + expected_size = + 3 * sizeof(DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>); + EXPECT_THAT(serializer.GetBytesUsed(&pl_used), Eq(expected_size)); + EXPECT_THAT(serializer.GetData(&pl_used), + IsOkAndHolds(ElementsAre(data2, data1, data0))); + + // Add one more data to transition ALMOST_FULL -> FULL + DocumentIdToJoinInfo<NamespaceFingerprintIdentifier> data3( + /*document_id=*/3, NamespaceFingerprintIdentifier( + /*namespace_id=*/1, /*fingerprint=*/0)); + EXPECT_THAT(serializer.PrependData(&pl_used, data3), IsOk()); + EXPECT_THAT(serializer.GetBytesUsed(&pl_used), Eq(size)); + EXPECT_THAT(serializer.GetData(&pl_used), + IsOkAndHolds(ElementsAre(data3, data2, data1, data0))); + + // The posting list is FULL. Adding another data should fail. + DocumentIdToJoinInfo<NamespaceFingerprintIdentifier> data4( + /*document_id=*/4, NamespaceFingerprintIdentifier( + /*namespace_id=*/0, /*fingerprint=*/1234)); + EXPECT_THAT(serializer.PrependData(&pl_used, data4), + StatusIs(libtextclassifier3::StatusCode::RESOURCE_EXHAUSTED)); +} + +TEST(PostingListJoinDataSerializerTest, PrependSmallerDataShouldFail) { + PostingListJoinDataSerializer< + DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>> + serializer; + + int size = 4 * sizeof(DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>); + ICING_ASSERT_OK_AND_ASSIGN( + PostingListUsed pl_used, + PostingListUsed::CreateFromUnitializedRegion(&serializer, size)); + + DocumentIdToJoinInfo<NamespaceFingerprintIdentifier> data( + /*document_id=*/100, + NamespaceFingerprintIdentifier(/*namespace_id=*/1, /*fingerprint=*/2)); + DocumentIdToJoinInfo<NamespaceFingerprintIdentifier> smaller_data( + /*document_id=*/99, + NamespaceFingerprintIdentifier(/*namespace_id=*/1, /*fingerprint=*/2)); + + // NOT_FULL -> NOT_FULL + ASSERT_THAT(serializer.PrependData(&pl_used, data), IsOk()); + EXPECT_THAT(serializer.PrependData(&pl_used, smaller_data), + StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); + + // NOT_FULL -> ALMOST_FULL + ASSERT_THAT(serializer.PrependData(&pl_used, data), IsOk()); + EXPECT_THAT(serializer.PrependData(&pl_used, smaller_data), + StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); + + // ALMOST_FULL -> FULL + ASSERT_THAT(serializer.PrependData(&pl_used, data), IsOk()); + EXPECT_THAT(serializer.PrependData(&pl_used, smaller_data), + StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); +} + +TEST(PostingListJoinDataSerializerTest, PrependDataPostingListUsedMinSize) { + PostingListJoinDataSerializer< + DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>> + serializer; + + int size = serializer.GetMinPostingListSize(); + ICING_ASSERT_OK_AND_ASSIGN( + PostingListUsed pl_used, + PostingListUsed::CreateFromUnitializedRegion(&serializer, size)); + + // PL State: EMPTY + EXPECT_THAT(serializer.GetBytesUsed(&pl_used), Eq(0)); + EXPECT_THAT(serializer.GetData(&pl_used), IsOkAndHolds(IsEmpty())); + + // Add a data. PL should shift to ALMOST_FULL state + DocumentIdToJoinInfo<NamespaceFingerprintIdentifier> data0( + /*document_id=*/0, + NamespaceFingerprintIdentifier(/*namespace_id=*/1, /*fingerprint=*/2)); + EXPECT_THAT(serializer.PrependData(&pl_used, data0), IsOk()); + // Size = sizeof(uncompressed data0) + int expected_size = + sizeof(DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>); + EXPECT_THAT(serializer.GetBytesUsed(&pl_used), Eq(expected_size)); + EXPECT_THAT(serializer.GetData(&pl_used), IsOkAndHolds(ElementsAre(data0))); + + // Add another data. PL should shift to FULL state. + DocumentIdToJoinInfo<NamespaceFingerprintIdentifier> data1( + /*document_id=*/1, + NamespaceFingerprintIdentifier(/*namespace_id=*/1, /*fingerprint=*/5)); + EXPECT_THAT(serializer.PrependData(&pl_used, data1), IsOk()); + // Size = sizeof(uncompressed data1) + sizeof(uncompressed data0) + expected_size += sizeof(DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>); + EXPECT_THAT(serializer.GetBytesUsed(&pl_used), Eq(expected_size)); + EXPECT_THAT(serializer.GetData(&pl_used), + IsOkAndHolds(ElementsAre(data1, data0))); + + // The posting list is FULL. Adding another data should fail. + DocumentIdToJoinInfo<NamespaceFingerprintIdentifier> data2( + /*document_id=*/2, NamespaceFingerprintIdentifier( + /*namespace_id=*/1, /*fingerprint=*/10)); + EXPECT_THAT(serializer.PrependData(&pl_used, data2), + StatusIs(libtextclassifier3::StatusCode::RESOURCE_EXHAUSTED)); +} + +TEST(PostingListJoinDataSerializerTest, PrependDataArrayDoNotKeepPrepended) { + PostingListJoinDataSerializer< + DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>> + serializer; + + int size = 6 * sizeof(DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>); + ICING_ASSERT_OK_AND_ASSIGN( + PostingListUsed pl_used, + PostingListUsed::CreateFromUnitializedRegion(&serializer, size)); + + std::vector<DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>> data_in; + std::vector<DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>> data_pushed; + + // Add 3 data. The PL is in the empty state and should be able to fit all 3 + // data without issue, transitioning the PL from EMPTY -> NOT_FULL. + data_in.push_back(DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>( + /*document_id=*/0, + NamespaceFingerprintIdentifier(/*namespace_id=*/1, /*fingerprint=*/2))); + data_in.push_back(DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>( + /*document_id=*/1, + NamespaceFingerprintIdentifier(/*namespace_id=*/1, /*fingerprint=*/5))); + data_in.push_back(DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>( + /*document_id=*/2, + NamespaceFingerprintIdentifier(/*namespace_id=*/1, /*fingerprint=*/10))); + EXPECT_THAT( + serializer.PrependDataArray(&pl_used, data_in.data(), data_in.size(), + /*keep_prepended=*/false), + IsOkAndHolds(data_in.size())); + std::move(data_in.begin(), data_in.end(), std::back_inserter(data_pushed)); + EXPECT_THAT(serializer.GetBytesUsed(&pl_used), + Eq(data_pushed.size() * + sizeof(DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>))); + EXPECT_THAT( + serializer.GetData(&pl_used), + IsOkAndHolds(ElementsAreArray(data_pushed.rbegin(), data_pushed.rend()))); + + // Add 2 data. The PL should transition from NOT_FULL to ALMOST_FULL. + data_in.clear(); + data_in.push_back(DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>( + /*document_id=*/3, + NamespaceFingerprintIdentifier(/*namespace_id=*/1, /*fingerprint=*/0))); + data_in.push_back(DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>( + /*document_id=*/4, NamespaceFingerprintIdentifier(/*namespace_id=*/0, + /*fingerprint=*/1234))); + EXPECT_THAT( + serializer.PrependDataArray(&pl_used, data_in.data(), data_in.size(), + /*keep_prepended=*/false), + IsOkAndHolds(data_in.size())); + std::move(data_in.begin(), data_in.end(), std::back_inserter(data_pushed)); + EXPECT_THAT(serializer.GetBytesUsed(&pl_used), + Eq(data_pushed.size() * + sizeof(DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>))); + EXPECT_THAT( + serializer.GetData(&pl_used), + IsOkAndHolds(ElementsAreArray(data_pushed.rbegin(), data_pushed.rend()))); + + // Add 2 data. The PL should remain ALMOST_FULL since the remaining space can + // only fit 1 data. + data_in.clear(); + data_in.push_back(DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>( + /*document_id=*/5, NamespaceFingerprintIdentifier(/*namespace_id=*/2, + /*fingerprint=*/99))); + data_in.push_back(DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>( + /*document_id=*/6, NamespaceFingerprintIdentifier(/*namespace_id=*/1, + /*fingerprint=*/63))); + EXPECT_THAT( + serializer.PrependDataArray(&pl_used, data_in.data(), data_in.size(), + /*keep_prepended=*/false), + IsOkAndHolds(0)); + EXPECT_THAT(serializer.GetBytesUsed(&pl_used), + Eq(data_pushed.size() * + sizeof(DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>))); + EXPECT_THAT( + serializer.GetData(&pl_used), + IsOkAndHolds(ElementsAreArray(data_pushed.rbegin(), data_pushed.rend()))); + + // Add 1 data. The PL should transition from ALMOST_FULL to FULL. + data_in.pop_back(); + ASSERT_THAT(data_in, SizeIs(1)); + EXPECT_THAT( + serializer.PrependDataArray(&pl_used, data_in.data(), data_in.size(), + /*keep_prepended=*/false), + IsOkAndHolds(data_in.size())); + std::move(data_in.begin(), data_in.end(), std::back_inserter(data_pushed)); + EXPECT_THAT(serializer.GetBytesUsed(&pl_used), + Eq(data_pushed.size() * + sizeof(DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>))); + EXPECT_THAT( + serializer.GetData(&pl_used), + IsOkAndHolds(ElementsAreArray(data_pushed.rbegin(), data_pushed.rend()))); +} + +TEST(PostingListJoinDataSerializerTest, PrependDataArrayKeepPrepended) { + PostingListJoinDataSerializer< + DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>> + serializer; + + int size = 6 * sizeof(DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>); + ICING_ASSERT_OK_AND_ASSIGN( + PostingListUsed pl_used, + PostingListUsed::CreateFromUnitializedRegion(&serializer, size)); + + std::vector<DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>> data_in; + std::vector<DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>> data_pushed; + + // Add 3 data. The PL is in the empty state and should be able to fit all 3 + // data without issue, transitioning the PL from EMPTY -> NOT_FULL. + data_in.push_back(DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>( + /*document_id=*/0, + NamespaceFingerprintIdentifier(/*namespace_id=*/1, /*fingerprint=*/2))); + data_in.push_back(DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>( + /*document_id=*/1, + NamespaceFingerprintIdentifier(/*namespace_id=*/1, /*fingerprint=*/5))); + data_in.push_back(DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>( + /*document_id=*/2, + NamespaceFingerprintIdentifier(/*namespace_id=*/1, /*fingerprint=*/10))); + EXPECT_THAT( + serializer.PrependDataArray(&pl_used, data_in.data(), data_in.size(), + /*keep_prepended=*/true), + IsOkAndHolds(data_in.size())); + std::move(data_in.begin(), data_in.end(), std::back_inserter(data_pushed)); + EXPECT_THAT(serializer.GetBytesUsed(&pl_used), + Eq(data_pushed.size() * + sizeof(DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>))); + EXPECT_THAT( + serializer.GetData(&pl_used), + IsOkAndHolds(ElementsAreArray(data_pushed.rbegin(), data_pushed.rend()))); + + // Add 4 data. The PL should prepend 3 data and transition from NOT_FULL to + // FULL. + data_in.clear(); + data_in.push_back(DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>( + /*document_id=*/3, + NamespaceFingerprintIdentifier(/*namespace_id=*/1, /*fingerprint=*/0))); + data_in.push_back(DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>( + /*document_id=*/4, NamespaceFingerprintIdentifier(/*namespace_id=*/0, + /*fingerprint=*/1234))); + data_in.push_back(DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>( + /*document_id=*/5, NamespaceFingerprintIdentifier(/*namespace_id=*/2, + /*fingerprint=*/99))); + data_in.push_back(DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>( + /*document_id=*/6, NamespaceFingerprintIdentifier(/*namespace_id=*/1, + /*fingerprint=*/63))); + EXPECT_THAT( + serializer.PrependDataArray(&pl_used, data_in.data(), data_in.size(), + /*keep_prepended=*/true), + IsOkAndHolds(3)); + data_in.pop_back(); + ASSERT_THAT(data_in, SizeIs(3)); + std::move(data_in.begin(), data_in.end(), std::back_inserter(data_pushed)); + EXPECT_THAT(serializer.GetBytesUsed(&pl_used), + Eq(data_pushed.size() * + sizeof(DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>))); + EXPECT_THAT( + serializer.GetData(&pl_used), + IsOkAndHolds(ElementsAreArray(data_pushed.rbegin(), data_pushed.rend()))); +} + +TEST(PostingListJoinDataSerializerTest, MoveFrom) { + PostingListJoinDataSerializer< + DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>> + serializer; + + int size = 3 * serializer.GetMinPostingListSize(); + ICING_ASSERT_OK_AND_ASSIGN( + PostingListUsed pl_used1, + PostingListUsed::CreateFromUnitializedRegion(&serializer, size)); + + std::vector<DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>> data_arr1 = + {DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>( + /*document_id=*/0, NamespaceFingerprintIdentifier( + /*namespace_id=*/1, /*fingerprint=*/2)), + DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>( + /*document_id=*/1, NamespaceFingerprintIdentifier( + /*namespace_id=*/1, /*fingerprint=*/5))}; + ASSERT_THAT( + serializer.PrependDataArray(&pl_used1, data_arr1.data(), data_arr1.size(), + /*keep_prepended=*/false), + IsOkAndHolds(data_arr1.size())); + + ICING_ASSERT_OK_AND_ASSIGN( + PostingListUsed pl_used2, + PostingListUsed::CreateFromUnitializedRegion(&serializer, size)); + std::vector<DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>> data_arr2 = + {DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>( + /*document_id=*/2, NamespaceFingerprintIdentifier( + /*namespace_id=*/1, /*fingerprint=*/10)), + DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>( + /*document_id=*/3, NamespaceFingerprintIdentifier( + /*namespace_id=*/1, /*fingerprint=*/0)), + DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>( + /*document_id=*/4, + NamespaceFingerprintIdentifier(/*namespace_id=*/0, + /*fingerprint=*/1234)), + DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>( + /*document_id=*/5, + NamespaceFingerprintIdentifier(/*namespace_id=*/2, + /*fingerprint=*/99))}; + ASSERT_THAT( + serializer.PrependDataArray(&pl_used2, data_arr2.data(), data_arr2.size(), + /*keep_prepended=*/false), + IsOkAndHolds(data_arr2.size())); + + EXPECT_THAT(serializer.MoveFrom(/*dst=*/&pl_used2, /*src=*/&pl_used1), + IsOk()); + EXPECT_THAT( + serializer.GetData(&pl_used2), + IsOkAndHolds(ElementsAreArray(data_arr1.rbegin(), data_arr1.rend()))); + EXPECT_THAT(serializer.GetData(&pl_used1), IsOkAndHolds(IsEmpty())); +} + +TEST(PostingListJoinDataSerializerTest, MoveToNullReturnsFailedPrecondition) { + PostingListJoinDataSerializer< + DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>> + serializer; + + int size = 3 * serializer.GetMinPostingListSize(); + ICING_ASSERT_OK_AND_ASSIGN( + PostingListUsed pl_used, + PostingListUsed::CreateFromUnitializedRegion(&serializer, size)); + std::vector<DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>> data_arr = { + DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>( + /*document_id=*/0, NamespaceFingerprintIdentifier( + /*namespace_id=*/1, /*fingerprint=*/2)), + DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>( + /*document_id=*/1, NamespaceFingerprintIdentifier( + /*namespace_id=*/1, /*fingerprint=*/5))}; + ASSERT_THAT( + serializer.PrependDataArray(&pl_used, data_arr.data(), data_arr.size(), + /*keep_prepended=*/false), + IsOkAndHolds(data_arr.size())); + + EXPECT_THAT(serializer.MoveFrom(/*dst=*/&pl_used, /*src=*/nullptr), + StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION)); + EXPECT_THAT( + serializer.GetData(&pl_used), + IsOkAndHolds(ElementsAreArray(data_arr.rbegin(), data_arr.rend()))); + + EXPECT_THAT(serializer.MoveFrom(/*dst=*/nullptr, /*src=*/&pl_used), + StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION)); + EXPECT_THAT( + serializer.GetData(&pl_used), + IsOkAndHolds(ElementsAreArray(data_arr.rbegin(), data_arr.rend()))); +} + +TEST(PostingListJoinDataSerializerTest, MoveToPostingListTooSmall) { + PostingListJoinDataSerializer< + DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>> + serializer; + + int size1 = 3 * serializer.GetMinPostingListSize(); + ICING_ASSERT_OK_AND_ASSIGN( + PostingListUsed pl_used1, + PostingListUsed::CreateFromUnitializedRegion(&serializer, size1)); + std::vector<DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>> data_arr1 = + {DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>( + /*document_id=*/0, NamespaceFingerprintIdentifier( + /*namespace_id=*/1, /*fingerprint=*/2)), + DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>( + /*document_id=*/1, NamespaceFingerprintIdentifier( + /*namespace_id=*/1, /*fingerprint=*/5)), + DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>( + /*document_id=*/2, NamespaceFingerprintIdentifier( + /*namespace_id=*/1, /*fingerprint=*/10)), + DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>( + /*document_id=*/3, NamespaceFingerprintIdentifier( + /*namespace_id=*/1, /*fingerprint=*/0)), + DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>( + /*document_id=*/4, + NamespaceFingerprintIdentifier(/*namespace_id=*/0, + /*fingerprint=*/1234))}; + ASSERT_THAT( + serializer.PrependDataArray(&pl_used1, data_arr1.data(), data_arr1.size(), + /*keep_prepended=*/false), + IsOkAndHolds(data_arr1.size())); + + int size2 = serializer.GetMinPostingListSize(); + ICING_ASSERT_OK_AND_ASSIGN( + PostingListUsed pl_used2, + PostingListUsed::CreateFromUnitializedRegion(&serializer, size2)); + std::vector<DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>> data_arr2 = + {DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>( + /*document_id=*/5, NamespaceFingerprintIdentifier( + /*namespace_id=*/2, /*fingerprint=*/99))}; + ASSERT_THAT( + serializer.PrependDataArray(&pl_used2, data_arr2.data(), data_arr2.size(), + /*keep_prepended=*/false), + IsOkAndHolds(data_arr2.size())); + + EXPECT_THAT(serializer.MoveFrom(/*dst=*/&pl_used2, /*src=*/&pl_used1), + StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); + EXPECT_THAT( + serializer.GetData(&pl_used1), + IsOkAndHolds(ElementsAreArray(data_arr1.rbegin(), data_arr1.rend()))); + EXPECT_THAT( + serializer.GetData(&pl_used2), + IsOkAndHolds(ElementsAreArray(data_arr2.rbegin(), data_arr2.rend()))); +} + +TEST(PostingListJoinDataSerializerTest, PopFrontData) { + PostingListJoinDataSerializer< + DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>> + serializer; + + int size = 2 * serializer.GetMinPostingListSize(); + ICING_ASSERT_OK_AND_ASSIGN( + PostingListUsed pl_used, + PostingListUsed::CreateFromUnitializedRegion(&serializer, size)); + + std::vector<DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>> data_arr = { + DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>( + /*document_id=*/0, NamespaceFingerprintIdentifier( + /*namespace_id=*/1, /*fingerprint=*/2)), + DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>( + /*document_id=*/1, NamespaceFingerprintIdentifier( + /*namespace_id=*/1, /*fingerprint=*/5)), + DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>( + /*document_id=*/2, NamespaceFingerprintIdentifier( + /*namespace_id=*/1, /*fingerprint=*/10))}; + ASSERT_THAT( + serializer.PrependDataArray(&pl_used, data_arr.data(), data_arr.size(), + /*keep_prepended=*/false), + IsOkAndHolds(data_arr.size())); + ASSERT_THAT( + serializer.GetData(&pl_used), + IsOkAndHolds(ElementsAreArray(data_arr.rbegin(), data_arr.rend()))); + + // Now, pop the last data. The posting list should contain the first three + // data. + EXPECT_THAT(serializer.PopFrontData(&pl_used, /*num_data=*/1), IsOk()); + data_arr.pop_back(); + EXPECT_THAT( + serializer.GetData(&pl_used), + IsOkAndHolds(ElementsAreArray(data_arr.rbegin(), data_arr.rend()))); +} + +} // namespace + +} // namespace lib +} // namespace icing diff --git a/icing/join/qualified-id-type-joinable-index.cc b/icing/join/qualified-id-join-index-impl-v1.cc index a1df3d0..cdcb5a9 100644 --- a/icing/join/qualified-id-type-joinable-index.cc +++ b/icing/join/qualified-id-join-index-impl-v1.cc @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "icing/join/qualified-id-type-joinable-index.h" +#include "icing/join/qualified-id-join-index-impl-v1.h" #include <cstring> #include <memory> @@ -29,9 +29,11 @@ #include "icing/file/filesystem.h" #include "icing/file/memory-mapped-file.h" #include "icing/join/doc-join-info.h" +#include "icing/join/qualified-id-join-index.h" #include "icing/store/document-id.h" #include "icing/store/dynamic-trie-key-mapper.h" #include "icing/store/key-mapper.h" +#include "icing/store/namespace-id.h" #include "icing/store/persistent-hash-map-key-mapper.h" #include "icing/util/crc32.h" #include "icing/util/encode-util.h" @@ -43,6 +45,11 @@ namespace lib { namespace { +// Set 1M for max # of qualified id entries and 10 bytes for key-value bytes. +// This will take at most 23 MiB disk space and mmap for persistent hash map. +static constexpr int32_t kDocJoinInfoMapperMaxNumEntries = 1 << 20; +static constexpr int32_t kDocJoinInfoMapperAverageKVByteSize = 10; + static constexpr int32_t kDocJoinInfoMapperDynamicTrieMaxSize = 128 * 1024 * 1024; // 128 MiB @@ -70,18 +77,19 @@ std::string GetQualifiedIdStoragePath(std::string_view working_path) { } // namespace /* static */ libtextclassifier3::StatusOr< - std::unique_ptr<QualifiedIdTypeJoinableIndex>> -QualifiedIdTypeJoinableIndex::Create(const Filesystem& filesystem, - std::string working_path, - bool pre_mapping_fbv, - bool use_persistent_hash_map) { + std::unique_ptr<QualifiedIdJoinIndexImplV1>> +QualifiedIdJoinIndexImplV1::Create(const Filesystem& filesystem, + std::string working_path, + bool pre_mapping_fbv, + bool use_persistent_hash_map) { if (!filesystem.FileExists(GetMetadataFilePath(working_path).c_str()) || !filesystem.DirectoryExists( GetDocJoinInfoMapperPath(working_path).c_str()) || !filesystem.FileExists(GetQualifiedIdStoragePath(working_path).c_str())) { // Discard working_path if any file/directory is missing, and reinitialize. if (filesystem.DirectoryExists(working_path.c_str())) { - ICING_RETURN_IF_ERROR(Discard(filesystem, working_path)); + ICING_RETURN_IF_ERROR( + QualifiedIdJoinIndex::Discard(filesystem, working_path)); } return InitializeNewFiles(filesystem, std::move(working_path), pre_mapping_fbv, use_persistent_hash_map); @@ -90,7 +98,7 @@ QualifiedIdTypeJoinableIndex::Create(const Filesystem& filesystem, pre_mapping_fbv, use_persistent_hash_map); } -QualifiedIdTypeJoinableIndex::~QualifiedIdTypeJoinableIndex() { +QualifiedIdJoinIndexImplV1::~QualifiedIdJoinIndexImplV1() { if (!PersistToDisk().ok()) { ICING_LOG(WARNING) << "Failed to persist qualified id type joinable index " "to disk while destructing " @@ -98,8 +106,10 @@ QualifiedIdTypeJoinableIndex::~QualifiedIdTypeJoinableIndex() { } } -libtextclassifier3::Status QualifiedIdTypeJoinableIndex::Put( +libtextclassifier3::Status QualifiedIdJoinIndexImplV1::Put( const DocJoinInfo& doc_join_info, std::string_view ref_qualified_id_str) { + SetDirty(); + if (!doc_join_info.is_valid()) { return absl_ports::InvalidArgumentError( "Cannot put data for an invalid DocJoinInfo"); @@ -123,8 +133,8 @@ libtextclassifier3::Status QualifiedIdTypeJoinableIndex::Put( return libtextclassifier3::Status::OK; } -libtextclassifier3::StatusOr<std::string_view> -QualifiedIdTypeJoinableIndex::Get(const DocJoinInfo& doc_join_info) const { +libtextclassifier3::StatusOr<std::string_view> QualifiedIdJoinIndexImplV1::Get( + const DocJoinInfo& doc_join_info) const { if (!doc_join_info.is_valid()) { return absl_ports::InvalidArgumentError( "Cannot get data for an invalid DocJoinInfo"); @@ -139,11 +149,13 @@ QualifiedIdTypeJoinableIndex::Get(const DocJoinInfo& doc_join_info) const { return std::string_view(data, strlen(data)); } -libtextclassifier3::Status QualifiedIdTypeJoinableIndex::Optimize( +libtextclassifier3::Status QualifiedIdJoinIndexImplV1::Optimize( const std::vector<DocumentId>& document_id_old_to_new, + const std::vector<NamespaceId>& namespace_id_old_to_new, DocumentId new_last_added_document_id) { std::string temp_working_path = working_path_ + "_temp"; - ICING_RETURN_IF_ERROR(Discard(filesystem_, temp_working_path)); + ICING_RETURN_IF_ERROR( + QualifiedIdJoinIndex::Discard(filesystem_, temp_working_path)); DestructibleDirectory temp_working_path_ddir(&filesystem_, std::move(temp_working_path)); @@ -158,7 +170,7 @@ libtextclassifier3::Status QualifiedIdTypeJoinableIndex::Optimize( // index. Also PersistToDisk and destruct the instance after finishing, so // we can safely swap directories later. ICING_ASSIGN_OR_RETURN( - std::unique_ptr<QualifiedIdTypeJoinableIndex> new_index, + std::unique_ptr<QualifiedIdJoinIndexImplV1> new_index, Create(filesystem_, temp_working_path_ddir.dir(), pre_mapping_fbv_, use_persistent_hash_map_)); ICING_RETURN_IF_ERROR( @@ -190,7 +202,9 @@ libtextclassifier3::Status QualifiedIdTypeJoinableIndex::Optimize( doc_join_info_mapper_, PersistentHashMapKeyMapper<int32_t>::Create( filesystem_, GetDocJoinInfoMapperPath(working_path_), - pre_mapping_fbv_)); + pre_mapping_fbv_, + /*max_num_entries=*/kDocJoinInfoMapperMaxNumEntries, + /*average_kv_byte_size=*/kDocJoinInfoMapperAverageKVByteSize)); } else { ICING_ASSIGN_OR_RETURN( doc_join_info_mapper_, @@ -210,7 +224,9 @@ libtextclassifier3::Status QualifiedIdTypeJoinableIndex::Optimize( return libtextclassifier3::Status::OK; } -libtextclassifier3::Status QualifiedIdTypeJoinableIndex::Clear() { +libtextclassifier3::Status QualifiedIdJoinIndexImplV1::Clear() { + SetDirty(); + doc_join_info_mapper_.reset(); // Discard and reinitialize doc join info mapper. std::string doc_join_info_mapper_path = @@ -221,8 +237,9 @@ libtextclassifier3::Status QualifiedIdTypeJoinableIndex::Clear() { ICING_ASSIGN_OR_RETURN( doc_join_info_mapper_, PersistentHashMapKeyMapper<int32_t>::Create( - filesystem_, std::move(doc_join_info_mapper_path), - pre_mapping_fbv_)); + filesystem_, std::move(doc_join_info_mapper_path), pre_mapping_fbv_, + /*max_num_entries=*/kDocJoinInfoMapperMaxNumEntries, + /*average_kv_byte_size=*/kDocJoinInfoMapperAverageKVByteSize)); } else { ICING_RETURN_IF_ERROR(DynamicTrieKeyMapper<int32_t>::Delete( filesystem_, doc_join_info_mapper_path)); @@ -244,11 +261,11 @@ libtextclassifier3::Status QualifiedIdTypeJoinableIndex::Clear() { } /* static */ libtextclassifier3::StatusOr< - std::unique_ptr<QualifiedIdTypeJoinableIndex>> -QualifiedIdTypeJoinableIndex::InitializeNewFiles(const Filesystem& filesystem, - std::string&& working_path, - bool pre_mapping_fbv, - bool use_persistent_hash_map) { + std::unique_ptr<QualifiedIdJoinIndexImplV1>> +QualifiedIdJoinIndexImplV1::InitializeNewFiles(const Filesystem& filesystem, + std::string&& working_path, + bool pre_mapping_fbv, + bool use_persistent_hash_map) { // Create working directory. if (!filesystem.CreateDirectoryRecursively(working_path.c_str())) { return absl_ports::InternalError( @@ -262,8 +279,9 @@ QualifiedIdTypeJoinableIndex::InitializeNewFiles(const Filesystem& filesystem, ICING_ASSIGN_OR_RETURN( doc_join_info_mapper, PersistentHashMapKeyMapper<int32_t>::Create( - filesystem, GetDocJoinInfoMapperPath(working_path), - pre_mapping_fbv)); + filesystem, GetDocJoinInfoMapperPath(working_path), pre_mapping_fbv, + /*max_num_entries=*/kDocJoinInfoMapperMaxNumEntries, + /*average_kv_byte_size=*/kDocJoinInfoMapperAverageKVByteSize)); } else { ICING_ASSIGN_OR_RETURN( doc_join_info_mapper, @@ -282,8 +300,8 @@ QualifiedIdTypeJoinableIndex::InitializeNewFiles(const Filesystem& filesystem, /*pre_mapping_mmap_size=*/pre_mapping_fbv ? 1024 * 1024 : 0)); // Create instance. - auto new_index = std::unique_ptr<QualifiedIdTypeJoinableIndex>( - new QualifiedIdTypeJoinableIndex( + auto new_index = std::unique_ptr<QualifiedIdJoinIndexImplV1>( + new QualifiedIdJoinIndexImplV1( filesystem, std::move(working_path), /*metadata_buffer=*/std::make_unique<uint8_t[]>(kMetadataFileSize), std::move(doc_join_info_mapper), std::move(qualified_id_storage), @@ -299,8 +317,8 @@ QualifiedIdTypeJoinableIndex::InitializeNewFiles(const Filesystem& filesystem, } /* static */ libtextclassifier3::StatusOr< - std::unique_ptr<QualifiedIdTypeJoinableIndex>> -QualifiedIdTypeJoinableIndex::InitializeExistingFiles( + std::unique_ptr<QualifiedIdJoinIndexImplV1>> +QualifiedIdJoinIndexImplV1::InitializeExistingFiles( const Filesystem& filesystem, std::string&& working_path, bool pre_mapping_fbv, bool use_persistent_hash_map) { // PRead metadata file. @@ -328,8 +346,9 @@ QualifiedIdTypeJoinableIndex::InitializeExistingFiles( ICING_ASSIGN_OR_RETURN( doc_join_info_mapper, PersistentHashMapKeyMapper<int32_t>::Create( - filesystem, GetDocJoinInfoMapperPath(working_path), - pre_mapping_fbv)); + filesystem, GetDocJoinInfoMapperPath(working_path), pre_mapping_fbv, + /*max_num_entries=*/kDocJoinInfoMapperMaxNumEntries, + /*average_kv_byte_size=*/kDocJoinInfoMapperAverageKVByteSize)); } else { ICING_ASSIGN_OR_RETURN( doc_join_info_mapper, @@ -348,8 +367,8 @@ QualifiedIdTypeJoinableIndex::InitializeExistingFiles( /*pre_mapping_mmap_size=*/pre_mapping_fbv ? 1024 * 1024 : 0)); // Create instance. - auto type_joinable_index = std::unique_ptr<QualifiedIdTypeJoinableIndex>( - new QualifiedIdTypeJoinableIndex( + auto type_joinable_index = std::unique_ptr<QualifiedIdJoinIndexImplV1>( + new QualifiedIdJoinIndexImplV1( filesystem, std::move(working_path), std::move(metadata_buffer), std::move(doc_join_info_mapper), std::move(qualified_id_storage), pre_mapping_fbv, use_persistent_hash_map)); @@ -364,9 +383,9 @@ QualifiedIdTypeJoinableIndex::InitializeExistingFiles( return type_joinable_index; } -libtextclassifier3::Status QualifiedIdTypeJoinableIndex::TransferIndex( +libtextclassifier3::Status QualifiedIdJoinIndexImplV1::TransferIndex( const std::vector<DocumentId>& document_id_old_to_new, - QualifiedIdTypeJoinableIndex* new_index) const { + QualifiedIdJoinIndexImplV1* new_index) const { std::unique_ptr<KeyMapper<int32_t>::Iterator> iter = doc_join_info_mapper_->GetIterator(); while (iter->Advance()) { @@ -394,8 +413,12 @@ libtextclassifier3::Status QualifiedIdTypeJoinableIndex::TransferIndex( return libtextclassifier3::Status::OK; } -libtextclassifier3::Status -QualifiedIdTypeJoinableIndex::PersistMetadataToDisk() { +libtextclassifier3::Status QualifiedIdJoinIndexImplV1::PersistMetadataToDisk( + bool force) { + if (!force && !is_info_dirty() && !is_storage_dirty()) { + return libtextclassifier3::Status::OK; + } + std::string metadata_file_path = GetMetadataFilePath(working_path_); ScopedFd sfd(filesystem_.OpenForWrite(metadata_file_path.c_str())); @@ -415,20 +438,32 @@ QualifiedIdTypeJoinableIndex::PersistMetadataToDisk() { return libtextclassifier3::Status::OK; } -libtextclassifier3::Status -QualifiedIdTypeJoinableIndex::PersistStoragesToDisk() { +libtextclassifier3::Status QualifiedIdJoinIndexImplV1::PersistStoragesToDisk( + bool force) { + if (!force && !is_storage_dirty()) { + return libtextclassifier3::Status::OK; + } + ICING_RETURN_IF_ERROR(doc_join_info_mapper_->PersistToDisk()); ICING_RETURN_IF_ERROR(qualified_id_storage_->PersistToDisk()); return libtextclassifier3::Status::OK; } libtextclassifier3::StatusOr<Crc32> -QualifiedIdTypeJoinableIndex::ComputeInfoChecksum() { +QualifiedIdJoinIndexImplV1::ComputeInfoChecksum(bool force) { + if (!force && !is_info_dirty()) { + return Crc32(crcs().component_crcs.info_crc); + } + return info().ComputeChecksum(); } libtextclassifier3::StatusOr<Crc32> -QualifiedIdTypeJoinableIndex::ComputeStoragesChecksum() { +QualifiedIdJoinIndexImplV1::ComputeStoragesChecksum(bool force) { + if (!force && !is_storage_dirty()) { + return Crc32(crcs().component_crcs.storages_crc); + } + ICING_ASSIGN_OR_RETURN(Crc32 doc_join_info_mapper_crc, doc_join_info_mapper_->ComputeChecksum()); ICING_ASSIGN_OR_RETURN(Crc32 qualified_id_storage_crc, diff --git a/icing/join/qualified-id-type-joinable-index.h b/icing/join/qualified-id-join-index-impl-v1.h index 4844433..9314602 100644 --- a/icing/join/qualified-id-type-joinable-index.h +++ b/icing/join/qualified-id-join-index-impl-v1.h @@ -12,31 +12,38 @@ // See the License for the specific language governing permissions and // limitations under the License. -#ifndef ICING_JOIN_QUALIFIED_ID_TYPE_JOINABLE_INDEX_H_ -#define ICING_JOIN_QUALIFIED_ID_TYPE_JOINABLE_INDEX_H_ +#ifndef ICING_JOIN_QUALIFIED_ID_JOIN_INDEX_IMPL_V1_H_ +#define ICING_JOIN_QUALIFIED_ID_JOIN_INDEX_IMPL_V1_H_ #include <cstdint> #include <memory> #include <string> #include <string_view> +#include <utility> #include <vector> #include "icing/text_classifier/lib3/utils/base/status.h" #include "icing/text_classifier/lib3/utils/base/statusor.h" +#include "icing/absl_ports/canonical_errors.h" #include "icing/file/file-backed-vector.h" #include "icing/file/filesystem.h" #include "icing/file/persistent-storage.h" #include "icing/join/doc-join-info.h" +#include "icing/join/qualified-id-join-index.h" +#include "icing/schema/joinable-property.h" +#include "icing/store/document-filter-data.h" #include "icing/store/document-id.h" #include "icing/store/key-mapper.h" +#include "icing/store/namespace-fingerprint-identifier.h" +#include "icing/store/namespace-id.h" #include "icing/util/crc32.h" namespace icing { namespace lib { -// QualifiedIdTypeJoinableIndex: a class to maintain data mapping DocJoinInfo to +// QualifiedIdJoinIndexImplV1: a class to maintain data mapping DocJoinInfo to // joinable qualified ids and delete propagation info. -class QualifiedIdTypeJoinableIndex : public PersistentStorage { +class QualifiedIdJoinIndexImplV1 : public QualifiedIdJoinIndex { public: struct Info { static constexpr int32_t kMagic = 0x48cabdc6; @@ -58,17 +65,14 @@ class QualifiedIdTypeJoinableIndex : public PersistentStorage { static constexpr int32_t kMetadataFileSize = sizeof(Crcs) + sizeof(Info); static_assert(kMetadataFileSize == 20, ""); - static constexpr WorkingPathType kWorkingPathType = - WorkingPathType::kDirectory; - - // Creates a QualifiedIdTypeJoinableIndex instance to store qualified ids for + // Creates a QualifiedIdJoinIndexImplV1 instance to store qualified ids for // future joining search. If any of the underlying file is missing, then // delete the whole working_path and (re)initialize with new ones. Otherwise // initialize and create the instance by existing files. // // filesystem: Object to make system level calls // working_path: Specifies the working path for PersistentStorage. - // QualifiedIdTypeJoinableIndex uses working path as working + // QualifiedIdJoinIndexImplV1 uses working path as working // directory and all related files will be stored under this // directory. It takes full ownership and of working_path_, // including creation/deletion. It is the caller's @@ -91,31 +95,35 @@ class QualifiedIdTypeJoinableIndex : public PersistentStorage { // - INTERNAL_ERROR on I/O errors // - Any KeyMapper errors static libtextclassifier3::StatusOr< - std::unique_ptr<QualifiedIdTypeJoinableIndex>> + std::unique_ptr<QualifiedIdJoinIndexImplV1>> Create(const Filesystem& filesystem, std::string working_path, bool pre_mapping_fbv, bool use_persistent_hash_map); - // Deletes QualifiedIdTypeJoinableIndex under working_path. - // - // Returns: - // - OK on success - // - INTERNAL_ERROR on I/O error - static libtextclassifier3::Status Discard(const Filesystem& filesystem, - const std::string& working_path) { - return PersistentStorage::Discard(filesystem, working_path, - kWorkingPathType); - } - // Delete copy and move constructor/assignment operator. - QualifiedIdTypeJoinableIndex(const QualifiedIdTypeJoinableIndex&) = delete; - QualifiedIdTypeJoinableIndex& operator=(const QualifiedIdTypeJoinableIndex&) = + QualifiedIdJoinIndexImplV1(const QualifiedIdJoinIndexImplV1&) = delete; + QualifiedIdJoinIndexImplV1& operator=(const QualifiedIdJoinIndexImplV1&) = delete; - QualifiedIdTypeJoinableIndex(QualifiedIdTypeJoinableIndex&&) = delete; - QualifiedIdTypeJoinableIndex& operator=(QualifiedIdTypeJoinableIndex&&) = - delete; + QualifiedIdJoinIndexImplV1(QualifiedIdJoinIndexImplV1&&) = delete; + QualifiedIdJoinIndexImplV1& operator=(QualifiedIdJoinIndexImplV1&&) = delete; + + ~QualifiedIdJoinIndexImplV1() override; - ~QualifiedIdTypeJoinableIndex() override; + // v2 only API. Returns UNIMPLEMENTED_ERROR. + libtextclassifier3::Status Put(SchemaTypeId schema_type_id, + JoinablePropertyId joinable_property_id, + DocumentId document_id, + std::vector<NamespaceFingerprintIdentifier>&& + ref_namespace_fingerprint_ids) override { + return absl_ports::UnimplementedError("This API is not supported in V2"); + } + + // v2 only API. Returns UNIMPLEMENTED_ERROR. + libtextclassifier3::StatusOr<std::unique_ptr<JoinDataIteratorBase>> + GetIterator(SchemaTypeId schema_type_id, + JoinablePropertyId joinable_property_id) const override { + return absl_ports::UnimplementedError("This API is not supported in V2"); + } // Puts a new data into index: DocJoinInfo (DocumentId, JoinablePropertyId) // references to ref_qualified_id_str (the identifier of another document). @@ -126,8 +134,9 @@ class QualifiedIdTypeJoinableIndex : public PersistentStorage { // - OK on success // - INVALID_ARGUMENT_ERROR if doc_join_info is invalid // - Any KeyMapper errors - libtextclassifier3::Status Put(const DocJoinInfo& doc_join_info, - std::string_view ref_qualified_id_str); + libtextclassifier3::Status Put( + const DocJoinInfo& doc_join_info, + std::string_view ref_qualified_id_str) override; // Gets the referenced document's qualified id string by DocJoinInfo. // @@ -138,7 +147,7 @@ class QualifiedIdTypeJoinableIndex : public PersistentStorage { // - NOT_FOUND_ERROR if doc_join_info doesn't exist // - Any KeyMapper errors libtextclassifier3::StatusOr<std::string_view> Get( - const DocJoinInfo& doc_join_info) const; + const DocJoinInfo& doc_join_info) const override; // Reduces internal file sizes by reclaiming space and ids of deleted // documents. Qualified id type joinable index will convert all entries to the @@ -146,6 +155,9 @@ class QualifiedIdTypeJoinableIndex : public PersistentStorage { // // - document_id_old_to_new: a map for converting old document id to new // document id. + // - namespace_id_old_to_new: a map for converting old namespace id to new + // namespace id. It is unused in this implementation since we store raw + // qualified id string (which contains raw namespace string). // - new_last_added_document_id: will be used to update the last added // document id in the qualified id type joinable // index. @@ -157,24 +169,29 @@ class QualifiedIdTypeJoinableIndex : public PersistentStorage { // and rebuild) libtextclassifier3::Status Optimize( const std::vector<DocumentId>& document_id_old_to_new, - DocumentId new_last_added_document_id); + const std::vector<NamespaceId>& namespace_id_old_to_new, + DocumentId new_last_added_document_id) override; // Clears all data and set last_added_document_id to kInvalidDocumentId. // // Returns: // - OK on success // - INTERNAL_ERROR on I/O error - libtextclassifier3::Status Clear(); + libtextclassifier3::Status Clear() override; - int32_t size() const { return doc_join_info_mapper_->num_keys(); } + bool is_v2() const override { return false; } - bool empty() const { return size() == 0; } + int32_t size() const override { return doc_join_info_mapper_->num_keys(); } - DocumentId last_added_document_id() const { + bool empty() const override { return size() == 0; } + + DocumentId last_added_document_id() const override { return info().last_added_document_id; } - void set_last_added_document_id(DocumentId document_id) { + void set_last_added_document_id(DocumentId document_id) override { + SetInfoDirty(); + Info& info_ref = info(); if (info_ref.last_added_document_id == kInvalidDocumentId || document_id > info_ref.last_added_document_id) { @@ -183,68 +200,70 @@ class QualifiedIdTypeJoinableIndex : public PersistentStorage { } private: - explicit QualifiedIdTypeJoinableIndex( + explicit QualifiedIdJoinIndexImplV1( const Filesystem& filesystem, std::string&& working_path, std::unique_ptr<uint8_t[]> metadata_buffer, std::unique_ptr<KeyMapper<int32_t>> doc_join_info_mapper, std::unique_ptr<FileBackedVector<char>> qualified_id_storage, bool pre_mapping_fbv, bool use_persistent_hash_map) - : PersistentStorage(filesystem, std::move(working_path), - kWorkingPathType), + : QualifiedIdJoinIndex(filesystem, std::move(working_path)), metadata_buffer_(std::move(metadata_buffer)), doc_join_info_mapper_(std::move(doc_join_info_mapper)), qualified_id_storage_(std::move(qualified_id_storage)), pre_mapping_fbv_(pre_mapping_fbv), - use_persistent_hash_map_(use_persistent_hash_map) {} + use_persistent_hash_map_(use_persistent_hash_map), + is_info_dirty_(false), + is_storage_dirty_(false) {} static libtextclassifier3::StatusOr< - std::unique_ptr<QualifiedIdTypeJoinableIndex>> + std::unique_ptr<QualifiedIdJoinIndexImplV1>> InitializeNewFiles(const Filesystem& filesystem, std::string&& working_path, bool pre_mapping_fbv, bool use_persistent_hash_map); static libtextclassifier3::StatusOr< - std::unique_ptr<QualifiedIdTypeJoinableIndex>> + std::unique_ptr<QualifiedIdJoinIndexImplV1>> InitializeExistingFiles(const Filesystem& filesystem, std::string&& working_path, bool pre_mapping_fbv, bool use_persistent_hash_map); - // Transfers qualified id type joinable index data from the current to - // new_index and convert to new document id according to - // document_id_old_to_new. It is a helper function for Optimize. + // Transfers qualified id join index data from the current to new_index and + // convert to new document id according to document_id_old_to_new. It is a + // helper function for Optimize. // // Returns: // - OK on success // - INTERNAL_ERROR on I/O error libtextclassifier3::Status TransferIndex( const std::vector<DocumentId>& document_id_old_to_new, - QualifiedIdTypeJoinableIndex* new_index) const; + QualifiedIdJoinIndexImplV1* new_index) const; // Flushes contents of metadata file. // // Returns: // - OK on success // - INTERNAL_ERROR on I/O error - libtextclassifier3::Status PersistMetadataToDisk() override; + libtextclassifier3::Status PersistMetadataToDisk(bool force) override; // Flushes contents of all storages to underlying files. // // Returns: // - OK on success // - INTERNAL_ERROR on I/O error - libtextclassifier3::Status PersistStoragesToDisk() override; + libtextclassifier3::Status PersistStoragesToDisk(bool force) override; // Computes and returns Info checksum. // // Returns: // - Crc of the Info on success - libtextclassifier3::StatusOr<Crc32> ComputeInfoChecksum() override; + libtextclassifier3::StatusOr<Crc32> ComputeInfoChecksum(bool force) override; // Computes and returns all storages checksum. // // Returns: // - Crc of all storages on success // - INTERNAL_ERROR if any data inconsistency - libtextclassifier3::StatusOr<Crc32> ComputeStoragesChecksum() override; + libtextclassifier3::StatusOr<Crc32> ComputeStoragesChecksum( + bool force) override; Crcs& crcs() override { return *reinterpret_cast<Crcs*>(metadata_buffer_.get() + @@ -266,6 +285,17 @@ class QualifiedIdTypeJoinableIndex : public PersistentStorage { kInfoMetadataBufferOffset); } + void SetInfoDirty() { is_info_dirty_ = true; } + // When storage is dirty, we have to set info dirty as well. So just expose + // SetDirty to set both. + void SetDirty() { + is_info_dirty_ = true; + is_storage_dirty_ = true; + } + + bool is_info_dirty() const { return is_info_dirty_; } + bool is_storage_dirty() const { return is_storage_dirty_; } + // Metadata buffer std::unique_ptr<uint8_t[]> metadata_buffer_; @@ -286,9 +316,12 @@ class QualifiedIdTypeJoinableIndex : public PersistentStorage { // Flag indicating whether use persistent hash map as the key mapper (if // false, then fall back to dynamic trie key mapper). bool use_persistent_hash_map_; + + bool is_info_dirty_; + bool is_storage_dirty_; }; } // namespace lib } // namespace icing -#endif // ICING_JOIN_QUALIFIED_ID_TYPE_JOINABLE_INDEX_H_ +#endif // ICING_JOIN_QUALIFIED_ID_JOIN_INDEX_IMPL_V1_H_ diff --git a/icing/join/qualified-id-type-joinable-index_test.cc b/icing/join/qualified-id-join-index-impl-v1_test.cc index 8ef9167..a6e19bb 100644 --- a/icing/join/qualified-id-type-joinable-index_test.cc +++ b/icing/join/qualified-id-join-index-impl-v1_test.cc @@ -12,8 +12,9 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "icing/join/qualified-id-type-joinable-index.h" +#include "icing/join/qualified-id-join-index-impl-v1.h" +#include <cstdint> #include <memory> #include <string> #include <string_view> @@ -49,29 +50,29 @@ using ::testing::Pointee; using ::testing::SizeIs; using Crcs = PersistentStorage::Crcs; -using Info = QualifiedIdTypeJoinableIndex::Info; +using Info = QualifiedIdJoinIndexImplV1::Info; static constexpr int32_t kCorruptedValueOffset = 3; -struct QualifiedIdJoinIndexTestParam { +struct QualifiedIdJoinIndexImplV1TestParam { bool pre_mapping_fbv; bool use_persistent_hash_map; - explicit QualifiedIdJoinIndexTestParam(bool pre_mapping_fbv_in, - bool use_persistent_hash_map_in) + explicit QualifiedIdJoinIndexImplV1TestParam(bool pre_mapping_fbv_in, + bool use_persistent_hash_map_in) : pre_mapping_fbv(pre_mapping_fbv_in), use_persistent_hash_map(use_persistent_hash_map_in) {} }; -class QualifiedIdTypeJoinableIndexTest - : public ::testing::TestWithParam<QualifiedIdJoinIndexTestParam> { +class QualifiedIdJoinIndexImplV1Test + : public ::testing::TestWithParam<QualifiedIdJoinIndexImplV1TestParam> { protected: void SetUp() override { base_dir_ = GetTestTempDir() + "/icing"; ASSERT_THAT(filesystem_.CreateDirectoryRecursively(base_dir_.c_str()), IsTrue()); - working_path_ = base_dir_ + "/qualified_id_type_joinable_index_test"; + working_path_ = base_dir_ + "/qualified_id_join_index_test"; } void TearDown() override { @@ -83,27 +84,26 @@ class QualifiedIdTypeJoinableIndexTest std::string working_path_; }; -TEST_P(QualifiedIdTypeJoinableIndexTest, InvalidWorkingPath) { - const QualifiedIdJoinIndexTestParam& param = GetParam(); +TEST_P(QualifiedIdJoinIndexImplV1Test, InvalidWorkingPath) { + const QualifiedIdJoinIndexImplV1TestParam& param = GetParam(); - EXPECT_THAT( - QualifiedIdTypeJoinableIndex::Create( - filesystem_, "/dev/null/qualified_id_type_joinable_index_test", - param.pre_mapping_fbv, param.use_persistent_hash_map), - StatusIs(libtextclassifier3::StatusCode::INTERNAL)); + EXPECT_THAT(QualifiedIdJoinIndexImplV1::Create( + filesystem_, "/dev/null/qualified_id_join_index_test", + param.pre_mapping_fbv, param.use_persistent_hash_map), + StatusIs(libtextclassifier3::StatusCode::INTERNAL)); } -TEST_P(QualifiedIdTypeJoinableIndexTest, InitializeNewFiles) { - const QualifiedIdJoinIndexTestParam& param = GetParam(); +TEST_P(QualifiedIdJoinIndexImplV1Test, InitializeNewFiles) { + const QualifiedIdJoinIndexImplV1TestParam& param = GetParam(); { - // Create new qualified id type joinable index + // Create new qualified id join index ASSERT_FALSE(filesystem_.DirectoryExists(working_path_.c_str())); ICING_ASSERT_OK_AND_ASSIGN( - std::unique_ptr<QualifiedIdTypeJoinableIndex> index, - QualifiedIdTypeJoinableIndex::Create(filesystem_, working_path_, - param.pre_mapping_fbv, - param.use_persistent_hash_map)); + std::unique_ptr<QualifiedIdJoinIndexImplV1> index, + QualifiedIdJoinIndexImplV1::Create(filesystem_, working_path_, + param.pre_mapping_fbv, + param.use_persistent_hash_map)); EXPECT_THAT(index, Pointee(IsEmpty())); ICING_ASSERT_OK(index->PersistToDisk()); @@ -114,24 +114,24 @@ TEST_P(QualifiedIdTypeJoinableIndexTest, InitializeNewFiles) { const std::string metadata_file_path = absl_ports::StrCat(working_path_, "/metadata"); auto metadata_buffer = std::make_unique<uint8_t[]>( - QualifiedIdTypeJoinableIndex::kMetadataFileSize); + QualifiedIdJoinIndexImplV1::kMetadataFileSize); ASSERT_THAT( filesystem_.PRead(metadata_file_path.c_str(), metadata_buffer.get(), - QualifiedIdTypeJoinableIndex::kMetadataFileSize, + QualifiedIdJoinIndexImplV1::kMetadataFileSize, /*offset=*/0), IsTrue()); // Check info section const Info* info = reinterpret_cast<const Info*>( metadata_buffer.get() + - QualifiedIdTypeJoinableIndex::kInfoMetadataBufferOffset); + QualifiedIdJoinIndexImplV1::kInfoMetadataBufferOffset); EXPECT_THAT(info->magic, Eq(Info::kMagic)); EXPECT_THAT(info->last_added_document_id, Eq(kInvalidDocumentId)); // Check crcs section const Crcs* crcs = reinterpret_cast<const Crcs*>( metadata_buffer.get() + - QualifiedIdTypeJoinableIndex::kCrcsMetadataBufferOffset); + QualifiedIdJoinIndexImplV1::kCrcsMetadataBufferOffset); // There are some initial info in KeyMapper, so storages_crc should be // non-zero. EXPECT_THAT(crcs->component_crcs.storages_crc, Ne(0)); @@ -146,16 +146,16 @@ TEST_P(QualifiedIdTypeJoinableIndexTest, InitializeNewFiles) { .Get())); } -TEST_P(QualifiedIdTypeJoinableIndexTest, +TEST_P(QualifiedIdJoinIndexImplV1Test, InitializationShouldFailWithoutPersistToDiskOrDestruction) { - const QualifiedIdJoinIndexTestParam& param = GetParam(); + const QualifiedIdJoinIndexImplV1TestParam& param = GetParam(); - // Create new qualified id type joinable index + // Create new qualified id join index ICING_ASSERT_OK_AND_ASSIGN( - std::unique_ptr<QualifiedIdTypeJoinableIndex> index, - QualifiedIdTypeJoinableIndex::Create(filesystem_, working_path_, - param.pre_mapping_fbv, - param.use_persistent_hash_map)); + std::unique_ptr<QualifiedIdJoinIndexImplV1> index, + QualifiedIdJoinIndexImplV1::Create(filesystem_, working_path_, + param.pre_mapping_fbv, + param.use_persistent_hash_map)); // Insert some data. ICING_ASSERT_OK( @@ -171,24 +171,24 @@ TEST_P(QualifiedIdTypeJoinableIndexTest, // Without calling PersistToDisk, checksums will not be recomputed or synced // to disk, so initializing another instance on the same files should fail. - EXPECT_THAT(QualifiedIdTypeJoinableIndex::Create( - filesystem_, working_path_, param.pre_mapping_fbv, - param.use_persistent_hash_map), + EXPECT_THAT(QualifiedIdJoinIndexImplV1::Create(filesystem_, working_path_, + param.pre_mapping_fbv, + param.use_persistent_hash_map), StatusIs(param.use_persistent_hash_map ? libtextclassifier3::StatusCode::FAILED_PRECONDITION : libtextclassifier3::StatusCode::INTERNAL)); } -TEST_P(QualifiedIdTypeJoinableIndexTest, +TEST_P(QualifiedIdJoinIndexImplV1Test, InitializationShouldSucceedWithPersistToDisk) { - const QualifiedIdJoinIndexTestParam& param = GetParam(); + const QualifiedIdJoinIndexImplV1TestParam& param = GetParam(); - // Create new qualified id type joinable index + // Create new qualified id join index ICING_ASSERT_OK_AND_ASSIGN( - std::unique_ptr<QualifiedIdTypeJoinableIndex> index1, - QualifiedIdTypeJoinableIndex::Create(filesystem_, working_path_, - param.pre_mapping_fbv, - param.use_persistent_hash_map)); + std::unique_ptr<QualifiedIdJoinIndexImplV1> index1, + QualifiedIdJoinIndexImplV1::Create(filesystem_, working_path_, + param.pre_mapping_fbv, + param.use_persistent_hash_map)); // Insert some data. ICING_ASSERT_OK( @@ -208,10 +208,10 @@ TEST_P(QualifiedIdTypeJoinableIndexTest, ICING_EXPECT_OK(index1->PersistToDisk()); ICING_ASSERT_OK_AND_ASSIGN( - std::unique_ptr<QualifiedIdTypeJoinableIndex> index2, - QualifiedIdTypeJoinableIndex::Create(filesystem_, working_path_, - param.pre_mapping_fbv, - param.use_persistent_hash_map)); + std::unique_ptr<QualifiedIdJoinIndexImplV1> index2, + QualifiedIdJoinIndexImplV1::Create(filesystem_, working_path_, + param.pre_mapping_fbv, + param.use_persistent_hash_map)); EXPECT_THAT(index2, Pointee(SizeIs(3))); EXPECT_THAT( index2->Get(DocJoinInfo(/*document_id=*/1, /*joinable_property_id=*/20)), @@ -224,17 +224,17 @@ TEST_P(QualifiedIdTypeJoinableIndexTest, IsOkAndHolds(/*ref_qualified_id_str=*/"namespace#uriC")); } -TEST_P(QualifiedIdTypeJoinableIndexTest, +TEST_P(QualifiedIdJoinIndexImplV1Test, InitializationShouldSucceedAfterDestruction) { - const QualifiedIdJoinIndexTestParam& param = GetParam(); + const QualifiedIdJoinIndexImplV1TestParam& param = GetParam(); { - // Create new qualified id type joinable index + // Create new qualified id join index ICING_ASSERT_OK_AND_ASSIGN( - std::unique_ptr<QualifiedIdTypeJoinableIndex> index, - QualifiedIdTypeJoinableIndex::Create(filesystem_, working_path_, - param.pre_mapping_fbv, - param.use_persistent_hash_map)); + std::unique_ptr<QualifiedIdJoinIndexImplV1> index, + QualifiedIdJoinIndexImplV1::Create(filesystem_, working_path_, + param.pre_mapping_fbv, + param.use_persistent_hash_map)); // Insert some data. ICING_ASSERT_OK( @@ -255,10 +255,10 @@ TEST_P(QualifiedIdTypeJoinableIndexTest, // thus initializing another instance on the same files should succeed, and // we should be able to get the same contents. ICING_ASSERT_OK_AND_ASSIGN( - std::unique_ptr<QualifiedIdTypeJoinableIndex> index, - QualifiedIdTypeJoinableIndex::Create(filesystem_, working_path_, - param.pre_mapping_fbv, - param.use_persistent_hash_map)); + std::unique_ptr<QualifiedIdJoinIndexImplV1> index, + QualifiedIdJoinIndexImplV1::Create(filesystem_, working_path_, + param.pre_mapping_fbv, + param.use_persistent_hash_map)); EXPECT_THAT(index, Pointee(SizeIs(3))); EXPECT_THAT(index->Get(DocJoinInfo(/*document_id=*/1, /*joinable_property_id=*/20)), @@ -272,17 +272,17 @@ TEST_P(QualifiedIdTypeJoinableIndexTest, } } -TEST_P(QualifiedIdTypeJoinableIndexTest, +TEST_P(QualifiedIdJoinIndexImplV1Test, InitializeExistingFilesWithDifferentMagicShouldFail) { - const QualifiedIdJoinIndexTestParam& param = GetParam(); + const QualifiedIdJoinIndexImplV1TestParam& param = GetParam(); { - // Create new qualified id type joinable index + // Create new qualified id join index ICING_ASSERT_OK_AND_ASSIGN( - std::unique_ptr<QualifiedIdTypeJoinableIndex> index, - QualifiedIdTypeJoinableIndex::Create(filesystem_, working_path_, - param.pre_mapping_fbv, - param.use_persistent_hash_map)); + std::unique_ptr<QualifiedIdJoinIndexImplV1> index, + QualifiedIdJoinIndexImplV1::Create(filesystem_, working_path_, + param.pre_mapping_fbv, + param.use_persistent_hash_map)); ICING_ASSERT_OK( index->Put(DocJoinInfo(/*document_id=*/1, /*joinable_property_id=*/20), /*ref_qualified_id_str=*/"namespace#uriA")); @@ -298,49 +298,48 @@ TEST_P(QualifiedIdTypeJoinableIndexTest, ASSERT_THAT(metadata_sfd.is_valid(), IsTrue()); auto metadata_buffer = std::make_unique<uint8_t[]>( - QualifiedIdTypeJoinableIndex::kMetadataFileSize); - ASSERT_THAT( - filesystem_.PRead(metadata_sfd.get(), metadata_buffer.get(), - QualifiedIdTypeJoinableIndex::kMetadataFileSize, - /*offset=*/0), - IsTrue()); + QualifiedIdJoinIndexImplV1::kMetadataFileSize); + ASSERT_THAT(filesystem_.PRead(metadata_sfd.get(), metadata_buffer.get(), + QualifiedIdJoinIndexImplV1::kMetadataFileSize, + /*offset=*/0), + IsTrue()); // Manually change magic and update checksums. Crcs* crcs = reinterpret_cast<Crcs*>( metadata_buffer.get() + - QualifiedIdTypeJoinableIndex::kCrcsMetadataBufferOffset); + QualifiedIdJoinIndexImplV1::kCrcsMetadataBufferOffset); Info* info = reinterpret_cast<Info*>( metadata_buffer.get() + - QualifiedIdTypeJoinableIndex::kInfoMetadataBufferOffset); + QualifiedIdJoinIndexImplV1::kInfoMetadataBufferOffset); info->magic += kCorruptedValueOffset; crcs->component_crcs.info_crc = info->ComputeChecksum().Get(); crcs->all_crc = crcs->component_crcs.ComputeChecksum().Get(); ASSERT_THAT(filesystem_.PWrite( metadata_sfd.get(), /*offset=*/0, metadata_buffer.get(), - QualifiedIdTypeJoinableIndex::kMetadataFileSize), + QualifiedIdJoinIndexImplV1::kMetadataFileSize), IsTrue()); } - // Attempt to create the qualified id type joinable index with different - // magic. This should fail. - EXPECT_THAT(QualifiedIdTypeJoinableIndex::Create( - filesystem_, working_path_, param.pre_mapping_fbv, - param.use_persistent_hash_map), + // Attempt to create the qualified id join index with different magic. This + // should fail. + EXPECT_THAT(QualifiedIdJoinIndexImplV1::Create(filesystem_, working_path_, + param.pre_mapping_fbv, + param.use_persistent_hash_map), StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION, HasSubstr("Incorrect magic value"))); } -TEST_P(QualifiedIdTypeJoinableIndexTest, +TEST_P(QualifiedIdJoinIndexImplV1Test, InitializeExistingFilesWithWrongAllCrcShouldFail) { - const QualifiedIdJoinIndexTestParam& param = GetParam(); + const QualifiedIdJoinIndexImplV1TestParam& param = GetParam(); { - // Create new qualified id type joinable index + // Create new qualified id join index ICING_ASSERT_OK_AND_ASSIGN( - std::unique_ptr<QualifiedIdTypeJoinableIndex> index, - QualifiedIdTypeJoinableIndex::Create(filesystem_, working_path_, - param.pre_mapping_fbv, - param.use_persistent_hash_map)); + std::unique_ptr<QualifiedIdJoinIndexImplV1> index, + QualifiedIdJoinIndexImplV1::Create(filesystem_, working_path_, + param.pre_mapping_fbv, + param.use_persistent_hash_map)); ICING_ASSERT_OK( index->Put(DocJoinInfo(/*document_id=*/1, /*joinable_property_id=*/20), /*ref_qualified_id_str=*/"namespace#uriA")); @@ -355,45 +354,44 @@ TEST_P(QualifiedIdTypeJoinableIndexTest, ASSERT_THAT(metadata_sfd.is_valid(), IsTrue()); auto metadata_buffer = std::make_unique<uint8_t[]>( - QualifiedIdTypeJoinableIndex::kMetadataFileSize); - ASSERT_THAT( - filesystem_.PRead(metadata_sfd.get(), metadata_buffer.get(), - QualifiedIdTypeJoinableIndex::kMetadataFileSize, - /*offset=*/0), - IsTrue()); + QualifiedIdJoinIndexImplV1::kMetadataFileSize); + ASSERT_THAT(filesystem_.PRead(metadata_sfd.get(), metadata_buffer.get(), + QualifiedIdJoinIndexImplV1::kMetadataFileSize, + /*offset=*/0), + IsTrue()); // Manually corrupt all_crc Crcs* crcs = reinterpret_cast<Crcs*>( metadata_buffer.get() + - QualifiedIdTypeJoinableIndex::kCrcsMetadataBufferOffset); + QualifiedIdJoinIndexImplV1::kCrcsMetadataBufferOffset); crcs->all_crc += kCorruptedValueOffset; ASSERT_THAT(filesystem_.PWrite( metadata_sfd.get(), /*offset=*/0, metadata_buffer.get(), - QualifiedIdTypeJoinableIndex::kMetadataFileSize), + QualifiedIdJoinIndexImplV1::kMetadataFileSize), IsTrue()); } - // Attempt to create the qualified id type joinable index with metadata - // containing corrupted all_crc. This should fail. - EXPECT_THAT(QualifiedIdTypeJoinableIndex::Create( - filesystem_, working_path_, param.pre_mapping_fbv, - param.use_persistent_hash_map), + // Attempt to create the qualified id join index with metadata containing + // corrupted all_crc. This should fail. + EXPECT_THAT(QualifiedIdJoinIndexImplV1::Create(filesystem_, working_path_, + param.pre_mapping_fbv, + param.use_persistent_hash_map), StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION, HasSubstr("Invalid all crc"))); } -TEST_P(QualifiedIdTypeJoinableIndexTest, +TEST_P(QualifiedIdJoinIndexImplV1Test, InitializeExistingFilesWithCorruptedInfoShouldFail) { - const QualifiedIdJoinIndexTestParam& param = GetParam(); + const QualifiedIdJoinIndexImplV1TestParam& param = GetParam(); { - // Create new qualified id type joinable index + // Create new qualified id join index ICING_ASSERT_OK_AND_ASSIGN( - std::unique_ptr<QualifiedIdTypeJoinableIndex> index, - QualifiedIdTypeJoinableIndex::Create(filesystem_, working_path_, - param.pre_mapping_fbv, - param.use_persistent_hash_map)); + std::unique_ptr<QualifiedIdJoinIndexImplV1> index, + QualifiedIdJoinIndexImplV1::Create(filesystem_, working_path_, + param.pre_mapping_fbv, + param.use_persistent_hash_map)); ICING_ASSERT_OK( index->Put(DocJoinInfo(/*document_id=*/1, /*joinable_property_id=*/20), /*ref_qualified_id_str=*/"namespace#uriA")); @@ -408,46 +406,45 @@ TEST_P(QualifiedIdTypeJoinableIndexTest, ASSERT_THAT(metadata_sfd.is_valid(), IsTrue()); auto metadata_buffer = std::make_unique<uint8_t[]>( - QualifiedIdTypeJoinableIndex::kMetadataFileSize); - ASSERT_THAT( - filesystem_.PRead(metadata_sfd.get(), metadata_buffer.get(), - QualifiedIdTypeJoinableIndex::kMetadataFileSize, - /*offset=*/0), - IsTrue()); + QualifiedIdJoinIndexImplV1::kMetadataFileSize); + ASSERT_THAT(filesystem_.PRead(metadata_sfd.get(), metadata_buffer.get(), + QualifiedIdJoinIndexImplV1::kMetadataFileSize, + /*offset=*/0), + IsTrue()); // Modify info, but don't update the checksum. This would be similar to // corruption of info. Info* info = reinterpret_cast<Info*>( metadata_buffer.get() + - QualifiedIdTypeJoinableIndex::kInfoMetadataBufferOffset); + QualifiedIdJoinIndexImplV1::kInfoMetadataBufferOffset); info->last_added_document_id += kCorruptedValueOffset; ASSERT_THAT(filesystem_.PWrite( metadata_sfd.get(), /*offset=*/0, metadata_buffer.get(), - QualifiedIdTypeJoinableIndex::kMetadataFileSize), + QualifiedIdJoinIndexImplV1::kMetadataFileSize), IsTrue()); } - // Attempt to create the qualified id type joinable index with info that - // doesn't match its checksum. This should fail. - EXPECT_THAT(QualifiedIdTypeJoinableIndex::Create( - filesystem_, working_path_, param.pre_mapping_fbv, - param.use_persistent_hash_map), + // Attempt to create the qualified id join index with info that doesn't match + // its checksum. This should fail. + EXPECT_THAT(QualifiedIdJoinIndexImplV1::Create(filesystem_, working_path_, + param.pre_mapping_fbv, + param.use_persistent_hash_map), StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION, HasSubstr("Invalid info crc"))); } -TEST_P(QualifiedIdTypeJoinableIndexTest, +TEST_P(QualifiedIdJoinIndexImplV1Test, InitializeExistingFilesWithCorruptedDocJoinInfoMapperShouldFail) { - const QualifiedIdJoinIndexTestParam& param = GetParam(); + const QualifiedIdJoinIndexImplV1TestParam& param = GetParam(); { - // Create new qualified id type joinable index + // Create new qualified id join index ICING_ASSERT_OK_AND_ASSIGN( - std::unique_ptr<QualifiedIdTypeJoinableIndex> index, - QualifiedIdTypeJoinableIndex::Create(filesystem_, working_path_, - param.pre_mapping_fbv, - param.use_persistent_hash_map)); + std::unique_ptr<QualifiedIdJoinIndexImplV1> index, + QualifiedIdJoinIndexImplV1::Create(filesystem_, working_path_, + param.pre_mapping_fbv, + param.use_persistent_hash_map)); ICING_ASSERT_OK( index->Put(DocJoinInfo(/*document_id=*/1, /*joinable_property_id=*/20), /*ref_qualified_id_str=*/"namespace#uriA")); @@ -478,26 +475,26 @@ TEST_P(QualifiedIdTypeJoinableIndexTest, ASSERT_THAT(old_crc, Not(Eq(new_crc))); } - // Attempt to create the qualified id type joinable index with corrupted + // Attempt to create the qualified id join index with corrupted // doc_join_info_mapper. This should fail. - EXPECT_THAT(QualifiedIdTypeJoinableIndex::Create( - filesystem_, working_path_, param.pre_mapping_fbv, - param.use_persistent_hash_map), + EXPECT_THAT(QualifiedIdJoinIndexImplV1::Create(filesystem_, working_path_, + param.pre_mapping_fbv, + param.use_persistent_hash_map), StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION, HasSubstr("Invalid storages crc"))); } -TEST_P(QualifiedIdTypeJoinableIndexTest, +TEST_P(QualifiedIdJoinIndexImplV1Test, InitializeExistingFilesWithCorruptedQualifiedIdStorageShouldFail) { - const QualifiedIdJoinIndexTestParam& param = GetParam(); + const QualifiedIdJoinIndexImplV1TestParam& param = GetParam(); { - // Create new qualified id type joinable index + // Create new qualified id join index ICING_ASSERT_OK_AND_ASSIGN( - std::unique_ptr<QualifiedIdTypeJoinableIndex> index, - QualifiedIdTypeJoinableIndex::Create(filesystem_, working_path_, - param.pre_mapping_fbv, - param.use_persistent_hash_map)); + std::unique_ptr<QualifiedIdJoinIndexImplV1> index, + QualifiedIdJoinIndexImplV1::Create(filesystem_, working_path_, + param.pre_mapping_fbv, + param.use_persistent_hash_map)); ICING_ASSERT_OK( index->Put(DocJoinInfo(/*document_id=*/1, /*joinable_property_id=*/20), /*ref_qualified_id_str=*/"namespace#uriA")); @@ -524,24 +521,24 @@ TEST_P(QualifiedIdTypeJoinableIndexTest, ASSERT_THAT(old_crc, Not(Eq(new_crc))); } - // Attempt to create the qualified id type joinable index with corrupted + // Attempt to create the qualified id join index with corrupted // qualified_id_storage. This should fail. - EXPECT_THAT(QualifiedIdTypeJoinableIndex::Create( - filesystem_, working_path_, param.pre_mapping_fbv, - param.use_persistent_hash_map), + EXPECT_THAT(QualifiedIdJoinIndexImplV1::Create(filesystem_, working_path_, + param.pre_mapping_fbv, + param.use_persistent_hash_map), StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION, HasSubstr("Invalid storages crc"))); } -TEST_P(QualifiedIdTypeJoinableIndexTest, InvalidPut) { - const QualifiedIdJoinIndexTestParam& param = GetParam(); +TEST_P(QualifiedIdJoinIndexImplV1Test, InvalidPut) { + const QualifiedIdJoinIndexImplV1TestParam& param = GetParam(); - // Create new qualified id type joinable index + // Create new qualified id join index ICING_ASSERT_OK_AND_ASSIGN( - std::unique_ptr<QualifiedIdTypeJoinableIndex> index, - QualifiedIdTypeJoinableIndex::Create(filesystem_, working_path_, - param.pre_mapping_fbv, - param.use_persistent_hash_map)); + std::unique_ptr<QualifiedIdJoinIndexImplV1> index, + QualifiedIdJoinIndexImplV1::Create(filesystem_, working_path_, + param.pre_mapping_fbv, + param.use_persistent_hash_map)); DocJoinInfo default_invalid; EXPECT_THAT( @@ -549,23 +546,23 @@ TEST_P(QualifiedIdTypeJoinableIndexTest, InvalidPut) { StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); } -TEST_P(QualifiedIdTypeJoinableIndexTest, InvalidGet) { - const QualifiedIdJoinIndexTestParam& param = GetParam(); +TEST_P(QualifiedIdJoinIndexImplV1Test, InvalidGet) { + const QualifiedIdJoinIndexImplV1TestParam& param = GetParam(); - // Create new qualified id type joinable index + // Create new qualified id join index ICING_ASSERT_OK_AND_ASSIGN( - std::unique_ptr<QualifiedIdTypeJoinableIndex> index, - QualifiedIdTypeJoinableIndex::Create(filesystem_, working_path_, - param.pre_mapping_fbv, - param.use_persistent_hash_map)); + std::unique_ptr<QualifiedIdJoinIndexImplV1> index, + QualifiedIdJoinIndexImplV1::Create(filesystem_, working_path_, + param.pre_mapping_fbv, + param.use_persistent_hash_map)); DocJoinInfo default_invalid; EXPECT_THAT(index->Get(default_invalid), StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); } -TEST_P(QualifiedIdTypeJoinableIndexTest, PutAndGet) { - const QualifiedIdJoinIndexTestParam& param = GetParam(); +TEST_P(QualifiedIdJoinIndexImplV1Test, PutAndGet) { + const QualifiedIdJoinIndexImplV1TestParam& param = GetParam(); DocJoinInfo target_info1(/*document_id=*/1, /*joinable_property_id=*/20); std::string_view ref_qualified_id_str_a = "namespace#uriA"; @@ -577,12 +574,12 @@ TEST_P(QualifiedIdTypeJoinableIndexTest, PutAndGet) { std::string_view ref_qualified_id_str_c = "namespace#uriC"; { - // Create new qualified id type joinable index + // Create new qualified id join index ICING_ASSERT_OK_AND_ASSIGN( - std::unique_ptr<QualifiedIdTypeJoinableIndex> index, - QualifiedIdTypeJoinableIndex::Create(filesystem_, working_path_, - param.pre_mapping_fbv, - param.use_persistent_hash_map)); + std::unique_ptr<QualifiedIdJoinIndexImplV1> index, + QualifiedIdJoinIndexImplV1::Create(filesystem_, working_path_, + param.pre_mapping_fbv, + param.use_persistent_hash_map)); EXPECT_THAT(index->Put(target_info1, ref_qualified_id_str_a), IsOk()); EXPECT_THAT(index->Put(target_info2, ref_qualified_id_str_b), IsOk()); @@ -598,29 +595,28 @@ TEST_P(QualifiedIdTypeJoinableIndexTest, PutAndGet) { // Verify we can get all of them after destructing and re-initializing. ICING_ASSERT_OK_AND_ASSIGN( - std::unique_ptr<QualifiedIdTypeJoinableIndex> index, - QualifiedIdTypeJoinableIndex::Create(filesystem_, working_path_, - param.pre_mapping_fbv, - param.use_persistent_hash_map)); + std::unique_ptr<QualifiedIdJoinIndexImplV1> index, + QualifiedIdJoinIndexImplV1::Create(filesystem_, working_path_, + param.pre_mapping_fbv, + param.use_persistent_hash_map)); EXPECT_THAT(index, Pointee(SizeIs(3))); EXPECT_THAT(index->Get(target_info1), IsOkAndHolds(ref_qualified_id_str_a)); EXPECT_THAT(index->Get(target_info2), IsOkAndHolds(ref_qualified_id_str_b)); EXPECT_THAT(index->Get(target_info3), IsOkAndHolds(ref_qualified_id_str_c)); } -TEST_P(QualifiedIdTypeJoinableIndexTest, - GetShouldReturnNotFoundErrorIfNotExist) { - const QualifiedIdJoinIndexTestParam& param = GetParam(); +TEST_P(QualifiedIdJoinIndexImplV1Test, GetShouldReturnNotFoundErrorIfNotExist) { + const QualifiedIdJoinIndexImplV1TestParam& param = GetParam(); DocJoinInfo target_info(/*document_id=*/1, /*joinable_property_id=*/20); std::string_view ref_qualified_id_str = "namespace#uriA"; - // Create new qualified id type joinable index + // Create new qualified id join index ICING_ASSERT_OK_AND_ASSIGN( - std::unique_ptr<QualifiedIdTypeJoinableIndex> index, - QualifiedIdTypeJoinableIndex::Create(filesystem_, working_path_, - param.pre_mapping_fbv, - param.use_persistent_hash_map)); + std::unique_ptr<QualifiedIdJoinIndexImplV1> index, + QualifiedIdJoinIndexImplV1::Create(filesystem_, working_path_, + param.pre_mapping_fbv, + param.use_persistent_hash_map)); // Verify entry is not found in the beginning. EXPECT_THAT(index->Get(target_info), @@ -636,14 +632,14 @@ TEST_P(QualifiedIdTypeJoinableIndexTest, StatusIs(libtextclassifier3::StatusCode::NOT_FOUND)); } -TEST_P(QualifiedIdTypeJoinableIndexTest, SetLastAddedDocumentId) { - const QualifiedIdJoinIndexTestParam& param = GetParam(); +TEST_P(QualifiedIdJoinIndexImplV1Test, SetLastAddedDocumentId) { + const QualifiedIdJoinIndexImplV1TestParam& param = GetParam(); ICING_ASSERT_OK_AND_ASSIGN( - std::unique_ptr<QualifiedIdTypeJoinableIndex> index, - QualifiedIdTypeJoinableIndex::Create(filesystem_, working_path_, - param.pre_mapping_fbv, - param.use_persistent_hash_map)); + std::unique_ptr<QualifiedIdJoinIndexImplV1> index, + QualifiedIdJoinIndexImplV1::Create(filesystem_, working_path_, + param.pre_mapping_fbv, + param.use_persistent_hash_map)); EXPECT_THAT(index->last_added_document_id(), Eq(kInvalidDocumentId)); @@ -657,15 +653,15 @@ TEST_P(QualifiedIdTypeJoinableIndexTest, SetLastAddedDocumentId) { } TEST_P( - QualifiedIdTypeJoinableIndexTest, + QualifiedIdJoinIndexImplV1Test, SetLastAddedDocumentIdShouldIgnoreNewDocumentIdNotGreaterThanTheCurrent) { - const QualifiedIdJoinIndexTestParam& param = GetParam(); + const QualifiedIdJoinIndexImplV1TestParam& param = GetParam(); ICING_ASSERT_OK_AND_ASSIGN( - std::unique_ptr<QualifiedIdTypeJoinableIndex> index, - QualifiedIdTypeJoinableIndex::Create(filesystem_, working_path_, - param.pre_mapping_fbv, - param.use_persistent_hash_map)); + std::unique_ptr<QualifiedIdJoinIndexImplV1> index, + QualifiedIdJoinIndexImplV1::Create(filesystem_, working_path_, + param.pre_mapping_fbv, + param.use_persistent_hash_map)); constexpr DocumentId kDocumentId = 123; index->set_last_added_document_id(kDocumentId); @@ -678,14 +674,14 @@ TEST_P( EXPECT_THAT(index->last_added_document_id(), Eq(kDocumentId)); } -TEST_P(QualifiedIdTypeJoinableIndexTest, Optimize) { - const QualifiedIdJoinIndexTestParam& param = GetParam(); +TEST_P(QualifiedIdJoinIndexImplV1Test, Optimize) { + const QualifiedIdJoinIndexImplV1TestParam& param = GetParam(); ICING_ASSERT_OK_AND_ASSIGN( - std::unique_ptr<QualifiedIdTypeJoinableIndex> index, - QualifiedIdTypeJoinableIndex::Create(filesystem_, working_path_, - param.pre_mapping_fbv, - param.use_persistent_hash_map)); + std::unique_ptr<QualifiedIdJoinIndexImplV1> index, + QualifiedIdJoinIndexImplV1::Create(filesystem_, working_path_, + param.pre_mapping_fbv, + param.use_persistent_hash_map)); ICING_ASSERT_OK( index->Put(DocJoinInfo(/*document_id=*/3, /*joinable_property_id=*/10), @@ -714,7 +710,8 @@ TEST_P(QualifiedIdTypeJoinableIndexTest, Optimize) { DocumentId new_last_added_document_id = 2; EXPECT_THAT( - index->Optimize(document_id_old_to_new, new_last_added_document_id), + index->Optimize(document_id_old_to_new, /*namespace_id_old_to_new=*/{}, + new_last_added_document_id), IsOk()); EXPECT_THAT(index, Pointee(SizeIs(3))); EXPECT_THAT(index->last_added_document_id(), Eq(new_last_added_document_id)); @@ -759,14 +756,14 @@ TEST_P(QualifiedIdTypeJoinableIndexTest, Optimize) { IsOkAndHolds("namespace#uriD")); } -TEST_P(QualifiedIdTypeJoinableIndexTest, OptimizeOutOfRangeDocumentId) { - const QualifiedIdJoinIndexTestParam& param = GetParam(); +TEST_P(QualifiedIdJoinIndexImplV1Test, OptimizeOutOfRangeDocumentId) { + const QualifiedIdJoinIndexImplV1TestParam& param = GetParam(); ICING_ASSERT_OK_AND_ASSIGN( - std::unique_ptr<QualifiedIdTypeJoinableIndex> index, - QualifiedIdTypeJoinableIndex::Create(filesystem_, working_path_, - param.pre_mapping_fbv, - param.use_persistent_hash_map)); + std::unique_ptr<QualifiedIdJoinIndexImplV1> index, + QualifiedIdJoinIndexImplV1::Create(filesystem_, working_path_, + param.pre_mapping_fbv, + param.use_persistent_hash_map)); ICING_ASSERT_OK( index->Put(DocJoinInfo(/*document_id=*/99, /*joinable_property_id=*/10), @@ -779,7 +776,7 @@ TEST_P(QualifiedIdTypeJoinableIndexTest, OptimizeOutOfRangeDocumentId) { // There shouldn't be any error due to vector index. EXPECT_THAT( - index->Optimize(document_id_old_to_new, + index->Optimize(document_id_old_to_new, /*namespace_id_old_to_new=*/{}, /*new_last_added_document_id=*/kInvalidDocumentId), IsOk()); EXPECT_THAT(index->last_added_document_id(), Eq(kInvalidDocumentId)); @@ -788,14 +785,14 @@ TEST_P(QualifiedIdTypeJoinableIndexTest, OptimizeOutOfRangeDocumentId) { EXPECT_THAT(index, Pointee(IsEmpty())); } -TEST_P(QualifiedIdTypeJoinableIndexTest, OptimizeDeleteAll) { - const QualifiedIdJoinIndexTestParam& param = GetParam(); +TEST_P(QualifiedIdJoinIndexImplV1Test, OptimizeDeleteAll) { + const QualifiedIdJoinIndexImplV1TestParam& param = GetParam(); ICING_ASSERT_OK_AND_ASSIGN( - std::unique_ptr<QualifiedIdTypeJoinableIndex> index, - QualifiedIdTypeJoinableIndex::Create(filesystem_, working_path_, - param.pre_mapping_fbv, - param.use_persistent_hash_map)); + std::unique_ptr<QualifiedIdJoinIndexImplV1> index, + QualifiedIdJoinIndexImplV1::Create(filesystem_, working_path_, + param.pre_mapping_fbv, + param.use_persistent_hash_map)); ICING_ASSERT_OK( index->Put(DocJoinInfo(/*document_id=*/3, /*joinable_property_id=*/10), @@ -818,7 +815,7 @@ TEST_P(QualifiedIdTypeJoinableIndexTest, OptimizeDeleteAll) { std::vector<DocumentId> document_id_old_to_new(22, kInvalidDocumentId); EXPECT_THAT( - index->Optimize(document_id_old_to_new, + index->Optimize(document_id_old_to_new, /*namespace_id_old_to_new=*/{}, /*new_last_added_document_id=*/kInvalidDocumentId), IsOk()); EXPECT_THAT(index->last_added_document_id(), Eq(kInvalidDocumentId)); @@ -827,19 +824,19 @@ TEST_P(QualifiedIdTypeJoinableIndexTest, OptimizeDeleteAll) { EXPECT_THAT(index, Pointee(IsEmpty())); } -TEST_P(QualifiedIdTypeJoinableIndexTest, Clear) { - const QualifiedIdJoinIndexTestParam& param = GetParam(); +TEST_P(QualifiedIdJoinIndexImplV1Test, Clear) { + const QualifiedIdJoinIndexImplV1TestParam& param = GetParam(); DocJoinInfo target_info1(/*document_id=*/1, /*joinable_property_id=*/20); DocJoinInfo target_info2(/*document_id=*/3, /*joinable_property_id=*/5); DocJoinInfo target_info3(/*document_id=*/6, /*joinable_property_id=*/13); - // Create new qualified id type joinable index + // Create new qualified id join index ICING_ASSERT_OK_AND_ASSIGN( - std::unique_ptr<QualifiedIdTypeJoinableIndex> index, - QualifiedIdTypeJoinableIndex::Create(filesystem_, working_path_, - param.pre_mapping_fbv, - param.use_persistent_hash_map)); + std::unique_ptr<QualifiedIdJoinIndexImplV1> index, + QualifiedIdJoinIndexImplV1::Create(filesystem_, working_path_, + param.pre_mapping_fbv, + param.use_persistent_hash_map)); ICING_ASSERT_OK( index->Put(target_info1, /*ref_qualified_id_str=*/"namespace#uriA")); ICING_ASSERT_OK( @@ -862,7 +859,7 @@ TEST_P(QualifiedIdTypeJoinableIndexTest, Clear) { EXPECT_THAT(index->Get(target_info3), StatusIs(libtextclassifier3::StatusCode::NOT_FOUND)); - // Joinable index should be able to work normally after Clear(). + // Join index should be able to work normally after Clear(). DocJoinInfo target_info4(/*document_id=*/2, /*joinable_property_id=*/19); ICING_ASSERT_OK( index->Put(target_info4, /*ref_qualified_id_str=*/"namespace#uriD")); @@ -876,9 +873,9 @@ TEST_P(QualifiedIdTypeJoinableIndexTest, Clear) { // Verify index after reconstructing. ICING_ASSERT_OK_AND_ASSIGN( - index, QualifiedIdTypeJoinableIndex::Create( - filesystem_, working_path_, param.pre_mapping_fbv, - param.use_persistent_hash_map)); + index, QualifiedIdJoinIndexImplV1::Create(filesystem_, working_path_, + param.pre_mapping_fbv, + param.use_persistent_hash_map)); EXPECT_THAT(index->last_added_document_id(), Eq(2)); EXPECT_THAT(index->Get(target_info1), StatusIs(libtextclassifier3::StatusCode::NOT_FOUND)); @@ -889,16 +886,16 @@ TEST_P(QualifiedIdTypeJoinableIndexTest, Clear) { EXPECT_THAT(index->Get(target_info4), IsOkAndHolds("namespace#uriD")); } -TEST_P(QualifiedIdTypeJoinableIndexTest, SwitchKeyMapperTypeShouldReturnError) { - const QualifiedIdJoinIndexTestParam& param = GetParam(); +TEST_P(QualifiedIdJoinIndexImplV1Test, SwitchKeyMapperTypeShouldReturnError) { + const QualifiedIdJoinIndexImplV1TestParam& param = GetParam(); { - // Create new qualified id type joinable index + // Create new qualified id join index ICING_ASSERT_OK_AND_ASSIGN( - std::unique_ptr<QualifiedIdTypeJoinableIndex> index, - QualifiedIdTypeJoinableIndex::Create(filesystem_, working_path_, - param.pre_mapping_fbv, - param.use_persistent_hash_map)); + std::unique_ptr<QualifiedIdJoinIndexImplV1> index, + QualifiedIdJoinIndexImplV1::Create(filesystem_, working_path_, + param.pre_mapping_fbv, + param.use_persistent_hash_map)); ICING_ASSERT_OK( index->Put(DocJoinInfo(/*document_id=*/1, /*joinable_property_id=*/20), /*ref_qualified_id_str=*/"namespace#uriA")); @@ -907,23 +904,26 @@ TEST_P(QualifiedIdTypeJoinableIndexTest, SwitchKeyMapperTypeShouldReturnError) { } bool switch_key_mapper_flag = !param.use_persistent_hash_map; - EXPECT_THAT(QualifiedIdTypeJoinableIndex::Create(filesystem_, working_path_, - param.pre_mapping_fbv, - switch_key_mapper_flag), + EXPECT_THAT(QualifiedIdJoinIndexImplV1::Create(filesystem_, working_path_, + param.pre_mapping_fbv, + switch_key_mapper_flag), StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION)); } INSTANTIATE_TEST_SUITE_P( - QualifiedIdTypeJoinableIndexTest, QualifiedIdTypeJoinableIndexTest, - testing::Values( - QualifiedIdJoinIndexTestParam(/*pre_mapping_fbv_in=*/true, - /*use_persistent_hash_map_in=*/true), - QualifiedIdJoinIndexTestParam(/*pre_mapping_fbv_in=*/true, - /*use_persistent_hash_map_in=*/false), - QualifiedIdJoinIndexTestParam(/*pre_mapping_fbv_in=*/false, - /*use_persistent_hash_map_in=*/true), - QualifiedIdJoinIndexTestParam(/*pre_mapping_fbv_in=*/false, - /*use_persistent_hash_map_in=*/false))); + QualifiedIdJoinIndexImplV1Test, QualifiedIdJoinIndexImplV1Test, + testing::Values(QualifiedIdJoinIndexImplV1TestParam( + /*pre_mapping_fbv_in=*/true, + /*use_persistent_hash_map_in=*/true), + QualifiedIdJoinIndexImplV1TestParam( + /*pre_mapping_fbv_in=*/true, + /*use_persistent_hash_map_in=*/false), + QualifiedIdJoinIndexImplV1TestParam( + /*pre_mapping_fbv_in=*/false, + /*use_persistent_hash_map_in=*/true), + QualifiedIdJoinIndexImplV1TestParam( + /*pre_mapping_fbv_in=*/false, + /*use_persistent_hash_map_in=*/false))); } // namespace diff --git a/icing/join/qualified-id-join-index-impl-v2.cc b/icing/join/qualified-id-join-index-impl-v2.cc new file mode 100644 index 0000000..70fd13c --- /dev/null +++ b/icing/join/qualified-id-join-index-impl-v2.cc @@ -0,0 +1,681 @@ +// Copyright (C) 2023 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "icing/join/qualified-id-join-index-impl-v2.h" + +#include <algorithm> +#include <cstdint> +#include <memory> +#include <string> +#include <string_view> +#include <utility> +#include <vector> + +#include "icing/text_classifier/lib3/utils/base/status.h" +#include "icing/text_classifier/lib3/utils/base/statusor.h" +#include "icing/absl_ports/canonical_errors.h" +#include "icing/absl_ports/str_cat.h" +#include "icing/file/destructible-directory.h" +#include "icing/file/filesystem.h" +#include "icing/file/posting_list/flash-index-storage.h" +#include "icing/file/posting_list/posting-list-accessor.h" +#include "icing/file/posting_list/posting-list-identifier.h" +#include "icing/join/document-id-to-join-info.h" +#include "icing/join/posting-list-join-data-accessor.h" +#include "icing/join/posting-list-join-data-serializer.h" +#include "icing/join/qualified-id-join-index.h" +#include "icing/schema/joinable-property.h" +#include "icing/store/document-filter-data.h" +#include "icing/store/document-id.h" +#include "icing/store/key-mapper.h" +#include "icing/store/namespace-fingerprint-identifier.h" +#include "icing/store/namespace-id.h" +#include "icing/store/persistent-hash-map-key-mapper.h" +#include "icing/util/crc32.h" +#include "icing/util/encode-util.h" +#include "icing/util/logging.h" +#include "icing/util/status-macros.h" + +namespace icing { +namespace lib { + +namespace { + +// Set 1M for max # of qualified id entries and 10 bytes for key-value bytes. +// This will take at most 23 MiB disk space and mmap for persistent hash map. +static constexpr int32_t kSchemaJoinableIdToPostingListMapperMaxNumEntries = + 1 << 20; +static constexpr int32_t kSchemaJoinableIdToPostingListMapperAverageKVByteSize = + 10; + +inline DocumentId GetNewDocumentId( + const std::vector<DocumentId>& document_id_old_to_new, + DocumentId old_document_id) { + if (old_document_id >= document_id_old_to_new.size()) { + return kInvalidDocumentId; + } + return document_id_old_to_new[old_document_id]; +} + +inline NamespaceId GetNewNamespaceId( + const std::vector<NamespaceId>& namespace_id_old_to_new, + NamespaceId namespace_id) { + if (namespace_id >= namespace_id_old_to_new.size()) { + return kInvalidNamespaceId; + } + return namespace_id_old_to_new[namespace_id]; +} + +libtextclassifier3::StatusOr<PostingListIdentifier> GetPostingListIdentifier( + const KeyMapper<PostingListIdentifier>& + schema_joinable_id_to_posting_list_mapper, + const std::string& encoded_schema_type_joinable_property_id_str) { + auto posting_list_identifier_or = + schema_joinable_id_to_posting_list_mapper.Get( + encoded_schema_type_joinable_property_id_str); + if (!posting_list_identifier_or.ok()) { + if (absl_ports::IsNotFound(posting_list_identifier_or.status())) { + // Not found. Return invalid posting list id. + return PostingListIdentifier::kInvalid; + } + // Real error. + return posting_list_identifier_or; + } + return std::move(posting_list_identifier_or).ValueOrDie(); +} + +libtextclassifier3::StatusOr<std::string> EncodeSchemaTypeJoinablePropertyId( + SchemaTypeId schema_type_id, JoinablePropertyId joinable_property_id) { + if (schema_type_id < 0) { + return absl_ports::InvalidArgumentError("Invalid schema type id"); + } + + if (!IsJoinablePropertyIdValid(joinable_property_id)) { + return absl_ports::InvalidArgumentError("Invalid joinable property id"); + } + + static constexpr int kEncodedSchemaTypeIdLength = 3; + + // encoded_schema_type_id_str should be 1 to 3 bytes based on the value of + // schema_type_id. + std::string encoded_schema_type_id_str = + encode_util::EncodeIntToCString(schema_type_id); + // Make encoded_schema_type_id_str to fixed kEncodedSchemaTypeIdLength bytes. + while (encoded_schema_type_id_str.size() < kEncodedSchemaTypeIdLength) { + // C string cannot contain 0 bytes, so we append it using 1, just like what + // we do in encode_util::EncodeIntToCString. + // + // The reason that this works is because DecodeIntToString decodes a byte + // value of 0x01 as 0x00. When EncodeIntToCString returns an encoded + // schema type id that is less than 3 bytes, it means that the id contains + // unencoded leading 0x00. So here we're explicitly encoding those bytes as + // 0x01. + encoded_schema_type_id_str.push_back(1); + } + + return absl_ports::StrCat( + encoded_schema_type_id_str, + encode_util::EncodeIntToCString(joinable_property_id)); +} + +std::string GetMetadataFilePath(std::string_view working_path) { + return absl_ports::StrCat(working_path, "/metadata"); +} + +std::string GetSchemaJoinableIdToPostingListMapperPath( + std::string_view working_path) { + return absl_ports::StrCat(working_path, + "/schema_joinable_id_to_posting_list_mapper"); +} + +std::string GetFlashIndexStorageFilePath(std::string_view working_path) { + return absl_ports::StrCat(working_path, "/flash_index_storage"); +} + +} // namespace + +libtextclassifier3::Status +QualifiedIdJoinIndexImplV2::JoinDataIterator::Advance() { + if (pl_accessor_ == nullptr) { + return absl_ports::ResourceExhaustedError("End of iterator"); + } + + if (!should_retrieve_next_batch_) { + // In this case, cached_batch_join_data_ is not empty (contains some data + // fetched in the previous round), so move curr_ to the next position and + // check if we have to fetch the next batch. + // + // Note: in the 1st round, should_retrieve_next_batch_ is true, so this part + // will never be executed. + ++curr_; + should_retrieve_next_batch_ = curr_ >= cached_batch_join_data_.cend(); + } + + if (should_retrieve_next_batch_) { + // Fetch next batch if needed. + ICING_RETURN_IF_ERROR(GetNextDataBatch()); + should_retrieve_next_batch_ = false; + } + + return libtextclassifier3::Status::OK; +} + +libtextclassifier3::Status +QualifiedIdJoinIndexImplV2::JoinDataIterator::GetNextDataBatch() { + auto cached_batch_join_data_or = pl_accessor_->GetNextDataBatch(); + if (!cached_batch_join_data_or.ok()) { + ICING_LOG(WARNING) + << "Fail to get next batch data from posting list due to: " + << cached_batch_join_data_or.status().error_message(); + return std::move(cached_batch_join_data_or).status(); + } + + cached_batch_join_data_ = std::move(cached_batch_join_data_or).ValueOrDie(); + curr_ = cached_batch_join_data_.cbegin(); + + if (cached_batch_join_data_.empty()) { + return absl_ports::ResourceExhaustedError("End of iterator"); + } + + return libtextclassifier3::Status::OK; +} + +/* static */ libtextclassifier3::StatusOr< + std::unique_ptr<QualifiedIdJoinIndexImplV2>> +QualifiedIdJoinIndexImplV2::Create(const Filesystem& filesystem, + std::string working_path, + bool pre_mapping_fbv) { + if (!filesystem.FileExists(GetMetadataFilePath(working_path).c_str()) || + !filesystem.DirectoryExists( + GetSchemaJoinableIdToPostingListMapperPath(working_path).c_str()) || + !filesystem.FileExists( + GetFlashIndexStorageFilePath(working_path).c_str())) { + // Discard working_path if any file/directory is missing, and reinitialize. + if (filesystem.DirectoryExists(working_path.c_str())) { + ICING_RETURN_IF_ERROR( + QualifiedIdJoinIndex::Discard(filesystem, working_path)); + } + return InitializeNewFiles(filesystem, std::move(working_path), + pre_mapping_fbv); + } + return InitializeExistingFiles(filesystem, std::move(working_path), + pre_mapping_fbv); +} + +QualifiedIdJoinIndexImplV2::~QualifiedIdJoinIndexImplV2() { + if (!PersistToDisk().ok()) { + ICING_LOG(WARNING) << "Failed to persist qualified id join index (v2) to " + "disk while destructing " + << working_path_; + } +} + +libtextclassifier3::Status QualifiedIdJoinIndexImplV2::Put( + SchemaTypeId schema_type_id, JoinablePropertyId joinable_property_id, + DocumentId document_id, + std::vector<NamespaceFingerprintIdentifier>&& + ref_namespace_fingerprint_ids) { + std::sort(ref_namespace_fingerprint_ids.begin(), + ref_namespace_fingerprint_ids.end()); + + // Dedupe. + auto last = std::unique(ref_namespace_fingerprint_ids.begin(), + ref_namespace_fingerprint_ids.end()); + ref_namespace_fingerprint_ids.erase(last, + ref_namespace_fingerprint_ids.end()); + if (ref_namespace_fingerprint_ids.empty()) { + return libtextclassifier3::Status::OK; + } + + SetDirty(); + ICING_ASSIGN_OR_RETURN( + std::string encoded_schema_type_joinable_property_id_str, + EncodeSchemaTypeJoinablePropertyId(schema_type_id, joinable_property_id)); + + ICING_ASSIGN_OR_RETURN( + PostingListIdentifier posting_list_identifier, + GetPostingListIdentifier(*schema_joinable_id_to_posting_list_mapper_, + encoded_schema_type_joinable_property_id_str)); + std::unique_ptr<PostingListJoinDataAccessor<JoinDataType>> pl_accessor; + if (posting_list_identifier.is_valid()) { + ICING_ASSIGN_OR_RETURN( + pl_accessor, + PostingListJoinDataAccessor<JoinDataType>::CreateFromExisting( + flash_index_storage_.get(), posting_list_serializer_.get(), + posting_list_identifier)); + } else { + ICING_ASSIGN_OR_RETURN( + pl_accessor, + PostingListJoinDataAccessor<JoinDataType>::Create( + flash_index_storage_.get(), posting_list_serializer_.get())); + } + + // Prepend join data into posting list. + for (const NamespaceFingerprintIdentifier& ref_namespace_fingerprint_id : + ref_namespace_fingerprint_ids) { + ICING_RETURN_IF_ERROR(pl_accessor->PrependData( + DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>( + document_id, ref_namespace_fingerprint_id))); + } + + // Finalize the posting list and update mapper. + PostingListAccessor::FinalizeResult result = + std::move(*pl_accessor).Finalize(); + if (!result.status.ok()) { + return result.status; + } + if (!result.id.is_valid()) { + return absl_ports::InternalError("Fail to flush data into posting list(s)"); + } + ICING_RETURN_IF_ERROR(schema_joinable_id_to_posting_list_mapper_->Put( + encoded_schema_type_joinable_property_id_str, result.id)); + + // Update info. + info().num_data += ref_namespace_fingerprint_ids.size(); + + return libtextclassifier3::Status::OK; +} + +libtextclassifier3::StatusOr< + std::unique_ptr<QualifiedIdJoinIndex::JoinDataIteratorBase>> +QualifiedIdJoinIndexImplV2::GetIterator( + SchemaTypeId schema_type_id, + JoinablePropertyId joinable_property_id) const { + ICING_ASSIGN_OR_RETURN( + std::string encoded_schema_type_joinable_property_id_str, + EncodeSchemaTypeJoinablePropertyId(schema_type_id, joinable_property_id)); + + ICING_ASSIGN_OR_RETURN( + PostingListIdentifier posting_list_identifier, + GetPostingListIdentifier(*schema_joinable_id_to_posting_list_mapper_, + encoded_schema_type_joinable_property_id_str)); + + if (!posting_list_identifier.is_valid()) { + return std::make_unique<JoinDataIterator>(nullptr); + } + + ICING_ASSIGN_OR_RETURN( + std::unique_ptr<PostingListJoinDataAccessor<JoinDataType>> pl_accessor, + PostingListJoinDataAccessor<JoinDataType>::CreateFromExisting( + flash_index_storage_.get(), posting_list_serializer_.get(), + posting_list_identifier)); + + return std::make_unique<JoinDataIterator>(std::move(pl_accessor)); +} + +libtextclassifier3::Status QualifiedIdJoinIndexImplV2::Optimize( + const std::vector<DocumentId>& document_id_old_to_new, + const std::vector<NamespaceId>& namespace_id_old_to_new, + DocumentId new_last_added_document_id) { + std::string temp_working_path = working_path_ + "_temp"; + ICING_RETURN_IF_ERROR( + QualifiedIdJoinIndex::Discard(filesystem_, temp_working_path)); + + DestructibleDirectory temp_working_path_ddir(&filesystem_, + std::move(temp_working_path)); + if (!temp_working_path_ddir.is_valid()) { + return absl_ports::InternalError( + "Unable to create temp directory to build new qualified id join index " + "(v2)"); + } + + { + // Transfer all data from the current to new qualified id join index. Also + // PersistToDisk and destruct the instance after finishing, so we can safely + // swap directories later. + ICING_ASSIGN_OR_RETURN( + std::unique_ptr<QualifiedIdJoinIndexImplV2> new_index, + Create(filesystem_, temp_working_path_ddir.dir(), pre_mapping_fbv_)); + ICING_RETURN_IF_ERROR(TransferIndex( + document_id_old_to_new, namespace_id_old_to_new, new_index.get())); + new_index->set_last_added_document_id(new_last_added_document_id); + ICING_RETURN_IF_ERROR(new_index->PersistToDisk()); + } + + // Destruct current index's storage instances to safely swap directories. + // TODO(b/268521214): handle delete propagation storage + schema_joinable_id_to_posting_list_mapper_.reset(); + flash_index_storage_.reset(); + + if (!filesystem_.SwapFiles(temp_working_path_ddir.dir().c_str(), + working_path_.c_str())) { + return absl_ports::InternalError( + "Unable to apply new qualified id join index (v2) due to failed swap"); + } + + // Reinitialize qualified id join index. + if (!filesystem_.PRead(GetMetadataFilePath(working_path_).c_str(), + metadata_buffer_.get(), kMetadataFileSize, + /*offset=*/0)) { + return absl_ports::InternalError("Fail to read metadata file"); + } + ICING_ASSIGN_OR_RETURN( + schema_joinable_id_to_posting_list_mapper_, + PersistentHashMapKeyMapper<PostingListIdentifier>::Create( + filesystem_, + GetSchemaJoinableIdToPostingListMapperPath(working_path_), + pre_mapping_fbv_, + /*max_num_entries=*/ + kSchemaJoinableIdToPostingListMapperMaxNumEntries, + /*average_kv_byte_size=*/ + kSchemaJoinableIdToPostingListMapperAverageKVByteSize)); + ICING_ASSIGN_OR_RETURN( + FlashIndexStorage flash_index_storage, + FlashIndexStorage::Create(GetFlashIndexStorageFilePath(working_path_), + &filesystem_, posting_list_serializer_.get())); + flash_index_storage_ = + std::make_unique<FlashIndexStorage>(std::move(flash_index_storage)); + + return libtextclassifier3::Status::OK; +} + +libtextclassifier3::Status QualifiedIdJoinIndexImplV2::Clear() { + SetDirty(); + + schema_joinable_id_to_posting_list_mapper_.reset(); + // Discard and reinitialize schema_joinable_id_to_posting_list_mapper. + std::string schema_joinable_id_to_posting_list_mapper_path = + GetSchemaJoinableIdToPostingListMapperPath(working_path_); + ICING_RETURN_IF_ERROR( + PersistentHashMapKeyMapper<PostingListIdentifier>::Delete( + filesystem_, schema_joinable_id_to_posting_list_mapper_path)); + ICING_ASSIGN_OR_RETURN( + schema_joinable_id_to_posting_list_mapper_, + PersistentHashMapKeyMapper<PostingListIdentifier>::Create( + filesystem_, + std::move(schema_joinable_id_to_posting_list_mapper_path), + pre_mapping_fbv_, + /*max_num_entries=*/ + kSchemaJoinableIdToPostingListMapperMaxNumEntries, + /*average_kv_byte_size=*/ + kSchemaJoinableIdToPostingListMapperAverageKVByteSize)); + + // Discard and reinitialize flash_index_storage. + flash_index_storage_.reset(); + if (!filesystem_.DeleteFile( + GetFlashIndexStorageFilePath(working_path_).c_str())) { + return absl_ports::InternalError("Fail to delete flash index storage file"); + } + ICING_ASSIGN_OR_RETURN( + FlashIndexStorage flash_index_storage, + FlashIndexStorage::Create(GetFlashIndexStorageFilePath(working_path_), + &filesystem_, posting_list_serializer_.get())); + flash_index_storage_ = + std::make_unique<FlashIndexStorage>(std::move(flash_index_storage)); + + // TODO(b/268521214): clear delete propagation storage + + info().num_data = 0; + info().last_added_document_id = kInvalidDocumentId; + return libtextclassifier3::Status::OK; +} + +/* static */ libtextclassifier3::StatusOr< + std::unique_ptr<QualifiedIdJoinIndexImplV2>> +QualifiedIdJoinIndexImplV2::InitializeNewFiles(const Filesystem& filesystem, + std::string&& working_path, + bool pre_mapping_fbv) { + // Create working directory. + if (!filesystem.CreateDirectoryRecursively(working_path.c_str())) { + return absl_ports::InternalError( + absl_ports::StrCat("Failed to create directory: ", working_path)); + } + + // Initialize schema_joinable_id_to_posting_list_mapper + ICING_ASSIGN_OR_RETURN( + std::unique_ptr<KeyMapper<PostingListIdentifier>> + schema_joinable_id_to_posting_list_mapper, + PersistentHashMapKeyMapper<PostingListIdentifier>::Create( + filesystem, GetSchemaJoinableIdToPostingListMapperPath(working_path), + pre_mapping_fbv, + /*max_num_entries=*/ + kSchemaJoinableIdToPostingListMapperMaxNumEntries, + /*average_kv_byte_size=*/ + kSchemaJoinableIdToPostingListMapperAverageKVByteSize)); + + // Initialize flash_index_storage + auto posting_list_serializer = + std::make_unique<PostingListJoinDataSerializer<JoinDataType>>(); + ICING_ASSIGN_OR_RETURN( + FlashIndexStorage flash_index_storage, + FlashIndexStorage::Create(GetFlashIndexStorageFilePath(working_path), + &filesystem, posting_list_serializer.get())); + + // Create instance. + auto new_join_index = std::unique_ptr<QualifiedIdJoinIndexImplV2>( + new QualifiedIdJoinIndexImplV2( + filesystem, std::move(working_path), + /*metadata_buffer=*/std::make_unique<uint8_t[]>(kMetadataFileSize), + std::move(schema_joinable_id_to_posting_list_mapper), + std::move(posting_list_serializer), + std::make_unique<FlashIndexStorage>(std::move(flash_index_storage)), + pre_mapping_fbv)); + // Initialize info content. + new_join_index->info().magic = Info::kMagic; + new_join_index->info().num_data = 0; + new_join_index->info().last_added_document_id = kInvalidDocumentId; + // Initialize new PersistentStorage. The initial checksums will be computed + // and set via InitializeNewStorage. + ICING_RETURN_IF_ERROR(new_join_index->InitializeNewStorage()); + + return new_join_index; +} + +/* static */ libtextclassifier3::StatusOr< + std::unique_ptr<QualifiedIdJoinIndexImplV2>> +QualifiedIdJoinIndexImplV2::InitializeExistingFiles( + const Filesystem& filesystem, std::string&& working_path, + bool pre_mapping_fbv) { + // PRead metadata file. + auto metadata_buffer = std::make_unique<uint8_t[]>(kMetadataFileSize); + if (!filesystem.PRead(GetMetadataFilePath(working_path).c_str(), + metadata_buffer.get(), kMetadataFileSize, + /*offset=*/0)) { + return absl_ports::InternalError("Fail to read metadata file"); + } + + // Initialize schema_joinable_id_to_posting_list_mapper + ICING_ASSIGN_OR_RETURN( + std::unique_ptr<KeyMapper<PostingListIdentifier>> + schema_joinable_id_to_posting_list_mapper, + PersistentHashMapKeyMapper<PostingListIdentifier>::Create( + filesystem, GetSchemaJoinableIdToPostingListMapperPath(working_path), + pre_mapping_fbv, + /*max_num_entries=*/ + kSchemaJoinableIdToPostingListMapperMaxNumEntries, + /*average_kv_byte_size=*/ + kSchemaJoinableIdToPostingListMapperAverageKVByteSize)); + + // Initialize flash_index_storage + auto posting_list_serializer = + std::make_unique<PostingListJoinDataSerializer<JoinDataType>>(); + ICING_ASSIGN_OR_RETURN( + FlashIndexStorage flash_index_storage, + FlashIndexStorage::Create(GetFlashIndexStorageFilePath(working_path), + &filesystem, posting_list_serializer.get())); + + // Create instance. + auto join_index = std::unique_ptr<QualifiedIdJoinIndexImplV2>( + new QualifiedIdJoinIndexImplV2( + filesystem, std::move(working_path), std::move(metadata_buffer), + std::move(schema_joinable_id_to_posting_list_mapper), + std::move(posting_list_serializer), + std::make_unique<FlashIndexStorage>(std::move(flash_index_storage)), + pre_mapping_fbv)); + // Initialize existing PersistentStorage. Checksums will be validated. + ICING_RETURN_IF_ERROR(join_index->InitializeExistingStorage()); + + // Validate magic. + if (join_index->info().magic != Info::kMagic) { + return absl_ports::FailedPreconditionError("Incorrect magic value"); + } + + return join_index; +} + +libtextclassifier3::Status QualifiedIdJoinIndexImplV2::TransferIndex( + const std::vector<DocumentId>& document_id_old_to_new, + const std::vector<NamespaceId>& namespace_id_old_to_new, + QualifiedIdJoinIndexImplV2* new_index) const { + std::unique_ptr<KeyMapper<PostingListIdentifier>::Iterator> iter = + schema_joinable_id_to_posting_list_mapper_->GetIterator(); + + // Iterate through all (schema_type_id, joinable_property_id). + while (iter->Advance()) { + PostingListIdentifier old_pl_id = iter->GetValue(); + if (!old_pl_id.is_valid()) { + // Skip invalid posting list id. + continue; + } + + // Read all join data from old posting lists and convert to new join data + // with new document id, namespace id. + std::vector<JoinDataType> new_join_data_vec; + ICING_ASSIGN_OR_RETURN( + std::unique_ptr<PostingListJoinDataAccessor<JoinDataType>> + old_pl_accessor, + PostingListJoinDataAccessor<JoinDataType>::CreateFromExisting( + flash_index_storage_.get(), posting_list_serializer_.get(), + old_pl_id)); + ICING_ASSIGN_OR_RETURN(std::vector<JoinDataType> batch_old_join_data, + old_pl_accessor->GetNextDataBatch()); + while (!batch_old_join_data.empty()) { + for (const JoinDataType& old_join_data : batch_old_join_data) { + DocumentId new_document_id = GetNewDocumentId( + document_id_old_to_new, old_join_data.document_id()); + NamespaceId new_ref_namespace_id = GetNewNamespaceId( + namespace_id_old_to_new, old_join_data.join_info().namespace_id()); + + // Transfer if the document and namespace are not deleted or outdated. + if (new_document_id != kInvalidDocumentId && + new_ref_namespace_id != kInvalidNamespaceId) { + // We can reuse the fingerprint from old_join_data, since document uri + // (and its fingerprint) will never change. + new_join_data_vec.push_back(JoinDataType( + new_document_id, NamespaceFingerprintIdentifier( + new_ref_namespace_id, + old_join_data.join_info().fingerprint()))); + } + } + ICING_ASSIGN_OR_RETURN(batch_old_join_data, + old_pl_accessor->GetNextDataBatch()); + } + + if (new_join_data_vec.empty()) { + continue; + } + + // NamespaceId order may change, so we have to sort the vector. + std::sort(new_join_data_vec.begin(), new_join_data_vec.end()); + + // Create new posting list in new_index and prepend all new join data into + // it. + ICING_ASSIGN_OR_RETURN( + std::unique_ptr<PostingListJoinDataAccessor<JoinDataType>> + new_pl_accessor, + PostingListJoinDataAccessor<JoinDataType>::Create( + new_index->flash_index_storage_.get(), + new_index->posting_list_serializer_.get())); + for (const JoinDataType& new_join_data : new_join_data_vec) { + ICING_RETURN_IF_ERROR(new_pl_accessor->PrependData(new_join_data)); + } + + // Finalize the posting list and update mapper of new_index. + PostingListAccessor::FinalizeResult result = + std::move(*new_pl_accessor).Finalize(); + if (!result.status.ok()) { + return result.status; + } + if (!result.id.is_valid()) { + return absl_ports::InternalError( + "Fail to flush data into posting list(s)"); + } + ICING_RETURN_IF_ERROR( + new_index->schema_joinable_id_to_posting_list_mapper_->Put( + iter->GetKey(), result.id)); + + // Update info. + new_index->info().num_data += new_join_data_vec.size(); + } + + // TODO(b/268521214): transfer delete propagation storage + + return libtextclassifier3::Status::OK; +} + +libtextclassifier3::Status QualifiedIdJoinIndexImplV2::PersistMetadataToDisk( + bool force) { + if (!force && !is_info_dirty() && !is_storage_dirty()) { + return libtextclassifier3::Status::OK; + } + + std::string metadata_file_path = GetMetadataFilePath(working_path_); + + ScopedFd sfd(filesystem_.OpenForWrite(metadata_file_path.c_str())); + if (!sfd.is_valid()) { + return absl_ports::InternalError("Fail to open metadata file for write"); + } + + if (!filesystem_.PWrite(sfd.get(), /*offset=*/0, metadata_buffer_.get(), + kMetadataFileSize)) { + return absl_ports::InternalError("Fail to write metadata file"); + } + + if (!filesystem_.DataSync(sfd.get())) { + return absl_ports::InternalError("Fail to sync metadata to disk"); + } + + return libtextclassifier3::Status::OK; +} + +libtextclassifier3::Status QualifiedIdJoinIndexImplV2::PersistStoragesToDisk( + bool force) { + if (!force && !is_storage_dirty()) { + return libtextclassifier3::Status::OK; + } + + ICING_RETURN_IF_ERROR( + schema_joinable_id_to_posting_list_mapper_->PersistToDisk()); + if (!flash_index_storage_->PersistToDisk()) { + return absl_ports::InternalError( + "Fail to persist FlashIndexStorage to disk"); + } + + return libtextclassifier3::Status::OK; +} + +libtextclassifier3::StatusOr<Crc32> +QualifiedIdJoinIndexImplV2::ComputeInfoChecksum(bool force) { + if (!force && !is_info_dirty()) { + return Crc32(crcs().component_crcs.info_crc); + } + + return info().ComputeChecksum(); +} + +libtextclassifier3::StatusOr<Crc32> +QualifiedIdJoinIndexImplV2::ComputeStoragesChecksum(bool force) { + if (!force && !is_storage_dirty()) { + return Crc32(crcs().component_crcs.storages_crc); + } + + ICING_ASSIGN_OR_RETURN( + Crc32 schema_joinable_id_to_posting_list_mapper_crc, + schema_joinable_id_to_posting_list_mapper_->ComputeChecksum()); + + return Crc32(schema_joinable_id_to_posting_list_mapper_crc.Get()); +} + +} // namespace lib +} // namespace icing diff --git a/icing/join/qualified-id-join-index-impl-v2.h b/icing/join/qualified-id-join-index-impl-v2.h new file mode 100644 index 0000000..2b0bf3f --- /dev/null +++ b/icing/join/qualified-id-join-index-impl-v2.h @@ -0,0 +1,369 @@ +// Copyright (C) 2023 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef ICING_JOIN_QUALIFIED_ID_JOIN_INDEX_IMPL_V2_H_ +#define ICING_JOIN_QUALIFIED_ID_JOIN_INDEX_IMPL_V2_H_ + +#include <cstdint> +#include <memory> +#include <string> +#include <string_view> +#include <utility> +#include <vector> + +#include "icing/text_classifier/lib3/utils/base/status.h" +#include "icing/text_classifier/lib3/utils/base/statusor.h" +#include "icing/absl_ports/canonical_errors.h" +#include "icing/file/filesystem.h" +#include "icing/file/persistent-storage.h" +#include "icing/file/posting_list/flash-index-storage.h" +#include "icing/file/posting_list/posting-list-identifier.h" +#include "icing/join/doc-join-info.h" +#include "icing/join/document-id-to-join-info.h" +#include "icing/join/posting-list-join-data-accessor.h" +#include "icing/join/posting-list-join-data-serializer.h" +#include "icing/join/qualified-id-join-index.h" +#include "icing/schema/joinable-property.h" +#include "icing/store/document-filter-data.h" +#include "icing/store/document-id.h" +#include "icing/store/key-mapper.h" +#include "icing/store/namespace-fingerprint-identifier.h" +#include "icing/store/namespace-id.h" +#include "icing/util/crc32.h" + +namespace icing { +namespace lib { + +// QualifiedIdJoinIndexImplV2: a class to maintain join data (DocumentId to +// referenced NamespaceFingerprintIdentifier). It stores join data in posting +// lists and bucketizes them by (schema_type_id, joinable_property_id). +class QualifiedIdJoinIndexImplV2 : public QualifiedIdJoinIndex { + public: + using JoinDataType = DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>; + + class JoinDataIterator : public JoinDataIteratorBase { + public: + explicit JoinDataIterator( + std::unique_ptr<PostingListJoinDataAccessor<JoinDataType>> pl_accessor) + : pl_accessor_(std::move(pl_accessor)), + should_retrieve_next_batch_(true) {} + + ~JoinDataIterator() override = default; + + // Advances to the next data. + // + // Returns: + // - OK on success + // - RESOURCE_EXHAUSTED_ERROR if reaching the end (i.e. no more relevant + // data) + // - Any other PostingListJoinDataAccessor errors + libtextclassifier3::Status Advance() override; + + const JoinDataType& GetCurrent() const override { return *curr_; } + + private: + // Gets next batch of data from the posting list chain, caches in + // cached_batch_integer_index_data_, and sets curr_ to the begin of the + // cache. + libtextclassifier3::Status GetNextDataBatch(); + + std::unique_ptr<PostingListJoinDataAccessor<JoinDataType>> pl_accessor_; + std::vector<JoinDataType> cached_batch_join_data_; + std::vector<JoinDataType>::const_iterator curr_; + bool should_retrieve_next_batch_; + }; + + struct Info { + static constexpr int32_t kMagic = 0x12d1c074; + + int32_t magic; + int32_t num_data; + DocumentId last_added_document_id; + + Crc32 ComputeChecksum() const { + return Crc32( + std::string_view(reinterpret_cast<const char*>(this), sizeof(Info))); + } + } __attribute__((packed)); + static_assert(sizeof(Info) == 12, ""); + + // Metadata file layout: <Crcs><Info> + static constexpr int32_t kCrcsMetadataBufferOffset = 0; + static constexpr int32_t kInfoMetadataBufferOffset = + static_cast<int32_t>(sizeof(Crcs)); + static constexpr int32_t kMetadataFileSize = sizeof(Crcs) + sizeof(Info); + static_assert(kMetadataFileSize == 24, ""); + + static constexpr WorkingPathType kWorkingPathType = + WorkingPathType::kDirectory; + + // Creates a QualifiedIdJoinIndexImplV2 instance to store join data + // (DocumentId to referenced NamespaceFingerPrintIdentifier) for future + // joining search. If any of the underlying file is missing, then delete the + // whole working_path and (re)initialize with new ones. Otherwise initialize + // and create the instance by existing files. + // + // filesystem: Object to make system level calls + // working_path: Specifies the working path for PersistentStorage. + // QualifiedIdJoinIndexImplV2 uses working path as working + // directory and all related files will be stored under this + // directory. It takes full ownership and of working_path_, + // including creation/deletion. It is the caller's + // responsibility to specify correct working path and avoid + // mixing different persistent storages together under the same + // path. Also the caller has the ownership for the parent + // directory of working_path_, and it is responsible for parent + // directory creation/deletion. See PersistentStorage for more + // details about the concept of working_path. + // pre_mapping_fbv: flag indicating whether memory map max possible file size + // for underlying FileBackedVector before growing the actual + // file size. + // + // Returns: + // - FAILED_PRECONDITION_ERROR if the file checksum doesn't match the stored + // checksum + // - INTERNAL_ERROR on I/O errors + // - Any KeyMapper errors + static libtextclassifier3::StatusOr< + std::unique_ptr<QualifiedIdJoinIndexImplV2>> + Create(const Filesystem& filesystem, std::string working_path, + bool pre_mapping_fbv); + + // Delete copy and move constructor/assignment operator. + QualifiedIdJoinIndexImplV2(const QualifiedIdJoinIndexImplV2&) = delete; + QualifiedIdJoinIndexImplV2& operator=(const QualifiedIdJoinIndexImplV2&) = + delete; + + QualifiedIdJoinIndexImplV2(QualifiedIdJoinIndexImplV2&&) = delete; + QualifiedIdJoinIndexImplV2& operator=(QualifiedIdJoinIndexImplV2&&) = delete; + + ~QualifiedIdJoinIndexImplV2() override; + + // v1 only API. Returns UNIMPLEMENTED_ERROR. + libtextclassifier3::Status Put( + const DocJoinInfo& doc_join_info, + std::string_view ref_qualified_id_str) override { + return absl_ports::UnimplementedError("This API is not supported in V2"); + } + + // v1 only API. Returns UNIMPLEMENTED_ERROR. + libtextclassifier3::StatusOr<std::string_view> Get( + const DocJoinInfo& doc_join_info) const override { + return absl_ports::UnimplementedError("This API is not supported in V2"); + } + + // Puts a list of referenced (parent) NamespaceFingerprintIdentifiers into + // the join index, given the (child) DocumentId, SchemaTypeId and + // JoinablePropertyId. + // + // Returns: + // - OK on success + // - INVALID_ARGUMENT_ERROR if schema_type_id, joinable_property_id, or + // document_id is invalid + // - Any KeyMapper/FlashIndexStorage errors + libtextclassifier3::Status Put(SchemaTypeId schema_type_id, + JoinablePropertyId joinable_property_id, + DocumentId document_id, + std::vector<NamespaceFingerprintIdentifier>&& + ref_namespace_fingerprint_ids) override; + + // Returns a JoinDataIterator for iterating through all join data of the + // specified (schema_type_id, joinable_property_id). + // + // Returns: + // - On success: a JoinDataIterator + // - INVALID_ARGUMENT_ERROR if schema_type_id or joinable_property_id is + // invalid + // - Any KeyMapper/FlashIndexStorage errors + libtextclassifier3::StatusOr<std::unique_ptr<JoinDataIteratorBase>> + GetIterator(SchemaTypeId schema_type_id, + JoinablePropertyId joinable_property_id) const override; + + // Reduces internal file sizes by reclaiming space and ids of deleted + // documents. Qualified id join index will convert all entries to the new + // document ids and namespace ids. + // + // - document_id_old_to_new: a map for converting old document id to new + // document id. + // - namespace_id_old_to_new: a map for converting old namespace id to new + // namespace id. + // - new_last_added_document_id: will be used to update the last added + // document id in the qualified id join index. + // + // Returns: + // - OK on success + // - INTERNAL_ERROR on I/O error. This could potentially leave the index in + // an invalid state and the caller should handle it properly (e.g. discard + // and rebuild) + libtextclassifier3::Status Optimize( + const std::vector<DocumentId>& document_id_old_to_new, + const std::vector<NamespaceId>& namespace_id_old_to_new, + DocumentId new_last_added_document_id) override; + + // Clears all data and set last_added_document_id to kInvalidDocumentId. + // + // Returns: + // - OK on success + // - INTERNAL_ERROR on I/O error + libtextclassifier3::Status Clear() override; + + bool is_v2() const override { return true; } + + int32_t size() const override { return info().num_data; } + + bool empty() const override { return size() == 0; } + + DocumentId last_added_document_id() const override { + return info().last_added_document_id; + } + + void set_last_added_document_id(DocumentId document_id) override { + SetInfoDirty(); + + Info& info_ref = info(); + if (info_ref.last_added_document_id == kInvalidDocumentId || + document_id > info_ref.last_added_document_id) { + info_ref.last_added_document_id = document_id; + } + } + + private: + explicit QualifiedIdJoinIndexImplV2( + const Filesystem& filesystem, std::string&& working_path, + std::unique_ptr<uint8_t[]> metadata_buffer, + std::unique_ptr<KeyMapper<PostingListIdentifier>> + schema_joinable_id_to_posting_list_mapper, + std::unique_ptr<PostingListJoinDataSerializer<JoinDataType>> + posting_list_serializer, + std::unique_ptr<FlashIndexStorage> flash_index_storage, + bool pre_mapping_fbv) + : QualifiedIdJoinIndex(filesystem, std::move(working_path)), + metadata_buffer_(std::move(metadata_buffer)), + schema_joinable_id_to_posting_list_mapper_( + std::move(schema_joinable_id_to_posting_list_mapper)), + posting_list_serializer_(std::move(posting_list_serializer)), + flash_index_storage_(std::move(flash_index_storage)), + pre_mapping_fbv_(pre_mapping_fbv), + is_info_dirty_(false), + is_storage_dirty_(false) {} + + static libtextclassifier3::StatusOr< + std::unique_ptr<QualifiedIdJoinIndexImplV2>> + InitializeNewFiles(const Filesystem& filesystem, std::string&& working_path, + bool pre_mapping_fbv); + + static libtextclassifier3::StatusOr< + std::unique_ptr<QualifiedIdJoinIndexImplV2>> + InitializeExistingFiles(const Filesystem& filesystem, + std::string&& working_path, bool pre_mapping_fbv); + + // Transfers qualified id join index data from the current to new_index and + // convert to new document id according to document_id_old_to_new and + // namespace_id_old_to_new. It is a helper function for Optimize. + // + // Returns: + // - OK on success + // - INTERNAL_ERROR on I/O error + libtextclassifier3::Status TransferIndex( + const std::vector<DocumentId>& document_id_old_to_new, + const std::vector<NamespaceId>& namespace_id_old_to_new, + QualifiedIdJoinIndexImplV2* new_index) const; + + // Flushes contents of metadata file. + // + // Returns: + // - OK on success + // - INTERNAL_ERROR on I/O error + libtextclassifier3::Status PersistMetadataToDisk(bool force) override; + + // Flushes contents of all storages to underlying files. + // + // Returns: + // - OK on success + // - INTERNAL_ERROR on I/O error + libtextclassifier3::Status PersistStoragesToDisk(bool force) override; + + // Computes and returns Info checksum. + // + // Returns: + // - Crc of the Info on success + libtextclassifier3::StatusOr<Crc32> ComputeInfoChecksum(bool force) override; + + // Computes and returns all storages checksum. + // + // Returns: + // - Crc of all storages on success + // - INTERNAL_ERROR if any data inconsistency + libtextclassifier3::StatusOr<Crc32> ComputeStoragesChecksum( + bool force) override; + + Crcs& crcs() override { + return *reinterpret_cast<Crcs*>(metadata_buffer_.get() + + kCrcsMetadataBufferOffset); + } + + const Crcs& crcs() const override { + return *reinterpret_cast<const Crcs*>(metadata_buffer_.get() + + kCrcsMetadataBufferOffset); + } + + Info& info() { + return *reinterpret_cast<Info*>(metadata_buffer_.get() + + kInfoMetadataBufferOffset); + } + + const Info& info() const { + return *reinterpret_cast<const Info*>(metadata_buffer_.get() + + kInfoMetadataBufferOffset); + } + + void SetInfoDirty() { is_info_dirty_ = true; } + // When storage is dirty, we have to set info dirty as well. So just expose + // SetDirty to set both. + void SetDirty() { + is_info_dirty_ = true; + is_storage_dirty_ = true; + } + + bool is_info_dirty() const { return is_info_dirty_; } + bool is_storage_dirty() const { return is_storage_dirty_; } + + // Metadata buffer + std::unique_ptr<uint8_t[]> metadata_buffer_; + + // Persistent KeyMapper for mapping (schema_type_id, joinable_property_id) to + // PostingListIdentifier. + std::unique_ptr<KeyMapper<PostingListIdentifier>> + schema_joinable_id_to_posting_list_mapper_; + + // Posting list related members. Use posting list to store join data + // (document id to referenced NamespaceFingerprintIdentifier). + std::unique_ptr<PostingListJoinDataSerializer<JoinDataType>> + posting_list_serializer_; + std::unique_ptr<FlashIndexStorage> flash_index_storage_; + + // TODO(b/268521214): add delete propagation storage + + // Flag indicating whether memory map max possible file size for underlying + // FileBackedVector before growing the actual file size. + bool pre_mapping_fbv_; + + bool is_info_dirty_; + bool is_storage_dirty_; +}; + +} // namespace lib +} // namespace icing + +#endif // ICING_JOIN_QUALIFIED_ID_JOIN_INDEX_IMPL_V2_H_ diff --git a/icing/join/qualified-id-join-index-impl-v2_test.cc b/icing/join/qualified-id-join-index-impl-v2_test.cc new file mode 100644 index 0000000..d73d6c2 --- /dev/null +++ b/icing/join/qualified-id-join-index-impl-v2_test.cc @@ -0,0 +1,1414 @@ +// Copyright (C) 2023 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "icing/join/qualified-id-join-index-impl-v2.h" + +#include <cstdint> +#include <memory> +#include <numeric> +#include <string> +#include <string_view> +#include <utility> +#include <vector> + +#include "icing/text_classifier/lib3/utils/base/status.h" +#include "icing/text_classifier/lib3/utils/base/statusor.h" +#include "gmock/gmock.h" +#include "gtest/gtest.h" +#include "icing/absl_ports/str_cat.h" +#include "icing/file/filesystem.h" +#include "icing/file/persistent-storage.h" +#include "icing/file/posting_list/posting-list-identifier.h" +#include "icing/join/document-id-to-join-info.h" +#include "icing/join/qualified-id-join-index.h" +#include "icing/schema/joinable-property.h" +#include "icing/store/document-filter-data.h" +#include "icing/store/document-id.h" +#include "icing/store/key-mapper.h" +#include "icing/store/namespace-fingerprint-identifier.h" +#include "icing/store/namespace-id.h" +#include "icing/store/persistent-hash-map-key-mapper.h" +#include "icing/testing/common-matchers.h" +#include "icing/testing/tmp-directory.h" +#include "icing/util/crc32.h" +#include "icing/util/status-macros.h" + +namespace icing { +namespace lib { + +namespace { + +using ::testing::ElementsAre; +using ::testing::Eq; +using ::testing::HasSubstr; +using ::testing::IsEmpty; +using ::testing::IsTrue; +using ::testing::Lt; +using ::testing::Ne; +using ::testing::Not; +using ::testing::Pointee; +using ::testing::SizeIs; + +using Crcs = PersistentStorage::Crcs; +using Info = QualifiedIdJoinIndexImplV2::Info; + +static constexpr int32_t kCorruptedValueOffset = 3; + +class QualifiedIdJoinIndexImplV2Test : public ::testing::Test { + protected: + void SetUp() override { + base_dir_ = GetTestTempDir() + "/icing"; + ASSERT_THAT(filesystem_.CreateDirectoryRecursively(base_dir_.c_str()), + IsTrue()); + + working_path_ = base_dir_ + "/qualified_id_join_index_impl_v2_test"; + } + + void TearDown() override { + filesystem_.DeleteDirectoryRecursively(base_dir_.c_str()); + } + + Filesystem filesystem_; + std::string base_dir_; + std::string working_path_; +}; + +libtextclassifier3::StatusOr< + std::vector<QualifiedIdJoinIndexImplV2::JoinDataType>> +GetJoinData(const QualifiedIdJoinIndexImplV2& index, + SchemaTypeId schema_type_id, + JoinablePropertyId joinable_property_id) { + ICING_ASSIGN_OR_RETURN( + std::unique_ptr<QualifiedIdJoinIndex::JoinDataIteratorBase> iter, + index.GetIterator(schema_type_id, joinable_property_id)); + + std::vector<QualifiedIdJoinIndexImplV2::JoinDataType> result; + while (iter->Advance().ok()) { + result.push_back(iter->GetCurrent()); + } + + return result; +} + +TEST_F(QualifiedIdJoinIndexImplV2Test, InvalidWorkingPath) { + EXPECT_THAT(QualifiedIdJoinIndexImplV2::Create( + filesystem_, "/dev/null/qualified_id_join_index_impl_v2_test", + /*pre_mapping_fbv=*/false), + StatusIs(libtextclassifier3::StatusCode::INTERNAL)); +} + +TEST_F(QualifiedIdJoinIndexImplV2Test, InitializeNewFiles) { + { + // Create new qualified id join index + ASSERT_FALSE(filesystem_.DirectoryExists(working_path_.c_str())); + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<QualifiedIdJoinIndexImplV2> index, + QualifiedIdJoinIndexImplV2::Create(filesystem_, working_path_, + /*pre_mapping_fbv=*/false)); + EXPECT_THAT(index, Pointee(IsEmpty())); + + ICING_ASSERT_OK(index->PersistToDisk()); + } + + // Metadata file should be initialized correctly for both info and crcs + // sections. + const std::string metadata_file_path = + absl_ports::StrCat(working_path_, "/metadata"); + auto metadata_buffer = std::make_unique<uint8_t[]>( + QualifiedIdJoinIndexImplV2::kMetadataFileSize); + ASSERT_THAT( + filesystem_.PRead(metadata_file_path.c_str(), metadata_buffer.get(), + QualifiedIdJoinIndexImplV2::kMetadataFileSize, + /*offset=*/0), + IsTrue()); + + // Check info section + const Info* info = reinterpret_cast<const Info*>( + metadata_buffer.get() + + QualifiedIdJoinIndexImplV2::kInfoMetadataBufferOffset); + EXPECT_THAT(info->magic, Eq(Info::kMagic)); + EXPECT_THAT(info->num_data, Eq(0)); + EXPECT_THAT(info->last_added_document_id, Eq(kInvalidDocumentId)); + + // Check crcs section + const Crcs* crcs = reinterpret_cast<const Crcs*>( + metadata_buffer.get() + + QualifiedIdJoinIndexImplV2::kCrcsMetadataBufferOffset); + // There are some initial info in KeyMapper, so storages_crc should be + // non-zero. + EXPECT_THAT(crcs->component_crcs.storages_crc, Ne(0)); + EXPECT_THAT(crcs->component_crcs.info_crc, + Eq(Crc32(std::string_view(reinterpret_cast<const char*>(info), + sizeof(Info))) + .Get())); + EXPECT_THAT(crcs->all_crc, + Eq(Crc32(std::string_view( + reinterpret_cast<const char*>(&crcs->component_crcs), + sizeof(Crcs::ComponentCrcs))) + .Get())); +} + +TEST_F(QualifiedIdJoinIndexImplV2Test, + InitializationShouldFailWithoutPersistToDiskOrDestruction) { + NamespaceFingerprintIdentifier id1(/*namespace_id=*/1, /*fingerprint=*/12); + NamespaceFingerprintIdentifier id2(/*namespace_id=*/1, /*fingerprint=*/34); + NamespaceFingerprintIdentifier id3(/*namespace_id=*/1, /*fingerprint=*/56); + NamespaceFingerprintIdentifier id4(/*namespace_id=*/1, /*fingerprint=*/78); + + // Create new qualified id join index + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<QualifiedIdJoinIndexImplV2> index, + QualifiedIdJoinIndexImplV2::Create(filesystem_, working_path_, + /*pre_mapping_fbv=*/false)); + + // Insert some data. + ICING_ASSERT_OK(index->Put( + /*schema_type_id=*/2, /*joinable_property_id=*/1, /*document_id=*/5, + /*ref_namespace_fingerprint_ids=*/{id2, id1})); + ICING_ASSERT_OK(index->PersistToDisk()); + ICING_ASSERT_OK(index->Put( + /*schema_type_id=*/3, /*joinable_property_id=*/10, /*document_id=*/6, + /*ref_namespace_fingerprint_ids=*/{id3})); + ICING_ASSERT_OK(index->Put( + /*schema_type_id=*/2, /*joinable_property_id=*/1, /*document_id=*/12, + /*ref_namespace_fingerprint_ids=*/{id4})); + + // Without calling PersistToDisk, checksums will not be recomputed or synced + // to disk, so initializing another instance on the same files should fail. + EXPECT_THAT(QualifiedIdJoinIndexImplV2::Create(filesystem_, working_path_, + /*pre_mapping_fbv=*/false), + StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION)); +} + +TEST_F(QualifiedIdJoinIndexImplV2Test, + InitializationShouldSucceedWithPersistToDisk) { + NamespaceFingerprintIdentifier id1(/*namespace_id=*/1, /*fingerprint=*/12); + NamespaceFingerprintIdentifier id2(/*namespace_id=*/1, /*fingerprint=*/34); + NamespaceFingerprintIdentifier id3(/*namespace_id=*/1, /*fingerprint=*/56); + NamespaceFingerprintIdentifier id4(/*namespace_id=*/1, /*fingerprint=*/78); + + // Create new qualified id join index + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<QualifiedIdJoinIndexImplV2> index1, + QualifiedIdJoinIndexImplV2::Create(filesystem_, working_path_, + /*pre_mapping_fbv=*/false)); + + // Insert some data. + ICING_ASSERT_OK(index1->Put( + /*schema_type_id=*/2, /*joinable_property_id=*/1, /*document_id=*/5, + /*ref_namespace_fingerprint_ids=*/{id2, id1})); + ICING_ASSERT_OK(index1->Put( + /*schema_type_id=*/3, /*joinable_property_id=*/10, /*document_id=*/6, + /*ref_namespace_fingerprint_ids=*/{id3})); + ICING_ASSERT_OK(index1->Put( + /*schema_type_id=*/2, /*joinable_property_id=*/1, /*document_id=*/12, + /*ref_namespace_fingerprint_ids=*/{id4})); + ASSERT_THAT(index1, Pointee(SizeIs(4))); + + // After calling PersistToDisk, all checksums should be recomputed and synced + // correctly to disk, so initializing another instance on the same files + // should succeed, and we should be able to get the same contents. + ICING_EXPECT_OK(index1->PersistToDisk()); + + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<QualifiedIdJoinIndexImplV2> index2, + QualifiedIdJoinIndexImplV2::Create(filesystem_, working_path_, + /*pre_mapping_fbv=*/false)); + EXPECT_THAT(index2, Pointee(SizeIs(4))); + EXPECT_THAT( + GetJoinData(*index2, /*schema_type_id=*/2, /*joinable_property_id=*/1), + IsOkAndHolds( + ElementsAre(DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>( + /*document_id=*/12, /*join_info=*/id4), + DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>( + /*document_id=*/5, /*join_info=*/id2), + DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>( + /*document_id=*/5, /*join_info=*/id1)))); + EXPECT_THAT( + GetJoinData(*index2, /*schema_type_id=*/3, /*joinable_property_id=*/10), + IsOkAndHolds( + ElementsAre(DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>( + /*document_id=*/6, /*join_info=*/id3)))); +} + +TEST_F(QualifiedIdJoinIndexImplV2Test, + InitializationShouldSucceedAfterDestruction) { + NamespaceFingerprintIdentifier id1(/*namespace_id=*/1, /*fingerprint=*/12); + NamespaceFingerprintIdentifier id2(/*namespace_id=*/1, /*fingerprint=*/34); + NamespaceFingerprintIdentifier id3(/*namespace_id=*/1, /*fingerprint=*/56); + NamespaceFingerprintIdentifier id4(/*namespace_id=*/1, /*fingerprint=*/78); + + { + // Create new qualified id join index + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<QualifiedIdJoinIndexImplV2> index, + QualifiedIdJoinIndexImplV2::Create(filesystem_, working_path_, + /*pre_mapping_fbv=*/false)); + + // Insert some data. + ICING_ASSERT_OK(index->Put( + /*schema_type_id=*/2, /*joinable_property_id=*/1, /*document_id=*/5, + /*ref_namespace_fingerprint_ids=*/{id2, id1})); + ICING_ASSERT_OK(index->Put( + /*schema_type_id=*/3, /*joinable_property_id=*/10, /*document_id=*/6, + /*ref_namespace_fingerprint_ids=*/{id3})); + ICING_ASSERT_OK(index->Put( + /*schema_type_id=*/2, /*joinable_property_id=*/1, /*document_id=*/12, + /*ref_namespace_fingerprint_ids=*/{id4})); + ASSERT_THAT(index, Pointee(SizeIs(4))); + } + + { + // The previous instance went out of scope and was destructed. Although we + // didn't call PersistToDisk explicitly, the destructor should invoke it and + // thus initializing another instance on the same files should succeed, and + // we should be able to get the same contents. + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<QualifiedIdJoinIndexImplV2> index, + QualifiedIdJoinIndexImplV2::Create(filesystem_, working_path_, + /*pre_mapping_fbv=*/false)); + EXPECT_THAT(index, Pointee(SizeIs(4))); + EXPECT_THAT( + GetJoinData(*index, /*schema_type_id=*/2, /*joinable_property_id=*/1), + IsOkAndHolds( + ElementsAre(DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>( + /*document_id=*/12, /*join_info=*/id4), + DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>( + /*document_id=*/5, /*join_info=*/id2), + DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>( + /*document_id=*/5, /*join_info=*/id1)))); + EXPECT_THAT( + GetJoinData(*index, /*schema_type_id=*/3, /*joinable_property_id=*/10), + IsOkAndHolds( + ElementsAre(DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>( + /*document_id=*/6, /*join_info=*/id3)))); + } +} + +TEST_F(QualifiedIdJoinIndexImplV2Test, + InitializeExistingFilesWithDifferentMagicShouldFail) { + { + // Create new qualified id join index + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<QualifiedIdJoinIndexImplV2> index, + QualifiedIdJoinIndexImplV2::Create(filesystem_, working_path_, + /*pre_mapping_fbv=*/false)); + ICING_ASSERT_OK(index->Put( + /*schema_type_id=*/2, /*joinable_property_id=*/1, /*document_id=*/5, + /*ref_namespace_fingerprint_ids=*/ + {NamespaceFingerprintIdentifier(/*namespace_id=*/1, + /*fingerprint=*/12)})); + + ICING_ASSERT_OK(index->PersistToDisk()); + } + + { + const std::string metadata_file_path = + absl_ports::StrCat(working_path_, "/metadata"); + ScopedFd metadata_sfd(filesystem_.OpenForWrite(metadata_file_path.c_str())); + ASSERT_THAT(metadata_sfd.is_valid(), IsTrue()); + + auto metadata_buffer = std::make_unique<uint8_t[]>( + QualifiedIdJoinIndexImplV2::kMetadataFileSize); + ASSERT_THAT(filesystem_.PRead(metadata_sfd.get(), metadata_buffer.get(), + QualifiedIdJoinIndexImplV2::kMetadataFileSize, + /*offset=*/0), + IsTrue()); + + // Manually change magic and update checksum + Crcs* crcs = reinterpret_cast<Crcs*>( + metadata_buffer.get() + + QualifiedIdJoinIndexImplV2::kCrcsMetadataBufferOffset); + Info* info = reinterpret_cast<Info*>( + metadata_buffer.get() + + QualifiedIdJoinIndexImplV2::kInfoMetadataBufferOffset); + info->magic += kCorruptedValueOffset; + crcs->component_crcs.info_crc = info->ComputeChecksum().Get(); + crcs->all_crc = crcs->component_crcs.ComputeChecksum().Get(); + ASSERT_THAT(filesystem_.PWrite( + metadata_sfd.get(), /*offset=*/0, metadata_buffer.get(), + QualifiedIdJoinIndexImplV2::kMetadataFileSize), + IsTrue()); + } + + // Attempt to create the qualified id join index with different magic. This + // should fail. + EXPECT_THAT(QualifiedIdJoinIndexImplV2::Create(filesystem_, working_path_, + /*pre_mapping_fbv=*/false), + StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION, + HasSubstr("Incorrect magic value"))); +} + +TEST_F(QualifiedIdJoinIndexImplV2Test, + InitializeExistingFilesWithWrongAllCrcShouldFail) { + { + // Create new qualified id join index + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<QualifiedIdJoinIndexImplV2> index, + QualifiedIdJoinIndexImplV2::Create(filesystem_, working_path_, + /*pre_mapping_fbv=*/false)); + ICING_ASSERT_OK(index->Put( + /*schema_type_id=*/2, /*joinable_property_id=*/1, /*document_id=*/5, + /*ref_namespace_fingerprint_ids=*/ + {NamespaceFingerprintIdentifier(/*namespace_id=*/1, + /*fingerprint=*/12)})); + + ICING_ASSERT_OK(index->PersistToDisk()); + } + + { + const std::string metadata_file_path = + absl_ports::StrCat(working_path_, "/metadata"); + ScopedFd metadata_sfd(filesystem_.OpenForWrite(metadata_file_path.c_str())); + ASSERT_THAT(metadata_sfd.is_valid(), IsTrue()); + + auto metadata_buffer = std::make_unique<uint8_t[]>( + QualifiedIdJoinIndexImplV2::kMetadataFileSize); + ASSERT_THAT(filesystem_.PRead(metadata_sfd.get(), metadata_buffer.get(), + QualifiedIdJoinIndexImplV2::kMetadataFileSize, + /*offset=*/0), + IsTrue()); + + // Manually corrupt all_crc + Crcs* crcs = reinterpret_cast<Crcs*>( + metadata_buffer.get() + + QualifiedIdJoinIndexImplV2::kCrcsMetadataBufferOffset); + crcs->all_crc += kCorruptedValueOffset; + + ASSERT_THAT(filesystem_.PWrite( + metadata_sfd.get(), /*offset=*/0, metadata_buffer.get(), + QualifiedIdJoinIndexImplV2::kMetadataFileSize), + IsTrue()); + } + + // Attempt to create the qualified id join index with metadata containing + // corrupted all_crc. This should fail. + EXPECT_THAT(QualifiedIdJoinIndexImplV2::Create(filesystem_, working_path_, + /*pre_mapping_fbv=*/false), + StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION, + HasSubstr("Invalid all crc"))); +} + +TEST_F(QualifiedIdJoinIndexImplV2Test, + InitializeExistingFilesWithCorruptedInfoShouldFail) { + { + // Create new qualified id join index + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<QualifiedIdJoinIndexImplV2> index, + QualifiedIdJoinIndexImplV2::Create(filesystem_, working_path_, + /*pre_mapping_fbv=*/false)); + ICING_ASSERT_OK(index->Put( + /*schema_type_id=*/2, /*joinable_property_id=*/1, /*document_id=*/5, + /*ref_namespace_fingerprint_ids=*/ + {NamespaceFingerprintIdentifier(/*namespace_id=*/1, + /*fingerprint=*/12)})); + + ICING_ASSERT_OK(index->PersistToDisk()); + } + + { + const std::string metadata_file_path = + absl_ports::StrCat(working_path_, "/metadata"); + ScopedFd metadata_sfd(filesystem_.OpenForWrite(metadata_file_path.c_str())); + ASSERT_THAT(metadata_sfd.is_valid(), IsTrue()); + + auto metadata_buffer = std::make_unique<uint8_t[]>( + QualifiedIdJoinIndexImplV2::kMetadataFileSize); + ASSERT_THAT(filesystem_.PRead(metadata_sfd.get(), metadata_buffer.get(), + QualifiedIdJoinIndexImplV2::kMetadataFileSize, + /*offset=*/0), + IsTrue()); + + // Modify info, but don't update the checksum. This would be similar to + // corruption of info. + Info* info = reinterpret_cast<Info*>( + metadata_buffer.get() + + QualifiedIdJoinIndexImplV2::kInfoMetadataBufferOffset); + info->last_added_document_id += kCorruptedValueOffset; + + ASSERT_THAT(filesystem_.PWrite( + metadata_sfd.get(), /*offset=*/0, metadata_buffer.get(), + QualifiedIdJoinIndexImplV2::kMetadataFileSize), + IsTrue()); + } + + // Attempt to create the qualified id join index with info that doesn't match + // its checksum. This should fail. + EXPECT_THAT(QualifiedIdJoinIndexImplV2::Create(filesystem_, working_path_, + /*pre_mapping_fbv=*/false), + StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION, + HasSubstr("Invalid info crc"))); +} + +TEST_F( + QualifiedIdJoinIndexImplV2Test, + InitializeExistingFilesWithCorruptedSchemaJoinableIdToPostingListMapperShouldFail) { + { + // Create new qualified id join index + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<QualifiedIdJoinIndexImplV2> index, + QualifiedIdJoinIndexImplV2::Create(filesystem_, working_path_, + /*pre_mapping_fbv=*/false)); + ICING_ASSERT_OK(index->Put( + /*schema_type_id=*/2, /*joinable_property_id=*/1, /*document_id=*/5, + /*ref_namespace_fingerprint_ids=*/ + {NamespaceFingerprintIdentifier(/*namespace_id=*/1, + /*fingerprint=*/12)})); + + ICING_ASSERT_OK(index->PersistToDisk()); + } + + // Corrupt schema_joinable_id_to_posting_list_mapper manually. + { + std::string mapper_working_path = absl_ports::StrCat( + working_path_, "/schema_joinable_id_to_posting_list_mapper"); + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<KeyMapper<PostingListIdentifier>> mapper, + PersistentHashMapKeyMapper<PostingListIdentifier>::Create( + filesystem_, std::move(mapper_working_path), + /*pre_mapping_fbv=*/false)); + ICING_ASSERT_OK_AND_ASSIGN(Crc32 old_crc, mapper->ComputeChecksum()); + ICING_ASSERT_OK(mapper->Put("foo", PostingListIdentifier::kInvalid)); + ICING_ASSERT_OK(mapper->PersistToDisk()); + ICING_ASSERT_OK_AND_ASSIGN(Crc32 new_crc, mapper->ComputeChecksum()); + ASSERT_THAT(old_crc, Not(Eq(new_crc))); + } + + // Attempt to create the qualified id join index with corrupted + // doc_join_info_mapper. This should fail. + EXPECT_THAT(QualifiedIdJoinIndexImplV2::Create(filesystem_, working_path_, + /*pre_mapping_fbv=*/false), + StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION, + HasSubstr("Invalid storages crc"))); +} + +TEST_F(QualifiedIdJoinIndexImplV2Test, InvalidPut) { + NamespaceFingerprintIdentifier id(/*namespace_id=*/1, /*fingerprint=*/12); + + // Create new qualified id join index + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<QualifiedIdJoinIndexImplV2> index, + QualifiedIdJoinIndexImplV2::Create(filesystem_, working_path_, + /*pre_mapping_fbv=*/false)); + + EXPECT_THAT( + index->Put(/*schema_type_id=*/-1, /*joinable_property_id=*/1, + /*document_id=*/5, /*ref_namespace_fingerprint_ids=*/{id}), + StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); + EXPECT_THAT( + index->Put(/*schema_type_id=*/2, /*joinable_property_id=*/-1, + /*document_id=*/5, /*ref_namespace_fingerprint_ids=*/{id}), + StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); + EXPECT_THAT(index->Put(/*schema_type_id=*/2, /*joinable_property_id=*/1, + /*document_id=*/kInvalidDocumentId, + /*ref_namespace_fingerprint_ids=*/{id}), + StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); +} + +TEST_F(QualifiedIdJoinIndexImplV2Test, InvalidGetIterator) { + // Create new qualified id join index + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<QualifiedIdJoinIndexImplV2> index, + QualifiedIdJoinIndexImplV2::Create(filesystem_, working_path_, + /*pre_mapping_fbv=*/false)); + + EXPECT_THAT( + index->GetIterator(/*schema_type_id=*/-1, /*joinable_property_id=*/1), + StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); + EXPECT_THAT( + index->GetIterator(/*schema_type_id=*/2, /*joinable_property_id=*/-1), + StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); +} + +TEST_F(QualifiedIdJoinIndexImplV2Test, + PutEmptyRefNamespaceFingerprintIdsShouldReturnOk) { + SchemaTypeId schema_type_id = 2; + JoinablePropertyId joinable_property_id = 1; + + // Create new qualified id join index + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<QualifiedIdJoinIndexImplV2> index, + QualifiedIdJoinIndexImplV2::Create(filesystem_, working_path_, + /*pre_mapping_fbv=*/false)); + + EXPECT_THAT( + index->Put(schema_type_id, joinable_property_id, /*document_id=*/5, + /*ref_namespace_fingerprint_ids=*/{}), + IsOk()); + EXPECT_THAT(index, Pointee(IsEmpty())); + + EXPECT_THAT(GetJoinData(*index, schema_type_id, joinable_property_id), + IsOkAndHolds(IsEmpty())); + EXPECT_THAT(GetJoinData(*index, schema_type_id + 1, joinable_property_id), + IsOkAndHolds(IsEmpty())); + EXPECT_THAT(GetJoinData(*index, schema_type_id, joinable_property_id + 1), + IsOkAndHolds(IsEmpty())); +} + +TEST_F(QualifiedIdJoinIndexImplV2Test, + PutAndGetSingleSchemaTypeAndJoinableProperty) { + SchemaTypeId schema_type_id = 2; + JoinablePropertyId joinable_property_id = 1; + + NamespaceFingerprintIdentifier id1(/*namespace_id=*/3, /*fingerprint=*/12); + NamespaceFingerprintIdentifier id2(/*namespace_id=*/1, /*fingerprint=*/34); + NamespaceFingerprintIdentifier id3(/*namespace_id=*/2, /*fingerprint=*/56); + NamespaceFingerprintIdentifier id4(/*namespace_id=*/0, /*fingerprint=*/78); + + { + // Create new qualified id join index + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<QualifiedIdJoinIndexImplV2> index, + QualifiedIdJoinIndexImplV2::Create(filesystem_, working_path_, + /*pre_mapping_fbv=*/false)); + + EXPECT_THAT( + index->Put(schema_type_id, joinable_property_id, /*document_id=*/5, + /*ref_namespace_fingerprint_ids=*/{id2, id1}), + IsOk()); + EXPECT_THAT( + index->Put(schema_type_id, joinable_property_id, /*document_id=*/6, + /*ref_namespace_fingerprint_ids=*/{id3}), + IsOk()); + EXPECT_THAT( + index->Put(schema_type_id, joinable_property_id, /*document_id=*/12, + /*ref_namespace_fingerprint_ids=*/{id4}), + IsOk()); + EXPECT_THAT(index, Pointee(SizeIs(4))); + + EXPECT_THAT(GetJoinData(*index, schema_type_id, joinable_property_id), + IsOkAndHolds(ElementsAre( + DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>( + /*document_id=*/12, /*join_info=*/id4), + DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>( + /*document_id=*/6, /*join_info=*/id3), + DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>( + /*document_id=*/5, /*join_info=*/id1), + DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>( + /*document_id=*/5, /*join_info=*/id2)))); + EXPECT_THAT(GetJoinData(*index, schema_type_id + 1, joinable_property_id), + IsOkAndHolds(IsEmpty())); + EXPECT_THAT(GetJoinData(*index, schema_type_id, joinable_property_id + 1), + IsOkAndHolds(IsEmpty())); + + ICING_ASSERT_OK(index->PersistToDisk()); + } + + // Verify we can get all of them after destructing and re-initializing. + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<QualifiedIdJoinIndexImplV2> index, + QualifiedIdJoinIndexImplV2::Create(filesystem_, working_path_, + /*pre_mapping_fbv=*/false)); + EXPECT_THAT(index, Pointee(SizeIs(4))); + EXPECT_THAT(GetJoinData(*index, schema_type_id, joinable_property_id), + IsOkAndHolds(ElementsAre( + DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>( + /*document_id=*/12, /*join_info=*/id4), + DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>( + /*document_id=*/6, /*join_info=*/id3), + DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>( + /*document_id=*/5, /*join_info=*/id1), + DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>( + /*document_id=*/5, /*join_info=*/id2)))); + EXPECT_THAT(GetJoinData(*index, schema_type_id + 1, joinable_property_id), + IsOkAndHolds(IsEmpty())); + EXPECT_THAT(GetJoinData(*index, schema_type_id, joinable_property_id + 1), + IsOkAndHolds(IsEmpty())); +} + +TEST_F(QualifiedIdJoinIndexImplV2Test, + PutAndGetMultipleSchemaTypesAndJoinableProperties) { + SchemaTypeId schema_type_id1 = 2; + SchemaTypeId schema_type_id2 = 4; + + JoinablePropertyId joinable_property_id1 = 1; + JoinablePropertyId joinable_property_id2 = 10; + + NamespaceFingerprintIdentifier id1(/*namespace_id=*/3, /*fingerprint=*/12); + NamespaceFingerprintIdentifier id2(/*namespace_id=*/1, /*fingerprint=*/34); + NamespaceFingerprintIdentifier id3(/*namespace_id=*/2, /*fingerprint=*/56); + NamespaceFingerprintIdentifier id4(/*namespace_id=*/0, /*fingerprint=*/78); + + { + // Create new qualified id join index + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<QualifiedIdJoinIndexImplV2> index, + QualifiedIdJoinIndexImplV2::Create(filesystem_, working_path_, + /*pre_mapping_fbv=*/false)); + + EXPECT_THAT( + index->Put(schema_type_id1, joinable_property_id1, /*document_id=*/5, + /*ref_namespace_fingerprint_ids=*/{id1}), + IsOk()); + EXPECT_THAT( + index->Put(schema_type_id1, joinable_property_id2, /*document_id=*/5, + /*ref_namespace_fingerprint_ids=*/{id2}), + IsOk()); + EXPECT_THAT( + index->Put(schema_type_id2, joinable_property_id1, /*document_id=*/12, + /*ref_namespace_fingerprint_ids=*/{id3}), + IsOk()); + EXPECT_THAT( + index->Put(schema_type_id2, joinable_property_id2, /*document_id=*/12, + /*ref_namespace_fingerprint_ids=*/{id4}), + IsOk()); + EXPECT_THAT(index, Pointee(SizeIs(4))); + + EXPECT_THAT(GetJoinData(*index, schema_type_id1, joinable_property_id1), + IsOkAndHolds(ElementsAre( + DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>( + /*document_id=*/5, /*join_info=*/id1)))); + EXPECT_THAT(GetJoinData(*index, schema_type_id1, joinable_property_id2), + IsOkAndHolds(ElementsAre( + DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>( + /*document_id=*/5, /*join_info=*/id2)))); + EXPECT_THAT(GetJoinData(*index, schema_type_id2, joinable_property_id1), + IsOkAndHolds(ElementsAre( + DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>( + /*document_id=*/12, /*join_info=*/id3)))); + EXPECT_THAT(GetJoinData(*index, schema_type_id2, joinable_property_id2), + IsOkAndHolds(ElementsAre( + DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>( + /*document_id=*/12, /*join_info=*/id4)))); + + ICING_ASSERT_OK(index->PersistToDisk()); + } + + // Verify we can get all of them after destructing and re-initializing. + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<QualifiedIdJoinIndexImplV2> index, + QualifiedIdJoinIndexImplV2::Create(filesystem_, working_path_, + /*pre_mapping_fbv=*/false)); + EXPECT_THAT(index, Pointee(SizeIs(4))); + EXPECT_THAT(GetJoinData(*index, schema_type_id1, joinable_property_id1), + IsOkAndHolds(ElementsAre( + DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>( + /*document_id=*/5, /*join_info=*/id1)))); + EXPECT_THAT(GetJoinData(*index, schema_type_id1, joinable_property_id2), + IsOkAndHolds(ElementsAre( + DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>( + /*document_id=*/5, /*join_info=*/id2)))); + EXPECT_THAT(GetJoinData(*index, schema_type_id2, joinable_property_id1), + IsOkAndHolds(ElementsAre( + DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>( + /*document_id=*/12, /*join_info=*/id3)))); + EXPECT_THAT(GetJoinData(*index, schema_type_id2, joinable_property_id2), + IsOkAndHolds(ElementsAre( + DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>( + /*document_id=*/12, /*join_info=*/id4)))); +} + +TEST_F(QualifiedIdJoinIndexImplV2Test, SetLastAddedDocumentId) { + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<QualifiedIdJoinIndexImplV2> index, + QualifiedIdJoinIndexImplV2::Create(filesystem_, working_path_, + /*pre_mapping_fbv=*/false)); + + EXPECT_THAT(index->last_added_document_id(), Eq(kInvalidDocumentId)); + + constexpr DocumentId kDocumentId = 100; + index->set_last_added_document_id(kDocumentId); + EXPECT_THAT(index->last_added_document_id(), Eq(kDocumentId)); + + constexpr DocumentId kNextDocumentId = 123; + index->set_last_added_document_id(kNextDocumentId); + EXPECT_THAT(index->last_added_document_id(), Eq(kNextDocumentId)); +} + +TEST_F( + QualifiedIdJoinIndexImplV2Test, + SetLastAddedDocumentIdShouldIgnoreNewDocumentIdNotGreaterThanTheCurrent) { + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<QualifiedIdJoinIndexImplV2> index, + QualifiedIdJoinIndexImplV2::Create(filesystem_, working_path_, + /*pre_mapping_fbv=*/false)); + + constexpr DocumentId kDocumentId = 123; + index->set_last_added_document_id(kDocumentId); + ASSERT_THAT(index->last_added_document_id(), Eq(kDocumentId)); + + constexpr DocumentId kNextDocumentId = 100; + ASSERT_THAT(kNextDocumentId, Lt(kDocumentId)); + index->set_last_added_document_id(kNextDocumentId); + // last_added_document_id() should remain unchanged. + EXPECT_THAT(index->last_added_document_id(), Eq(kDocumentId)); +} + +TEST_F(QualifiedIdJoinIndexImplV2Test, Optimize) { + // General test for Optimize(). + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<QualifiedIdJoinIndexImplV2> index, + QualifiedIdJoinIndexImplV2::Create(filesystem_, working_path_, + /*pre_mapping_fbv=*/false)); + + SchemaTypeId schema_type_id1 = 2; + SchemaTypeId schema_type_id2 = 5; + + JoinablePropertyId joinable_property_id1 = 11; + JoinablePropertyId joinable_property_id2 = 15; + + NamespaceFingerprintIdentifier id1(/*namespace_id=*/2, /*fingerprint=*/101); + NamespaceFingerprintIdentifier id2(/*namespace_id=*/3, /*fingerprint=*/102); + NamespaceFingerprintIdentifier id3(/*namespace_id=*/4, /*fingerprint=*/103); + NamespaceFingerprintIdentifier id4(/*namespace_id=*/0, /*fingerprint=*/104); + NamespaceFingerprintIdentifier id5(/*namespace_id=*/0, /*fingerprint=*/105); + NamespaceFingerprintIdentifier id6(/*namespace_id=*/1, /*fingerprint=*/106); + NamespaceFingerprintIdentifier id7(/*namespace_id=*/3, /*fingerprint=*/107); + NamespaceFingerprintIdentifier id8(/*namespace_id=*/2, /*fingerprint=*/108); + + EXPECT_THAT( + index->Put(schema_type_id1, joinable_property_id1, /*document_id=*/3, + /*ref_namespace_fingerprint_ids=*/{id1, id2, id3}), + IsOk()); + EXPECT_THAT( + index->Put(schema_type_id2, joinable_property_id2, /*document_id=*/5, + /*ref_namespace_fingerprint_ids=*/{id4}), + IsOk()); + EXPECT_THAT( + index->Put(schema_type_id2, joinable_property_id2, /*document_id=*/8, + /*ref_namespace_fingerprint_ids=*/{id5, id6}), + IsOk()); + EXPECT_THAT( + index->Put(schema_type_id1, joinable_property_id1, /*document_id=*/13, + /*ref_namespace_fingerprint_ids=*/{id7}), + IsOk()); + EXPECT_THAT( + index->Put(schema_type_id1, joinable_property_id1, /*document_id=*/21, + /*ref_namespace_fingerprint_ids=*/{id8}), + IsOk()); + index->set_last_added_document_id(21); + + ASSERT_THAT(index, Pointee(SizeIs(8))); + + // Delete doc id = 5, 13, compress and keep the rest. + std::vector<DocumentId> document_id_old_to_new(22, kInvalidDocumentId); + document_id_old_to_new[3] = 0; + document_id_old_to_new[8] = 1; + document_id_old_to_new[21] = 2; + + // Delete namespace id 1, 2 (and invalidate id1, id6, id8). Reorder namespace + // ids [0, 3, 4] to [1, 2, 0]. + std::vector<NamespaceId> namespace_id_old_to_new(5, kInvalidNamespaceId); + namespace_id_old_to_new[0] = 1; + namespace_id_old_to_new[3] = 2; + namespace_id_old_to_new[4] = 0; + + DocumentId new_last_added_document_id = 2; + EXPECT_THAT(index->Optimize(document_id_old_to_new, namespace_id_old_to_new, + new_last_added_document_id), + IsOk()); + EXPECT_THAT(index, Pointee(SizeIs(3))); + EXPECT_THAT(index->last_added_document_id(), Eq(new_last_added_document_id)); + + // Verify GetIterator API should work normally after Optimize(). + // 1) schema_type_id1, joinable_property_id1: + // - old_doc_id=21, old_ref_namespace_id=2: NOT FOUND + // - old_doc_id=13, old_ref_namespace_id=3: NOT FOUND + // - old_doc_id=3, old_ref_namespace_id=4: + // become new_doc_id=0, new_ref_namespace_id=0 + // - old_doc_id=3, old_ref_namespace_id=3: + // become new_doc_id=0, new_ref_namespace_id=2 + // - old_doc_id=3, old_ref_namespace_id=2: NOT FOUND + // + // For new_doc_id=0, it should reorder due to posting list restriction. + EXPECT_THAT( + GetJoinData(*index, schema_type_id1, joinable_property_id1), + IsOkAndHolds(ElementsAre( + DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>( + /*document_id=*/0, /*join_info=*/NamespaceFingerprintIdentifier( + /*namespace_id=*/2, /*fingerprint=*/102)), + DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>( + /*document_id=*/0, /*join_info=*/NamespaceFingerprintIdentifier( + /*namespace_id=*/0, /*fingerprint=*/103))))); + + // 2) schema_type_id2, joinable_property_id2: + // - old_doc_id=8, old_ref_namespace_id=1: NOT FOUND + // - old_doc_id=8, old_ref_namespace_id=0: + // become new_doc_id=1, new_ref_namespace_id=1 + // - old_doc_id=5, old_ref_namespace_id=0: NOT FOUND + EXPECT_THAT( + GetJoinData(*index, schema_type_id2, joinable_property_id2), + IsOkAndHolds( + ElementsAre(DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>( + /*document_id=*/1, /*join_info=*/NamespaceFingerprintIdentifier( + /*namespace_id=*/1, /*fingerprint=*/105))))); + + // Verify Put API should work normally after Optimize(). + NamespaceFingerprintIdentifier id9(/*namespace_id=*/1, /*fingerprint=*/109); + EXPECT_THAT( + index->Put(schema_type_id1, joinable_property_id1, /*document_id=*/99, + /*ref_namespace_fingerprint_ids=*/{id9}), + IsOk()); + index->set_last_added_document_id(99); + + EXPECT_THAT(index, Pointee(SizeIs(4))); + EXPECT_THAT(index->last_added_document_id(), Eq(99)); + EXPECT_THAT( + GetJoinData(*index, schema_type_id1, joinable_property_id1), + IsOkAndHolds(ElementsAre( + DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>( + /*document_id=*/99, /*join_info=*/id9), + DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>( + /*document_id=*/0, /*join_info=*/NamespaceFingerprintIdentifier( + /*namespace_id=*/2, /*fingerprint=*/102)), + DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>( + /*document_id=*/0, /*join_info=*/NamespaceFingerprintIdentifier( + /*namespace_id=*/0, /*fingerprint=*/103))))); +} + +TEST_F(QualifiedIdJoinIndexImplV2Test, OptimizeDocumentIdChange) { + // Specific test for Optimize(): document id compaction. + + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<QualifiedIdJoinIndexImplV2> index, + QualifiedIdJoinIndexImplV2::Create(filesystem_, working_path_, + /*pre_mapping_fbv=*/false)); + + SchemaTypeId schema_type_id = 2; + JoinablePropertyId joinable_property_id = 1; + + NamespaceFingerprintIdentifier id1(/*namespace_id=*/1, /*fingerprint=*/101); + NamespaceFingerprintIdentifier id2(/*namespace_id=*/1, /*fingerprint=*/102); + NamespaceFingerprintIdentifier id3(/*namespace_id=*/1, /*fingerprint=*/103); + NamespaceFingerprintIdentifier id4(/*namespace_id=*/1, /*fingerprint=*/104); + NamespaceFingerprintIdentifier id5(/*namespace_id=*/1, /*fingerprint=*/105); + NamespaceFingerprintIdentifier id6(/*namespace_id=*/1, /*fingerprint=*/106); + + EXPECT_THAT( + index->Put(schema_type_id, joinable_property_id, /*document_id=*/3, + /*ref_namespace_fingerprint_ids=*/{id1, id2}), + IsOk()); + EXPECT_THAT( + index->Put(schema_type_id, joinable_property_id, /*document_id=*/5, + /*ref_namespace_fingerprint_ids=*/{id3}), + IsOk()); + EXPECT_THAT( + index->Put(schema_type_id, joinable_property_id, /*document_id=*/8, + /*ref_namespace_fingerprint_ids=*/{id4}), + IsOk()); + EXPECT_THAT( + index->Put(schema_type_id, joinable_property_id, /*document_id=*/13, + /*ref_namespace_fingerprint_ids=*/{id5}), + IsOk()); + EXPECT_THAT( + index->Put(schema_type_id, joinable_property_id, /*document_id=*/21, + /*ref_namespace_fingerprint_ids=*/{id6}), + IsOk()); + index->set_last_added_document_id(21); + + ASSERT_THAT(index, Pointee(SizeIs(6))); + + // Delete doc id = 5, 8, compress and keep the rest. + std::vector<DocumentId> document_id_old_to_new(22, kInvalidDocumentId); + document_id_old_to_new[3] = 0; + document_id_old_to_new[13] = 1; + document_id_old_to_new[21] = 2; + + // No change for namespace id. + std::vector<NamespaceId> namespace_id_old_to_new = {0, 1}; + + DocumentId new_last_added_document_id = 2; + EXPECT_THAT(index->Optimize(document_id_old_to_new, namespace_id_old_to_new, + new_last_added_document_id), + IsOk()); + EXPECT_THAT(index, Pointee(SizeIs(4))); + EXPECT_THAT(index->last_added_document_id(), Eq(new_last_added_document_id)); + + // Verify GetIterator API should work normally after Optimize(). + // - old_doc_id=21, join_info=id6: become doc_id=2, join_info=id6 + // - old_doc_id=13, join_info=id5: become doc_id=1, join_info=id5 + // - old_doc_id=8, join_info=id4: NOT FOUND + // - old_doc_id=5, join_info=id3: NOT FOUND + // - old_doc_id=3, join_info=id2: become doc_id=0, join_info=id2 + // - old_doc_id=3, join_info=id1: become doc_id=0, join_info=id1 + EXPECT_THAT(GetJoinData(*index, schema_type_id, joinable_property_id), + IsOkAndHolds(ElementsAre( + DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>( + /*document_id=*/2, /*join_info=*/id6), + DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>( + /*document_id=*/1, /*join_info=*/id5), + DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>( + /*document_id=*/0, /*join_info=*/id2), + DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>( + /*document_id=*/0, /*join_info=*/id1)))); + + // Verify Put API should work normally after Optimize(). + NamespaceFingerprintIdentifier id7(/*namespace_id=*/1, /*fingerprint=*/107); + EXPECT_THAT( + index->Put(schema_type_id, joinable_property_id, /*document_id=*/99, + /*ref_namespace_fingerprint_ids=*/{id7}), + IsOk()); + index->set_last_added_document_id(99); + + EXPECT_THAT(index, Pointee(SizeIs(5))); + EXPECT_THAT(index->last_added_document_id(), Eq(99)); + EXPECT_THAT(GetJoinData(*index, schema_type_id, joinable_property_id), + IsOkAndHolds(ElementsAre( + DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>( + /*document_id=*/99, /*join_info=*/id7), + DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>( + /*document_id=*/2, /*join_info=*/id6), + DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>( + /*document_id=*/1, /*join_info=*/id5), + DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>( + /*document_id=*/0, /*join_info=*/id2), + DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>( + /*document_id=*/0, /*join_info=*/id1)))); +} + +TEST_F(QualifiedIdJoinIndexImplV2Test, OptimizeOutOfRangeDocumentId) { + // Specific test for Optimize() for out of range document id. + + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<QualifiedIdJoinIndexImplV2> index, + QualifiedIdJoinIndexImplV2::Create(filesystem_, working_path_, + /*pre_mapping_fbv=*/false)); + + SchemaTypeId schema_type_id = 2; + JoinablePropertyId joinable_property_id = 1; + NamespaceFingerprintIdentifier id(/*namespace_id=*/1, /*fingerprint=*/101); + + EXPECT_THAT( + index->Put(schema_type_id, joinable_property_id, /*document_id=*/99, + /*ref_namespace_fingerprint_ids=*/{id}), + IsOk()); + index->set_last_added_document_id(99); + + // Create document_id_old_to_new with size = 1. Optimize should handle out of + // range DocumentId properly. + std::vector<DocumentId> document_id_old_to_new = {kInvalidDocumentId}; + std::vector<NamespaceId> namespace_id_old_to_new = {0, 1}; + + // There shouldn't be any error due to vector index. + EXPECT_THAT( + index->Optimize(document_id_old_to_new, namespace_id_old_to_new, + /*new_last_added_document_id=*/kInvalidDocumentId), + IsOk()); + EXPECT_THAT(index->last_added_document_id(), Eq(kInvalidDocumentId)); + + // Verify all data are discarded after Optimize(). + EXPECT_THAT(index, Pointee(IsEmpty())); + EXPECT_THAT(GetJoinData(*index, schema_type_id, joinable_property_id), + IsOkAndHolds(IsEmpty())); +} + +TEST_F(QualifiedIdJoinIndexImplV2Test, OptimizeDeleteAllDocuments) { + // Specific test for Optimize(): delete all document ids. + + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<QualifiedIdJoinIndexImplV2> index, + QualifiedIdJoinIndexImplV2::Create(filesystem_, working_path_, + /*pre_mapping_fbv=*/false)); + + SchemaTypeId schema_type_id = 2; + JoinablePropertyId joinable_property_id = 1; + + NamespaceFingerprintIdentifier id1(/*namespace_id=*/1, /*fingerprint=*/101); + NamespaceFingerprintIdentifier id2(/*namespace_id=*/1, /*fingerprint=*/102); + NamespaceFingerprintIdentifier id3(/*namespace_id=*/1, /*fingerprint=*/103); + NamespaceFingerprintIdentifier id4(/*namespace_id=*/1, /*fingerprint=*/104); + NamespaceFingerprintIdentifier id5(/*namespace_id=*/1, /*fingerprint=*/105); + NamespaceFingerprintIdentifier id6(/*namespace_id=*/1, /*fingerprint=*/106); + + EXPECT_THAT( + index->Put(schema_type_id, joinable_property_id, /*document_id=*/3, + /*ref_namespace_fingerprint_ids=*/{id1, id2}), + IsOk()); + EXPECT_THAT( + index->Put(schema_type_id, joinable_property_id, /*document_id=*/5, + /*ref_namespace_fingerprint_ids=*/{id3}), + IsOk()); + EXPECT_THAT( + index->Put(schema_type_id, joinable_property_id, /*document_id=*/8, + /*ref_namespace_fingerprint_ids=*/{id4}), + IsOk()); + EXPECT_THAT( + index->Put(schema_type_id, joinable_property_id, /*document_id=*/13, + /*ref_namespace_fingerprint_ids=*/{id5}), + IsOk()); + EXPECT_THAT( + index->Put(schema_type_id, joinable_property_id, /*document_id=*/21, + /*ref_namespace_fingerprint_ids=*/{id6}), + IsOk()); + index->set_last_added_document_id(21); + + ASSERT_THAT(index, Pointee(SizeIs(6))); + + // Delete all documents. + std::vector<DocumentId> document_id_old_to_new(22, kInvalidDocumentId); + + // No change for namespace id. + std::vector<NamespaceId> namespace_id_old_to_new = {0, 1}; + + EXPECT_THAT( + index->Optimize(document_id_old_to_new, namespace_id_old_to_new, + /*new_last_added_document_id=*/kInvalidDocumentId), + IsOk()); + EXPECT_THAT(index->last_added_document_id(), Eq(kInvalidDocumentId)); + + // Verify all data are discarded after Optimize(). + EXPECT_THAT(index, Pointee(IsEmpty())); + EXPECT_THAT(GetJoinData(*index, schema_type_id, joinable_property_id), + IsOkAndHolds(IsEmpty())); +} + +TEST_F(QualifiedIdJoinIndexImplV2Test, OptimizeNamespaceIdChange) { + // Specific test for Optimize(): referenced namespace id compaction. + + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<QualifiedIdJoinIndexImplV2> index, + QualifiedIdJoinIndexImplV2::Create(filesystem_, working_path_, + /*pre_mapping_fbv=*/false)); + + SchemaTypeId schema_type_id = 2; + JoinablePropertyId joinable_property_id = 1; + + NamespaceFingerprintIdentifier id1(/*namespace_id=*/3, /*fingerprint=*/101); + NamespaceFingerprintIdentifier id2(/*namespace_id=*/5, /*fingerprint=*/102); + NamespaceFingerprintIdentifier id3(/*namespace_id=*/4, /*fingerprint=*/103); + NamespaceFingerprintIdentifier id4(/*namespace_id=*/0, /*fingerprint=*/104); + NamespaceFingerprintIdentifier id5(/*namespace_id=*/2, /*fingerprint=*/105); + NamespaceFingerprintIdentifier id6(/*namespace_id=*/1, /*fingerprint=*/106); + + EXPECT_THAT( + index->Put(schema_type_id, joinable_property_id, /*document_id=*/2, + /*ref_namespace_fingerprint_ids=*/{id1}), + IsOk()); + EXPECT_THAT( + index->Put(schema_type_id, joinable_property_id, /*document_id=*/3, + /*ref_namespace_fingerprint_ids=*/{id2}), + IsOk()); + EXPECT_THAT( + index->Put(schema_type_id, joinable_property_id, /*document_id=*/5, + /*ref_namespace_fingerprint_ids=*/{id3}), + IsOk()); + EXPECT_THAT( + index->Put(schema_type_id, joinable_property_id, /*document_id=*/8, + /*ref_namespace_fingerprint_ids=*/{id4}), + IsOk()); + EXPECT_THAT( + index->Put(schema_type_id, joinable_property_id, /*document_id=*/13, + /*ref_namespace_fingerprint_ids=*/{id5}), + IsOk()); + EXPECT_THAT( + index->Put(schema_type_id, joinable_property_id, /*document_id=*/21, + /*ref_namespace_fingerprint_ids=*/{id6}), + IsOk()); + index->set_last_added_document_id(21); + + ASSERT_THAT(index, Pointee(SizeIs(6))); + + // No change for document id. + std::vector<DocumentId> document_id_old_to_new(22); + std::iota(document_id_old_to_new.begin(), document_id_old_to_new.end(), 0); + + // Delete namespace id 2, 4. Reorder namespace id [0, 1, 3, 5] to [2, 3, 1, + // 0]. + std::vector<NamespaceId> namespace_id_old_to_new(6, kInvalidNamespaceId); + namespace_id_old_to_new[0] = 2; + namespace_id_old_to_new[1] = 3; + namespace_id_old_to_new[3] = 1; + namespace_id_old_to_new[5] = 0; + + DocumentId new_last_added_document_id = 21; + EXPECT_THAT(index->Optimize(document_id_old_to_new, namespace_id_old_to_new, + new_last_added_document_id), + IsOk()); + EXPECT_THAT(index, Pointee(SizeIs(4))); + EXPECT_THAT(index->last_added_document_id(), Eq(new_last_added_document_id)); + + // Verify GetIterator API should work normally after Optimize(). + // - id6 (old_namespace_id=1): new_namespace_id=3 (document_id = 21) + // - id5 (old_namespace_id=2): NOT FOUND + // - id4 (old_namespace_id=0): new_namespace_id=2 (document_id = 8) + // - id3 (old_namespace_id=4): NOT FOUND + // - id2 (old_namespace_id=5): new_namespace_id=0 (document_id = 3) + // - id1 (old_namespace_id=3): new_namespace_id=1 (document_id = 2) + EXPECT_THAT( + GetJoinData(*index, schema_type_id, joinable_property_id), + IsOkAndHolds(ElementsAre( + DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>( + /*document_id=*/21, /*join_info=*/NamespaceFingerprintIdentifier( + /*namespace_id=*/3, /*fingerprint=*/106)), + DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>( + /*document_id=*/8, /*join_info=*/NamespaceFingerprintIdentifier( + /*namespace_id=*/2, /*fingerprint=*/104)), + DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>( + /*document_id=*/3, /*join_info=*/NamespaceFingerprintIdentifier( + /*namespace_id=*/0, /*fingerprint=*/102)), + DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>( + /*document_id=*/2, /*join_info=*/NamespaceFingerprintIdentifier( + /*namespace_id=*/1, /*fingerprint=*/101))))); + + // Verify Put API should work normally after Optimize(). + NamespaceFingerprintIdentifier id7(/*namespace_id=*/1, /*fingerprint=*/107); + EXPECT_THAT( + index->Put(schema_type_id, joinable_property_id, /*document_id=*/99, + /*ref_namespace_fingerprint_ids=*/{id7}), + IsOk()); + index->set_last_added_document_id(99); + + EXPECT_THAT(index, Pointee(SizeIs(5))); + EXPECT_THAT(index->last_added_document_id(), Eq(99)); + EXPECT_THAT( + GetJoinData(*index, schema_type_id, joinable_property_id), + IsOkAndHolds(ElementsAre( + DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>( + /*document_id=*/99, /*join_info=*/id7), + DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>( + /*document_id=*/21, /*join_info=*/NamespaceFingerprintIdentifier( + /*namespace_id=*/3, /*fingerprint=*/106)), + DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>( + /*document_id=*/8, /*join_info=*/NamespaceFingerprintIdentifier( + /*namespace_id=*/2, /*fingerprint=*/104)), + DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>( + /*document_id=*/3, /*join_info=*/NamespaceFingerprintIdentifier( + /*namespace_id=*/0, /*fingerprint=*/102)), + DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>( + /*document_id=*/2, /*join_info=*/NamespaceFingerprintIdentifier( + /*namespace_id=*/1, /*fingerprint=*/101))))); +} + +TEST_F(QualifiedIdJoinIndexImplV2Test, OptimizeNamespaceIdChangeShouldReorder) { + // Specific test for Optimize(): referenced namespace id reorder. + + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<QualifiedIdJoinIndexImplV2> index, + QualifiedIdJoinIndexImplV2::Create(filesystem_, working_path_, + /*pre_mapping_fbv=*/false)); + + SchemaTypeId schema_type_id = 2; + JoinablePropertyId joinable_property_id = 1; + + NamespaceFingerprintIdentifier id1(/*namespace_id=*/0, /*fingerprint=*/101); + NamespaceFingerprintIdentifier id2(/*namespace_id=*/1, /*fingerprint=*/102); + NamespaceFingerprintIdentifier id3(/*namespace_id=*/2, /*fingerprint=*/103); + NamespaceFingerprintIdentifier id4(/*namespace_id=*/1, /*fingerprint=*/104); + + EXPECT_THAT( + index->Put(schema_type_id, joinable_property_id, /*document_id=*/0, + /*ref_namespace_fingerprint_ids=*/{id1, id2, id3}), + IsOk()); + EXPECT_THAT( + index->Put(schema_type_id, joinable_property_id, /*document_id=*/1, + /*ref_namespace_fingerprint_ids=*/{id4}), + IsOk()); + index->set_last_added_document_id(1); + + ASSERT_THAT(index, Pointee(SizeIs(4))); + + // No change for document id. + std::vector<DocumentId> document_id_old_to_new = {0, 1}; + + // Reorder namespace id [0, 1, 2] to [2, 0, 1]. + std::vector<NamespaceId> namespace_id_old_to_new = {2, 0, 1}; + + DocumentId new_last_added_document_id = 1; + EXPECT_THAT(index->Optimize(document_id_old_to_new, namespace_id_old_to_new, + new_last_added_document_id), + IsOk()); + EXPECT_THAT(index, Pointee(SizeIs(4))); + EXPECT_THAT(index->last_added_document_id(), Eq(new_last_added_document_id)); + + // Verify GetIterator API should work normally after Optimize(). + // - id4 (old_namespace_id=1): new_namespace_id=0 (document_id = 1) + // - id3 (old_namespace_id=2): new_namespace_id=1 (document_id = 0) + // - id2 (old_namespace_id=1): new_namespace_id=0 (document_id = 0) + // - id1 (old_namespace_id=0): new_namespace_id=2 (document_id = 0) + // + // Should reorder to [id4, id1, id3, id2] due to posting list restriction. + EXPECT_THAT( + GetJoinData(*index, schema_type_id, joinable_property_id), + IsOkAndHolds(ElementsAre( + DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>( + /*document_id=*/1, /*join_info=*/NamespaceFingerprintIdentifier( + /*namespace_id=*/0, /*fingerprint=*/104)), + DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>( + /*document_id=*/0, /*join_info=*/NamespaceFingerprintIdentifier( + /*namespace_id=*/2, /*fingerprint=*/101)), + DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>( + /*document_id=*/0, /*join_info=*/NamespaceFingerprintIdentifier( + /*namespace_id=*/1, /*fingerprint=*/103)), + DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>( + /*document_id=*/0, /*join_info=*/NamespaceFingerprintIdentifier( + /*namespace_id=*/0, /*fingerprint=*/102))))); +} + +TEST_F(QualifiedIdJoinIndexImplV2Test, OptimizeOutOfRangeNamespaceId) { + // Specific test for Optimize(): out of range referenced namespace id. + + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<QualifiedIdJoinIndexImplV2> index, + QualifiedIdJoinIndexImplV2::Create(filesystem_, working_path_, + /*pre_mapping_fbv=*/false)); + + SchemaTypeId schema_type_id = 2; + JoinablePropertyId joinable_property_id = 1; + NamespaceFingerprintIdentifier id(/*namespace_id=*/99, /*fingerprint=*/101); + + EXPECT_THAT( + index->Put(schema_type_id, joinable_property_id, /*document_id=*/0, + /*ref_namespace_fingerprint_ids=*/{id}), + IsOk()); + index->set_last_added_document_id(0); + + // Create namespace_id_old_to_new with size = 1. Optimize should handle out of + // range NamespaceId properly. + std::vector<DocumentId> document_id_old_to_new = {0}; + std::vector<NamespaceId> namespace_id_old_to_new = {kInvalidNamespaceId}; + + // There shouldn't be any error due to vector index. + EXPECT_THAT( + index->Optimize(document_id_old_to_new, namespace_id_old_to_new, + /*new_last_added_document_id=*/kInvalidDocumentId), + IsOk()); + EXPECT_THAT(index->last_added_document_id(), Eq(kInvalidDocumentId)); + + // Verify all data are discarded after Optimize(). + EXPECT_THAT(index, Pointee(IsEmpty())); + EXPECT_THAT(GetJoinData(*index, schema_type_id, joinable_property_id), + IsOkAndHolds(IsEmpty())); +} + +TEST_F(QualifiedIdJoinIndexImplV2Test, OptimizeDeleteAllNamespaces) { + // Specific test for Optimize(): delete all referenced namespace ids. + + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<QualifiedIdJoinIndexImplV2> index, + QualifiedIdJoinIndexImplV2::Create(filesystem_, working_path_, + /*pre_mapping_fbv=*/false)); + + SchemaTypeId schema_type_id = 2; + JoinablePropertyId joinable_property_id = 1; + + NamespaceFingerprintIdentifier id1(/*namespace_id=*/0, /*fingerprint=*/101); + NamespaceFingerprintIdentifier id2(/*namespace_id=*/1, /*fingerprint=*/102); + NamespaceFingerprintIdentifier id3(/*namespace_id=*/2, /*fingerprint=*/103); + + EXPECT_THAT( + index->Put(schema_type_id, joinable_property_id, /*document_id=*/0, + /*ref_namespace_fingerprint_ids=*/{id1}), + IsOk()); + EXPECT_THAT( + index->Put(schema_type_id, joinable_property_id, /*document_id=*/1, + /*ref_namespace_fingerprint_ids=*/{id2}), + IsOk()); + EXPECT_THAT( + index->Put(schema_type_id, joinable_property_id, /*document_id=*/2, + /*ref_namespace_fingerprint_ids=*/{id3}), + IsOk()); + index->set_last_added_document_id(3); + + ASSERT_THAT(index, Pointee(SizeIs(3))); + + // No change for document id. + std::vector<DocumentId> document_id_old_to_new = {0, 1, 2}; + + // Delete all namespaces. + std::vector<NamespaceId> namespace_id_old_to_new(3, kInvalidNamespaceId); + + EXPECT_THAT( + index->Optimize(document_id_old_to_new, namespace_id_old_to_new, + /*new_last_added_document_id=*/kInvalidDocumentId), + IsOk()); + EXPECT_THAT(index->last_added_document_id(), Eq(kInvalidDocumentId)); + + // Verify all data are discarded after Optimize(). + EXPECT_THAT(index, Pointee(IsEmpty())); + EXPECT_THAT(GetJoinData(*index, schema_type_id, joinable_property_id), + IsOkAndHolds(IsEmpty())); +} + +TEST_F(QualifiedIdJoinIndexImplV2Test, Clear) { + NamespaceFingerprintIdentifier id1(/*namespace_id=*/1, /*fingerprint=*/12); + NamespaceFingerprintIdentifier id2(/*namespace_id=*/1, /*fingerprint=*/34); + NamespaceFingerprintIdentifier id3(/*namespace_id=*/1, /*fingerprint=*/56); + NamespaceFingerprintIdentifier id4(/*namespace_id=*/1, /*fingerprint=*/78); + + // Create new qualified id join index + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<QualifiedIdJoinIndexImplV2> index, + QualifiedIdJoinIndexImplV2::Create(filesystem_, working_path_, + /*pre_mapping_fbv=*/false)); + // Insert some data. + ICING_ASSERT_OK(index->Put( + /*schema_type_id=*/2, /*joinable_property_id=*/1, /*document_id=*/5, + /*ref_namespace_fingerprint_ids=*/{id2, id1})); + ICING_ASSERT_OK(index->Put( + /*schema_type_id=*/3, /*joinable_property_id=*/10, /*document_id=*/6, + /*ref_namespace_fingerprint_ids=*/{id3})); + ICING_ASSERT_OK(index->Put( + /*schema_type_id=*/2, /*joinable_property_id=*/1, /*document_id=*/12, + /*ref_namespace_fingerprint_ids=*/{id4})); + ASSERT_THAT(index, Pointee(SizeIs(4))); + index->set_last_added_document_id(12); + ASSERT_THAT(index->last_added_document_id(), Eq(12)); + + // After Clear(), last_added_document_id should be set to kInvalidDocumentId, + // and the previous added data should be deleted. + EXPECT_THAT(index->Clear(), IsOk()); + EXPECT_THAT(index, Pointee(IsEmpty())); + EXPECT_THAT(index->last_added_document_id(), Eq(kInvalidDocumentId)); + EXPECT_THAT( + GetJoinData(*index, /*schema_type_id=*/2, /*joinable_property_id=*/1), + IsOkAndHolds(IsEmpty())); + EXPECT_THAT( + GetJoinData(*index, /*schema_type_id=*/3, /*joinable_property_id=*/10), + IsOkAndHolds(IsEmpty())); + + // Join index should be able to work normally after Clear(). + ICING_ASSERT_OK(index->Put( + /*schema_type_id=*/2, /*joinable_property_id=*/1, /*document_id=*/20, + /*ref_namespace_fingerprint_ids=*/{id4, id2, id1, id3})); + index->set_last_added_document_id(20); + + EXPECT_THAT(index, Pointee(SizeIs(4))); + EXPECT_THAT(index->last_added_document_id(), Eq(20)); + EXPECT_THAT( + GetJoinData(*index, /*schema_type_id=*/2, /*joinable_property_id=*/1), + IsOkAndHolds( + ElementsAre(DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>( + /*document_id=*/20, /*join_info=*/id4), + DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>( + /*document_id=*/20, /*join_info=*/id3), + DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>( + /*document_id=*/20, /*join_info=*/id2), + DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>( + /*document_id=*/20, /*join_info=*/id1)))); + + ICING_ASSERT_OK(index->PersistToDisk()); + index.reset(); + + // Verify index after reconstructing. + ICING_ASSERT_OK_AND_ASSIGN( + index, QualifiedIdJoinIndexImplV2::Create(filesystem_, working_path_, + /*pre_mapping_fbv=*/false)); + EXPECT_THAT(index->last_added_document_id(), Eq(20)); + EXPECT_THAT( + GetJoinData(*index, /*schema_type_id=*/2, /*joinable_property_id=*/1), + IsOkAndHolds( + ElementsAre(DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>( + /*document_id=*/20, /*join_info=*/id4), + DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>( + /*document_id=*/20, /*join_info=*/id3), + DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>( + /*document_id=*/20, /*join_info=*/id2), + DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>( + /*document_id=*/20, /*join_info=*/id1)))); +} + +} // namespace + +} // namespace lib +} // namespace icing diff --git a/icing/join/qualified-id-join-index.h b/icing/join/qualified-id-join-index.h new file mode 100644 index 0000000..4e487f9 --- /dev/null +++ b/icing/join/qualified-id-join-index.h @@ -0,0 +1,187 @@ +// Copyright (C) 2023 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef ICING_JOIN_QUALIFIED_ID_JOIN_INDEX_H_ +#define ICING_JOIN_QUALIFIED_ID_JOIN_INDEX_H_ + +#include <cstdint> +#include <memory> +#include <string> +#include <string_view> +#include <utility> +#include <vector> + +#include "icing/text_classifier/lib3/utils/base/status.h" +#include "icing/text_classifier/lib3/utils/base/statusor.h" +#include "icing/file/filesystem.h" +#include "icing/file/persistent-storage.h" +#include "icing/join/doc-join-info.h" +#include "icing/join/document-id-to-join-info.h" +#include "icing/schema/joinable-property.h" +#include "icing/store/document-filter-data.h" +#include "icing/store/document-id.h" +#include "icing/store/namespace-fingerprint-identifier.h" +#include "icing/store/namespace-id.h" +#include "icing/util/crc32.h" + +namespace icing { +namespace lib { + +// QualifiedIdJoinIndex: an abstract class to maintain data for qualified id +// joining. +class QualifiedIdJoinIndex : public PersistentStorage { + public: + class JoinDataIteratorBase { + public: + virtual ~JoinDataIteratorBase() = default; + + virtual libtextclassifier3::Status Advance() = 0; + + virtual const DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>& + GetCurrent() const = 0; + }; + + static constexpr WorkingPathType kWorkingPathType = + WorkingPathType::kDirectory; + + // Deletes QualifiedIdJoinIndex under working_path. + // + // Returns: + // - OK on success + // - INTERNAL_ERROR on I/O error + static libtextclassifier3::Status Discard(const Filesystem& filesystem, + const std::string& working_path) { + return PersistentStorage::Discard(filesystem, working_path, + kWorkingPathType); + } + + virtual ~QualifiedIdJoinIndex() override = default; + + // (v1 only) Puts a new data into index: DocJoinInfo (DocumentId, + // JoinablePropertyId) references to ref_qualified_id_str (the identifier of + // another document). + // + // REQUIRES: ref_qualified_id_str contains no '\0'. + // + // Returns: + // - OK on success + // - INVALID_ARGUMENT_ERROR if doc_join_info is invalid + // - Any KeyMapper errors + virtual libtextclassifier3::Status Put( + const DocJoinInfo& doc_join_info, + std::string_view ref_qualified_id_str) = 0; + + // (v2 only) Puts a list of referenced NamespaceFingerprintIdentifier into + // index, given the DocumentId, SchemaTypeId and JoinablePropertyId. + // + // Returns: + // - OK on success + // - INVALID_ARGUMENT_ERROR if schema_type_id, joinable_property_id, or + // document_id is invalid + // - Any KeyMapper/FlashIndexStorage errors + virtual libtextclassifier3::Status Put( + SchemaTypeId schema_type_id, JoinablePropertyId joinable_property_id, + DocumentId document_id, + std::vector<NamespaceFingerprintIdentifier>&& + ref_namespace_fingerprint_ids) = 0; + + // (v1 only) Gets the referenced document's qualified id string by + // DocJoinInfo. + // + // Returns: + // - A qualified id string referenced by the given DocJoinInfo (DocumentId, + // JoinablePropertyId) on success + // - INVALID_ARGUMENT_ERROR if doc_join_info is invalid + // - NOT_FOUND_ERROR if doc_join_info doesn't exist + // - Any KeyMapper errors + virtual libtextclassifier3::StatusOr<std::string_view> Get( + const DocJoinInfo& doc_join_info) const = 0; + + // (v2 only) Returns a JoinDataIterator for iterating through all join data of + // the specified (schema_type_id, joinable_property_id). + // + // Returns: + // - On success: a JoinDataIterator + // - INVALID_ARGUMENT_ERROR if schema_type_id or joinable_property_id is + // invalid + // - Any KeyMapper/FlashIndexStorage errors + virtual libtextclassifier3::StatusOr<std::unique_ptr<JoinDataIteratorBase>> + GetIterator(SchemaTypeId schema_type_id, + JoinablePropertyId joinable_property_id) const = 0; + + // Reduces internal file sizes by reclaiming space and ids of deleted + // documents. Qualified id type joinable index will convert all entries to the + // new document ids. + // + // - document_id_old_to_new: a map for converting old document id to new + // document id. + // - namespace_id_old_to_new: a map for converting old namespace id to new + // namespace id. + // - new_last_added_document_id: will be used to update the last added + // document id in the qualified id type joinable + // index. + // + // Returns: + // - OK on success + // - INTERNAL_ERROR on I/O error. This could potentially leave the index in + // an invalid state and the caller should handle it properly (e.g. discard + // and rebuild) + virtual libtextclassifier3::Status Optimize( + const std::vector<DocumentId>& document_id_old_to_new, + const std::vector<NamespaceId>& namespace_id_old_to_new, + DocumentId new_last_added_document_id) = 0; + + // Clears all data and set last_added_document_id to kInvalidDocumentId. + // + // Returns: + // - OK on success + // - INTERNAL_ERROR on I/O error + virtual libtextclassifier3::Status Clear() = 0; + + virtual bool is_v2() const = 0; + + virtual int32_t size() const = 0; + + virtual bool empty() const = 0; + + virtual DocumentId last_added_document_id() const = 0; + + virtual void set_last_added_document_id(DocumentId document_id) = 0; + + protected: + explicit QualifiedIdJoinIndex(const Filesystem& filesystem, + std::string&& working_path) + : PersistentStorage(filesystem, std::move(working_path), + kWorkingPathType) {} + + virtual libtextclassifier3::Status PersistStoragesToDisk( + bool force) override = 0; + + virtual libtextclassifier3::Status PersistMetadataToDisk( + bool force) override = 0; + + virtual libtextclassifier3::StatusOr<Crc32> ComputeInfoChecksum( + bool force) override = 0; + + virtual libtextclassifier3::StatusOr<Crc32> ComputeStoragesChecksum( + bool force) override = 0; + + virtual Crcs& crcs() override = 0; + virtual const Crcs& crcs() const override = 0; +}; + +} // namespace lib +} // namespace icing + +#endif // ICING_JOIN_QUALIFIED_ID_JOIN_INDEX_H_ diff --git a/icing/join/qualified-id-join-indexing-handler-v1_test.cc b/icing/join/qualified-id-join-indexing-handler-v1_test.cc new file mode 100644 index 0000000..9700132 --- /dev/null +++ b/icing/join/qualified-id-join-indexing-handler-v1_test.cc @@ -0,0 +1,558 @@ +// Copyright (C) 2023 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include <memory> +#include <string> +#include <string_view> +#include <utility> + +#include "icing/text_classifier/lib3/utils/base/status.h" +#include "gmock/gmock.h" +#include "gtest/gtest.h" +#include "icing/document-builder.h" +#include "icing/file/filesystem.h" +#include "icing/file/portable-file-backed-proto-log.h" +#include "icing/join/qualified-id-join-index-impl-v1.h" +#include "icing/join/qualified-id-join-index.h" +#include "icing/join/qualified-id-join-indexing-handler.h" +#include "icing/join/qualified-id.h" +#include "icing/portable/platform.h" +#include "icing/proto/document.pb.h" +#include "icing/proto/schema.pb.h" +#include "icing/schema-builder.h" +#include "icing/schema/joinable-property.h" +#include "icing/schema/schema-store.h" +#include "icing/store/document-id.h" +#include "icing/store/document-store.h" +#include "icing/testing/common-matchers.h" +#include "icing/testing/fake-clock.h" +#include "icing/testing/icu-data-file-helper.h" +#include "icing/testing/test-data.h" +#include "icing/testing/tmp-directory.h" +#include "icing/tokenization/language-segmenter-factory.h" +#include "icing/tokenization/language-segmenter.h" +#include "icing/util/tokenized-document.h" +#include "unicode/uloc.h" + +namespace icing { +namespace lib { + +namespace { + +using ::testing::Eq; +using ::testing::IsEmpty; +using ::testing::IsTrue; + +// Schema type for referenced documents: ReferencedType +static constexpr std::string_view kReferencedType = "ReferencedType"; +static constexpr std::string_view kPropertyName = "name"; + +// Joinable properties and joinable property id. Joinable property id is +// determined by the lexicographical order of joinable property path. +// Schema type with joinable property: FakeType +static constexpr std::string_view kFakeType = "FakeType"; +static constexpr std::string_view kPropertyQualifiedId = "qualifiedId"; + +static constexpr JoinablePropertyId kQualifiedIdJoinablePropertyId = 0; + +// Schema type with nested joinable properties: NestedType +static constexpr std::string_view kNestedType = "NestedType"; +static constexpr std::string_view kPropertyNestedDoc = "nested"; +static constexpr std::string_view kPropertyQualifiedId2 = "qualifiedId2"; + +static constexpr JoinablePropertyId kNestedQualifiedIdJoinablePropertyId = 0; +static constexpr JoinablePropertyId kQualifiedId2JoinablePropertyId = 1; + +static constexpr DocumentId kDefaultDocumentId = 3; + +// TODO(b/275121148): remove this test after deprecating +// QualifiedIdJoinIndexImplV1. +class QualifiedIdJoinIndexingHandlerV1Test : public ::testing::Test { + protected: + void SetUp() override { + if (!IsCfStringTokenization() && !IsReverseJniTokenization()) { + ICING_ASSERT_OK( + // File generated via icu_data_file rule in //icing/BUILD. + icu_data_file_helper::SetUpICUDataFile( + GetTestFilePath("icing/icu.dat"))); + } + + base_dir_ = GetTestTempDir() + "/icing_test"; + ASSERT_THAT(filesystem_.CreateDirectoryRecursively(base_dir_.c_str()), + IsTrue()); + + qualified_id_join_index_dir_ = base_dir_ + "/qualified_id_join_index"; + schema_store_dir_ = base_dir_ + "/schema_store"; + doc_store_dir_ = base_dir_ + "/doc_store"; + + ICING_ASSERT_OK_AND_ASSIGN(qualified_id_join_index_, + QualifiedIdJoinIndexImplV1::Create( + filesystem_, qualified_id_join_index_dir_, + /*pre_mapping_fbv=*/false, + /*use_persistent_hash_map=*/false)); + + language_segmenter_factory::SegmenterOptions segmenter_options(ULOC_US); + ICING_ASSERT_OK_AND_ASSIGN( + lang_segmenter_, + language_segmenter_factory::Create(std::move(segmenter_options))); + + ASSERT_THAT( + filesystem_.CreateDirectoryRecursively(schema_store_dir_.c_str()), + IsTrue()); + ICING_ASSERT_OK_AND_ASSIGN( + schema_store_, + SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_)); + SchemaProto schema = + SchemaBuilder() + .AddType( + SchemaTypeConfigBuilder() + .SetType(kReferencedType) + .AddProperty(PropertyConfigBuilder() + .SetName(kPropertyName) + .SetDataTypeString(TERM_MATCH_EXACT, + TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_OPTIONAL))) + .AddType(SchemaTypeConfigBuilder().SetType(kFakeType).AddProperty( + PropertyConfigBuilder() + .SetName(kPropertyQualifiedId) + .SetDataTypeJoinableString(JOINABLE_VALUE_TYPE_QUALIFIED_ID) + .SetCardinality(CARDINALITY_OPTIONAL))) + .AddType( + SchemaTypeConfigBuilder() + .SetType(kNestedType) + .AddProperty( + PropertyConfigBuilder() + .SetName(kPropertyNestedDoc) + .SetDataTypeDocument( + kFakeType, /*index_nested_properties=*/true) + .SetCardinality(CARDINALITY_OPTIONAL)) + .AddProperty(PropertyConfigBuilder() + .SetName(kPropertyQualifiedId2) + .SetDataTypeJoinableString( + JOINABLE_VALUE_TYPE_QUALIFIED_ID) + .SetCardinality(CARDINALITY_OPTIONAL))) + .Build(); + ICING_ASSERT_OK(schema_store_->SetSchema( + schema, /*ignore_errors_and_delete_documents=*/false, + /*allow_circular_schema_definitions=*/false)); + + ASSERT_THAT(filesystem_.CreateDirectoryRecursively(doc_store_dir_.c_str()), + IsTrue()); + ICING_ASSERT_OK_AND_ASSIGN( + DocumentStore::CreateResult create_result, + DocumentStore::Create(&filesystem_, doc_store_dir_, &fake_clock_, + schema_store_.get(), + /*force_recovery_and_revalidate_documents=*/false, + /*namespace_id_fingerprint=*/false, + /*pre_mapping_fbv=*/false, + /*use_persistent_hash_map=*/false, + PortableFileBackedProtoLog< + DocumentWrapper>::kDeflateCompressionLevel, + /*initialize_stats=*/nullptr)); + doc_store_ = std::move(create_result.document_store); + } + + void TearDown() override { + doc_store_.reset(); + schema_store_.reset(); + lang_segmenter_.reset(); + qualified_id_join_index_.reset(); + + filesystem_.DeleteDirectoryRecursively(base_dir_.c_str()); + } + + Filesystem filesystem_; + FakeClock fake_clock_; + std::string base_dir_; + std::string qualified_id_join_index_dir_; + std::string schema_store_dir_; + std::string doc_store_dir_; + + std::unique_ptr<QualifiedIdJoinIndex> qualified_id_join_index_; + std::unique_ptr<LanguageSegmenter> lang_segmenter_; + std::unique_ptr<SchemaStore> schema_store_; + std::unique_ptr<DocumentStore> doc_store_; +}; + +TEST_F(QualifiedIdJoinIndexingHandlerV1Test, + CreationWithNullPointerShouldFail) { + EXPECT_THAT( + QualifiedIdJoinIndexingHandler::Create( + /*clock=*/nullptr, doc_store_.get(), qualified_id_join_index_.get()), + StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION)); + + EXPECT_THAT( + QualifiedIdJoinIndexingHandler::Create( + &fake_clock_, /*doc_store=*/nullptr, qualified_id_join_index_.get()), + StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION)); + + EXPECT_THAT( + QualifiedIdJoinIndexingHandler::Create( + &fake_clock_, doc_store_.get(), /*qualified_id_join_index=*/nullptr), + StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION)); +} + +TEST_F(QualifiedIdJoinIndexingHandlerV1Test, HandleJoinableProperty) { + DocumentProto referenced_document = + DocumentBuilder() + .SetKey("pkg$db/ns", "ref_type/1") + .SetSchema(std::string(kReferencedType)) + .AddStringProperty(std::string(kPropertyName), "one") + .Build(); + + DocumentProto document = + DocumentBuilder() + .SetKey("icing", "fake_type/1") + .SetSchema(std::string(kFakeType)) + .AddStringProperty(std::string(kPropertyQualifiedId), + "pkg$db/ns#ref_type/1") + .Build(); + ICING_ASSERT_OK_AND_ASSIGN( + TokenizedDocument tokenized_document, + TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(), + document)); + + ASSERT_THAT(qualified_id_join_index_->last_added_document_id(), + Eq(kInvalidDocumentId)); + // Handle document. + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<QualifiedIdJoinIndexingHandler> handler, + QualifiedIdJoinIndexingHandler::Create(&fake_clock_, doc_store_.get(), + qualified_id_join_index_.get())); + EXPECT_THAT( + handler->Handle(tokenized_document, kDefaultDocumentId, + /*recovery_mode=*/false, /*put_document_stats=*/nullptr), + IsOk()); + + EXPECT_THAT(qualified_id_join_index_->last_added_document_id(), + Eq(kDefaultDocumentId)); + EXPECT_THAT(qualified_id_join_index_->Get(DocJoinInfo( + kDefaultDocumentId, kQualifiedIdJoinablePropertyId)), + IsOkAndHolds("pkg$db/ns#ref_type/1")); +} + +TEST_F(QualifiedIdJoinIndexingHandlerV1Test, HandleNestedJoinableProperty) { + DocumentProto referenced_document1 = + DocumentBuilder() + .SetKey("pkg$db/ns", "ref_type/1") + .SetSchema(std::string(kReferencedType)) + .AddStringProperty(std::string(kPropertyName), "one") + .Build(); + DocumentProto referenced_document2 = + DocumentBuilder() + .SetKey("pkg$db/ns", "ref_type/2") + .SetSchema(std::string(kReferencedType)) + .AddStringProperty(std::string(kPropertyName), "two") + .Build(); + + DocumentProto nested_document = + DocumentBuilder() + .SetKey("pkg$db/ns", "nested_type/1") + .SetSchema(std::string(kNestedType)) + .AddDocumentProperty( + std::string(kPropertyNestedDoc), + DocumentBuilder() + .SetKey("pkg$db/ns", "nested_fake_type/1") + .SetSchema(std::string(kFakeType)) + .AddStringProperty(std::string(kPropertyQualifiedId), + "pkg$db/ns#ref_type/2") + .Build()) + .AddStringProperty(std::string(kPropertyQualifiedId2), + "pkg$db/ns#ref_type/1") + .Build(); + ICING_ASSERT_OK_AND_ASSIGN( + TokenizedDocument tokenized_document, + TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(), + nested_document)); + + ASSERT_THAT(qualified_id_join_index_->last_added_document_id(), + Eq(kInvalidDocumentId)); + // Handle nested_document. + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<QualifiedIdJoinIndexingHandler> handler, + QualifiedIdJoinIndexingHandler::Create(&fake_clock_, doc_store_.get(), + qualified_id_join_index_.get())); + EXPECT_THAT(handler->Handle(tokenized_document, kDefaultDocumentId, + /*recovery_mode=*/false, + /*put_document_stats=*/nullptr), + IsOk()); + + EXPECT_THAT(qualified_id_join_index_->last_added_document_id(), + Eq(kDefaultDocumentId)); + EXPECT_THAT(qualified_id_join_index_->Get(DocJoinInfo( + kDefaultDocumentId, kNestedQualifiedIdJoinablePropertyId)), + IsOkAndHolds("pkg$db/ns#ref_type/2")); + EXPECT_THAT(qualified_id_join_index_->Get(DocJoinInfo( + kDefaultDocumentId, kQualifiedId2JoinablePropertyId)), + IsOkAndHolds("pkg$db/ns#ref_type/1")); +} + +TEST_F(QualifiedIdJoinIndexingHandlerV1Test, + HandleShouldSkipInvalidFormatQualifiedId) { + static constexpr std::string_view kInvalidFormatQualifiedId = + "invalid_format_qualified_id"; + ASSERT_THAT(QualifiedId::Parse(kInvalidFormatQualifiedId), + StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); + + DocumentProto document = + DocumentBuilder() + .SetKey("icing", "fake_type/1") + .SetSchema(std::string(kFakeType)) + .AddStringProperty(std::string(kPropertyQualifiedId), + std::string(kInvalidFormatQualifiedId)) + .Build(); + ICING_ASSERT_OK_AND_ASSIGN( + TokenizedDocument tokenized_document, + TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(), + document)); + + ASSERT_THAT(qualified_id_join_index_->last_added_document_id(), + Eq(kInvalidDocumentId)); + // Handle document. Should ignore invalid format qualified id. + // Index data should remain unchanged since there is no valid qualified id, + // but last_added_document_id should be updated. + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<QualifiedIdJoinIndexingHandler> handler, + QualifiedIdJoinIndexingHandler::Create(&fake_clock_, doc_store_.get(), + qualified_id_join_index_.get())); + EXPECT_THAT( + handler->Handle(tokenized_document, kDefaultDocumentId, + /*recovery_mode=*/false, /*put_document_stats=*/nullptr), + IsOk()); + EXPECT_THAT(qualified_id_join_index_->last_added_document_id(), + Eq(kDefaultDocumentId)); + EXPECT_THAT(qualified_id_join_index_->Get(DocJoinInfo( + kDefaultDocumentId, kQualifiedIdJoinablePropertyId)), + StatusIs(libtextclassifier3::StatusCode::NOT_FOUND)); +} + +TEST_F(QualifiedIdJoinIndexingHandlerV1Test, HandleShouldSkipEmptyQualifiedId) { + // Create a document without any qualified id. + DocumentProto document = DocumentBuilder() + .SetKey("icing", "fake_type/1") + .SetSchema(std::string(kFakeType)) + .Build(); + ICING_ASSERT_OK_AND_ASSIGN( + TokenizedDocument tokenized_document, + TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(), + document)); + ASSERT_THAT(tokenized_document.qualified_id_join_properties(), IsEmpty()); + + ASSERT_THAT(qualified_id_join_index_->last_added_document_id(), + Eq(kInvalidDocumentId)); + // Handle document. Index data should remain unchanged since there is no + // qualified id, but last_added_document_id should be updated. + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<QualifiedIdJoinIndexingHandler> handler, + QualifiedIdJoinIndexingHandler::Create(&fake_clock_, doc_store_.get(), + qualified_id_join_index_.get())); + EXPECT_THAT( + handler->Handle(tokenized_document, kDefaultDocumentId, + /*recovery_mode=*/false, /*put_document_stats=*/nullptr), + IsOk()); + EXPECT_THAT(qualified_id_join_index_->last_added_document_id(), + Eq(kDefaultDocumentId)); + EXPECT_THAT(qualified_id_join_index_->Get(DocJoinInfo( + kDefaultDocumentId, kQualifiedIdJoinablePropertyId)), + StatusIs(libtextclassifier3::StatusCode::NOT_FOUND)); +} + +TEST_F(QualifiedIdJoinIndexingHandlerV1Test, + HandleInvalidDocumentIdShouldReturnInvalidArgumentError) { + DocumentProto referenced_document = + DocumentBuilder() + .SetKey("pkg$db/ns", "ref_type/1") + .SetSchema(std::string(kReferencedType)) + .AddStringProperty(std::string(kPropertyName), "one") + .Build(); + + DocumentProto document = + DocumentBuilder() + .SetKey("icing", "fake_type/1") + .SetSchema(std::string(kFakeType)) + .AddStringProperty(std::string(kPropertyQualifiedId), + "pkg$db/ns#ref_type/1") + .Build(); + ICING_ASSERT_OK_AND_ASSIGN( + TokenizedDocument tokenized_document, + TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(), + document)); + + qualified_id_join_index_->set_last_added_document_id(kDefaultDocumentId); + ASSERT_THAT(qualified_id_join_index_->last_added_document_id(), + Eq(kDefaultDocumentId)); + + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<QualifiedIdJoinIndexingHandler> handler, + QualifiedIdJoinIndexingHandler::Create(&fake_clock_, doc_store_.get(), + qualified_id_join_index_.get())); + + // Handling document with kInvalidDocumentId should cause a failure, and both + // index data and last_added_document_id should remain unchanged. + EXPECT_THAT( + handler->Handle(tokenized_document, kInvalidDocumentId, + /*recovery_mode=*/false, /*put_document_stats=*/nullptr), + StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); + EXPECT_THAT(qualified_id_join_index_->last_added_document_id(), + Eq(kDefaultDocumentId)); + EXPECT_THAT(qualified_id_join_index_->Get(DocJoinInfo( + kInvalidDocumentId, kQualifiedIdJoinablePropertyId)), + StatusIs(libtextclassifier3::StatusCode::NOT_FOUND)); + + // Recovery mode should get the same result. + EXPECT_THAT( + handler->Handle(tokenized_document, kInvalidDocumentId, + /*recovery_mode=*/false, /*put_document_stats=*/nullptr), + StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); + EXPECT_THAT(qualified_id_join_index_->last_added_document_id(), + Eq(kDefaultDocumentId)); + EXPECT_THAT(qualified_id_join_index_->Get(DocJoinInfo( + kInvalidDocumentId, kQualifiedIdJoinablePropertyId)), + StatusIs(libtextclassifier3::StatusCode::NOT_FOUND)); +} + +TEST_F(QualifiedIdJoinIndexingHandlerV1Test, + HandleOutOfOrderDocumentIdShouldReturnInvalidArgumentError) { + DocumentProto referenced_document = + DocumentBuilder() + .SetKey("pkg$db/ns", "ref_type/1") + .SetSchema(std::string(kReferencedType)) + .AddStringProperty(std::string(kPropertyName), "one") + .Build(); + + DocumentProto document = + DocumentBuilder() + .SetKey("icing", "fake_type/1") + .SetSchema(std::string(kFakeType)) + .AddStringProperty(std::string(kPropertyQualifiedId), + "pkg$db/ns#ref_type/1") + .Build(); + ICING_ASSERT_OK_AND_ASSIGN( + TokenizedDocument tokenized_document, + TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(), + document)); + + qualified_id_join_index_->set_last_added_document_id(kDefaultDocumentId); + ASSERT_THAT(qualified_id_join_index_->last_added_document_id(), + Eq(kDefaultDocumentId)); + + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<QualifiedIdJoinIndexingHandler> handler, + QualifiedIdJoinIndexingHandler::Create(&fake_clock_, doc_store_.get(), + qualified_id_join_index_.get())); + + // Handling document with document_id < last_added_document_id should cause a + // failure, and both index data and last_added_document_id should remain + // unchanged. + ASSERT_THAT(IsDocumentIdValid(kDefaultDocumentId - 1), IsTrue()); + EXPECT_THAT( + handler->Handle(tokenized_document, kDefaultDocumentId - 1, + /*recovery_mode=*/false, /*put_document_stats=*/nullptr), + StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); + EXPECT_THAT(qualified_id_join_index_->last_added_document_id(), + Eq(kDefaultDocumentId)); + EXPECT_THAT(qualified_id_join_index_->Get(DocJoinInfo( + kDefaultDocumentId, kQualifiedIdJoinablePropertyId)), + StatusIs(libtextclassifier3::StatusCode::NOT_FOUND)); + + // Handling document with document_id == last_added_document_id should cause a + // failure, and both index data and last_added_document_id should remain + // unchanged. + EXPECT_THAT( + handler->Handle(tokenized_document, kDefaultDocumentId, + /*recovery_mode=*/false, /*put_document_stats=*/nullptr), + StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); + EXPECT_THAT(qualified_id_join_index_->last_added_document_id(), + Eq(kDefaultDocumentId)); + EXPECT_THAT(qualified_id_join_index_->Get(DocJoinInfo( + kDefaultDocumentId, kQualifiedIdJoinablePropertyId)), + StatusIs(libtextclassifier3::StatusCode::NOT_FOUND)); +} + +TEST_F(QualifiedIdJoinIndexingHandlerV1Test, + HandleRecoveryModeShouldIgnoreDocsLELastAddedDocId) { + DocumentProto referenced_document = + DocumentBuilder() + .SetKey("pkg$db/ns", "ref_type/1") + .SetSchema(std::string(kReferencedType)) + .AddStringProperty(std::string(kPropertyName), "one") + .Build(); + + DocumentProto document = + DocumentBuilder() + .SetKey("icing", "fake_type/1") + .SetSchema(std::string(kFakeType)) + .AddStringProperty(std::string(kPropertyQualifiedId), + "pkg$db/ns#ref_type/1") + .Build(); + ICING_ASSERT_OK_AND_ASSIGN( + TokenizedDocument tokenized_document, + TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(), + document)); + + qualified_id_join_index_->set_last_added_document_id(kDefaultDocumentId); + ASSERT_THAT(qualified_id_join_index_->last_added_document_id(), + Eq(kDefaultDocumentId)); + + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<QualifiedIdJoinIndexingHandler> handler, + QualifiedIdJoinIndexingHandler::Create(&fake_clock_, doc_store_.get(), + qualified_id_join_index_.get())); + + // Handle document with document_id < last_added_document_id in recovery mode. + // We should not get any error, but the handler should ignore the document, so + // both index data and last_added_document_id should remain unchanged. + ASSERT_THAT(IsDocumentIdValid(kDefaultDocumentId - 1), IsTrue()); + EXPECT_THAT( + handler->Handle(tokenized_document, kDefaultDocumentId - 1, + /*recovery_mode=*/true, /*put_document_stats=*/nullptr), + IsOk()); + EXPECT_THAT(qualified_id_join_index_->last_added_document_id(), + Eq(kDefaultDocumentId)); + EXPECT_THAT(qualified_id_join_index_->Get(DocJoinInfo( + kDefaultDocumentId, kQualifiedIdJoinablePropertyId)), + StatusIs(libtextclassifier3::StatusCode::NOT_FOUND)); + + // Handle document with document_id == last_added_document_id in recovery + // mode. We should not get any error, but the handler should ignore the + // document, so both index data and last_added_document_id should remain + // unchanged. + EXPECT_THAT( + handler->Handle(tokenized_document, kDefaultDocumentId, + /*recovery_mode=*/true, /*put_document_stats=*/nullptr), + IsOk()); + EXPECT_THAT(qualified_id_join_index_->last_added_document_id(), + Eq(kDefaultDocumentId)); + EXPECT_THAT(qualified_id_join_index_->Get(DocJoinInfo( + kDefaultDocumentId, kQualifiedIdJoinablePropertyId)), + StatusIs(libtextclassifier3::StatusCode::NOT_FOUND)); + + // Handle document with document_id > last_added_document_id in recovery mode. + // The handler should index this document and update last_added_document_id. + ASSERT_THAT(IsDocumentIdValid(kDefaultDocumentId + 1), IsTrue()); + EXPECT_THAT( + handler->Handle(tokenized_document, kDefaultDocumentId + 1, + /*recovery_mode=*/true, /*put_document_stats=*/nullptr), + IsOk()); + EXPECT_THAT(qualified_id_join_index_->last_added_document_id(), + Eq(kDefaultDocumentId + 1)); + EXPECT_THAT(qualified_id_join_index_->Get(DocJoinInfo( + kDefaultDocumentId + 1, kQualifiedIdJoinablePropertyId)), + IsOkAndHolds("pkg$db/ns#ref_type/1")); +} + +} // namespace + +} // namespace lib +} // namespace icing diff --git a/icing/join/qualified-id-join-indexing-handler.cc b/icing/join/qualified-id-join-indexing-handler.cc index 86af043..df86cba 100644 --- a/icing/join/qualified-id-join-indexing-handler.cc +++ b/icing/join/qualified-id-join-indexing-handler.cc @@ -14,19 +14,28 @@ #include "icing/join/qualified-id-join-indexing-handler.h" +#include <cstdint> +#include <limits> #include <memory> +#include <optional> #include <string_view> +#include <utility> +#include <vector> #include "icing/text_classifier/lib3/utils/base/status.h" #include "icing/text_classifier/lib3/utils/base/statusor.h" #include "icing/absl_ports/canonical_errors.h" #include "icing/join/doc-join-info.h" -#include "icing/join/qualified-id-type-joinable-index.h" +#include "icing/join/qualified-id-join-index.h" #include "icing/join/qualified-id.h" #include "icing/legacy/core/icing-string-util.h" #include "icing/proto/logging.pb.h" #include "icing/schema/joinable-property.h" +#include "icing/store/document-filter-data.h" #include "icing/store/document-id.h" +#include "icing/store/document-store.h" +#include "icing/store/namespace-fingerprint-identifier.h" +#include "icing/store/namespace-id.h" #include "icing/util/clock.h" #include "icing/util/logging.h" #include "icing/util/status-macros.h" @@ -38,12 +47,15 @@ namespace lib { /* static */ libtextclassifier3::StatusOr< std::unique_ptr<QualifiedIdJoinIndexingHandler>> QualifiedIdJoinIndexingHandler::Create( - const Clock* clock, QualifiedIdTypeJoinableIndex* qualified_id_join_index) { + const Clock* clock, const DocumentStore* doc_store, + QualifiedIdJoinIndex* qualified_id_join_index) { ICING_RETURN_ERROR_IF_NULL(clock); + ICING_RETURN_ERROR_IF_NULL(doc_store); ICING_RETURN_ERROR_IF_NULL(qualified_id_join_index); return std::unique_ptr<QualifiedIdJoinIndexingHandler>( - new QualifiedIdJoinIndexingHandler(clock, qualified_id_join_index)); + new QualifiedIdJoinIndexingHandler(clock, doc_store, + qualified_id_join_index)); } libtextclassifier3::Status QualifiedIdJoinIndexingHandler::Handle( @@ -69,30 +81,89 @@ libtextclassifier3::Status QualifiedIdJoinIndexingHandler::Handle( } qualified_id_join_index_.set_last_added_document_id(document_id); - for (const JoinableProperty<std::string_view>& qualified_id_property : - tokenized_document.qualified_id_join_properties()) { - if (qualified_id_property.values.empty()) { - continue; + if (qualified_id_join_index_.is_v2()) { + // v2 + std::optional<DocumentFilterData> filter_data = + doc_store_.GetAliveDocumentFilterData( + document_id, + /*current_time_ms=*/std::numeric_limits<int64_t>::min()); + if (!filter_data) { + // This should not happen. + return absl_ports::InternalError( + "Failed to get alive document filter data when indexing"); } - DocJoinInfo info(document_id, qualified_id_property.metadata.id); - // Currently we only support single (non-repeated) joinable value under a - // property. - std::string_view ref_qualified_id_str = qualified_id_property.values[0]; - - // Attempt to parse qualified id string to make sure the format is correct. - if (!QualifiedId::Parse(ref_qualified_id_str).ok()) { - // Skip incorrect format of qualified id string to save disk space. - continue; + for (const JoinableProperty<std::string_view>& qualified_id_property : + tokenized_document.qualified_id_join_properties()) { + // Parse all qualified id strings and convert them to + // NamespaceFingerprintIdentifier. + std::vector<NamespaceFingerprintIdentifier> ref_doc_ns_fingerprint_ids; + for (std::string_view ref_qualified_id_str : + qualified_id_property.values) { + // Attempt to parse qualified id string to make sure the format is + // correct. + auto ref_qualified_id_or = QualifiedId::Parse(ref_qualified_id_str); + if (!ref_qualified_id_or.ok()) { + // Skip incorrect format of qualified id string. + continue; + } + + QualifiedId ref_qualified_id = + std::move(ref_qualified_id_or).ValueOrDie(); + auto ref_namespace_id_or = + doc_store_.GetNamespaceId(ref_qualified_id.name_space()); + if (!ref_namespace_id_or.ok()) { + // Skip invalid namespace id. + continue; + } + NamespaceId ref_namespace_id = + std::move(ref_namespace_id_or).ValueOrDie(); + + ref_doc_ns_fingerprint_ids.push_back(NamespaceFingerprintIdentifier( + ref_namespace_id, ref_qualified_id.uri())); + } + + // Batch add all join data of this (schema_type_id, joinable_property_id) + // into to the index. + libtextclassifier3::Status status = qualified_id_join_index_.Put( + filter_data->schema_type_id(), qualified_id_property.metadata.id, + document_id, std::move(ref_doc_ns_fingerprint_ids)); + if (!status.ok()) { + ICING_LOG(WARNING) + << "Failed to add data into qualified id join index v2 due to: " + << status.error_message(); + return status; + } } - - libtextclassifier3::Status status = - qualified_id_join_index_.Put(info, ref_qualified_id_str); - if (!status.ok()) { - ICING_LOG(WARNING) - << "Failed to add data into qualified id join index due to: " - << status.error_message(); - return status; + } else { + // v1 + // TODO(b/275121148): deprecate this part after rollout v2. + for (const JoinableProperty<std::string_view>& qualified_id_property : + tokenized_document.qualified_id_join_properties()) { + if (qualified_id_property.values.empty()) { + continue; + } + + DocJoinInfo info(document_id, qualified_id_property.metadata.id); + // Currently we only support single (non-repeated) joinable value under a + // property. + std::string_view ref_qualified_id_str = qualified_id_property.values[0]; + + // Attempt to parse qualified id string to make sure the format is + // correct. + if (!QualifiedId::Parse(ref_qualified_id_str).ok()) { + // Skip incorrect format of qualified id string to save disk space. + continue; + } + + libtextclassifier3::Status status = + qualified_id_join_index_.Put(info, ref_qualified_id_str); + if (!status.ok()) { + ICING_LOG(WARNING) + << "Failed to add data into qualified id join index due to: " + << status.error_message(); + return status; + } } } diff --git a/icing/join/qualified-id-join-indexing-handler.h b/icing/join/qualified-id-join-indexing-handler.h index 434403e..8a11bf9 100644 --- a/icing/join/qualified-id-join-indexing-handler.h +++ b/icing/join/qualified-id-join-indexing-handler.h @@ -15,11 +15,15 @@ #ifndef ICING_JOIN_QUALIFIED_ID_JOIN_INDEXING_HANDLER_H_ #define ICING_JOIN_QUALIFIED_ID_JOIN_INDEXING_HANDLER_H_ +#include <memory> + #include "icing/text_classifier/lib3/utils/base/status.h" +#include "icing/text_classifier/lib3/utils/base/statusor.h" #include "icing/index/data-indexing-handler.h" -#include "icing/join/qualified-id-type-joinable-index.h" +#include "icing/join/qualified-id-join-index.h" #include "icing/proto/logging.pb.h" #include "icing/store/document-id.h" +#include "icing/store/document-store.h" #include "icing/util/clock.h" #include "icing/util/tokenized-document.h" @@ -37,13 +41,13 @@ class QualifiedIdJoinIndexingHandler : public DataIndexingHandler { // - FAILED_PRECONDITION_ERROR if any of the input pointer is null static libtextclassifier3::StatusOr< std::unique_ptr<QualifiedIdJoinIndexingHandler>> - Create(const Clock* clock, - QualifiedIdTypeJoinableIndex* qualified_id_join_index); + Create(const Clock* clock, const DocumentStore* doc_store, + QualifiedIdJoinIndex* qualified_id_join_index); ~QualifiedIdJoinIndexingHandler() override = default; // Handles the joinable qualified id data indexing process: add data into the - // qualified id type joinable cache. + // qualified id join index. // /// Returns: // - OK on success. @@ -51,18 +55,21 @@ class QualifiedIdJoinIndexingHandler : public DataIndexingHandler { // than or equal to the document_id of a previously indexed document in // non recovery mode. // - INTERNAL_ERROR if any other errors occur. - // - Any QualifiedIdTypeJoinableIndex errors. + // - Any QualifiedIdJoinIndex errors. libtextclassifier3::Status Handle( const TokenizedDocument& tokenized_document, DocumentId document_id, bool recovery_mode, PutDocumentStatsProto* put_document_stats) override; private: explicit QualifiedIdJoinIndexingHandler( - const Clock* clock, QualifiedIdTypeJoinableIndex* qualified_id_join_index) + const Clock* clock, const DocumentStore* doc_store, + QualifiedIdJoinIndex* qualified_id_join_index) : DataIndexingHandler(clock), + doc_store_(*doc_store), qualified_id_join_index_(*qualified_id_join_index) {} - QualifiedIdTypeJoinableIndex& qualified_id_join_index_; // Does not own. + const DocumentStore& doc_store_; // Does not own. + QualifiedIdJoinIndex& qualified_id_join_index_; // Does not own. }; } // namespace lib diff --git a/icing/join/qualified-id-join-indexing-handler_test.cc b/icing/join/qualified-id-join-indexing-handler_test.cc index e48dc33..53d35c7 100644 --- a/icing/join/qualified-id-join-indexing-handler_test.cc +++ b/icing/join/qualified-id-join-indexing-handler_test.cc @@ -17,13 +17,20 @@ #include <memory> #include <string> #include <string_view> +#include <utility> +#include <vector> #include "icing/text_classifier/lib3/utils/base/status.h" +#include "icing/text_classifier/lib3/utils/base/statusor.h" #include "gmock/gmock.h" #include "gtest/gtest.h" +#include "icing/absl_ports/str_cat.h" #include "icing/document-builder.h" #include "icing/file/filesystem.h" -#include "icing/join/qualified-id-type-joinable-index.h" +#include "icing/file/portable-file-backed-proto-log.h" +#include "icing/join/document-id-to-join-info.h" +#include "icing/join/qualified-id-join-index-impl-v2.h" +#include "icing/join/qualified-id-join-index.h" #include "icing/join/qualified-id.h" #include "icing/portable/platform.h" #include "icing/proto/document.pb.h" @@ -31,7 +38,11 @@ #include "icing/schema-builder.h" #include "icing/schema/joinable-property.h" #include "icing/schema/schema-store.h" +#include "icing/store/document-filter-data.h" #include "icing/store/document-id.h" +#include "icing/store/document-store.h" +#include "icing/store/namespace-fingerprint-identifier.h" +#include "icing/store/namespace-id.h" #include "icing/testing/common-matchers.h" #include "icing/testing/fake-clock.h" #include "icing/testing/icu-data-file-helper.h" @@ -39,6 +50,7 @@ #include "icing/testing/tmp-directory.h" #include "icing/tokenization/language-segmenter-factory.h" #include "icing/tokenization/language-segmenter.h" +#include "icing/util/status-macros.h" #include "icing/util/tokenized-document.h" #include "unicode/uloc.h" @@ -47,9 +59,11 @@ namespace lib { namespace { +using ::testing::ElementsAre; using ::testing::Eq; using ::testing::IsEmpty; using ::testing::IsTrue; +using ::testing::NotNull; // Schema type for referenced documents: ReferencedType static constexpr std::string_view kReferencedType = "ReferencedType"; @@ -61,18 +75,11 @@ static constexpr std::string_view kPropertyName = "name"; static constexpr std::string_view kFakeType = "FakeType"; static constexpr std::string_view kPropertyQualifiedId = "qualifiedId"; -static constexpr JoinablePropertyId kQualifiedIdJoinablePropertyId = 0; - // Schema type with nested joinable properties: NestedType static constexpr std::string_view kNestedType = "NestedType"; static constexpr std::string_view kPropertyNestedDoc = "nested"; static constexpr std::string_view kPropertyQualifiedId2 = "qualifiedId2"; -static constexpr JoinablePropertyId kNestedQualifiedIdJoinablePropertyId = 0; -static constexpr JoinablePropertyId kQualifiedId2JoinablePropertyId = 1; - -static constexpr DocumentId kDefaultDocumentId = 3; - class QualifiedIdJoinIndexingHandlerTest : public ::testing::Test { protected: void SetUp() override { @@ -89,12 +96,12 @@ class QualifiedIdJoinIndexingHandlerTest : public ::testing::Test { qualified_id_join_index_dir_ = base_dir_ + "/qualified_id_join_index"; schema_store_dir_ = base_dir_ + "/schema_store"; + doc_store_dir_ = base_dir_ + "/doc_store"; - ICING_ASSERT_OK_AND_ASSIGN( - qualified_id_join_index_, - QualifiedIdTypeJoinableIndex::Create( - filesystem_, qualified_id_join_index_dir_, - /*pre_mapping_fbv=*/false, /*use_persistent_hash_map=*/false)); + ICING_ASSERT_OK_AND_ASSIGN(qualified_id_join_index_, + QualifiedIdJoinIndexImplV2::Create( + filesystem_, qualified_id_join_index_dir_, + /*pre_mapping_fbv=*/false)); language_segmenter_factory::SegmenterOptions segmenter_options(ULOC_US); ICING_ASSERT_OK_AND_ASSIGN( @@ -140,9 +147,52 @@ class QualifiedIdJoinIndexingHandlerTest : public ::testing::Test { ICING_ASSERT_OK(schema_store_->SetSchema( schema, /*ignore_errors_and_delete_documents=*/false, /*allow_circular_schema_definitions=*/false)); + + ASSERT_THAT(filesystem_.CreateDirectoryRecursively(doc_store_dir_.c_str()), + IsTrue()); + ICING_ASSERT_OK_AND_ASSIGN( + DocumentStore::CreateResult create_result, + DocumentStore::Create(&filesystem_, doc_store_dir_, &fake_clock_, + schema_store_.get(), + /*force_recovery_and_revalidate_documents=*/false, + /*namespace_id_fingerprint=*/true, + /*pre_mapping_fbv=*/false, + /*use_persistent_hash_map=*/false, + PortableFileBackedProtoLog< + DocumentWrapper>::kDeflateCompressionLevel, + /*initialize_stats=*/nullptr)); + doc_store_ = std::move(create_result.document_store); + + // Get FakeType related ids. + ICING_ASSERT_OK_AND_ASSIGN(fake_type_id_, + schema_store_->GetSchemaTypeId(kFakeType)); + ICING_ASSERT_OK_AND_ASSIGN( + const JoinablePropertyMetadata* metadata1, + schema_store_->GetJoinablePropertyMetadata( + fake_type_id_, std::string(kPropertyQualifiedId))); + ASSERT_THAT(metadata1, NotNull()); + fake_type_joinable_property_id_ = metadata1->id; + + // Get NestedType related ids. + ICING_ASSERT_OK_AND_ASSIGN(nested_type_id_, + schema_store_->GetSchemaTypeId(kNestedType)); + ICING_ASSERT_OK_AND_ASSIGN( + const JoinablePropertyMetadata* metadata2, + schema_store_->GetJoinablePropertyMetadata( + nested_type_id_, + absl_ports::StrCat(kPropertyNestedDoc, ".", kPropertyQualifiedId))); + ASSERT_THAT(metadata2, NotNull()); + nested_type_nested_joinable_property_id_ = metadata2->id; + ICING_ASSERT_OK_AND_ASSIGN( + const JoinablePropertyMetadata* metadata3, + schema_store_->GetJoinablePropertyMetadata( + nested_type_id_, std::string(kPropertyQualifiedId2))); + ASSERT_THAT(metadata3, NotNull()); + nested_type_joinable_property_id_ = metadata3->id; } void TearDown() override { + doc_store_.reset(); schema_store_.reset(); lang_segmenter_.reset(); qualified_id_join_index_.reset(); @@ -155,30 +205,77 @@ class QualifiedIdJoinIndexingHandlerTest : public ::testing::Test { std::string base_dir_; std::string qualified_id_join_index_dir_; std::string schema_store_dir_; + std::string doc_store_dir_; - std::unique_ptr<QualifiedIdTypeJoinableIndex> qualified_id_join_index_; + std::unique_ptr<QualifiedIdJoinIndexImplV2> qualified_id_join_index_; std::unique_ptr<LanguageSegmenter> lang_segmenter_; std::unique_ptr<SchemaStore> schema_store_; + std::unique_ptr<DocumentStore> doc_store_; + + // FakeType related ids. + SchemaTypeId fake_type_id_; + JoinablePropertyId fake_type_joinable_property_id_; + + // NestedType related ids. + SchemaTypeId nested_type_id_; + JoinablePropertyId nested_type_nested_joinable_property_id_; + JoinablePropertyId nested_type_joinable_property_id_; }; +libtextclassifier3::StatusOr< + std::vector<QualifiedIdJoinIndexImplV2::JoinDataType>> +GetJoinData(const QualifiedIdJoinIndexImplV2& index, + SchemaTypeId schema_type_id, + JoinablePropertyId joinable_property_id) { + ICING_ASSIGN_OR_RETURN( + std::unique_ptr<QualifiedIdJoinIndex::JoinDataIteratorBase> iter, + index.GetIterator(schema_type_id, joinable_property_id)); + + std::vector<QualifiedIdJoinIndexImplV2::JoinDataType> result; + while (iter->Advance().ok()) { + result.push_back(iter->GetCurrent()); + } + + return result; +} + TEST_F(QualifiedIdJoinIndexingHandlerTest, CreationWithNullPointerShouldFail) { - EXPECT_THAT(QualifiedIdJoinIndexingHandler::Create( - /*clock=*/nullptr, qualified_id_join_index_.get()), - StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION)); + EXPECT_THAT( + QualifiedIdJoinIndexingHandler::Create( + /*clock=*/nullptr, doc_store_.get(), qualified_id_join_index_.get()), + StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION)); - EXPECT_THAT(QualifiedIdJoinIndexingHandler::Create( - &fake_clock_, /*qualified_id_join_index=*/nullptr), - StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION)); + EXPECT_THAT( + QualifiedIdJoinIndexingHandler::Create( + &fake_clock_, /*doc_store=*/nullptr, qualified_id_join_index_.get()), + StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION)); + + EXPECT_THAT( + QualifiedIdJoinIndexingHandler::Create( + &fake_clock_, doc_store_.get(), /*qualified_id_join_index=*/nullptr), + StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION)); } TEST_F(QualifiedIdJoinIndexingHandlerTest, HandleJoinableProperty) { + // Create and put referenced (parent) document. Get its document id and + // namespace id. DocumentProto referenced_document = DocumentBuilder() .SetKey("pkg$db/ns", "ref_type/1") .SetSchema(std::string(kReferencedType)) .AddStringProperty(std::string(kPropertyName), "one") .Build(); - + ICING_ASSERT_OK_AND_ASSIGN(DocumentId ref_doc_id, + doc_store_->Put(referenced_document)); + ICING_ASSERT_OK_AND_ASSIGN( + NamespaceId ref_doc_ns_id, + doc_store_->GetNamespaceId(referenced_document.namespace_())); + NamespaceFingerprintIdentifier ref_doc_ns_fingerprint_id( + /*namespace_id=*/ref_doc_ns_id, /*target_str=*/referenced_document.uri()); + ASSERT_THAT(doc_store_->GetDocumentId(ref_doc_ns_fingerprint_id), + IsOkAndHolds(ref_doc_id)); + + // Create and put (child) document. Also tokenize it. DocumentProto document = DocumentBuilder() .SetKey("icing", "fake_type/1") @@ -186,44 +283,81 @@ TEST_F(QualifiedIdJoinIndexingHandlerTest, HandleJoinableProperty) { .AddStringProperty(std::string(kPropertyQualifiedId), "pkg$db/ns#ref_type/1") .Build(); + ICING_ASSERT_OK_AND_ASSIGN(DocumentId doc_id, doc_store_->Put(document)); ICING_ASSERT_OK_AND_ASSIGN( TokenizedDocument tokenized_document, TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(), - document)); + std::move(document))); + // Handle document. ASSERT_THAT(qualified_id_join_index_->last_added_document_id(), Eq(kInvalidDocumentId)); - // Handle document. ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<QualifiedIdJoinIndexingHandler> handler, - QualifiedIdJoinIndexingHandler::Create(&fake_clock_, + QualifiedIdJoinIndexingHandler::Create(&fake_clock_, doc_store_.get(), qualified_id_join_index_.get())); EXPECT_THAT( - handler->Handle(tokenized_document, kDefaultDocumentId, - /*recovery_mode=*/false, /*put_document_stats=*/nullptr), + handler->Handle(tokenized_document, doc_id, /*recovery_mode=*/false, + /*put_document_stats=*/nullptr), IsOk()); - EXPECT_THAT(qualified_id_join_index_->last_added_document_id(), - Eq(kDefaultDocumentId)); - EXPECT_THAT(qualified_id_join_index_->Get(DocJoinInfo( - kDefaultDocumentId, kQualifiedIdJoinablePropertyId)), - IsOkAndHolds("pkg$db/ns#ref_type/1")); + // Verify the state of qualified_id_join_index_ after Handle(). + EXPECT_THAT(qualified_id_join_index_->last_added_document_id(), Eq(doc_id)); + // (kFakeType, kPropertyQualifiedId) should contain + // [(doc_id, ref_doc_ns_fingerprint_id)]. + EXPECT_THAT( + GetJoinData(*qualified_id_join_index_, /*schema_type_id=*/fake_type_id_, + /*joinable_property_id=*/fake_type_joinable_property_id_), + IsOkAndHolds( + ElementsAre(DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>( + /*document_id=*/doc_id, + /*join_info=*/ref_doc_ns_fingerprint_id)))); } TEST_F(QualifiedIdJoinIndexingHandlerTest, HandleNestedJoinableProperty) { + // Create and put referenced (parent) document1. Get its document id and + // namespace id. DocumentProto referenced_document1 = DocumentBuilder() .SetKey("pkg$db/ns", "ref_type/1") .SetSchema(std::string(kReferencedType)) .AddStringProperty(std::string(kPropertyName), "one") .Build(); + ICING_ASSERT_OK_AND_ASSIGN(DocumentId ref_doc_id1, + doc_store_->Put(referenced_document1)); + ICING_ASSERT_OK_AND_ASSIGN( + NamespaceId ref_doc_ns_id1, + doc_store_->GetNamespaceId(referenced_document1.namespace_())); + NamespaceFingerprintIdentifier ref_doc_ns_fingerprint_id1( + /*namespace_id=*/ref_doc_ns_id1, + /*target_str=*/referenced_document1.uri()); + ASSERT_THAT(doc_store_->GetDocumentId(ref_doc_ns_fingerprint_id1), + IsOkAndHolds(ref_doc_id1)); + + // Create and put referenced (parent) document2. Get its document id and + // namespace id. DocumentProto referenced_document2 = DocumentBuilder() .SetKey("pkg$db/ns", "ref_type/2") .SetSchema(std::string(kReferencedType)) .AddStringProperty(std::string(kPropertyName), "two") .Build(); - + ICING_ASSERT_OK_AND_ASSIGN(DocumentId ref_doc_id2, + doc_store_->Put(referenced_document2)); + ICING_ASSERT_OK_AND_ASSIGN( + NamespaceId ref_doc_ns_id2, + doc_store_->GetNamespaceId(referenced_document2.namespace_())); + NamespaceFingerprintIdentifier ref_doc_ns_fingerprint_id2( + /*namespace_id=*/ref_doc_ns_id2, + /*target_str=*/referenced_document2.uri()); + ASSERT_THAT(doc_store_->GetDocumentId(ref_doc_ns_fingerprint_id2), + IsOkAndHolds(ref_doc_id2)); + + // Create and put (child) document: + // - kPropertyNestedDoc.kPropertyQualifiedId refers to referenced_document2. + // - kPropertyQualifiedId2 refers to referenced_document1. + // + // Also tokenize it. DocumentProto nested_document = DocumentBuilder() .SetKey("pkg$db/ns", "nested_type/1") @@ -239,31 +373,51 @@ TEST_F(QualifiedIdJoinIndexingHandlerTest, HandleNestedJoinableProperty) { .AddStringProperty(std::string(kPropertyQualifiedId2), "pkg$db/ns#ref_type/1") .Build(); + ICING_ASSERT_OK_AND_ASSIGN(DocumentId doc_id, + doc_store_->Put(nested_document)); ICING_ASSERT_OK_AND_ASSIGN( TokenizedDocument tokenized_document, TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(), nested_document)); + // Handle nested_document. ASSERT_THAT(qualified_id_join_index_->last_added_document_id(), Eq(kInvalidDocumentId)); - // Handle nested_document. ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<QualifiedIdJoinIndexingHandler> handler, - QualifiedIdJoinIndexingHandler::Create(&fake_clock_, + QualifiedIdJoinIndexingHandler::Create(&fake_clock_, doc_store_.get(), qualified_id_join_index_.get())); - EXPECT_THAT(handler->Handle(tokenized_document, kDefaultDocumentId, - /*recovery_mode=*/false, - /*put_document_stats=*/nullptr), - IsOk()); + EXPECT_THAT( + handler->Handle(tokenized_document, doc_id, /*recovery_mode=*/false, + /*put_document_stats=*/nullptr), + IsOk()); - EXPECT_THAT(qualified_id_join_index_->last_added_document_id(), - Eq(kDefaultDocumentId)); - EXPECT_THAT(qualified_id_join_index_->Get(DocJoinInfo( - kDefaultDocumentId, kNestedQualifiedIdJoinablePropertyId)), - IsOkAndHolds("pkg$db/ns#ref_type/2")); - EXPECT_THAT(qualified_id_join_index_->Get(DocJoinInfo( - kDefaultDocumentId, kQualifiedId2JoinablePropertyId)), - IsOkAndHolds("pkg$db/ns#ref_type/1")); + // Verify the state of qualified_id_join_index_ after Handle(). + EXPECT_THAT(qualified_id_join_index_->last_added_document_id(), Eq(doc_id)); + // (kFakeType, kPropertyQualifiedId) should contain nothing. + EXPECT_THAT( + GetJoinData(*qualified_id_join_index_, /*schema_type_id=*/fake_type_id_, + /*joinable_property_id=*/fake_type_joinable_property_id_), + IsOkAndHolds(IsEmpty())); + // (kNestedType, kPropertyNestedDoc.kPropertyQualifiedId) should contain + // [(doc_id, ref_doc_ns_fingerprint_id2)]. + EXPECT_THAT( + GetJoinData( + *qualified_id_join_index_, /*schema_type_id=*/nested_type_id_, + /*joinable_property_id=*/nested_type_nested_joinable_property_id_), + IsOkAndHolds( + ElementsAre(DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>( + /*document_id=*/doc_id, + /*join_info=*/ref_doc_ns_fingerprint_id2)))); + // (kNestedType, kPropertyQualifiedId2) should contain + // [(doc_id, ref_doc_ns_fingerprint_id1)]. + EXPECT_THAT( + GetJoinData(*qualified_id_join_index_, /*schema_type_id=*/nested_type_id_, + /*joinable_property_id=*/nested_type_joinable_property_id_), + IsOkAndHolds( + ElementsAre(DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>( + /*document_id=*/doc_id, + /*join_info=*/ref_doc_ns_fingerprint_id1)))); } TEST_F(QualifiedIdJoinIndexingHandlerTest, @@ -273,6 +427,8 @@ TEST_F(QualifiedIdJoinIndexingHandlerTest, ASSERT_THAT(QualifiedId::Parse(kInvalidFormatQualifiedId), StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); + // Create and put (child) document with an invalid format referenced qualified + // id. Also tokenize it. DocumentProto document = DocumentBuilder() .SetKey("icing", "fake_type/1") @@ -280,71 +436,133 @@ TEST_F(QualifiedIdJoinIndexingHandlerTest, .AddStringProperty(std::string(kPropertyQualifiedId), std::string(kInvalidFormatQualifiedId)) .Build(); + ICING_ASSERT_OK_AND_ASSIGN(DocumentId doc_id, doc_store_->Put(document)); ICING_ASSERT_OK_AND_ASSIGN( TokenizedDocument tokenized_document, TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(), document)); + // Handle document. Should ignore invalid format qualified id. ASSERT_THAT(qualified_id_join_index_->last_added_document_id(), Eq(kInvalidDocumentId)); - // Handle document. Should ignore invalid format qualified id. - // Index data should remain unchanged since there is no valid qualified id, - // but last_added_document_id should be updated. ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<QualifiedIdJoinIndexingHandler> handler, - QualifiedIdJoinIndexingHandler::Create(&fake_clock_, + QualifiedIdJoinIndexingHandler::Create(&fake_clock_, doc_store_.get(), qualified_id_join_index_.get())); EXPECT_THAT( - handler->Handle(tokenized_document, kDefaultDocumentId, - /*recovery_mode=*/false, /*put_document_stats=*/nullptr), + handler->Handle(tokenized_document, doc_id, /*recovery_mode=*/false, + /*put_document_stats=*/nullptr), IsOk()); - EXPECT_THAT(qualified_id_join_index_->last_added_document_id(), - Eq(kDefaultDocumentId)); - EXPECT_THAT(qualified_id_join_index_->Get(DocJoinInfo( - kDefaultDocumentId, kQualifiedIdJoinablePropertyId)), - StatusIs(libtextclassifier3::StatusCode::NOT_FOUND)); + + // Verify the state of qualified_id_join_index_ after Handle(). Index data + // should remain unchanged since there is no valid qualified id, but + // last_added_document_id should be updated. + EXPECT_THAT(qualified_id_join_index_->last_added_document_id(), Eq(doc_id)); + // (kFakeType, kPropertyQualifiedId) should contain nothing. + EXPECT_THAT( + GetJoinData(*qualified_id_join_index_, /*schema_type_id=*/fake_type_id_, + /*joinable_property_id=*/fake_type_joinable_property_id_), + IsOkAndHolds(IsEmpty())); +} + +TEST_F(QualifiedIdJoinIndexingHandlerTest, + HandleShouldSkipNonExistingNamespace) { + static constexpr std::string_view kUnknownNamespace = "UnknownNamespace"; + // Create and put (child) document which references to a parent qualified id + // with an unknown namespace. Also tokenize it. + DocumentProto document = + DocumentBuilder() + .SetKey("icing", "fake_type/1") + .SetSchema(std::string(kFakeType)) + .AddStringProperty( + std::string(kPropertyQualifiedId), + absl_ports::StrCat(kUnknownNamespace, "#", "ref_type/1")) + .Build(); + ICING_ASSERT_OK_AND_ASSIGN(DocumentId doc_id, doc_store_->Put(document)); + ICING_ASSERT_OK_AND_ASSIGN( + TokenizedDocument tokenized_document, + TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(), + std::move(document))); + + // Handle document. + ASSERT_THAT(qualified_id_join_index_->last_added_document_id(), + Eq(kInvalidDocumentId)); + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<QualifiedIdJoinIndexingHandler> handler, + QualifiedIdJoinIndexingHandler::Create(&fake_clock_, doc_store_.get(), + qualified_id_join_index_.get())); + EXPECT_THAT( + handler->Handle(tokenized_document, doc_id, /*recovery_mode=*/false, + /*put_document_stats=*/nullptr), + IsOk()); + + // Verify the state of qualified_id_join_index_ after Handle(). + EXPECT_THAT(qualified_id_join_index_->last_added_document_id(), Eq(doc_id)); + // (kFakeType, kPropertyQualifiedId) should be empty since + // "UnknownNamespace#ref_type/1" should be skipped. + EXPECT_THAT( + GetJoinData(*qualified_id_join_index_, /*schema_type_id=*/fake_type_id_, + /*joinable_property_id=*/fake_type_joinable_property_id_), + IsOkAndHolds(IsEmpty())); } TEST_F(QualifiedIdJoinIndexingHandlerTest, HandleShouldSkipEmptyQualifiedId) { - // Create a document without any qualified id. + // Create and put (child) document without any qualified id. Also tokenize it. DocumentProto document = DocumentBuilder() .SetKey("icing", "fake_type/1") .SetSchema(std::string(kFakeType)) .Build(); + ICING_ASSERT_OK_AND_ASSIGN(DocumentId doc_id, doc_store_->Put(document)); ICING_ASSERT_OK_AND_ASSIGN( TokenizedDocument tokenized_document, TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(), document)); ASSERT_THAT(tokenized_document.qualified_id_join_properties(), IsEmpty()); + // Handle document. ASSERT_THAT(qualified_id_join_index_->last_added_document_id(), Eq(kInvalidDocumentId)); - // Handle document. Index data should remain unchanged since there is no - // qualified id, but last_added_document_id should be updated. ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<QualifiedIdJoinIndexingHandler> handler, - QualifiedIdJoinIndexingHandler::Create(&fake_clock_, + QualifiedIdJoinIndexingHandler::Create(&fake_clock_, doc_store_.get(), qualified_id_join_index_.get())); EXPECT_THAT( - handler->Handle(tokenized_document, kDefaultDocumentId, - /*recovery_mode=*/false, /*put_document_stats=*/nullptr), + handler->Handle(tokenized_document, doc_id, /*recovery_mode=*/false, + /*put_document_stats=*/nullptr), IsOk()); - EXPECT_THAT(qualified_id_join_index_->last_added_document_id(), - Eq(kDefaultDocumentId)); - EXPECT_THAT(qualified_id_join_index_->Get(DocJoinInfo( - kDefaultDocumentId, kQualifiedIdJoinablePropertyId)), - StatusIs(libtextclassifier3::StatusCode::NOT_FOUND)); + + // Verify the state of qualified_id_join_index_ after Handle(). Index data + // should remain unchanged since there is no qualified id, but + // last_added_document_id should be updated. + EXPECT_THAT(qualified_id_join_index_->last_added_document_id(), Eq(doc_id)); + // (kFakeType, kPropertyQualifiedId) should contain nothing. + EXPECT_THAT( + GetJoinData(*qualified_id_join_index_, /*schema_type_id=*/fake_type_id_, + /*joinable_property_id=*/fake_type_joinable_property_id_), + IsOkAndHolds(IsEmpty())); } TEST_F(QualifiedIdJoinIndexingHandlerTest, HandleInvalidDocumentIdShouldReturnInvalidArgumentError) { + // Create and put referenced (parent) document. Get its document id and + // namespace id. DocumentProto referenced_document = DocumentBuilder() .SetKey("pkg$db/ns", "ref_type/1") .SetSchema(std::string(kReferencedType)) .AddStringProperty(std::string(kPropertyName), "one") .Build(); - + ICING_ASSERT_OK_AND_ASSIGN(DocumentId ref_doc_id, + doc_store_->Put(referenced_document)); + ICING_ASSERT_OK_AND_ASSIGN( + NamespaceId ref_doc_ns_id, + doc_store_->GetNamespaceId(referenced_document.namespace_())); + NamespaceFingerprintIdentifier ref_doc_ns_fingerprint_id( + /*namespace_id=*/ref_doc_ns_id, /*target_str=*/referenced_document.uri()); + ASSERT_THAT(doc_store_->GetDocumentId(ref_doc_ns_fingerprint_id), + IsOkAndHolds(ref_doc_id)); + + // Create and put (child) document. Also tokenize it. DocumentProto document = DocumentBuilder() .SetKey("icing", "fake_type/1") @@ -352,31 +570,35 @@ TEST_F(QualifiedIdJoinIndexingHandlerTest, .AddStringProperty(std::string(kPropertyQualifiedId), "pkg$db/ns#ref_type/1") .Build(); + ICING_ASSERT_OK(doc_store_->Put(document)); ICING_ASSERT_OK_AND_ASSIGN( TokenizedDocument tokenized_document, TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(), - document)); + std::move(document))); - qualified_id_join_index_->set_last_added_document_id(kDefaultDocumentId); + qualified_id_join_index_->set_last_added_document_id(ref_doc_id); ASSERT_THAT(qualified_id_join_index_->last_added_document_id(), - Eq(kDefaultDocumentId)); + Eq(ref_doc_id)); ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<QualifiedIdJoinIndexingHandler> handler, - QualifiedIdJoinIndexingHandler::Create(&fake_clock_, + QualifiedIdJoinIndexingHandler::Create(&fake_clock_, doc_store_.get(), qualified_id_join_index_.get())); - // Handling document with kInvalidDocumentId should cause a failure, and both - // index data and last_added_document_id should remain unchanged. + // Handling document with kInvalidDocumentId should cause a failure. EXPECT_THAT( handler->Handle(tokenized_document, kInvalidDocumentId, /*recovery_mode=*/false, /*put_document_stats=*/nullptr), StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); + // Verify the state of qualified_id_join_index_ after Handle(). Both index + // data and last_added_document_id should remain unchanged. EXPECT_THAT(qualified_id_join_index_->last_added_document_id(), - Eq(kDefaultDocumentId)); - EXPECT_THAT(qualified_id_join_index_->Get(DocJoinInfo( - kInvalidDocumentId, kQualifiedIdJoinablePropertyId)), - StatusIs(libtextclassifier3::StatusCode::NOT_FOUND)); + Eq(ref_doc_id)); + // (kFakeType, kPropertyQualifiedId) should contain nothing. + EXPECT_THAT( + GetJoinData(*qualified_id_join_index_, /*schema_type_id=*/fake_type_id_, + /*joinable_property_id=*/fake_type_joinable_property_id_), + IsOkAndHolds(IsEmpty())); // Recovery mode should get the same result. EXPECT_THAT( @@ -384,21 +606,35 @@ TEST_F(QualifiedIdJoinIndexingHandlerTest, /*recovery_mode=*/false, /*put_document_stats=*/nullptr), StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); EXPECT_THAT(qualified_id_join_index_->last_added_document_id(), - Eq(kDefaultDocumentId)); - EXPECT_THAT(qualified_id_join_index_->Get(DocJoinInfo( - kInvalidDocumentId, kQualifiedIdJoinablePropertyId)), - StatusIs(libtextclassifier3::StatusCode::NOT_FOUND)); + Eq(ref_doc_id)); + // (kFakeType, kPropertyQualifiedId) should contain nothing. + EXPECT_THAT( + GetJoinData(*qualified_id_join_index_, /*schema_type_id=*/fake_type_id_, + /*joinable_property_id=*/fake_type_joinable_property_id_), + IsOkAndHolds(IsEmpty())); } TEST_F(QualifiedIdJoinIndexingHandlerTest, HandleOutOfOrderDocumentIdShouldReturnInvalidArgumentError) { + // Create and put referenced (parent) document. Get its document id and + // namespace id. DocumentProto referenced_document = DocumentBuilder() .SetKey("pkg$db/ns", "ref_type/1") .SetSchema(std::string(kReferencedType)) .AddStringProperty(std::string(kPropertyName), "one") .Build(); - + ICING_ASSERT_OK_AND_ASSIGN(DocumentId ref_doc_id, + doc_store_->Put(referenced_document)); + ICING_ASSERT_OK_AND_ASSIGN( + NamespaceId ref_doc_ns_id, + doc_store_->GetNamespaceId(referenced_document.namespace_())); + NamespaceFingerprintIdentifier ref_doc_ns_fingerprint_id( + /*namespace_id=*/ref_doc_ns_id, /*target_str=*/referenced_document.uri()); + ASSERT_THAT(doc_store_->GetDocumentId(ref_doc_ns_fingerprint_id), + IsOkAndHolds(ref_doc_id)); + + // Create and put (child) document. Also tokenize it. DocumentProto document = DocumentBuilder() .SetKey("icing", "fake_type/1") @@ -406,57 +642,75 @@ TEST_F(QualifiedIdJoinIndexingHandlerTest, .AddStringProperty(std::string(kPropertyQualifiedId), "pkg$db/ns#ref_type/1") .Build(); + ICING_ASSERT_OK_AND_ASSIGN(DocumentId doc_id, doc_store_->Put(document)); ICING_ASSERT_OK_AND_ASSIGN( TokenizedDocument tokenized_document, TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(), - document)); - - qualified_id_join_index_->set_last_added_document_id(kDefaultDocumentId); - ASSERT_THAT(qualified_id_join_index_->last_added_document_id(), - Eq(kDefaultDocumentId)); + std::move(document))); ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<QualifiedIdJoinIndexingHandler> handler, - QualifiedIdJoinIndexingHandler::Create(&fake_clock_, + QualifiedIdJoinIndexingHandler::Create(&fake_clock_, doc_store_.get(), qualified_id_join_index_.get())); - // Handling document with document_id < last_added_document_id should cause a - // failure, and both index data and last_added_document_id should remain - // unchanged. - ASSERT_THAT(IsDocumentIdValid(kDefaultDocumentId - 1), IsTrue()); + // Handling document with document_id == last_added_document_id should cause a + // failure. + qualified_id_join_index_->set_last_added_document_id(doc_id); + ASSERT_THAT(qualified_id_join_index_->last_added_document_id(), Eq(doc_id)); EXPECT_THAT( - handler->Handle(tokenized_document, kDefaultDocumentId - 1, - /*recovery_mode=*/false, /*put_document_stats=*/nullptr), + handler->Handle(tokenized_document, doc_id, /*recovery_mode=*/false, + /*put_document_stats=*/nullptr), StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); - EXPECT_THAT(qualified_id_join_index_->last_added_document_id(), - Eq(kDefaultDocumentId)); - EXPECT_THAT(qualified_id_join_index_->Get(DocJoinInfo( - kDefaultDocumentId, kQualifiedIdJoinablePropertyId)), - StatusIs(libtextclassifier3::StatusCode::NOT_FOUND)); + // Verify the state of qualified_id_join_index_ after Handle(). Both index + // data and last_added_document_id should remain unchanged. + EXPECT_THAT(qualified_id_join_index_->last_added_document_id(), Eq(doc_id)); + // (kFakeType, kPropertyQualifiedId) should contain nothing. + EXPECT_THAT( + GetJoinData(*qualified_id_join_index_, /*schema_type_id=*/fake_type_id_, + /*joinable_property_id=*/fake_type_joinable_property_id_), + IsOkAndHolds(IsEmpty())); - // Handling document with document_id == last_added_document_id should cause a - // failure, and both index data and last_added_document_id should remain - // unchanged. + // Handling document with document_id < last_added_document_id should cause a + // failure. + qualified_id_join_index_->set_last_added_document_id(doc_id + 1); + ASSERT_THAT(qualified_id_join_index_->last_added_document_id(), + Eq(doc_id + 1)); EXPECT_THAT( - handler->Handle(tokenized_document, kDefaultDocumentId, - /*recovery_mode=*/false, /*put_document_stats=*/nullptr), + handler->Handle(tokenized_document, doc_id, /*recovery_mode=*/false, + /*put_document_stats=*/nullptr), StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); + // Verify the state of qualified_id_join_index_ after Handle(). Both index + // data and last_added_document_id should remain unchanged. EXPECT_THAT(qualified_id_join_index_->last_added_document_id(), - Eq(kDefaultDocumentId)); - EXPECT_THAT(qualified_id_join_index_->Get(DocJoinInfo( - kDefaultDocumentId, kQualifiedIdJoinablePropertyId)), - StatusIs(libtextclassifier3::StatusCode::NOT_FOUND)); + Eq(doc_id + 1)); + // (kFakeType, kPropertyQualifiedId) should contain nothing. + EXPECT_THAT( + GetJoinData(*qualified_id_join_index_, /*schema_type_id=*/fake_type_id_, + /*joinable_property_id=*/fake_type_joinable_property_id_), + IsOkAndHolds(IsEmpty())); } TEST_F(QualifiedIdJoinIndexingHandlerTest, - HandleRecoveryModeShouldIgnoreDocsLELastAddedDocId) { + HandleRecoveryModeShouldIndexDocsGtLastAddedDocId) { + // Create and put referenced (parent) document. Get its document id and + // namespace id. DocumentProto referenced_document = DocumentBuilder() .SetKey("pkg$db/ns", "ref_type/1") .SetSchema(std::string(kReferencedType)) .AddStringProperty(std::string(kPropertyName), "one") .Build(); - + ICING_ASSERT_OK_AND_ASSIGN(DocumentId ref_doc_id, + doc_store_->Put(referenced_document)); + ICING_ASSERT_OK_AND_ASSIGN( + NamespaceId ref_doc_ns_id, + doc_store_->GetNamespaceId(referenced_document.namespace_())); + NamespaceFingerprintIdentifier ref_doc_ns_fingerprint_id( + /*namespace_id=*/ref_doc_ns_id, /*target_str=*/referenced_document.uri()); + ASSERT_THAT(doc_store_->GetDocumentId(ref_doc_ns_fingerprint_id), + IsOkAndHolds(ref_doc_id)); + + // Create and put (child) document. Also tokenize it. DocumentProto document = DocumentBuilder() .SetKey("icing", "fake_type/1") @@ -464,60 +718,109 @@ TEST_F(QualifiedIdJoinIndexingHandlerTest, .AddStringProperty(std::string(kPropertyQualifiedId), "pkg$db/ns#ref_type/1") .Build(); + ICING_ASSERT_OK_AND_ASSIGN(DocumentId doc_id, doc_store_->Put(document)); ICING_ASSERT_OK_AND_ASSIGN( TokenizedDocument tokenized_document, TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(), - document)); - - qualified_id_join_index_->set_last_added_document_id(kDefaultDocumentId); - ASSERT_THAT(qualified_id_join_index_->last_added_document_id(), - Eq(kDefaultDocumentId)); + std::move(document))); ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<QualifiedIdJoinIndexingHandler> handler, - QualifiedIdJoinIndexingHandler::Create(&fake_clock_, + QualifiedIdJoinIndexingHandler::Create(&fake_clock_, doc_store_.get(), qualified_id_join_index_.get())); - // Handle document with document_id < last_added_document_id in recovery mode. - // We should not get any error, but the handler should ignore the document, so - // both index data and last_added_document_id should remain unchanged. - ASSERT_THAT(IsDocumentIdValid(kDefaultDocumentId - 1), IsTrue()); + // Handle document with document_id > last_added_document_id in recovery mode. + // The handler should index this document and update last_added_document_id. + qualified_id_join_index_->set_last_added_document_id(doc_id - 1); + ASSERT_THAT(qualified_id_join_index_->last_added_document_id(), + Eq(doc_id - 1)); EXPECT_THAT( - handler->Handle(tokenized_document, kDefaultDocumentId - 1, - /*recovery_mode=*/true, /*put_document_stats=*/nullptr), + handler->Handle(tokenized_document, doc_id, /*recovery_mode=*/true, + /*put_document_stats=*/nullptr), IsOk()); - EXPECT_THAT(qualified_id_join_index_->last_added_document_id(), - Eq(kDefaultDocumentId)); - EXPECT_THAT(qualified_id_join_index_->Get(DocJoinInfo( - kDefaultDocumentId, kQualifiedIdJoinablePropertyId)), - StatusIs(libtextclassifier3::StatusCode::NOT_FOUND)); + EXPECT_THAT(qualified_id_join_index_->last_added_document_id(), Eq(doc_id)); + EXPECT_THAT( + GetJoinData(*qualified_id_join_index_, /*schema_type_id=*/fake_type_id_, + /*joinable_property_id=*/fake_type_joinable_property_id_), + IsOkAndHolds( + ElementsAre(DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>( + /*document_id=*/doc_id, + /*join_info=*/ref_doc_ns_fingerprint_id)))); +} + +TEST_F(QualifiedIdJoinIndexingHandlerTest, + HandleRecoveryModeShouldIgnoreDocsLeLastAddedDocId) { + // Create and put referenced (parent) document. Get its document id and + // namespace id. + DocumentProto referenced_document = + DocumentBuilder() + .SetKey("pkg$db/ns", "ref_type/1") + .SetSchema(std::string(kReferencedType)) + .AddStringProperty(std::string(kPropertyName), "one") + .Build(); + ICING_ASSERT_OK_AND_ASSIGN(DocumentId ref_doc_id, + doc_store_->Put(referenced_document)); + ICING_ASSERT_OK_AND_ASSIGN( + NamespaceId ref_doc_ns_id, + doc_store_->GetNamespaceId(referenced_document.namespace_())); + NamespaceFingerprintIdentifier ref_doc_ns_fingerprint_id( + /*namespace_id=*/ref_doc_ns_id, /*target_str=*/referenced_document.uri()); + ASSERT_THAT(doc_store_->GetDocumentId(ref_doc_ns_fingerprint_id), + IsOkAndHolds(ref_doc_id)); + + // Create and put (child) document. Also tokenize it. + DocumentProto document = + DocumentBuilder() + .SetKey("icing", "fake_type/1") + .SetSchema(std::string(kFakeType)) + .AddStringProperty(std::string(kPropertyQualifiedId), + "pkg$db/ns#ref_type/1") + .Build(); + ICING_ASSERT_OK_AND_ASSIGN(DocumentId doc_id, doc_store_->Put(document)); + ICING_ASSERT_OK_AND_ASSIGN( + TokenizedDocument tokenized_document, + TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(), + std::move(document))); + + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<QualifiedIdJoinIndexingHandler> handler, + QualifiedIdJoinIndexingHandler::Create(&fake_clock_, doc_store_.get(), + qualified_id_join_index_.get())); // Handle document with document_id == last_added_document_id in recovery // mode. We should not get any error, but the handler should ignore the // document, so both index data and last_added_document_id should remain // unchanged. + qualified_id_join_index_->set_last_added_document_id(doc_id); + ASSERT_THAT(qualified_id_join_index_->last_added_document_id(), Eq(doc_id)); EXPECT_THAT( - handler->Handle(tokenized_document, kDefaultDocumentId, - /*recovery_mode=*/true, /*put_document_stats=*/nullptr), + handler->Handle(tokenized_document, doc_id, /*recovery_mode=*/true, + /*put_document_stats=*/nullptr), IsOk()); - EXPECT_THAT(qualified_id_join_index_->last_added_document_id(), - Eq(kDefaultDocumentId)); - EXPECT_THAT(qualified_id_join_index_->Get(DocJoinInfo( - kDefaultDocumentId, kQualifiedIdJoinablePropertyId)), - StatusIs(libtextclassifier3::StatusCode::NOT_FOUND)); + EXPECT_THAT(qualified_id_join_index_->last_added_document_id(), Eq(doc_id)); + // (kFakeType, kPropertyQualifiedId) should contain nothing. + EXPECT_THAT( + GetJoinData(*qualified_id_join_index_, /*schema_type_id=*/fake_type_id_, + /*joinable_property_id=*/fake_type_joinable_property_id_), + IsOkAndHolds(IsEmpty())); - // Handle document with document_id > last_added_document_id in recovery mode. - // The handler should index this document and update last_added_document_id. - ASSERT_THAT(IsDocumentIdValid(kDefaultDocumentId + 1), IsTrue()); + // Handle document with document_id < last_added_document_id in recovery mode. + // We should not get any error, but the handler should ignore the document, so + // both index data and last_added_document_id should remain unchanged. + qualified_id_join_index_->set_last_added_document_id(doc_id + 1); + ASSERT_THAT(qualified_id_join_index_->last_added_document_id(), + Eq(doc_id + 1)); EXPECT_THAT( - handler->Handle(tokenized_document, kDefaultDocumentId + 1, - /*recovery_mode=*/true, /*put_document_stats=*/nullptr), + handler->Handle(tokenized_document, doc_id, /*recovery_mode=*/true, + /*put_document_stats=*/nullptr), IsOk()); EXPECT_THAT(qualified_id_join_index_->last_added_document_id(), - Eq(kDefaultDocumentId + 1)); - EXPECT_THAT(qualified_id_join_index_->Get(DocJoinInfo( - kDefaultDocumentId + 1, kQualifiedIdJoinablePropertyId)), - IsOkAndHolds("pkg$db/ns#ref_type/1")); + Eq(doc_id + 1)); + // (kFakeType, kPropertyQualifiedId) should contain nothing. + EXPECT_THAT( + GetJoinData(*qualified_id_join_index_, /*schema_type_id=*/fake_type_id_, + /*joinable_property_id=*/fake_type_joinable_property_id_), + IsOkAndHolds(IsEmpty())); } } // namespace diff --git a/icing/legacy/index/icing-dynamic-trie_test.cc b/icing/legacy/index/icing-dynamic-trie_test.cc index dd63784..ec7e277 100644 --- a/icing/legacy/index/icing-dynamic-trie_test.cc +++ b/icing/legacy/index/icing-dynamic-trie_test.cc @@ -716,7 +716,7 @@ TEST_F(IcingDynamicTrieTest, Properties) { static const uint32_t kOne = 1; uint32_t val_idx; - trie.Insert("abcd", &kOne, &val_idx, false); + ICING_ASSERT_OK(trie.Insert("abcd", &kOne, &val_idx, false)); trie.SetProperty(val_idx, 0); trie.SetProperty(val_idx, 3); @@ -736,7 +736,7 @@ TEST_F(IcingDynamicTrieTest, Properties) { } // Persist after sync. - trie.Insert("abcd", &kOne, &val_idx, false); + ICING_ASSERT_OK(trie.Insert("abcd", &kOne, &val_idx, false)); trie.SetProperty(val_idx, 1); ASSERT_TRUE(trie.Sync()); trie.Close(); @@ -770,15 +770,15 @@ TEST_F(IcingDynamicTrieTest, ClearSingleProperty) { static const uint32_t kOne = 1; uint32_t val_idx[3]; - trie.Insert("abcd", &kOne, &val_idx[0], false); + ICING_ASSERT_OK(trie.Insert("abcd", &kOne, &val_idx[0], false)); trie.SetProperty(val_idx[0], 0); trie.SetProperty(val_idx[0], 3); - trie.Insert("efgh", &kOne, &val_idx[1], false); + ICING_ASSERT_OK(trie.Insert("efgh", &kOne, &val_idx[1], false)); trie.SetProperty(val_idx[1], 0); trie.SetProperty(val_idx[1], 3); - trie.Insert("ijkl", &kOne, &val_idx[2], false); + ICING_ASSERT_OK(trie.Insert("ijkl", &kOne, &val_idx[2], false)); trie.SetProperty(val_idx[2], 0); trie.SetProperty(val_idx[2], 3); diff --git a/icing/monkey_test/icing-monkey-test-runner.cc b/icing/monkey_test/icing-monkey-test-runner.cc index 558da1c..76e41ce 100644 --- a/icing/monkey_test/icing-monkey-test-runner.cc +++ b/icing/monkey_test/icing-monkey-test-runner.cc @@ -15,16 +15,33 @@ #include "icing/monkey_test/icing-monkey-test-runner.h" #include <algorithm> +#include <array> #include <cstdint> #include <functional> +#include <memory> +#include <random> #include <string> +#include <utility> #include <vector> #include "gmock/gmock.h" #include "gtest/gtest.h" +#include "icing/absl_ports/str_cat.h" +#include "icing/file/destructible-directory.h" +#include "icing/icing-search-engine.h" #include "icing/monkey_test/in-memory-icing-search-engine.h" #include "icing/monkey_test/monkey-test-generators.h" +#include "icing/monkey_test/monkey-test-util.h" +#include "icing/monkey_test/monkey-tokenized-document.h" #include "icing/portable/equals-proto.h" +#include "icing/proto/document.pb.h" +#include "icing/proto/initialize.pb.h" +#include "icing/proto/schema.pb.h" +#include "icing/proto/scoring.pb.h" +#include "icing/proto/search.pb.h" +#include "icing/proto/status.pb.h" +#include "icing/proto/term.pb.h" +#include "icing/result/result-state-manager.h" #include "icing/testing/common-matchers.h" #include "icing/testing/tmp-directory.h" #include "icing/util/logging.h" @@ -37,17 +54,10 @@ namespace { using ::icing::lib::portable_equals_proto::EqualsProto; using ::testing::Eq; using ::testing::Le; +using ::testing::Not; using ::testing::SizeIs; using ::testing::UnorderedElementsAreArray; -SchemaProto GenerateRandomSchema( - const IcingMonkeyTestRunnerConfiguration& config, - MonkeyTestRandomEngine* random) { - MonkeySchemaGenerator schema_generator(random); - return schema_generator.GenerateSchema(config.num_types, - config.possible_num_properties); -} - SearchSpecProto GenerateRandomSearchSpecProto( MonkeyTestRandomEngine* random, MonkeyDocumentGenerator* document_generator) { @@ -164,20 +174,13 @@ void SortDocuments(std::vector<DocumentProto>& documents) { } // namespace IcingMonkeyTestRunner::IcingMonkeyTestRunner( - const IcingMonkeyTestRunnerConfiguration& config) - : config_(config), random_(config.seed), in_memory_icing_() { + IcingMonkeyTestRunnerConfiguration config) + : config_(std::move(config)), + random_(config_.seed), + in_memory_icing_(std::make_unique<InMemoryIcingSearchEngine>(&random_)), + schema_generator_( + std::make_unique<MonkeySchemaGenerator>(&random_, &config_)) { ICING_LOG(INFO) << "Monkey test runner started with seed: " << config_.seed; - - SchemaProto schema = GenerateRandomSchema(config_, &random_); - ICING_LOG(DBG) << "Schema Generated: " << schema.DebugString(); - - in_memory_icing_ = - std::make_unique<InMemoryIcingSearchEngine>(&random_, std::move(schema)); - - document_generator_ = std::make_unique<MonkeyDocumentGenerator>( - &random_, in_memory_icing_->GetSchema(), config_.possible_num_tokens_, - config_.num_namespaces, config_.num_uris); - std::string dir = GetTestTempDir() + "/icing/monkey"; filesystem_.DeleteDirectoryRecursively(dir.c_str()); icing_dir_ = std::make_unique<DestructibleDirectory>(&filesystem_, dir); @@ -186,7 +189,7 @@ IcingMonkeyTestRunner::IcingMonkeyTestRunner( void IcingMonkeyTestRunner::Run(uint32_t num) { ASSERT_TRUE(icing_ != nullptr) << "Icing search engine has not yet been created. Please call " - "CreateIcingSearchEngineWithSchema() first"; + "Initialize() first"; uint32_t frequency_sum = 0; for (const auto& schedule : config_.monkey_api_schedules) { @@ -208,10 +211,55 @@ void IcingMonkeyTestRunner::Run(uint32_t num) { } } -void IcingMonkeyTestRunner::CreateIcingSearchEngineWithSchema() { +SetSchemaResultProto IcingMonkeyTestRunner::SetSchema(SchemaProto&& schema) { + in_memory_icing_->SetSchema(std::move(schema)); + document_generator_ = std::make_unique<MonkeyDocumentGenerator>( + &random_, in_memory_icing_->GetSchema(), &config_); + return icing_->SetSchema(*in_memory_icing_->GetSchema(), + /*ignore_errors_and_delete_documents=*/true); +} + +void IcingMonkeyTestRunner::Initialize() { ASSERT_NO_FATAL_FAILURE(CreateIcingSearchEngine()); - ASSERT_THAT(icing_->SetSchema(*in_memory_icing_->GetSchema()).status(), - ProtoIsOk()); + + SchemaProto schema = schema_generator_->GenerateSchema(); + ICING_LOG(DBG) << "Schema Generated: " << schema.DebugString(); + + ASSERT_THAT(SetSchema(std::move(schema)).status(), ProtoIsOk()); +} + +void IcingMonkeyTestRunner::DoUpdateSchema() { + ICING_LOG(INFO) << "Monkey updating schema"; + + MonkeySchemaGenerator::UpdateSchemaResult result = + schema_generator_->UpdateSchema(*in_memory_icing_->GetSchema()); + if (result.is_invalid_schema) { + SetSchemaResultProto set_schema_result = + icing_->SetSchema(result.schema, + /*ignore_errors_and_delete_documents=*/true); + ASSERT_THAT(set_schema_result.status(), Not(ProtoIsOk())); + return; + } + ICING_LOG(DBG) << "Updating schema to: " << result.schema.DebugString(); + SetSchemaResultProto icing_set_schema_result = + SetSchema(std::move(result.schema)); + ASSERT_THAT(icing_set_schema_result.status(), ProtoIsOk()); + ASSERT_THAT(icing_set_schema_result.deleted_schema_types(), + UnorderedElementsAreArray(result.schema_types_deleted)); + ASSERT_THAT(icing_set_schema_result.incompatible_schema_types(), + UnorderedElementsAreArray(result.schema_types_incompatible)); + ASSERT_THAT( + icing_set_schema_result.index_incompatible_changed_schema_types(), + UnorderedElementsAreArray(result.schema_types_index_incompatible)); + + // Update in-memory icing + for (const std::string& deleted_type : result.schema_types_deleted) { + ICING_ASSERT_OK(in_memory_icing_->DeleteBySchemaType(deleted_type)); + } + for (const std::string& incompatible_type : + result.schema_types_incompatible) { + ICING_ASSERT_OK(in_memory_icing_->DeleteBySchemaType(incompatible_type)); + } } void IcingMonkeyTestRunner::DoGet() { @@ -266,10 +314,11 @@ void IcingMonkeyTestRunner::DoDelete() { /*p_other=*/0.1); ICING_LOG(INFO) << "Monkey deleting namespace: " << document.name_space << ", uri: " << document.uri; - in_memory_icing_->Delete(document.name_space, document.uri); DeleteResultProto delete_result = icing_->Delete(document.name_space, document.uri); if (document.document.has_value()) { + ICING_ASSERT_OK( + in_memory_icing_->Delete(document.name_space, document.uri)); ASSERT_THAT(delete_result.status(), ProtoIsOk()) << "Cannot delete an existing document."; } else { @@ -383,8 +432,8 @@ void IcingMonkeyTestRunner::DoSearch() { ICING_VLOG(1) << "scoring_spec:\n" << scoring_spec->DebugString(); ICING_VLOG(1) << "result_spec:\n" << result_spec->DebugString(); - std::vector<DocumentProto> exp_documents = - in_memory_icing_->Search(*search_spec); + ICING_ASSERT_OK_AND_ASSIGN(std::vector<DocumentProto> exp_documents, + in_memory_icing_->Search(*search_spec)); SearchResultProto search_result = icing_->Search(*search_spec, *scoring_spec, *result_spec); diff --git a/icing/monkey_test/icing-monkey-test-runner.h b/icing/monkey_test/icing-monkey-test-runner.h index fbaaaaa..10be60c 100644 --- a/icing/monkey_test/icing-monkey-test-runner.h +++ b/icing/monkey_test/icing-monkey-test-runner.h @@ -16,63 +16,36 @@ #define ICING_MONKEY_TEST_ICING_MONKEY_TEST_RUNNER_H_ #include <cstdint> -#include <random> +#include <memory> #include "icing/file/destructible-directory.h" +#include "icing/file/filesystem.h" #include "icing/icing-search-engine.h" #include "icing/monkey_test/in-memory-icing-search-engine.h" #include "icing/monkey_test/monkey-test-generators.h" +#include "icing/monkey_test/monkey-test-util.h" +#include "icing/proto/schema.pb.h" namespace icing { namespace lib { -class IcingMonkeyTestRunner; - -struct IcingMonkeyTestRunnerConfiguration { - explicit IcingMonkeyTestRunnerConfiguration(uint32_t seed, int num_types, - int num_namespaces, int num_uris, - int index_merge_size) - : seed(seed), - num_types(num_types), - num_namespaces(num_namespaces), - num_uris(num_uris), - index_merge_size(index_merge_size) {} - - uint32_t seed; - int num_types; - int num_namespaces; - int num_uris; - int index_merge_size; - - // The possible number of properties that may appear in generated schema - // types. - std::vector<int> possible_num_properties; - - // The possible number of tokens that may appear in generated documents, with - // a noise factor from 0.5 to 1 applied. - std::vector<int> possible_num_tokens_; - - // An array of pairs of monkey test APIs with frequencies. - // If f_sum is the sum of all the frequencies, an operation with frequency f - // means for every f_sum iterations, the operation is expected to run f times. - std::vector<std::pair<std::function<void(IcingMonkeyTestRunner*)>, uint32_t>> - monkey_api_schedules; -}; - class IcingMonkeyTestRunner { public: - IcingMonkeyTestRunner(const IcingMonkeyTestRunnerConfiguration& config); + IcingMonkeyTestRunner(IcingMonkeyTestRunnerConfiguration config); IcingMonkeyTestRunner(const IcingMonkeyTestRunner&) = delete; IcingMonkeyTestRunner& operator=(const IcingMonkeyTestRunner&) = delete; + SetSchemaResultProto SetSchema(SchemaProto&& schema); + // This function must and should only be called before running the monkey // test. - void CreateIcingSearchEngineWithSchema(); + void Initialize(); // Run the monkey test with num operations. void Run(uint32_t num); // APIs supported in icing search engine. + void DoUpdateSchema(); void DoGet(); void DoGetAllNamespaces(); void DoPut(); @@ -94,6 +67,7 @@ class IcingMonkeyTestRunner { std::unique_ptr<InMemoryIcingSearchEngine> in_memory_icing_; std::unique_ptr<IcingSearchEngine> icing_; + std::unique_ptr<MonkeySchemaGenerator> schema_generator_; std::unique_ptr<MonkeyDocumentGenerator> document_generator_; void CreateIcingSearchEngine(); diff --git a/icing/monkey_test/icing-search-engine_monkey_test.cc b/icing/monkey_test/icing-search-engine_monkey_test.cc index a24e57f..436e27b 100644 --- a/icing/monkey_test/icing-search-engine_monkey_test.cc +++ b/icing/monkey_test/icing-search-engine_monkey_test.cc @@ -12,9 +12,17 @@ // See the License for the specific language governing permissions and // limitations under the License. +#include <cstdint> +#include <random> +#include <utility> + #include "gtest/gtest.h" #include "icing/monkey_test/icing-monkey-test-runner.h" +#include "icing/monkey_test/monkey-test-util.h" #include "icing/portable/platform.h" +#include "icing/proto/debug.pb.h" +#include "icing/schema/section.h" +#include "icing/util/logging.h" namespace icing { namespace lib { @@ -44,13 +52,14 @@ TEST(IcingSearchEngineMonkeyTest, MonkeyTest) { {&IcingMonkeyTestRunner::DoGetAllNamespaces, 50}, {&IcingMonkeyTestRunner::DoDelete, 50}, {&IcingMonkeyTestRunner::DoDeleteByNamespace, 50}, - {&IcingMonkeyTestRunner::DoDeleteBySchemaType, 50}, + {&IcingMonkeyTestRunner::DoDeleteBySchemaType, 45}, {&IcingMonkeyTestRunner::DoDeleteByQuery, 20}, {&IcingMonkeyTestRunner::DoOptimize, 5}, + {&IcingMonkeyTestRunner::DoUpdateSchema, 5}, {&IcingMonkeyTestRunner::ReloadFromDisk, 5}}; uint32_t num_iterations = IsAndroidArm() ? 1000 : 5000; - IcingMonkeyTestRunner runner(config); - ASSERT_NO_FATAL_FAILURE(runner.CreateIcingSearchEngineWithSchema()); + IcingMonkeyTestRunner runner(std::move(config)); + ASSERT_NO_FATAL_FAILURE(runner.Initialize()); ASSERT_NO_FATAL_FAILURE(runner.Run(num_iterations)); } @@ -75,8 +84,8 @@ TEST(DISABLED_IcingSearchEngineMonkeyTest, MonkeyManyDocTest) { {&IcingMonkeyTestRunner::DoGetAllNamespaces, 50}, {&IcingMonkeyTestRunner::DoOptimize, 5}, {&IcingMonkeyTestRunner::ReloadFromDisk, 5}}; - IcingMonkeyTestRunner runner(config); - ASSERT_NO_FATAL_FAILURE(runner.CreateIcingSearchEngineWithSchema()); + IcingMonkeyTestRunner runner(std::move(config)); + ASSERT_NO_FATAL_FAILURE(runner.Initialize()); // Pre-fill with 4 million documents SetLoggingLevel(LogSeverity::WARNING); for (int i = 0; i < 4000000; i++) { diff --git a/icing/monkey_test/in-memory-icing-search-engine.cc b/icing/monkey_test/in-memory-icing-search-engine.cc index 405a7b0..7baa06e 100644 --- a/icing/monkey_test/in-memory-icing-search-engine.cc +++ b/icing/monkey_test/in-memory-icing-search-engine.cc @@ -14,15 +14,27 @@ #include "icing/monkey_test/in-memory-icing-search-engine.h" +#include <algorithm> #include <cstdint> +#include <memory> +#include <random> +#include <string> #include <string_view> #include <unordered_set> +#include <utility> #include <vector> #include "icing/text_classifier/lib3/utils/base/status.h" #include "icing/text_classifier/lib3/utils/base/statusor.h" #include "icing/absl_ports/canonical_errors.h" #include "icing/absl_ports/str_cat.h" +#include "icing/absl_ports/str_join.h" +#include "icing/monkey_test/monkey-tokenized-document.h" +#include "icing/proto/document.pb.h" +#include "icing/proto/schema.pb.h" +#include "icing/proto/search.pb.h" +#include "icing/proto/term.pb.h" +#include "icing/store/document-id.h" #include "icing/util/status-macros.h" namespace icing { @@ -38,9 +50,80 @@ bool IsPrefix(std::string_view s1, std::string_view s2) { return s1 == s2.substr(0, s1.length()); } -bool DoesDocumentMatchQuery(const MonkeyTokenizedDocument &document, - const std::string &query, - TermMatchType::Code term_match_type) { +} // namespace + +libtextclassifier3::StatusOr<const PropertyConfigProto *> +InMemoryIcingSearchEngine::GetPropertyConfig( + const std::string &schema_type, const std::string &property_name) const { + auto schema_iter = property_config_map_.find(schema_type); + if (schema_iter == property_config_map_.end()) { + return absl_ports::NotFoundError( + absl_ports::StrCat("Schema type: ", schema_type, " is not found.")); + } + auto property_iter = schema_iter->second.find(property_name); + if (property_iter == schema_iter->second.end()) { + return absl_ports::NotFoundError( + absl_ports::StrCat("Property: ", property_name, " is not found.")); + } + return &property_iter->second; +} + +libtextclassifier3::StatusOr<TermMatchType::Code> +InMemoryIcingSearchEngine::GetTermMatchType( + const std::string &schema_type, + const MonkeyTokenizedSection §ion) const { + bool in_indexable_properties_list = false; + bool all_indexable_from_top = true; + + std::vector<std::string_view> properties_in_path = + absl_ports::StrSplit(section.path, "."); + if (properties_in_path.empty()) { + return absl_ports::InvalidArgumentError("Got empty path."); + } + std::string curr_schema_type = schema_type; + for (int i = 0; i < properties_in_path.size(); ++i) { + ICING_ASSIGN_OR_RETURN( + const PropertyConfigProto *prop, + GetPropertyConfig(curr_schema_type, + std::string(properties_in_path[i]))); + if (prop->data_type() == PropertyConfigProto::DataType::STRING) { + return prop->string_indexing_config().term_match_type(); + } + + if (prop->data_type() != PropertyConfigProto::DataType::DOCUMENT) { + return TermMatchType::Code::TermMatchType_Code_UNKNOWN; + } + + bool old_all_indexable_from_top = all_indexable_from_top; + all_indexable_from_top &= + prop->document_indexing_config().index_nested_properties(); + if (!all_indexable_from_top && !in_indexable_properties_list) { + // Only try to update in_indexable_properties_list if this is the first + // level with index_nested_properties=false. + if (old_all_indexable_from_top) { + auto &indexable_properties = + prop->document_indexing_config().indexable_nested_properties_list(); + std::string relative_path = + absl_ports::StrCatPieces(std::vector<std::string_view>( + properties_in_path.begin() + i + 1, properties_in_path.end())); + in_indexable_properties_list = + std::find(indexable_properties.begin(), indexable_properties.end(), + relative_path) != indexable_properties.end(); + } + // Check in_indexable_properties_list again. + if (!in_indexable_properties_list) { + return TermMatchType::Code::TermMatchType_Code_UNKNOWN; + } + } + curr_schema_type = prop->document_indexing_config().GetTypeName(); + } + return TermMatchType::Code::TermMatchType_Code_UNKNOWN; +} + +libtextclassifier3::StatusOr<bool> +InMemoryIcingSearchEngine::DoesDocumentMatchQuery( + const MonkeyTokenizedDocument &document, const std::string &query, + TermMatchType::Code term_match_type) const { std::vector<std::string_view> strs = absl_ports::StrSplit(query, ":"); std::string_view query_term; std::string_view section_restrict; @@ -54,8 +137,15 @@ bool DoesDocumentMatchQuery(const MonkeyTokenizedDocument &document, if (!section_restrict.empty() && section.path != section_restrict) { continue; } + ICING_ASSIGN_OR_RETURN( + TermMatchType::Code section_term_match_type, + GetTermMatchType(document.document.schema(), section)); + if (section_term_match_type == TermMatchType::UNKNOWN) { + // Skip non-indexable property. + continue; + } for (const std::string &token : section.token_sequence) { - if (section.term_match_type == TermMatchType::EXACT_ONLY || + if (section_term_match_type == TermMatchType::EXACT_ONLY || term_match_type == TermMatchType::EXACT_ONLY) { if (token == query_term) { return true; @@ -68,7 +158,18 @@ bool DoesDocumentMatchQuery(const MonkeyTokenizedDocument &document, return false; } -} // namespace +void InMemoryIcingSearchEngine::SetSchema(SchemaProto &&schema) { + schema_ = std::make_unique<SchemaProto>(std::move(schema)); + property_config_map_.clear(); + for (const SchemaTypeConfigProto &type_config : schema_->types()) { + auto &curr_property_map = property_config_map_[type_config.schema_type()]; + for (const PropertyConfigProto &property_config : + type_config.properties()) { + curr_property_map.insert( + {property_config.property_name(), property_config}); + } + } +} InMemoryIcingSearchEngine::PickDocumentResult InMemoryIcingSearchEngine::RandomPickDocument(float p_alive, float p_all, @@ -121,7 +222,7 @@ InMemoryIcingSearchEngine::RandomPickDocument(float p_alive, float p_all, void InMemoryIcingSearchEngine::Put(const MonkeyTokenizedDocument &document) { // Delete the old one if existing. - Delete(document.document.namespace_(), document.document.uri()); + Delete(document.document.namespace_(), document.document.uri()).IgnoreError(); existing_doc_ids_.push_back(documents_.size()); namespace_uri_docid_map[document.document.namespace_()] [document.document.uri()] = documents_.size(); @@ -192,7 +293,8 @@ InMemoryIcingSearchEngine::DeleteBySchemaType(const std::string &schema_type) { libtextclassifier3::StatusOr<uint32_t> InMemoryIcingSearchEngine::DeleteByQuery( const SearchSpecProto &search_spec) { - std::vector<DocumentId> doc_ids_to_delete = InternalSearch(search_spec); + ICING_ASSIGN_OR_RETURN(std::vector<DocumentId> doc_ids_to_delete, + InternalSearch(search_spec)); for (DocumentId doc_id : doc_ids_to_delete) { const DocumentProto &document = documents_[doc_id].document; if (!Delete(document.namespace_(), document.uri()).ok()) { @@ -204,9 +306,10 @@ libtextclassifier3::StatusOr<uint32_t> InMemoryIcingSearchEngine::DeleteByQuery( return doc_ids_to_delete.size(); } -std::vector<DocumentProto> InMemoryIcingSearchEngine::Search( - const SearchSpecProto &search_spec) const { - std::vector<DocumentId> matched_doc_ids = InternalSearch(search_spec); +libtextclassifier3::StatusOr<std::vector<DocumentProto>> +InMemoryIcingSearchEngine::Search(const SearchSpecProto &search_spec) const { + ICING_ASSIGN_OR_RETURN(std::vector<DocumentId> matched_doc_ids, + InternalSearch(search_spec)); std::vector<DocumentProto> result; result.reserve(matched_doc_ids.size()); for (DocumentId doc_id : matched_doc_ids) { @@ -229,12 +332,16 @@ libtextclassifier3::StatusOr<DocumentId> InMemoryIcingSearchEngine::InternalGet( " is not found by InMemoryIcingSearchEngine::InternalGet.")); } -std::vector<DocumentId> InMemoryIcingSearchEngine::InternalSearch( +libtextclassifier3::StatusOr<std::vector<DocumentId>> +InMemoryIcingSearchEngine::InternalSearch( const SearchSpecProto &search_spec) const { std::vector<DocumentId> matched_doc_ids; for (DocumentId doc_id : existing_doc_ids_) { - if (DoesDocumentMatchQuery(documents_[doc_id], search_spec.query(), - search_spec.term_match_type())) { + ICING_ASSIGN_OR_RETURN( + bool match, + DoesDocumentMatchQuery(documents_[doc_id], search_spec.query(), + search_spec.term_match_type())); + if (match) { matched_doc_ids.push_back(doc_id); } } diff --git a/icing/monkey_test/in-memory-icing-search-engine.h b/icing/monkey_test/in-memory-icing-search-engine.h index a5d8872..98e7e4c 100644 --- a/icing/monkey_test/in-memory-icing-search-engine.h +++ b/icing/monkey_test/in-memory-icing-search-engine.h @@ -16,18 +16,21 @@ #define ICING_MONKEY_TEST_IN_MEMORY_ICING_SEARCH_ENGINE_H_ #include <cstdint> +#include <memory> #include <optional> #include <string> #include <unordered_map> #include <unordered_set> #include <vector> +#include "icing/text_classifier/lib3/utils/base/status.h" #include "icing/text_classifier/lib3/utils/base/statusor.h" -#include "icing/monkey_test/monkey-test-generators.h" +#include "icing/monkey_test/monkey-test-util.h" #include "icing/monkey_test/monkey-tokenized-document.h" #include "icing/proto/document.pb.h" #include "icing/proto/schema.pb.h" #include "icing/proto/search.pb.h" +#include "icing/proto/term.pb.h" #include "icing/store/document-id.h" namespace icing { @@ -43,15 +46,14 @@ class InMemoryIcingSearchEngine { std::optional<DocumentProto> document; }; - InMemoryIcingSearchEngine(MonkeyTestRandomEngine *random, - SchemaProto &&schema) - : random_(random), - schema_(std::make_unique<SchemaProto>(std::move(schema))) {} + InMemoryIcingSearchEngine(MonkeyTestRandomEngine *random) : random_(random) {} uint32_t GetNumAliveDocuments() const { return existing_doc_ids_.size(); } const SchemaProto *GetSchema() const { return schema_.get(); } + void SetSchema(SchemaProto &&schema); + // Randomly pick a document from the in-memory Icing for monkey testing. // // p_alive: chance of getting an alive document. @@ -112,7 +114,8 @@ class InMemoryIcingSearchEngine { // Currently, only the "query" and "term_match_type" fields are recognized by // the in-memory Icing, and only single term queries with possible section // restrictions are supported. - std::vector<DocumentProto> Search(const SearchSpecProto &search_spec) const; + libtextclassifier3::StatusOr<std::vector<DocumentProto>> Search( + const SearchSpecProto &search_spec) const; private: // Does not own. @@ -126,6 +129,11 @@ class InMemoryIcingSearchEngine { namespace_uri_docid_map; std::unique_ptr<SchemaProto> schema_; + // A map that maps from (schema_type, property_name) to the corresponding + // PropertyConfigProto. + std::unordered_map< + std::string, std::unordered_map<std::string, const PropertyConfigProto &>> + property_config_map_; // Finds and returns the internal document id for the document identified by // the given key (namespace, uri) @@ -138,8 +146,19 @@ class InMemoryIcingSearchEngine { // A helper method for DeleteByQuery and Search to get matched internal doc // ids. - std::vector<DocumentId> InternalSearch( + libtextclassifier3::StatusOr<std::vector<DocumentId>> InternalSearch( const SearchSpecProto &search_spec) const; + + libtextclassifier3::StatusOr<const PropertyConfigProto *> GetPropertyConfig( + const std::string &schema_type, const std::string &property_name) const; + + libtextclassifier3::StatusOr<TermMatchType::Code> GetTermMatchType( + const std::string &schema_type, + const MonkeyTokenizedSection §ion) const; + + libtextclassifier3::StatusOr<bool> DoesDocumentMatchQuery( + const MonkeyTokenizedDocument &document, const std::string &query, + TermMatchType::Code term_match_type) const; }; } // namespace lib diff --git a/icing/monkey_test/monkey-test-generators.cc b/icing/monkey_test/monkey-test-generators.cc index 7b2ff56..0d5ad73 100644 --- a/icing/monkey_test/monkey-test-generators.cc +++ b/icing/monkey_test/monkey-test-generators.cc @@ -14,79 +14,269 @@ #include "icing/monkey_test/monkey-test-generators.h" +#include <array> +#include <cstdint> +#include <random> +#include <string> +#include <string_view> +#include <unordered_set> +#include <utility> +#include <vector> + +#include "icing/absl_ports/str_cat.h" +#include "icing/absl_ports/str_join.h" +#include "icing/document-builder.h" +#include "icing/monkey_test/monkey-test-util.h" +#include "icing/monkey_test/monkey-tokenized-document.h" +#include "icing/proto/schema.pb.h" +#include "icing/proto/term.pb.h" +#include "icing/schema/section.h" + namespace icing { namespace lib { -SchemaProto MonkeySchemaGenerator::GenerateSchema( - int num_types, const std::vector<int>& possible_num_properties) const { +namespace { + +constexpr std::array<PropertyConfigProto::Cardinality::Code, 3> kCardinalities = + {PropertyConfigProto::Cardinality::REPEATED, + PropertyConfigProto::Cardinality::OPTIONAL, + PropertyConfigProto::Cardinality::REQUIRED}; + +constexpr std::array<TermMatchType::Code, 3> kTermMatchTypes = { + TermMatchType::UNKNOWN, TermMatchType::EXACT_ONLY, TermMatchType::PREFIX}; + +PropertyConfigProto::Cardinality::Code GetRandomCardinality( + MonkeyTestRandomEngine* random) { + std::uniform_int_distribution<> dist(0, kCardinalities.size() - 1); + return kCardinalities[dist(*random)]; +} + +TermMatchType::Code GetRandomTermMatchType(MonkeyTestRandomEngine* random) { + std::uniform_int_distribution<> dist(0, kTermMatchTypes.size() - 1); + return kTermMatchTypes[dist(*random)]; +} + +// TODO: Update this function when supporting document_indexing_config. +bool IsIndexableProperty(const PropertyConfigProto& property) { + return property.string_indexing_config().term_match_type() != + TermMatchType::UNKNOWN; +} + +void SetStringIndexingConfig(PropertyConfigProto& property, + TermMatchType::Code term_match_type) { + if (term_match_type != TermMatchType::UNKNOWN) { + StringIndexingConfig* string_indexing_config = + property.mutable_string_indexing_config(); + string_indexing_config->set_term_match_type(term_match_type); + // TODO: Try to add different TokenizerTypes. VERBATIM, RFC822, and URL are + // the remaining candidates to consider. + string_indexing_config->set_tokenizer_type( + StringIndexingConfig::TokenizerType::PLAIN); + } else { + property.clear_string_indexing_config(); + } +} + +} // namespace + +SchemaProto MonkeySchemaGenerator::GenerateSchema() { SchemaProto schema; - std::uniform_int_distribution<> dist(0, possible_num_properties.size() - 1); - while (num_types > 0) { - int num_properties = possible_num_properties[dist(*random_)]; - *schema.add_types() = GenerateType( - "MonkeyTestType" + std::to_string(num_types), num_properties); - --num_types; + for (int i = 0; i < config_->num_types; ++i) { + *schema.add_types() = GenerateType(); } return schema; } +MonkeySchemaGenerator::UpdateSchemaResult MonkeySchemaGenerator::UpdateSchema( + const SchemaProto& schema) { + UpdateSchemaResult result = {std::move(schema)}; + SchemaProto& new_schema = result.schema; + + // Delete up to 2 existing types. + std::uniform_int_distribution<> num_types_to_delete_dist(0, 2); + for (int num_types_to_delete = num_types_to_delete_dist(*random_); + num_types_to_delete >= 0; --num_types_to_delete) { + if (new_schema.types_size() > 0) { + std::uniform_int_distribution<> dist(0, new_schema.types_size() - 1); + int index_to_delete = dist(*random_); + result.schema_types_deleted.insert( + new_schema.types(index_to_delete).schema_type()); + new_schema.mutable_types()->SwapElements(index_to_delete, + new_schema.types_size() - 1); + new_schema.mutable_types()->RemoveLast(); + } + } + + // Updating about 1/3 of existing types. + for (int i = 0; i < new_schema.types_size(); ++i) { + std::uniform_int_distribution<> dist(0, 2); + if (dist(*random_) == 0) { + UpdateType(*new_schema.mutable_types(i), result); + } + } + + // Add up to 2 new types. + std::uniform_int_distribution<> num_types_to_add_dist(0, 2); + for (int num_types_to_add = num_types_to_add_dist(*random_); + num_types_to_add >= 0; --num_types_to_add) { + *new_schema.add_types() = GenerateType(); + } + + return result; +} + PropertyConfigProto MonkeySchemaGenerator::GenerateProperty( - std::string_view name, TermMatchType::Code term_match_type) const { + const SchemaTypeConfigProto& type_config, + PropertyConfigProto::Cardinality::Code cardinality, + TermMatchType::Code term_match_type) { PropertyConfigProto prop; - prop.set_property_name(name.data(), name.length()); + prop.set_property_name( + "MonkeyTestProp" + + std::to_string(num_properties_generated_[type_config.schema_type()]++)); // TODO: Perhaps in future iterations we will want to generate more than just // string properties. prop.set_data_type(PropertyConfigProto::DataType::STRING); + prop.set_cardinality(cardinality); + SetStringIndexingConfig(prop, term_match_type); + return prop; +} - constexpr std::array<PropertyConfigProto::Cardinality::Code, 3> - cardinalities = {PropertyConfigProto::Cardinality::REPEATED, - PropertyConfigProto::Cardinality::OPTIONAL, - PropertyConfigProto::Cardinality::REQUIRED}; - std::uniform_int_distribution<> dist(0, cardinalities.size() - 1); - prop.set_cardinality(cardinalities[dist(*random_)]); +void MonkeySchemaGenerator::UpdateProperty( + const SchemaTypeConfigProto& type_config, PropertyConfigProto& property, + UpdateSchemaResult& result) { + PropertyConfigProto::Cardinality::Code new_cardinality = + GetRandomCardinality(random_); + if (new_cardinality != property.cardinality()) { + // Only do compatible cardinality update for now, otherwise it would be hard + // to track which documents will be invalid after updating the schema. + // + // The following type of updates are not allowed: + // - optional -> required + // - repeated -> optional + // - repeated -> required + if (property.cardinality() == PropertyConfigProto::Cardinality::OPTIONAL && + new_cardinality == PropertyConfigProto::Cardinality::REQUIRED) { + return; + } + if (property.cardinality() == PropertyConfigProto::Cardinality::REPEATED && + (new_cardinality == PropertyConfigProto::Cardinality::OPTIONAL || + new_cardinality == PropertyConfigProto::Cardinality::REQUIRED)) { + return; + } + property.set_cardinality(new_cardinality); + } - if (term_match_type != TermMatchType::UNKNOWN) { - StringIndexingConfig* string_indexing_config = - prop.mutable_string_indexing_config(); - string_indexing_config->set_term_match_type(term_match_type); - string_indexing_config->set_tokenizer_type( - StringIndexingConfig::TokenizerType::PLAIN); + if (property.data_type() == PropertyConfigProto::DataType::STRING) { + TermMatchType::Code new_term_match_type = GetRandomTermMatchType(random_); + if (new_term_match_type != + property.string_indexing_config().term_match_type()) { + SetStringIndexingConfig(property, new_term_match_type); + result.schema_types_index_incompatible.insert(type_config.schema_type()); + } } - return prop; } -SchemaTypeConfigProto MonkeySchemaGenerator::GenerateType( - std::string_view name, int num_properties) const { +SchemaTypeConfigProto MonkeySchemaGenerator::GenerateType() { SchemaTypeConfigProto type_config; - type_config.set_schema_type(name.data(), name.length()); + type_config.set_schema_type("MonkeyTestType" + + std::to_string(num_types_generated_++)); + std::uniform_int_distribution<> possible_num_properties_dist( + 0, config_->possible_num_properties.size() - 1); + int total_num_properties = + config_->possible_num_properties[possible_num_properties_dist(*random_)]; + int num_indexed_properties = 0; - constexpr std::array<TermMatchType::Code, 3> term_match_types = { - TermMatchType::UNKNOWN, TermMatchType::EXACT_ONLY, TermMatchType::PREFIX}; - std::uniform_int_distribution<> dist(0, term_match_types.size() - 1); - while (--num_properties >= 0) { - std::string prop_name = "MonkeyTestProp" + std::to_string(num_properties); + for (int i = 0; i < total_num_properties; ++i) { TermMatchType::Code term_match_type = TermMatchType::UNKNOWN; if (num_indexed_properties < kTotalNumSections) { - term_match_type = term_match_types[dist(*random_)]; + term_match_type = GetRandomTermMatchType(random_); } if (term_match_type != TermMatchType::UNKNOWN) { num_indexed_properties += 1; } - (*type_config.add_properties()) = - GenerateProperty(prop_name, term_match_type); + (*type_config.add_properties()) = GenerateProperty( + type_config, GetRandomCardinality(random_), term_match_type); } return type_config; } +void MonkeySchemaGenerator::UpdateType(SchemaTypeConfigProto& type_config, + UpdateSchemaResult& result) { + // Delete up to 4 existing property. + std::uniform_int_distribution<> num_properties_to_delete_dist(0, 4); + for (int num_properties_to_delete = num_properties_to_delete_dist(*random_); + num_properties_to_delete >= 0; --num_properties_to_delete) { + if (type_config.properties_size() > 0) { + std::uniform_int_distribution<> dist(0, + type_config.properties_size() - 1); + int index_to_delete = dist(*random_); + // Only delete a required property for now, otherwise it would be hard + // to track which documents will be invalid after updating the schema. + if (type_config.properties(index_to_delete).cardinality() != + PropertyConfigProto::Cardinality::REQUIRED) { + continue; + } + if (IsIndexableProperty(type_config.properties(index_to_delete))) { + result.schema_types_index_incompatible.insert( + type_config.schema_type()); + } + // Removing a property will cause the type to be considered as + // incompatible. + result.schema_types_incompatible.insert(type_config.schema_type()); + + type_config.mutable_properties()->SwapElements( + index_to_delete, type_config.properties_size() - 1); + type_config.mutable_properties()->RemoveLast(); + } + } + + // Updating about 1/3 of existing properties. + for (int i = 0; i < type_config.properties_size(); ++i) { + std::uniform_int_distribution<> dist(0, 2); + if (dist(*random_) == 0) { + UpdateProperty(type_config, *type_config.mutable_properties(i), result); + } + } + + // Add up to 4 new properties. + std::uniform_int_distribution<> num_types_to_add_dist(0, 4); + for (int num_types_to_add = num_types_to_add_dist(*random_); + num_types_to_add >= 0; --num_types_to_add) { + PropertyConfigProto::Cardinality::Code new_cardinality = + GetRandomCardinality(random_); + // Adding a required property will make all document of this type invalid. + if (new_cardinality == PropertyConfigProto::Cardinality::REQUIRED) { + result.schema_types_incompatible.insert(type_config.schema_type()); + } + PropertyConfigProto new_property = GenerateProperty( + type_config, new_cardinality, GetRandomTermMatchType(random_)); + if (IsIndexableProperty(new_property)) { + result.schema_types_index_incompatible.insert(type_config.schema_type()); + } + (*type_config.add_properties()) = std::move(new_property); + } + + int num_indexed_properties = 0; + for (int i = 0; i < type_config.properties_size(); ++i) { + if (IsIndexableProperty(type_config.properties(i))) { + ++num_indexed_properties; + } + } + + if (num_indexed_properties > kTotalNumSections) { + result.is_invalid_schema = true; + } +} + std::string MonkeyDocumentGenerator::GetNamespace() const { uint32_t name_space; // When num_namespaces is 0, all documents generated get different namespaces. // Otherwise, namespaces will be randomly picked from a set with // num_namespaces elements. - if (num_namespaces_ == 0) { + if (config_->num_namespaces == 0) { name_space = num_docs_generated_; } else { - std::uniform_int_distribution<> dist(0, num_namespaces_ - 1); + std::uniform_int_distribution<> dist(0, config_->num_namespaces - 1); name_space = dist(*random_); } return absl_ports::StrCat("namespace", std::to_string(name_space)); @@ -96,18 +286,19 @@ std::string MonkeyDocumentGenerator::GetUri() const { uint32_t uri; // When num_uris is 0, all documents generated get different URIs. Otherwise, // URIs will be randomly picked from a set with num_uris elements. - if (num_uris_ == 0) { + if (config_->num_uris == 0) { uri = num_docs_generated_; } else { - std::uniform_int_distribution<> dist(0, num_uris_ - 1); + std::uniform_int_distribution<> dist(0, config_->num_uris - 1); uri = dist(*random_); } return absl_ports::StrCat("uri", std::to_string(uri)); } int MonkeyDocumentGenerator::GetNumTokens() const { - std::uniform_int_distribution<> dist(0, possible_num_tokens_.size() - 1); - int n = possible_num_tokens_[dist(*random_)]; + std::uniform_int_distribution<> dist( + 0, config_->possible_num_tokens_.size() - 1); + int n = config_->possible_num_tokens_[dist(*random_)]; // Add some noise std::uniform_real_distribution<> real_dist(0.5, 1); float p = real_dist(*random_); @@ -138,15 +329,13 @@ MonkeyTokenizedDocument MonkeyDocumentGenerator::GenerateDocument() { std::vector<std::string> prop_content = GetPropertyContent(); doc_builder.AddStringProperty(prop.property_name(), absl_ports::StrJoin(prop_content, " ")); - // Create a tokenized section if the current property is indexable. - if (prop.data_type() == PropertyConfigProto::DataType::STRING && - prop.string_indexing_config().term_match_type() != - TermMatchType::UNKNOWN) { - MonkeyTokenizedSection section = { - prop.property_name(), prop.string_indexing_config().term_match_type(), - std::move(prop_content)}; - document.tokenized_sections.push_back(std::move(section)); - } + // No matter whether the property is indexable currently, we have to create + // a section for it since a non-indexable property can become indexable + // after a schema type change. The in-memory icing will automatically skip + // sections that are non-indexable at the time of search requests. + MonkeyTokenizedSection section = {prop.property_name(), + std::move(prop_content)}; + document.tokenized_sections.push_back(std::move(section)); } document.document = doc_builder.Build(); ++num_docs_generated_; diff --git a/icing/monkey_test/monkey-test-generators.h b/icing/monkey_test/monkey-test-generators.h index 6349918..72a4723 100644 --- a/icing/monkey_test/monkey-test-generators.h +++ b/icing/monkey_test/monkey-test-generators.h @@ -15,51 +15,66 @@ #ifndef ICING_MONKEY_TEST_MONKEY_TEST_GENERATORS_H_ #define ICING_MONKEY_TEST_MONKEY_TEST_GENERATORS_H_ -#include <algorithm> #include <cstdint> #include <random> #include <string> #include <string_view> +#include <unordered_map> +#include <unordered_set> #include <vector> -#include "icing/absl_ports/str_cat.h" -#include "icing/absl_ports/str_join.h" -#include "icing/document-builder.h" #include "icing/monkey_test/monkey-test-common-words.h" +#include "icing/monkey_test/monkey-test-util.h" #include "icing/monkey_test/monkey-tokenized-document.h" -#include "icing/proto/document.pb.h" #include "icing/proto/schema.pb.h" -#include "icing/schema/section.h" +#include "icing/proto/term.pb.h" #include "icing/util/clock.h" namespace icing { namespace lib { -using MonkeyTestRandomEngine = std::mt19937; - // A random schema generator used for monkey testing. class MonkeySchemaGenerator { public: - explicit MonkeySchemaGenerator(MonkeyTestRandomEngine* random) - : random_(random) {} + struct UpdateSchemaResult { + SchemaProto schema; + bool is_invalid_schema; + std::unordered_set<std::string> schema_types_deleted; + std::unordered_set<std::string> schema_types_incompatible; + std::unordered_set<std::string> schema_types_index_incompatible; + }; + + explicit MonkeySchemaGenerator( + MonkeyTestRandomEngine* random, + const IcingMonkeyTestRunnerConfiguration* config) + : random_(random), config_(config) {} - // To ensure that the random schema is generated with the best quality, the - // number of properties for each type will only be randomly picked from the - // list of possible_num_properties, instead of picking it from a range. - // For example, a vector of [1, 2, 3, 4] means each generated types have a 25% - // chance of getting 1 property, 2 properties, 3 properties and 4 properties. - SchemaProto GenerateSchema( - int num_types, const std::vector<int>& possible_num_properties) const; + SchemaProto GenerateSchema(); + + UpdateSchemaResult UpdateSchema(const SchemaProto& schema); private: PropertyConfigProto GenerateProperty( - std::string_view name, TermMatchType::Code term_match_type) const; + const SchemaTypeConfigProto& type_config, + PropertyConfigProto::Cardinality::Code cardinality, + TermMatchType::Code term_match_type); + + void UpdateProperty(const SchemaTypeConfigProto& type_config, + PropertyConfigProto& property, + UpdateSchemaResult& result); - SchemaTypeConfigProto GenerateType(std::string_view name, - int num_properties) const; + SchemaTypeConfigProto GenerateType(); - // Does not own. - MonkeyTestRandomEngine* random_; + void UpdateType(SchemaTypeConfigProto& type_config, + UpdateSchemaResult& result); + + int num_types_generated_ = 0; + // A map from type name to the number of properties generated in the + // corresponding types. + std::unordered_map<std::string, int> num_properties_generated_; + + MonkeyTestRandomEngine* random_; // Does not own. + const IcingMonkeyTestRunnerConfiguration* config_; // Does not own. }; // A random document generator used for monkey testing. @@ -68,16 +83,10 @@ class MonkeySchemaGenerator { // Same for num_namespaces. class MonkeyDocumentGenerator { public: - explicit MonkeyDocumentGenerator(MonkeyTestRandomEngine* random, - const SchemaProto* schema, - std::vector<int> possible_num_tokens, - uint32_t num_namespaces, - uint32_t num_uris = 0) - : random_(random), - schema_(schema), - possible_num_tokens_(std::move(possible_num_tokens)), - num_namespaces_(num_namespaces), - num_uris_(num_uris) {} + explicit MonkeyDocumentGenerator( + MonkeyTestRandomEngine* random, const SchemaProto* schema, + const IcingMonkeyTestRunnerConfiguration* config) + : random_(random), schema_(schema), config_(config) {} const SchemaTypeConfigProto& GetType() const { std::uniform_int_distribution<> dist(0, schema_->types_size() - 1); @@ -104,15 +113,10 @@ class MonkeyDocumentGenerator { MonkeyTokenizedDocument GenerateDocument(); private: - MonkeyTestRandomEngine* random_; // Does not own. - const SchemaProto* schema_; // Does not own. - - // The possible number of tokens that may appear in generated documents, with - // a noise factor from 0.5 to 1 applied. - std::vector<int> possible_num_tokens_; + MonkeyTestRandomEngine* random_; // Does not own. + const SchemaProto* schema_; // Does not own. + const IcingMonkeyTestRunnerConfiguration* config_; // Does not own. - uint32_t num_namespaces_; - uint32_t num_uris_; uint32_t num_docs_generated_ = 0; Clock clock_; }; diff --git a/icing/monkey_test/monkey-test-util.h b/icing/monkey_test/monkey-test-util.h new file mode 100644 index 0000000..d6053d8 --- /dev/null +++ b/icing/monkey_test/monkey-test-util.h @@ -0,0 +1,68 @@ +// Copyright (C) 2023 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef ICING_MONKEY_TEST_MONKEY_TEST_UTIL_H_ +#define ICING_MONKEY_TEST_MONKEY_TEST_UTIL_H_ + +#include <cstdint> +#include <functional> +#include <random> +#include <utility> +#include <vector> + +namespace icing { +namespace lib { + +using MonkeyTestRandomEngine = std::mt19937; + +class IcingMonkeyTestRunner; + +struct IcingMonkeyTestRunnerConfiguration { + explicit IcingMonkeyTestRunnerConfiguration(uint32_t seed, int num_types, + int num_namespaces, int num_uris, + int index_merge_size) + : seed(seed), + num_types(num_types), + num_namespaces(num_namespaces), + num_uris(num_uris), + index_merge_size(index_merge_size) {} + + uint32_t seed; + int num_types; + int num_namespaces; + int num_uris; + int index_merge_size; + + // To ensure that the random schema is generated with the best quality, the + // number of properties for each type will only be randomly picked from this + // list, instead of picking it from a range. For example, a vector of + // [1, 2, 3, 4] means each generated types have a 25% chance of getting 1 + // property, 2 properties, 3 properties and 4 properties. + std::vector<int> possible_num_properties; + + // The possible number of tokens that may appear in generated documents, with + // a noise factor from 0.5 to 1 applied. + std::vector<int> possible_num_tokens_; + + // An array of pairs of monkey test APIs with frequencies. + // If f_sum is the sum of all the frequencies, an operation with frequency f + // means for every f_sum iterations, the operation is expected to run f times. + std::vector<std::pair<std::function<void(IcingMonkeyTestRunner*)>, uint32_t>> + monkey_api_schedules; +}; + +} // namespace lib +} // namespace icing + +#endif // ICING_MONKEY_TEST_MONKEY_TEST_UTIL_H_ diff --git a/icing/monkey_test/monkey-tokenized-document.h b/icing/monkey_test/monkey-tokenized-document.h index a0b38c2..87b77bb 100644 --- a/icing/monkey_test/monkey-tokenized-document.h +++ b/icing/monkey_test/monkey-tokenized-document.h @@ -16,16 +16,15 @@ #define ICING_MONKEY_TEST_MONKEY_TOKENIZED_DOCUMENT_H_ #include <string> +#include <vector> #include "icing/proto/document.pb.h" -#include "icing/proto/term.pb.h" namespace icing { namespace lib { struct MonkeyTokenizedSection { std::string path; - TermMatchType::Code term_match_type; std::vector<std::string> token_sequence; }; diff --git a/icing/performance-configuration.cc b/icing/performance-configuration.cc index 07ff9bc..1518381 100644 --- a/icing/performance-configuration.cc +++ b/icing/performance-configuration.cc @@ -38,20 +38,17 @@ namespace { // rendering 2 frames. // // With the information above, we then try to choose default values for -// query_length and num_to_score so that the overall time can comfortably fit -// in with our goal. +// query_length so that the overall time can comfortably fit in with our goal +// (note that num_to_score will be decided by the client, which is specified in +// ResultSpecProto). // 1. Set query_length to 23000 so that any query can be executed by // QueryProcessor within 15 ms on a Pixel 3 XL according to results of // //icing/query:query-processor_benchmark. -// 2. Set num_to_score to 30000 so that results can be scored and ranked within -// 3 ms on a Pixel 3 XL according to results of -// //icing/scoring:score-and-rank_benchmark. // // In the worse-case scenario, we still have [33 ms - 15 ms - 3 ms] = 15 ms left // for all the other things like proto parsing, document fetching, and even // Android Binder calls if Icing search engine runs in a separate process. constexpr int kMaxQueryLength = 23000; -constexpr int kDefaultNumToScore = 30000; // New Android devices nowadays all allow more than 16 MB memory per app. Using // that as a guideline and being more conservative, we set 4 MB as the safe @@ -67,8 +64,7 @@ constexpr int kMaxNumTotalHits = kSafeMemoryUsage / sizeof(ScoredDocumentHit); } // namespace PerformanceConfiguration::PerformanceConfiguration() - : PerformanceConfiguration(kMaxQueryLength, kDefaultNumToScore, - kMaxNumTotalHits) {} + : PerformanceConfiguration(kMaxQueryLength, kMaxNumTotalHits) {} } // namespace lib } // namespace icing diff --git a/icing/performance-configuration.h b/icing/performance-configuration.h index b9282ca..3ec67f3 100644 --- a/icing/performance-configuration.h +++ b/icing/performance-configuration.h @@ -23,10 +23,8 @@ struct PerformanceConfiguration { // Loads default configuration. PerformanceConfiguration(); - PerformanceConfiguration(int max_query_length_in, int num_to_score_in, - int max_num_total_hits) + PerformanceConfiguration(int max_query_length_in, int max_num_total_hits) : max_query_length(max_query_length_in), - num_to_score(num_to_score_in), max_num_total_hits(max_num_total_hits) {} // Search performance @@ -34,9 +32,6 @@ struct PerformanceConfiguration { // Maximum length of query to execute in IndexProcessor. int max_query_length; - // Number of results to score in ScoringProcessor for every query. - int num_to_score; - // Memory // Maximum number of ScoredDocumentHits to cache in the ResultStateManager at diff --git a/icing/portable/platform.h b/icing/portable/platform.h index 4c115e1..6d8c668 100644 --- a/icing/portable/platform.h +++ b/icing/portable/platform.h @@ -15,7 +15,10 @@ #ifndef ICING_PORTABLE_PLATFORM_H_ #define ICING_PORTABLE_PLATFORM_H_ -#include "unicode/uversion.h" +#include "unicode/uconfig.h" // IWYU pragma: keep +// clang-format: do not reorder the above include. + +#include "unicode/uvernum.h" namespace icing { namespace lib { @@ -40,13 +43,8 @@ inline bool IsIcuTokenization() { return !IsReverseJniTokenization() && !IsCfStringTokenization(); } -inline bool IsIcu72PlusTokenization() { - if (!IsIcuTokenization()) { - return false; - } - UVersionInfo version_array; - u_getVersion(version_array); - return version_array[0] >= 72; +inline int GetIcuTokenizationVersion() { + return IsIcuTokenization() ? U_ICU_VERSION_MAJOR_NUM : 0; } // Whether we're running on android_x86 diff --git a/icing/query/advanced_query_parser/parser.cc b/icing/query/advanced_query_parser/parser.cc index fd74561..82576a1 100644 --- a/icing/query/advanced_query_parser/parser.cc +++ b/icing/query/advanced_query_parser/parser.cc @@ -116,7 +116,7 @@ Parser::ConsumeMember() { // Member could be either `TEXT (DOT TEXT)* (DOT function)?` or `TEXT STAR` // at this point. So check for 'STAR' to differentiate the two cases. if (Match(Lexer::TokenType::STAR)) { - Consume(Lexer::TokenType::STAR); + ICING_RETURN_IF_ERROR(Consume(Lexer::TokenType::STAR)); std::string_view raw_text = text_node->raw_value(); std::string text = std::move(*text_node).value(); text_node = std::make_unique<TextNode>(std::move(text), raw_text, @@ -125,7 +125,7 @@ Parser::ConsumeMember() { } else { children.push_back(std::move(text_node)); while (Match(Lexer::TokenType::DOT)) { - Consume(Lexer::TokenType::DOT); + ICING_RETURN_IF_ERROR(Consume(Lexer::TokenType::DOT)); if (MatchFunction()) { ICING_ASSIGN_OR_RETURN(std::unique_ptr<FunctionNode> function_node, ConsumeFunction()); @@ -201,7 +201,7 @@ Parser::ConsumeArgs() { ICING_ASSIGN_OR_RETURN(std::unique_ptr<Node> arg, ConsumeExpression()); args.push_back(std::move(arg)); while (Match(Lexer::TokenType::COMMA)) { - Consume(Lexer::TokenType::COMMA); + ICING_RETURN_IF_ERROR(Consume(Lexer::TokenType::COMMA)); ICING_ASSIGN_OR_RETURN(arg, ConsumeExpression()); args.push_back(std::move(arg)); } @@ -223,7 +223,7 @@ Parser::ConsumeRestriction() { bool has_minus = Match(Lexer::TokenType::MINUS); if (has_minus) { - Consume(Lexer::TokenType::MINUS); + ICING_RETURN_IF_ERROR(Consume(Lexer::TokenType::MINUS)); } std::unique_ptr<Node> arg; @@ -276,10 +276,10 @@ libtextclassifier3::StatusOr<std::unique_ptr<Node>> Parser::ConsumeTerm() { operator_text = "MINUS"; } else { if (Match(Lexer::TokenType::NOT)) { - Consume(Lexer::TokenType::NOT); + ICING_RETURN_IF_ERROR(Consume(Lexer::TokenType::NOT)); operator_text = "NOT"; } else { - Consume(Lexer::TokenType::MINUS); + ICING_RETURN_IF_ERROR(Consume(Lexer::TokenType::MINUS)); operator_text = "MINUS"; } } @@ -296,7 +296,7 @@ libtextclassifier3::StatusOr<std::unique_ptr<Node>> Parser::ConsumeFactor() { terms.push_back(std::move(term)); while (Match(Lexer::TokenType::OR)) { - Consume(Lexer::TokenType::OR); + ICING_RETURN_IF_ERROR(Consume(Lexer::TokenType::OR)); ICING_ASSIGN_OR_RETURN(term, ConsumeTerm()); terms.push_back(std::move(term)); } @@ -330,7 +330,7 @@ Parser::ConsumeQueryExpression() { sequences.push_back(std::move(sequence)); while (Match(Lexer::TokenType::AND)) { - Consume(Lexer::TokenType::AND); + ICING_RETURN_IF_ERROR(Consume(Lexer::TokenType::AND)); ICING_ASSIGN_OR_RETURN(sequence, ConsumeSequence()); sequences.push_back(std::move(sequence)); } @@ -348,7 +348,7 @@ libtextclassifier3::StatusOr<std::unique_ptr<Node>> Parser::ConsumeMultExpr() { while (Match(Lexer::TokenType::TIMES) || Match(Lexer::TokenType::DIV)) { while (Match(Lexer::TokenType::TIMES)) { - Consume(Lexer::TokenType::TIMES); + ICING_RETURN_IF_ERROR(Consume(Lexer::TokenType::TIMES)); ICING_ASSIGN_OR_RETURN(node, ConsumeTerm()); stack.push_back(std::move(node)); } @@ -357,7 +357,7 @@ libtextclassifier3::StatusOr<std::unique_ptr<Node>> Parser::ConsumeMultExpr() { stack.push_back(std::move(node)); while (Match(Lexer::TokenType::DIV)) { - Consume(Lexer::TokenType::DIV); + ICING_RETURN_IF_ERROR(Consume(Lexer::TokenType::DIV)); ICING_ASSIGN_OR_RETURN(node, ConsumeTerm()); stack.push_back(std::move(node)); } @@ -380,7 +380,7 @@ Parser::ConsumeScoringExpression() { while (Match(Lexer::TokenType::PLUS) || Match(Lexer::TokenType::MINUS)) { while (Match(Lexer::TokenType::PLUS)) { - Consume(Lexer::TokenType::PLUS); + ICING_RETURN_IF_ERROR(Consume(Lexer::TokenType::PLUS)); ICING_ASSIGN_OR_RETURN(node, ConsumeMultExpr()); stack.push_back(std::move(node)); } @@ -389,7 +389,7 @@ Parser::ConsumeScoringExpression() { stack.push_back(std::move(node)); while (Match(Lexer::TokenType::MINUS)) { - Consume(Lexer::TokenType::MINUS); + ICING_RETURN_IF_ERROR(Consume(Lexer::TokenType::MINUS)); ICING_ASSIGN_OR_RETURN(node, ConsumeMultExpr()); stack.push_back(std::move(node)); } diff --git a/icing/query/advanced_query_parser/query-visitor.cc b/icing/query/advanced_query_parser/query-visitor.cc index d75a550..31da959 100644 --- a/icing/query/advanced_query_parser/query-visitor.cc +++ b/icing/query/advanced_query_parser/query-visitor.cc @@ -33,9 +33,11 @@ #include "icing/index/iterator/doc-hit-info-iterator-none.h" #include "icing/index/iterator/doc-hit-info-iterator-not.h" #include "icing/index/iterator/doc-hit-info-iterator-or.h" +#include "icing/index/iterator/doc-hit-info-iterator-property-in-document.h" #include "icing/index/iterator/doc-hit-info-iterator-property-in-schema.h" #include "icing/index/iterator/doc-hit-info-iterator-section-restrict.h" #include "icing/index/iterator/doc-hit-info-iterator.h" +#include "icing/index/property-existence-indexing-handler.h" #include "icing/query/advanced_query_parser/lexer.h" #include "icing/query/advanced_query_parser/param.h" #include "icing/query/advanced_query_parser/parser.h" @@ -222,13 +224,23 @@ void QueryVisitor::RegisterFunctions() { auto property_defined = [this](std::vector<PendingValue>&& args) { return this->PropertyDefinedFunction(std::move(args)); }; - Function property_defined_function = Function::Create(DataType::kDocumentIterator, "propertyDefined", {Param(DataType::kString)}, std::move(property_defined)) .ValueOrDie(); registered_functions_.insert( {property_defined_function.name(), std::move(property_defined_function)}); + + // DocHitInfoIterator hasProperty(std::string); + auto has_property = [this](std::vector<PendingValue>&& args) { + return this->HasPropertyFunction(std::move(args)); + }; + Function has_property_function = + Function::Create(DataType::kDocumentIterator, "hasProperty", + {Param(DataType::kString)}, std::move(has_property)) + .ValueOrDie(); + registered_functions_.insert( + {has_property_function.name(), std::move(has_property_function)}); } libtextclassifier3::StatusOr<PendingValue> QueryVisitor::SearchFunction( @@ -279,7 +291,7 @@ libtextclassifier3::StatusOr<PendingValue> QueryVisitor::SearchFunction( // Update members based on results of processing the query. if (args.size() == 2 && pending_property_restricts_.has_active_property_restricts()) { - iterator = std::make_unique<DocHitInfoIteratorSectionRestrict>( + iterator = DocHitInfoIteratorSectionRestrict::ApplyRestrictions( std::move(iterator), &document_store_, &schema_store_, pending_property_restricts_.active_property_restricts(), current_time_ms_); @@ -322,6 +334,31 @@ QueryVisitor::PropertyDefinedFunction(std::vector<PendingValue>&& args) { return PendingValue(std::move(property_in_schema_iterator)); } +libtextclassifier3::StatusOr<PendingValue> QueryVisitor::HasPropertyFunction( + std::vector<PendingValue>&& args) { + // The first arg is guaranteed to be a STRING at this point. It should be safe + // to call ValueOrDie. + const std::string& property_path = args.at(0).string_val().ValueOrDie()->term; + + // Perform an exact search for the property existence metadata token. + ICING_ASSIGN_OR_RETURN( + std::unique_ptr<DocHitInfoIterator> meta_hit_iterator, + index_.GetIterator( + absl_ports::StrCat(kPropertyExistenceTokenPrefix, property_path), + /*term_start_index=*/0, + /*unnormalized_term_length=*/0, kSectionIdMaskAll, + TermMatchType::EXACT_ONLY, + /*need_hit_term_frequency=*/false)); + + std::unique_ptr<DocHitInfoIterator> property_in_document_iterator = + std::make_unique<DocHitInfoIteratorPropertyInDocument>( + std::move(meta_hit_iterator)); + + features_.insert(kHasPropertyFunctionFeature); + + return PendingValue(std::move(property_in_document_iterator)); +} + libtextclassifier3::StatusOr<int64_t> QueryVisitor::PopPendingIntValue() { if (pending_values_.empty()) { return absl_ports::InvalidArgumentError("Unable to retrieve int value."); @@ -647,7 +684,7 @@ libtextclassifier3::Status QueryVisitor::ProcessHasOperator( std::set<std::string> property_restricts = {std::move(text_value.term)}; pending_values_.push( - PendingValue(std::make_unique<DocHitInfoIteratorSectionRestrict>( + PendingValue(DocHitInfoIteratorSectionRestrict::ApplyRestrictions( std::move(delegate), &document_store_, &schema_store_, std::move(property_restricts), current_time_ms_))); return libtextclassifier3::Status::OK; diff --git a/icing/query/advanced_query_parser/query-visitor.h b/icing/query/advanced_query_parser/query-visitor.h index 38864f8..d090b3c 100644 --- a/icing/query/advanced_query_parser/query-visitor.h +++ b/icing/query/advanced_query_parser/query-visitor.h @@ -247,13 +247,23 @@ class QueryVisitor : public AbstractSyntaxTreeVisitor { libtextclassifier3::StatusOr<PendingValue> SearchFunction( std::vector<PendingValue>&& args); - // Implementation of the propertyDefined(member) custom function. + // Implementation of the propertyDefined(property_path) custom function. // Returns: - // - a Pending Value holding a DocHitIterator to be implemented. + // - a Pending Value holding a DocHitIterator that returns hits for all + // documents whose schema types have defined the property specified by + // property_path. // - any errors returned by Lexer::ExtractTokens libtextclassifier3::StatusOr<PendingValue> PropertyDefinedFunction( std::vector<PendingValue>&& args); + // Implementation of the hasProperty(property_path) custom function. + // Returns: + // - a Pending Value holding a DocHitIterator that returns hits for all + // documents that have the property specified by property_path. + // - any errors returned by Lexer::ExtractTokens + libtextclassifier3::StatusOr<PendingValue> HasPropertyFunction( + std::vector<PendingValue>&& args); + // Handles a NaryOperatorNode where the operator is HAS (':') and pushes an // iterator with the proper section filter applied. If the current property // restriction represented by pending_property_restricts and the first child diff --git a/icing/query/advanced_query_parser/query-visitor_test.cc b/icing/query/advanced_query_parser/query-visitor_test.cc index 0d7ba6d..9455baa 100644 --- a/icing/query/advanced_query_parser/query-visitor_test.cc +++ b/icing/query/advanced_query_parser/query-visitor_test.cc @@ -17,17 +17,28 @@ #include <cstdint> #include <limits> #include <memory> +#include <string> #include <string_view> +#include <unordered_map> +#include <utility> +#include <vector> #include "icing/text_classifier/lib3/utils/base/status.h" +#include "icing/text_classifier/lib3/utils/base/statusor.h" #include "gmock/gmock.h" #include "gtest/gtest.h" +#include "icing/absl_ports/str_cat.h" #include "icing/document-builder.h" +#include "icing/file/filesystem.h" +#include "icing/file/portable-file-backed-proto-log.h" +#include "icing/index/hit/hit.h" #include "icing/index/index.h" +#include "icing/index/iterator/doc-hit-info-iterator-filter.h" #include "icing/index/iterator/doc-hit-info-iterator-test-util.h" #include "icing/index/iterator/doc-hit-info-iterator.h" #include "icing/index/numeric/dummy-numeric-index.h" #include "icing/index/numeric/numeric-index.h" +#include "icing/index/property-existence-indexing-handler.h" #include "icing/jni/jni-cache.h" #include "icing/legacy/index/icing-filesystem.h" #include "icing/portable/platform.h" @@ -35,7 +46,13 @@ #include "icing/query/advanced_query_parser/lexer.h" #include "icing/query/advanced_query_parser/parser.h" #include "icing/query/query-features.h" +#include "icing/query/query-results.h" #include "icing/schema-builder.h" +#include "icing/schema/schema-store.h" +#include "icing/schema/section.h" +#include "icing/store/document-id.h" +#include "icing/store/document-store.h" +#include "icing/store/namespace-id.h" #include "icing/testing/common-matchers.h" #include "icing/testing/icu-data-file-helper.h" #include "icing/testing/jni-test-helpers.h" @@ -47,6 +64,8 @@ #include "icing/tokenization/tokenizer.h" #include "icing/transform/normalizer-factory.h" #include "icing/transform/normalizer.h" +#include "icing/util/clock.h" +#include "icing/util/status-macros.h" #include "unicode/uloc.h" namespace icing { @@ -114,17 +133,20 @@ class QueryVisitorTest : public ::testing::TestWithParam<QueryType> { ICING_ASSERT_OK_AND_ASSIGN( DocumentStore::CreateResult create_result, - DocumentStore::Create(&filesystem_, store_dir_, &clock_, - schema_store_.get(), - /*force_recovery_and_revalidate_documents=*/false, - /*namespace_id_fingerprint=*/false, - PortableFileBackedProtoLog< - DocumentWrapper>::kDeflateCompressionLevel, - /*initialize_stats=*/nullptr)); + DocumentStore::Create( + &filesystem_, store_dir_, &clock_, schema_store_.get(), + /*force_recovery_and_revalidate_documents=*/false, + /*namespace_id_fingerprint=*/false, /*pre_mapping_fbv=*/false, + /*use_persistent_hash_map=*/false, + PortableFileBackedProtoLog< + DocumentWrapper>::kDeflateCompressionLevel, + /*initialize_stats=*/nullptr)); document_store_ = std::move(create_result.document_store); Index::Options options(index_dir_.c_str(), - /*index_merge_size=*/1024 * 1024); + /*index_merge_size=*/1024 * 1024, + /*lite_index_sort_at_indexing=*/true, + /*lite_index_sort_size=*/1024 * 8); ICING_ASSERT_OK_AND_ASSIGN( index_, Index::Create(options, &filesystem_, &icing_filesystem_)); @@ -215,16 +237,16 @@ TEST_P(QueryVisitorTest, SimpleLessThan) { // respectively. std::unique_ptr<NumericIndex<int64_t>::Editor> editor = numeric_index_->Edit("price", kDocumentId0, kSectionId0); - editor->BufferKey(0); - std::move(*editor).IndexAllBufferedKeys(); + ICING_ASSERT_OK(editor->BufferKey(0)); + ICING_ASSERT_OK(std::move(*editor).IndexAllBufferedKeys()); editor = numeric_index_->Edit("price", kDocumentId1, kSectionId1); - editor->BufferKey(1); - std::move(*editor).IndexAllBufferedKeys(); + ICING_ASSERT_OK(editor->BufferKey(1)); + ICING_ASSERT_OK(std::move(*editor).IndexAllBufferedKeys()); editor = numeric_index_->Edit("price", kDocumentId2, kSectionId2); - editor->BufferKey(2); - std::move(*editor).IndexAllBufferedKeys(); + ICING_ASSERT_OK(editor->BufferKey(2)); + ICING_ASSERT_OK(std::move(*editor).IndexAllBufferedKeys()); std::string query = CreateQuery("price < 2"); ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Node> root_node, @@ -258,16 +280,16 @@ TEST_P(QueryVisitorTest, SimpleLessThanEq) { // respectively. std::unique_ptr<NumericIndex<int64_t>::Editor> editor = numeric_index_->Edit("price", kDocumentId0, kSectionId0); - editor->BufferKey(0); - std::move(*editor).IndexAllBufferedKeys(); + ICING_ASSERT_OK(editor->BufferKey(0)); + ICING_ASSERT_OK(std::move(*editor).IndexAllBufferedKeys()); editor = numeric_index_->Edit("price", kDocumentId1, kSectionId1); - editor->BufferKey(1); - std::move(*editor).IndexAllBufferedKeys(); + ICING_ASSERT_OK(editor->BufferKey(1)); + ICING_ASSERT_OK(std::move(*editor).IndexAllBufferedKeys()); editor = numeric_index_->Edit("price", kDocumentId2, kSectionId2); - editor->BufferKey(2); - std::move(*editor).IndexAllBufferedKeys(); + ICING_ASSERT_OK(editor->BufferKey(2)); + ICING_ASSERT_OK(std::move(*editor).IndexAllBufferedKeys()); std::string query = CreateQuery("price <= 1"); ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Node> root_node, @@ -301,16 +323,16 @@ TEST_P(QueryVisitorTest, SimpleEqual) { // respectively. std::unique_ptr<NumericIndex<int64_t>::Editor> editor = numeric_index_->Edit("price", kDocumentId0, kSectionId0); - editor->BufferKey(0); - std::move(*editor).IndexAllBufferedKeys(); + ICING_ASSERT_OK(editor->BufferKey(0)); + ICING_ASSERT_OK(std::move(*editor).IndexAllBufferedKeys()); editor = numeric_index_->Edit("price", kDocumentId1, kSectionId1); - editor->BufferKey(1); - std::move(*editor).IndexAllBufferedKeys(); + ICING_ASSERT_OK(editor->BufferKey(1)); + ICING_ASSERT_OK(std::move(*editor).IndexAllBufferedKeys()); editor = numeric_index_->Edit("price", kDocumentId2, kSectionId2); - editor->BufferKey(2); - std::move(*editor).IndexAllBufferedKeys(); + ICING_ASSERT_OK(editor->BufferKey(2)); + ICING_ASSERT_OK(std::move(*editor).IndexAllBufferedKeys()); std::string query = CreateQuery("price == 2"); ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Node> root_node, @@ -344,16 +366,16 @@ TEST_P(QueryVisitorTest, SimpleGreaterThanEq) { // respectively. std::unique_ptr<NumericIndex<int64_t>::Editor> editor = numeric_index_->Edit("price", kDocumentId0, kSectionId0); - editor->BufferKey(0); - std::move(*editor).IndexAllBufferedKeys(); + ICING_ASSERT_OK(editor->BufferKey(0)); + ICING_ASSERT_OK(std::move(*editor).IndexAllBufferedKeys()); editor = numeric_index_->Edit("price", kDocumentId1, kSectionId1); - editor->BufferKey(1); - std::move(*editor).IndexAllBufferedKeys(); + ICING_ASSERT_OK(editor->BufferKey(1)); + ICING_ASSERT_OK(std::move(*editor).IndexAllBufferedKeys()); editor = numeric_index_->Edit("price", kDocumentId2, kSectionId2); - editor->BufferKey(2); - std::move(*editor).IndexAllBufferedKeys(); + ICING_ASSERT_OK(editor->BufferKey(2)); + ICING_ASSERT_OK(std::move(*editor).IndexAllBufferedKeys()); std::string query = CreateQuery("price >= 1"); ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Node> root_node, @@ -387,16 +409,16 @@ TEST_P(QueryVisitorTest, SimpleGreaterThan) { // respectively. std::unique_ptr<NumericIndex<int64_t>::Editor> editor = numeric_index_->Edit("price", kDocumentId0, kSectionId0); - editor->BufferKey(0); - std::move(*editor).IndexAllBufferedKeys(); + ICING_ASSERT_OK(editor->BufferKey(0)); + ICING_ASSERT_OK(std::move(*editor).IndexAllBufferedKeys()); editor = numeric_index_->Edit("price", kDocumentId1, kSectionId1); - editor->BufferKey(1); - std::move(*editor).IndexAllBufferedKeys(); + ICING_ASSERT_OK(editor->BufferKey(1)); + ICING_ASSERT_OK(std::move(*editor).IndexAllBufferedKeys()); editor = numeric_index_->Edit("price", kDocumentId2, kSectionId2); - editor->BufferKey(2); - std::move(*editor).IndexAllBufferedKeys(); + ICING_ASSERT_OK(editor->BufferKey(2)); + ICING_ASSERT_OK(std::move(*editor).IndexAllBufferedKeys()); std::string query = CreateQuery("price > 1"); ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Node> root_node, @@ -431,16 +453,16 @@ TEST_P(QueryVisitorTest, IntMinLessThanEqual) { int64_t int_min = std::numeric_limits<int64_t>::min(); std::unique_ptr<NumericIndex<int64_t>::Editor> editor = numeric_index_->Edit("price", kDocumentId0, kSectionId0); - editor->BufferKey(int_min); - std::move(*editor).IndexAllBufferedKeys(); + ICING_ASSERT_OK(editor->BufferKey(int_min)); + ICING_ASSERT_OK(std::move(*editor).IndexAllBufferedKeys()); editor = numeric_index_->Edit("price", kDocumentId1, kSectionId1); - editor->BufferKey(std::numeric_limits<int64_t>::max()); - std::move(*editor).IndexAllBufferedKeys(); + ICING_ASSERT_OK(editor->BufferKey(std::numeric_limits<int64_t>::max())); + ICING_ASSERT_OK(std::move(*editor).IndexAllBufferedKeys()); editor = numeric_index_->Edit("price", kDocumentId2, kSectionId2); - editor->BufferKey(int_min + 1); - std::move(*editor).IndexAllBufferedKeys(); + ICING_ASSERT_OK(editor->BufferKey(int_min + 1)); + ICING_ASSERT_OK(std::move(*editor).IndexAllBufferedKeys()); std::string query = CreateQuery("price <= " + std::to_string(int_min)); ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Node> root_node, @@ -475,16 +497,16 @@ TEST_P(QueryVisitorTest, IntMaxGreaterThanEqual) { int64_t int_max = std::numeric_limits<int64_t>::max(); std::unique_ptr<NumericIndex<int64_t>::Editor> editor = numeric_index_->Edit("price", kDocumentId0, kSectionId0); - editor->BufferKey(std::numeric_limits<int64_t>::min()); - std::move(*editor).IndexAllBufferedKeys(); + ICING_ASSERT_OK(editor->BufferKey(std::numeric_limits<int64_t>::min())); + ICING_ASSERT_OK(std::move(*editor).IndexAllBufferedKeys()); editor = numeric_index_->Edit("price", kDocumentId1, kSectionId1); - editor->BufferKey(int_max); - std::move(*editor).IndexAllBufferedKeys(); + ICING_ASSERT_OK(editor->BufferKey(int_max)); + ICING_ASSERT_OK(std::move(*editor).IndexAllBufferedKeys()); editor = numeric_index_->Edit("price", kDocumentId2, kSectionId2); - editor->BufferKey(int_max - 1); - std::move(*editor).IndexAllBufferedKeys(); + ICING_ASSERT_OK(editor->BufferKey(int_max - 1)); + ICING_ASSERT_OK(std::move(*editor).IndexAllBufferedKeys()); std::string query = CreateQuery("price >= " + std::to_string(int_max)); ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Node> root_node, @@ -518,18 +540,18 @@ TEST_P(QueryVisitorTest, NestedPropertyLessThan) { // respectively. std::unique_ptr<NumericIndex<int64_t>::Editor> editor = numeric_index_->Edit("subscription.price", kDocumentId0, kSectionId0); - editor->BufferKey(0); - std::move(*editor).IndexAllBufferedKeys(); + ICING_ASSERT_OK(editor->BufferKey(0)); + ICING_ASSERT_OK(std::move(*editor).IndexAllBufferedKeys()); editor = numeric_index_->Edit("subscription.price", kDocumentId1, kSectionId1); - editor->BufferKey(1); - std::move(*editor).IndexAllBufferedKeys(); + ICING_ASSERT_OK(editor->BufferKey(1)); + ICING_ASSERT_OK(std::move(*editor).IndexAllBufferedKeys()); editor = numeric_index_->Edit("subscription.price", kDocumentId2, kSectionId2); - editor->BufferKey(2); - std::move(*editor).IndexAllBufferedKeys(); + ICING_ASSERT_OK(editor->BufferKey(2)); + ICING_ASSERT_OK(std::move(*editor).IndexAllBufferedKeys()); std::string query = CreateQuery("subscription.price < 2"); ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Node> root_node, @@ -591,18 +613,18 @@ TEST_P(QueryVisitorTest, LessThanTooManyOperandsInvalid) { // respectively. std::unique_ptr<NumericIndex<int64_t>::Editor> editor = numeric_index_->Edit("subscription.price", kDocumentId0, kSectionId0); - editor->BufferKey(0); - std::move(*editor).IndexAllBufferedKeys(); + ICING_ASSERT_OK(editor->BufferKey(0)); + ICING_ASSERT_OK(std::move(*editor).IndexAllBufferedKeys()); editor = numeric_index_->Edit("subscription.price", kDocumentId1, kSectionId1); - editor->BufferKey(1); - std::move(*editor).IndexAllBufferedKeys(); + ICING_ASSERT_OK(editor->BufferKey(1)); + ICING_ASSERT_OK(std::move(*editor).IndexAllBufferedKeys()); editor = numeric_index_->Edit("subscription.price", kDocumentId2, kSectionId2); - editor->BufferKey(2); - std::move(*editor).IndexAllBufferedKeys(); + ICING_ASSERT_OK(editor->BufferKey(2)); + ICING_ASSERT_OK(std::move(*editor).IndexAllBufferedKeys()); // Create an invalid AST for the query '3 < subscription.price 25' where '<' // has three operands @@ -666,18 +688,18 @@ TEST_P(QueryVisitorTest, LessThanNonExistentPropertyNotFound) { // respectively. std::unique_ptr<NumericIndex<int64_t>::Editor> editor = numeric_index_->Edit("subscription.price", kDocumentId0, kSectionId0); - editor->BufferKey(0); - std::move(*editor).IndexAllBufferedKeys(); + ICING_ASSERT_OK(editor->BufferKey(0)); + ICING_ASSERT_OK(std::move(*editor).IndexAllBufferedKeys()); editor = numeric_index_->Edit("subscription.price", kDocumentId1, kSectionId1); - editor->BufferKey(1); - std::move(*editor).IndexAllBufferedKeys(); + ICING_ASSERT_OK(editor->BufferKey(1)); + ICING_ASSERT_OK(std::move(*editor).IndexAllBufferedKeys()); editor = numeric_index_->Edit("subscription.price", kDocumentId2, kSectionId2); - editor->BufferKey(2); - std::move(*editor).IndexAllBufferedKeys(); + ICING_ASSERT_OK(editor->BufferKey(2)); + ICING_ASSERT_OK(std::move(*editor).IndexAllBufferedKeys()); std::string query = CreateQuery("time < 25"); ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Node> root_node, @@ -719,16 +741,16 @@ TEST_P(QueryVisitorTest, IntMinLessThanInvalid) { int64_t int_min = std::numeric_limits<int64_t>::min(); std::unique_ptr<NumericIndex<int64_t>::Editor> editor = numeric_index_->Edit("price", kDocumentId0, kSectionId0); - editor->BufferKey(int_min); - std::move(*editor).IndexAllBufferedKeys(); + ICING_ASSERT_OK(editor->BufferKey(int_min)); + ICING_ASSERT_OK(std::move(*editor).IndexAllBufferedKeys()); editor = numeric_index_->Edit("price", kDocumentId1, kSectionId1); - editor->BufferKey(std::numeric_limits<int64_t>::max()); - std::move(*editor).IndexAllBufferedKeys(); + ICING_ASSERT_OK(editor->BufferKey(std::numeric_limits<int64_t>::max())); + ICING_ASSERT_OK(std::move(*editor).IndexAllBufferedKeys()); editor = numeric_index_->Edit("price", kDocumentId2, kSectionId2); - editor->BufferKey(int_min + 1); - std::move(*editor).IndexAllBufferedKeys(); + ICING_ASSERT_OK(editor->BufferKey(int_min + 1)); + ICING_ASSERT_OK(std::move(*editor).IndexAllBufferedKeys()); std::string query = CreateQuery("price <" + std::to_string(int_min)); ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Node> root_node, @@ -749,16 +771,16 @@ TEST_P(QueryVisitorTest, IntMaxGreaterThanInvalid) { int64_t int_max = std::numeric_limits<int64_t>::max(); std::unique_ptr<NumericIndex<int64_t>::Editor> editor = numeric_index_->Edit("price", kDocumentId0, kSectionId0); - editor->BufferKey(std::numeric_limits<int64_t>::min()); - std::move(*editor).IndexAllBufferedKeys(); + ICING_ASSERT_OK(editor->BufferKey(std::numeric_limits<int64_t>::min())); + ICING_ASSERT_OK(std::move(*editor).IndexAllBufferedKeys()); editor = numeric_index_->Edit("price", kDocumentId1, kSectionId1); - editor->BufferKey(int_max); - std::move(*editor).IndexAllBufferedKeys(); + ICING_ASSERT_OK(editor->BufferKey(int_max)); + ICING_ASSERT_OK(std::move(*editor).IndexAllBufferedKeys()); editor = numeric_index_->Edit("price", kDocumentId2, kSectionId2); - editor->BufferKey(int_max - 1); - std::move(*editor).IndexAllBufferedKeys(); + ICING_ASSERT_OK(editor->BufferKey(int_max - 1)); + ICING_ASSERT_OK(std::move(*editor).IndexAllBufferedKeys()); std::string query = CreateQuery("price >" + std::to_string(int_max)); ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Node> root_node, @@ -804,34 +826,34 @@ TEST_P(QueryVisitorTest, NumericComparatorDoesntAffectLaterTerms) { DocumentBuilder().SetKey("ns", "uri0").SetSchema("type").Build())); std::unique_ptr<NumericIndex<int64_t>::Editor> editor = numeric_index_->Edit("price", kDocumentId0, kSectionId0); - editor->BufferKey(-2); - editor->BufferKey(-1); - editor->BufferKey(1); - editor->BufferKey(2); - std::move(*editor).IndexAllBufferedKeys(); + ICING_ASSERT_OK(editor->BufferKey(-2)); + ICING_ASSERT_OK(editor->BufferKey(-1)); + ICING_ASSERT_OK(editor->BufferKey(1)); + ICING_ASSERT_OK(editor->BufferKey(2)); + ICING_ASSERT_OK(std::move(*editor).IndexAllBufferedKeys()); Index::Editor term_editor = index_->Edit( kDocumentId0, kSectionId1, TERM_MATCH_PREFIX, /*namespace_id=*/0); - term_editor.BufferTerm("-2"); - term_editor.BufferTerm("-1"); - term_editor.BufferTerm("1"); - term_editor.BufferTerm("2"); - term_editor.IndexAllBufferedTerms(); + ICING_ASSERT_OK(term_editor.BufferTerm("-2")); + ICING_ASSERT_OK(term_editor.BufferTerm("-1")); + ICING_ASSERT_OK(term_editor.BufferTerm("1")); + ICING_ASSERT_OK(term_editor.BufferTerm("2")); + ICING_ASSERT_OK(term_editor.IndexAllBufferedTerms()); ICING_ASSERT_OK(document_store_->Put( DocumentBuilder().SetKey("ns", "uri1").SetSchema("type").Build())); editor = numeric_index_->Edit("price", kDocumentId1, kSectionId0); - editor->BufferKey(-1); - std::move(*editor).IndexAllBufferedKeys(); + ICING_ASSERT_OK(editor->BufferKey(-1)); + ICING_ASSERT_OK(std::move(*editor).IndexAllBufferedKeys()); ICING_ASSERT_OK(document_store_->Put( DocumentBuilder().SetKey("ns", "uri2").SetSchema("type").Build())); editor = numeric_index_->Edit("price", kDocumentId2, kSectionId0); - editor->BufferKey(-1); - std::move(*editor).IndexAllBufferedKeys(); + ICING_ASSERT_OK(editor->BufferKey(-1)); + ICING_ASSERT_OK(std::move(*editor).IndexAllBufferedKeys()); term_editor = index_->Edit(kDocumentId2, kSectionId1, TERM_MATCH_PREFIX, /*namespace_id=*/0); - term_editor.BufferTerm("2"); - term_editor.IndexAllBufferedTerms(); + ICING_ASSERT_OK(term_editor.BufferTerm("2")); + ICING_ASSERT_OK(term_editor.IndexAllBufferedTerms()); // Translating MINUS chars that are interpreted as NOTs, this query would be // `price == -1 AND NOT 2` @@ -869,18 +891,18 @@ TEST_P(QueryVisitorTest, SingleTermTermFrequencyEnabled) { // "bar" respectively. Index::Editor editor = index_->Edit(kDocumentId0, kSectionId1, TERM_MATCH_PREFIX, /*namespace_id=*/0); - editor.BufferTerm("foo"); - editor.IndexAllBufferedTerms(); + ICING_ASSERT_OK(editor.BufferTerm("foo")); + ICING_ASSERT_OK(editor.IndexAllBufferedTerms()); editor = index_->Edit(kDocumentId1, kSectionId1, TERM_MATCH_PREFIX, /*namespace_id=*/0); - editor.BufferTerm("foo"); - editor.IndexAllBufferedTerms(); + ICING_ASSERT_OK(editor.BufferTerm("foo")); + ICING_ASSERT_OK(editor.IndexAllBufferedTerms()); editor = index_->Edit(kDocumentId2, kSectionId1, TERM_MATCH_PREFIX, /*namespace_id=*/0); - editor.BufferTerm("bar"); - editor.IndexAllBufferedTerms(); + ICING_ASSERT_OK(editor.BufferTerm("bar")); + ICING_ASSERT_OK(editor.IndexAllBufferedTerms()); std::string query = CreateQuery("foo"); ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Node> root_node, @@ -921,18 +943,18 @@ TEST_P(QueryVisitorTest, SingleTermTermFrequencyDisabled) { // "bar" respectively. Index::Editor editor = index_->Edit(kDocumentId0, kSectionId1, TERM_MATCH_PREFIX, /*namespace_id=*/0); - editor.BufferTerm("foo"); - editor.IndexAllBufferedTerms(); + ICING_ASSERT_OK(editor.BufferTerm("foo")); + ICING_ASSERT_OK(editor.IndexAllBufferedTerms()); editor = index_->Edit(kDocumentId1, kSectionId1, TERM_MATCH_PREFIX, /*namespace_id=*/0); - editor.BufferTerm("foo"); - editor.IndexAllBufferedTerms(); + ICING_ASSERT_OK(editor.BufferTerm("foo")); + ICING_ASSERT_OK(editor.IndexAllBufferedTerms()); editor = index_->Edit(kDocumentId2, kSectionId1, TERM_MATCH_PREFIX, /*namespace_id=*/0); - editor.BufferTerm("bar"); - editor.IndexAllBufferedTerms(); + ICING_ASSERT_OK(editor.BufferTerm("bar")); + ICING_ASSERT_OK(editor.IndexAllBufferedTerms()); std::string query = CreateQuery("foo"); ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Node> root_node, @@ -972,18 +994,18 @@ TEST_P(QueryVisitorTest, SingleTermPrefix) { // "bar" respectively. Index::Editor editor = index_->Edit(kDocumentId0, kSectionId1, TERM_MATCH_PREFIX, /*namespace_id=*/0); - editor.BufferTerm("foo"); - editor.IndexAllBufferedTerms(); + ICING_ASSERT_OK(editor.BufferTerm("foo")); + ICING_ASSERT_OK(editor.IndexAllBufferedTerms()); editor = index_->Edit(kDocumentId1, kSectionId1, TERM_MATCH_PREFIX, /*namespace_id=*/0); - editor.BufferTerm("foo"); - editor.IndexAllBufferedTerms(); + ICING_ASSERT_OK(editor.BufferTerm("foo")); + ICING_ASSERT_OK(editor.IndexAllBufferedTerms()); editor = index_->Edit(kDocumentId2, kSectionId1, TERM_MATCH_PREFIX, /*namespace_id=*/0); - editor.BufferTerm("bar"); - editor.IndexAllBufferedTerms(); + ICING_ASSERT_OK(editor.BufferTerm("bar")); + ICING_ASSERT_OK(editor.IndexAllBufferedTerms()); // An EXACT query for 'fo' won't match anything. std::string query = CreateQuery("fo"); @@ -1068,21 +1090,21 @@ TEST_P(QueryVisitorTest, SegmentationWithPrefix) { // ["foo", "ba"] and ["bar", "fo"] respectively. Index::Editor editor = index_->Edit(kDocumentId0, kSectionId1, TERM_MATCH_PREFIX, /*namespace_id=*/0); - editor.BufferTerm("foo"); - editor.BufferTerm("ba"); - editor.IndexAllBufferedTerms(); + ICING_ASSERT_OK(editor.BufferTerm("foo")); + ICING_ASSERT_OK(editor.BufferTerm("ba")); + ICING_ASSERT_OK(editor.IndexAllBufferedTerms()); editor = index_->Edit(kDocumentId1, kSectionId1, TERM_MATCH_PREFIX, /*namespace_id=*/0); - editor.BufferTerm("foo"); - editor.BufferTerm("ba"); - editor.IndexAllBufferedTerms(); + ICING_ASSERT_OK(editor.BufferTerm("foo")); + ICING_ASSERT_OK(editor.BufferTerm("ba")); + ICING_ASSERT_OK(editor.IndexAllBufferedTerms()); editor = index_->Edit(kDocumentId2, kSectionId1, TERM_MATCH_PREFIX, /*namespace_id=*/0); - editor.BufferTerm("bar"); - editor.BufferTerm("fo"); - editor.IndexAllBufferedTerms(); + ICING_ASSERT_OK(editor.BufferTerm("bar")); + ICING_ASSERT_OK(editor.BufferTerm("fo")); + ICING_ASSERT_OK(editor.IndexAllBufferedTerms()); // An EXACT query for `ba?fo` will be lexed into a single TEXT token. // The visitor will tokenize it into `ba` and `fo` (`?` is dropped because it @@ -1135,18 +1157,18 @@ TEST_P(QueryVisitorTest, SingleVerbatimTerm) { // "foo:bar(baz)" and "bar:baz(foo)" respectively. Index::Editor editor = index_->Edit(kDocumentId0, kSectionId1, TERM_MATCH_PREFIX, /*namespace_id=*/0); - editor.BufferTerm("foo:bar(baz)"); - editor.IndexAllBufferedTerms(); + ICING_ASSERT_OK(editor.BufferTerm("foo:bar(baz)")); + ICING_ASSERT_OK(editor.IndexAllBufferedTerms()); editor = index_->Edit(kDocumentId1, kSectionId1, TERM_MATCH_PREFIX, /*namespace_id=*/0); - editor.BufferTerm("foo:bar(baz)"); - editor.IndexAllBufferedTerms(); + ICING_ASSERT_OK(editor.BufferTerm("foo:bar(baz)")); + ICING_ASSERT_OK(editor.IndexAllBufferedTerms()); editor = index_->Edit(kDocumentId2, kSectionId1, TERM_MATCH_PREFIX, /*namespace_id=*/0); - editor.BufferTerm("bar:baz(foo)"); - editor.IndexAllBufferedTerms(); + ICING_ASSERT_OK(editor.BufferTerm("bar:baz(foo)")); + ICING_ASSERT_OK(editor.IndexAllBufferedTerms()); std::string query = CreateQuery("\"foo:bar(baz)\""); ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Node> root_node, @@ -1181,18 +1203,18 @@ TEST_P(QueryVisitorTest, SingleVerbatimTermPrefix) { // "foo:bar(abc)" and "bar:baz(foo)" respectively. Index::Editor editor = index_->Edit(kDocumentId0, kSectionId1, TERM_MATCH_PREFIX, /*namespace_id=*/0); - editor.BufferTerm("foo:bar(baz)"); - editor.IndexAllBufferedTerms(); + ICING_ASSERT_OK(editor.BufferTerm("foo:bar(baz)")); + ICING_ASSERT_OK(editor.IndexAllBufferedTerms()); editor = index_->Edit(kDocumentId1, kSectionId1, TERM_MATCH_PREFIX, /*namespace_id=*/0); - editor.BufferTerm("foo:bar(abc)"); - editor.IndexAllBufferedTerms(); + ICING_ASSERT_OK(editor.BufferTerm("foo:bar(abc)")); + ICING_ASSERT_OK(editor.IndexAllBufferedTerms()); editor = index_->Edit(kDocumentId2, kSectionId1, TERM_MATCH_PREFIX, /*namespace_id=*/0); - editor.BufferTerm("bar:baz(foo)"); - editor.IndexAllBufferedTerms(); + ICING_ASSERT_OK(editor.BufferTerm("bar:baz(foo)")); + ICING_ASSERT_OK(editor.IndexAllBufferedTerms()); // Query for `"foo:bar("*`. This should match docs 0 and 1. std::string query = CreateQuery("\"foo:bar(\"*"); @@ -1233,18 +1255,18 @@ TEST_P(QueryVisitorTest, VerbatimTermEscapingQuote) { // "foobar\" and "foobar"" respectively. Index::Editor editor = index_->Edit(kDocumentId0, kSectionId1, TERM_MATCH_EXACT, /*namespace_id=*/0); - editor.BufferTerm(R"(foobary)"); - editor.IndexAllBufferedTerms(); + ICING_ASSERT_OK(editor.BufferTerm(R"(foobary)")); + ICING_ASSERT_OK(editor.IndexAllBufferedTerms()); editor = index_->Edit(kDocumentId1, kSectionId1, TERM_MATCH_EXACT, /*namespace_id=*/0); - editor.BufferTerm(R"(foobar\)"); - editor.IndexAllBufferedTerms(); + ICING_ASSERT_OK(editor.BufferTerm(R"(foobar\)")); + ICING_ASSERT_OK(editor.IndexAllBufferedTerms()); editor = index_->Edit(kDocumentId2, kSectionId1, TERM_MATCH_EXACT, /*namespace_id=*/0); - editor.BufferTerm(R"(foobar")"); - editor.IndexAllBufferedTerms(); + ICING_ASSERT_OK(editor.BufferTerm(R"(foobar")")); + ICING_ASSERT_OK(editor.IndexAllBufferedTerms()); // From the comment above, verbatim_term = `foobar"` and verbatim_query = // `foobar\"` @@ -1285,19 +1307,19 @@ TEST_P(QueryVisitorTest, VerbatimTermEscapingEscape) { // "foobar\" and "foobar"" respectively. Index::Editor editor = index_->Edit(kDocumentId0, kSectionId1, TERM_MATCH_EXACT, /*namespace_id=*/0); - editor.BufferTerm(R"(foobary)"); - editor.IndexAllBufferedTerms(); + ICING_ASSERT_OK(editor.BufferTerm(R"(foobary)")); + ICING_ASSERT_OK(editor.IndexAllBufferedTerms()); editor = index_->Edit(kDocumentId1, kSectionId1, TERM_MATCH_EXACT, /*namespace_id=*/0); // From the comment above, verbatim_term = `foobar\`. - editor.BufferTerm(R"(foobar\)"); - editor.IndexAllBufferedTerms(); + ICING_ASSERT_OK(editor.BufferTerm(R"(foobar\)")); + ICING_ASSERT_OK(editor.IndexAllBufferedTerms()); editor = index_->Edit(kDocumentId2, kSectionId1, TERM_MATCH_EXACT, /*namespace_id=*/0); - editor.BufferTerm(R"(foobar")"); - editor.IndexAllBufferedTerms(); + ICING_ASSERT_OK(editor.BufferTerm(R"(foobar")")); + ICING_ASSERT_OK(editor.IndexAllBufferedTerms()); // Issue a query for the verbatim token `foobar\`. std::string query = CreateQuery(R"(("foobar\\"))"); @@ -1340,18 +1362,18 @@ TEST_P(QueryVisitorTest, VerbatimTermEscapingNonSpecialChar) { Index::Editor editor = index_->Edit(kDocumentId0, kSectionId1, TERM_MATCH_EXACT, /*namespace_id=*/0); // From the comment above, verbatim_term = `foobary`. - editor.BufferTerm(R"(foobary)"); - editor.IndexAllBufferedTerms(); + ICING_ASSERT_OK(editor.BufferTerm(R"(foobary)")); + ICING_ASSERT_OK(editor.IndexAllBufferedTerms()); editor = index_->Edit(kDocumentId1, kSectionId1, TERM_MATCH_EXACT, /*namespace_id=*/0); - editor.BufferTerm(R"(foobar\)"); - editor.IndexAllBufferedTerms(); + ICING_ASSERT_OK(editor.BufferTerm(R"(foobar\)")); + ICING_ASSERT_OK(editor.IndexAllBufferedTerms()); editor = index_->Edit(kDocumentId2, kSectionId1, TERM_MATCH_EXACT, /*namespace_id=*/0); - editor.BufferTerm(R"(foobar\y)"); - editor.IndexAllBufferedTerms(); + ICING_ASSERT_OK(editor.BufferTerm(R"(foobar\y)")); + ICING_ASSERT_OK(editor.IndexAllBufferedTerms()); // Issue a query for the verbatim token `foobary`. std::string query = CreateQuery(R"(("foobar\y"))"); @@ -1421,19 +1443,19 @@ TEST_P(QueryVisitorTest, VerbatimTermNewLine) { Index::Editor editor = index_->Edit(kDocumentId0, kSectionId1, TERM_MATCH_EXACT, /*namespace_id=*/0); // From the comment above, verbatim_term = `foobar` + '\n'. - editor.BufferTerm("foobar\n"); - editor.IndexAllBufferedTerms(); + ICING_ASSERT_OK(editor.BufferTerm("foobar\n")); + ICING_ASSERT_OK(editor.IndexAllBufferedTerms()); editor = index_->Edit(kDocumentId1, kSectionId1, TERM_MATCH_EXACT, /*namespace_id=*/0); - editor.BufferTerm(R"(foobar\)"); - editor.IndexAllBufferedTerms(); + ICING_ASSERT_OK(editor.BufferTerm(R"(foobar\)")); + ICING_ASSERT_OK(editor.IndexAllBufferedTerms()); editor = index_->Edit(kDocumentId2, kSectionId1, TERM_MATCH_EXACT, /*namespace_id=*/0); // verbatim_term = `foobar\n`. This is distinct from the term added above. - editor.BufferTerm(R"(foobar\n)"); - editor.IndexAllBufferedTerms(); + ICING_ASSERT_OK(editor.BufferTerm(R"(foobar\n)")); + ICING_ASSERT_OK(editor.IndexAllBufferedTerms()); // Issue a query for the verbatim token `foobar` + '\n'. std::string query = CreateQuery("\"foobar\n\""); @@ -1495,20 +1517,20 @@ TEST_P(QueryVisitorTest, VerbatimTermEscapingComplex) { // `foo\\\"bar\\nbaz\"` and `foo\\"bar\\nbaz"` respectively. Index::Editor editor = index_->Edit(kDocumentId0, kSectionId1, TERM_MATCH_EXACT, /*namespace_id=*/0); - editor.BufferTerm(R"(foo\"bar\nbaz")"); - editor.IndexAllBufferedTerms(); + ICING_ASSERT_OK(editor.BufferTerm(R"(foo\"bar\nbaz")")); + ICING_ASSERT_OK(editor.IndexAllBufferedTerms()); editor = index_->Edit(kDocumentId1, kSectionId1, TERM_MATCH_EXACT, /*namespace_id=*/0); // Add the verbatim_term from doc 0 but with all of the escapes left in - editor.BufferTerm(R"(foo\\\"bar\\nbaz\")"); - editor.IndexAllBufferedTerms(); + ICING_ASSERT_OK(editor.BufferTerm(R"(foo\\\"bar\\nbaz\")")); + ICING_ASSERT_OK(editor.IndexAllBufferedTerms()); editor = index_->Edit(kDocumentId2, kSectionId1, TERM_MATCH_EXACT, /*namespace_id=*/0); // Add the verbatim_term from doc 0 but with the escapes for '\' chars left in - editor.BufferTerm(R"(foo\\"bar\\nbaz")"); - editor.IndexAllBufferedTerms(); + ICING_ASSERT_OK(editor.BufferTerm(R"(foo\\"bar\\nbaz")")); + ICING_ASSERT_OK(editor.IndexAllBufferedTerms()); // Issue a query for the verbatim token `foo\"bar\nbaz"`. std::string query = CreateQuery(R"(("foo\\\"bar\\nbaz\""))"); @@ -1553,22 +1575,22 @@ TEST_P(QueryVisitorTest, SingleMinusTerm) { DocumentBuilder().SetKey("ns", "uri0").SetSchema("type").Build())); Index::Editor editor = index_->Edit(kDocumentId0, kSectionId1, TERM_MATCH_PREFIX, /*namespace_id=*/0); - editor.BufferTerm("foo"); - editor.IndexAllBufferedTerms(); + ICING_ASSERT_OK(editor.BufferTerm("foo")); + ICING_ASSERT_OK(editor.IndexAllBufferedTerms()); ICING_ASSERT_OK(document_store_->Put( DocumentBuilder().SetKey("ns", "uri1").SetSchema("type").Build())); editor = index_->Edit(kDocumentId1, kSectionId1, TERM_MATCH_PREFIX, /*namespace_id=*/0); - editor.BufferTerm("foo"); - editor.IndexAllBufferedTerms(); + ICING_ASSERT_OK(editor.BufferTerm("foo")); + ICING_ASSERT_OK(editor.IndexAllBufferedTerms()); ICING_ASSERT_OK(document_store_->Put( DocumentBuilder().SetKey("ns", "uri2").SetSchema("type").Build())); editor = index_->Edit(kDocumentId2, kSectionId1, TERM_MATCH_PREFIX, /*namespace_id=*/0); - editor.BufferTerm("bar"); - editor.IndexAllBufferedTerms(); + ICING_ASSERT_OK(editor.BufferTerm("bar")); + ICING_ASSERT_OK(editor.IndexAllBufferedTerms()); std::string query = CreateQuery("-foo"); ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Node> root_node, @@ -1607,22 +1629,22 @@ TEST_P(QueryVisitorTest, SingleNotTerm) { DocumentBuilder().SetKey("ns", "uri0").SetSchema("type").Build())); Index::Editor editor = index_->Edit(kDocumentId0, kSectionId1, TERM_MATCH_PREFIX, /*namespace_id=*/0); - editor.BufferTerm("foo"); - editor.IndexAllBufferedTerms(); + ICING_ASSERT_OK(editor.BufferTerm("foo")); + ICING_ASSERT_OK(editor.IndexAllBufferedTerms()); ICING_ASSERT_OK(document_store_->Put( DocumentBuilder().SetKey("ns", "uri1").SetSchema("type").Build())); editor = index_->Edit(kDocumentId1, kSectionId1, TERM_MATCH_PREFIX, /*namespace_id=*/0); - editor.BufferTerm("foo"); - editor.IndexAllBufferedTerms(); + ICING_ASSERT_OK(editor.BufferTerm("foo")); + ICING_ASSERT_OK(editor.IndexAllBufferedTerms()); ICING_ASSERT_OK(document_store_->Put( DocumentBuilder().SetKey("ns", "uri2").SetSchema("type").Build())); editor = index_->Edit(kDocumentId2, kSectionId1, TERM_MATCH_PREFIX, /*namespace_id=*/0); - editor.BufferTerm("bar"); - editor.IndexAllBufferedTerms(); + ICING_ASSERT_OK(editor.BufferTerm("bar")); + ICING_ASSERT_OK(editor.IndexAllBufferedTerms()); std::string query = CreateQuery("NOT foo"); ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Node> root_node, @@ -1657,26 +1679,26 @@ TEST_P(QueryVisitorTest, NestedNotTerms) { DocumentBuilder().SetKey("ns", "uri0").SetSchema("type").Build())); Index::Editor editor = index_->Edit(kDocumentId0, kSectionId1, TERM_MATCH_PREFIX, /*namespace_id=*/0); - editor.BufferTerm("foo"); - editor.BufferTerm("bar"); - editor.BufferTerm("baz"); - editor.IndexAllBufferedTerms(); + ICING_ASSERT_OK(editor.BufferTerm("foo")); + ICING_ASSERT_OK(editor.BufferTerm("bar")); + ICING_ASSERT_OK(editor.BufferTerm("baz")); + ICING_ASSERT_OK(editor.IndexAllBufferedTerms()); ICING_ASSERT_OK(document_store_->Put( DocumentBuilder().SetKey("ns", "uri1").SetSchema("type").Build())); editor = index_->Edit(kDocumentId1, kSectionId1, TERM_MATCH_PREFIX, /*namespace_id=*/0); - editor.BufferTerm("foo"); - editor.BufferTerm("baz"); - editor.IndexAllBufferedTerms(); + ICING_ASSERT_OK(editor.BufferTerm("foo")); + ICING_ASSERT_OK(editor.BufferTerm("baz")); + ICING_ASSERT_OK(editor.IndexAllBufferedTerms()); ICING_ASSERT_OK(document_store_->Put( DocumentBuilder().SetKey("ns", "uri2").SetSchema("type").Build())); editor = index_->Edit(kDocumentId2, kSectionId1, TERM_MATCH_PREFIX, /*namespace_id=*/0); - editor.BufferTerm("bar"); - editor.BufferTerm("baz"); - editor.IndexAllBufferedTerms(); + ICING_ASSERT_OK(editor.BufferTerm("bar")); + ICING_ASSERT_OK(editor.BufferTerm("baz")); + ICING_ASSERT_OK(editor.IndexAllBufferedTerms()); // Double negative could be rewritten as `(foo AND NOT bar) baz` std::string query = CreateQuery("NOT (-foo OR bar) baz"); @@ -1715,26 +1737,26 @@ TEST_P(QueryVisitorTest, DeeplyNestedNotTerms) { DocumentBuilder().SetKey("ns", "uri0").SetSchema("type").Build())); Index::Editor editor = index_->Edit(kDocumentId0, kSectionId1, TERM_MATCH_PREFIX, /*namespace_id=*/0); - editor.BufferTerm("foo"); - editor.BufferTerm("bar"); - editor.BufferTerm("baz"); - editor.IndexAllBufferedTerms(); + ICING_ASSERT_OK(editor.BufferTerm("foo")); + ICING_ASSERT_OK(editor.BufferTerm("bar")); + ICING_ASSERT_OK(editor.BufferTerm("baz")); + ICING_ASSERT_OK(editor.IndexAllBufferedTerms()); ICING_ASSERT_OK(document_store_->Put( DocumentBuilder().SetKey("ns", "uri1").SetSchema("type").Build())); editor = index_->Edit(kDocumentId1, kSectionId1, TERM_MATCH_PREFIX, /*namespace_id=*/0); - editor.BufferTerm("foo"); - editor.BufferTerm("baz"); - editor.IndexAllBufferedTerms(); + ICING_ASSERT_OK(editor.BufferTerm("foo")); + ICING_ASSERT_OK(editor.BufferTerm("baz")); + ICING_ASSERT_OK(editor.IndexAllBufferedTerms()); ICING_ASSERT_OK(document_store_->Put( DocumentBuilder().SetKey("ns", "uri2").SetSchema("type").Build())); editor = index_->Edit(kDocumentId2, kSectionId1, TERM_MATCH_PREFIX, /*namespace_id=*/0); - editor.BufferTerm("bar"); - editor.BufferTerm("baz"); - editor.IndexAllBufferedTerms(); + ICING_ASSERT_OK(editor.BufferTerm("bar")); + ICING_ASSERT_OK(editor.BufferTerm("baz")); + ICING_ASSERT_OK(editor.IndexAllBufferedTerms()); // Simplifying: // NOT (-(NOT (foo -bar) baz) -bat) NOT bass @@ -1773,19 +1795,19 @@ TEST_P(QueryVisitorTest, DeeplyNestedNotTerms) { TEST_P(QueryVisitorTest, ImplicitAndTerms) { Index::Editor editor = index_->Edit(kDocumentId0, kSectionId1, TERM_MATCH_PREFIX, /*namespace_id=*/0); - editor.BufferTerm("foo"); - editor.IndexAllBufferedTerms(); + ICING_ASSERT_OK(editor.BufferTerm("foo")); + ICING_ASSERT_OK(editor.IndexAllBufferedTerms()); editor = index_->Edit(kDocumentId1, kSectionId1, TERM_MATCH_PREFIX, /*namespace_id=*/0); - editor.BufferTerm("foo"); - editor.BufferTerm("bar"); - editor.IndexAllBufferedTerms(); + ICING_ASSERT_OK(editor.BufferTerm("foo")); + ICING_ASSERT_OK(editor.BufferTerm("bar")); + ICING_ASSERT_OK(editor.IndexAllBufferedTerms()); editor = index_->Edit(kDocumentId2, kSectionId1, TERM_MATCH_PREFIX, /*namespace_id=*/0); - editor.BufferTerm("bar"); - editor.IndexAllBufferedTerms(); + ICING_ASSERT_OK(editor.BufferTerm("bar")); + ICING_ASSERT_OK(editor.IndexAllBufferedTerms()); std::string query = CreateQuery("foo bar"); ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Node> root_node, @@ -1816,19 +1838,19 @@ TEST_P(QueryVisitorTest, ImplicitAndTerms) { TEST_P(QueryVisitorTest, ExplicitAndTerms) { Index::Editor editor = index_->Edit(kDocumentId0, kSectionId1, TERM_MATCH_PREFIX, /*namespace_id=*/0); - editor.BufferTerm("foo"); - editor.IndexAllBufferedTerms(); + ICING_ASSERT_OK(editor.BufferTerm("foo")); + ICING_ASSERT_OK(editor.IndexAllBufferedTerms()); editor = index_->Edit(kDocumentId1, kSectionId1, TERM_MATCH_PREFIX, /*namespace_id=*/0); - editor.BufferTerm("foo"); - editor.BufferTerm("bar"); - editor.IndexAllBufferedTerms(); + ICING_ASSERT_OK(editor.BufferTerm("foo")); + ICING_ASSERT_OK(editor.BufferTerm("bar")); + ICING_ASSERT_OK(editor.IndexAllBufferedTerms()); editor = index_->Edit(kDocumentId2, kSectionId1, TERM_MATCH_PREFIX, /*namespace_id=*/0); - editor.BufferTerm("bar"); - editor.IndexAllBufferedTerms(); + ICING_ASSERT_OK(editor.BufferTerm("bar")); + ICING_ASSERT_OK(editor.IndexAllBufferedTerms()); std::string query = CreateQuery("foo AND bar"); ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Node> root_node, @@ -1859,19 +1881,19 @@ TEST_P(QueryVisitorTest, ExplicitAndTerms) { TEST_P(QueryVisitorTest, OrTerms) { Index::Editor editor = index_->Edit(kDocumentId0, kSectionId1, TERM_MATCH_PREFIX, /*namespace_id=*/0); - editor.BufferTerm("foo"); - editor.IndexAllBufferedTerms(); + ICING_ASSERT_OK(editor.BufferTerm("foo")); + ICING_ASSERT_OK(editor.IndexAllBufferedTerms()); editor = index_->Edit(kDocumentId1, kSectionId1, TERM_MATCH_PREFIX, /*namespace_id=*/0); - editor.BufferTerm("fo"); - editor.BufferTerm("ba"); - editor.IndexAllBufferedTerms(); + ICING_ASSERT_OK(editor.BufferTerm("fo")); + ICING_ASSERT_OK(editor.BufferTerm("ba")); + ICING_ASSERT_OK(editor.IndexAllBufferedTerms()); editor = index_->Edit(kDocumentId2, kSectionId1, TERM_MATCH_PREFIX, /*namespace_id=*/0); - editor.BufferTerm("bar"); - editor.IndexAllBufferedTerms(); + ICING_ASSERT_OK(editor.BufferTerm("bar")); + ICING_ASSERT_OK(editor.IndexAllBufferedTerms()); std::string query = CreateQuery("foo OR bar"); ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Node> root_node, @@ -1902,20 +1924,20 @@ TEST_P(QueryVisitorTest, OrTerms) { TEST_P(QueryVisitorTest, AndOrTermPrecedence) { Index::Editor editor = index_->Edit(kDocumentId0, kSectionId1, TERM_MATCH_PREFIX, /*namespace_id=*/0); - editor.BufferTerm("bar"); - editor.IndexAllBufferedTerms(); + ICING_ASSERT_OK(editor.BufferTerm("bar")); + ICING_ASSERT_OK(editor.IndexAllBufferedTerms()); editor = index_->Edit(kDocumentId1, kSectionId1, TERM_MATCH_PREFIX, /*namespace_id=*/0); - editor.BufferTerm("foo"); - editor.BufferTerm("bar"); - editor.IndexAllBufferedTerms(); + ICING_ASSERT_OK(editor.BufferTerm("foo")); + ICING_ASSERT_OK(editor.BufferTerm("bar")); + ICING_ASSERT_OK(editor.IndexAllBufferedTerms()); editor = index_->Edit(kDocumentId2, kSectionId1, TERM_MATCH_PREFIX, /*namespace_id=*/0); - editor.BufferTerm("foo"); - editor.BufferTerm("baz"); - editor.IndexAllBufferedTerms(); + ICING_ASSERT_OK(editor.BufferTerm("foo")); + ICING_ASSERT_OK(editor.BufferTerm("baz")); + ICING_ASSERT_OK(editor.IndexAllBufferedTerms()); // Should be interpreted like `foo (bar OR baz)` std::string query = CreateQuery("foo bar OR baz"); @@ -2009,24 +2031,24 @@ TEST_P(QueryVisitorTest, AndOrNotPrecedence) { DocumentBuilder().SetKey("ns", "uri0").SetSchema("type").Build())); Index::Editor editor = index_->Edit(kDocumentId0, kSectionId1, TERM_MATCH_PREFIX, /*namespace_id=*/0); - editor.BufferTerm("foo"); - editor.IndexAllBufferedTerms(); + ICING_ASSERT_OK(editor.BufferTerm("foo")); + ICING_ASSERT_OK(editor.IndexAllBufferedTerms()); ICING_ASSERT_OK(document_store_->Put( DocumentBuilder().SetKey("ns", "uri1").SetSchema("type").Build())); editor = index_->Edit(kDocumentId1, kSectionId1, TERM_MATCH_PREFIX, /*namespace_id=*/0); - editor.BufferTerm("foo"); - editor.BufferTerm("bar"); - editor.IndexAllBufferedTerms(); + ICING_ASSERT_OK(editor.BufferTerm("foo")); + ICING_ASSERT_OK(editor.BufferTerm("bar")); + ICING_ASSERT_OK(editor.IndexAllBufferedTerms()); ICING_ASSERT_OK(document_store_->Put( DocumentBuilder().SetKey("ns", "uri2").SetSchema("type").Build())); editor = index_->Edit(kDocumentId2, kSectionId1, TERM_MATCH_PREFIX, /*namespace_id=*/0); - editor.BufferTerm("foo"); - editor.BufferTerm("baz"); - editor.IndexAllBufferedTerms(); + ICING_ASSERT_OK(editor.BufferTerm("foo")); + ICING_ASSERT_OK(editor.BufferTerm("baz")); + ICING_ASSERT_OK(editor.IndexAllBufferedTerms()); // Should be interpreted like `foo ((NOT bar) OR baz)` std::string query = CreateQuery("foo NOT bar OR baz"); @@ -2097,22 +2119,22 @@ TEST_P(QueryVisitorTest, PropertyFilter) { DocumentBuilder().SetKey("ns", "uri0").SetSchema("type").Build())); Index::Editor editor = index_->Edit(kDocumentId0, prop1_section_id, TERM_MATCH_PREFIX, /*namespace_id=*/0); - editor.BufferTerm("foo"); - editor.IndexAllBufferedTerms(); + ICING_ASSERT_OK(editor.BufferTerm("foo")); + ICING_ASSERT_OK(editor.IndexAllBufferedTerms()); ICING_ASSERT_OK(document_store_->Put( DocumentBuilder().SetKey("ns", "uri1").SetSchema("type").Build())); editor = index_->Edit(kDocumentId1, prop1_section_id, TERM_MATCH_PREFIX, /*namespace_id=*/0); - editor.BufferTerm("foo"); - editor.IndexAllBufferedTerms(); + ICING_ASSERT_OK(editor.BufferTerm("foo")); + ICING_ASSERT_OK(editor.IndexAllBufferedTerms()); ICING_ASSERT_OK(document_store_->Put( DocumentBuilder().SetKey("ns", "uri2").SetSchema("type").Build())); editor = index_->Edit(kDocumentId2, prop2_section_id, TERM_MATCH_PREFIX, /*namespace_id=*/0); - editor.BufferTerm("foo"); - editor.IndexAllBufferedTerms(); + ICING_ASSERT_OK(editor.BufferTerm("foo")); + ICING_ASSERT_OK(editor.IndexAllBufferedTerms()); std::string query = CreateQuery("foo", /*property_restrict=*/"prop1"); ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Node> root_node, @@ -2173,22 +2195,22 @@ TEST_F(QueryVisitorTest, MultiPropertyFilter) { DocumentBuilder().SetKey("ns", "uri0").SetSchema("type").Build())); Index::Editor editor = index_->Edit(kDocumentId0, prop1_section_id, TERM_MATCH_PREFIX, /*namespace_id=*/0); - editor.BufferTerm("foo"); - editor.IndexAllBufferedTerms(); + ICING_ASSERT_OK(editor.BufferTerm("foo")); + ICING_ASSERT_OK(editor.IndexAllBufferedTerms()); ICING_ASSERT_OK(document_store_->Put( DocumentBuilder().SetKey("ns", "uri1").SetSchema("type").Build())); editor = index_->Edit(kDocumentId1, prop2_section_id, TERM_MATCH_PREFIX, /*namespace_id=*/0); - editor.BufferTerm("foo"); - editor.IndexAllBufferedTerms(); + ICING_ASSERT_OK(editor.BufferTerm("foo")); + ICING_ASSERT_OK(editor.IndexAllBufferedTerms()); ICING_ASSERT_OK(document_store_->Put( DocumentBuilder().SetKey("ns", "uri2").SetSchema("type").Build())); editor = index_->Edit(kDocumentId2, prop3_section_id, TERM_MATCH_PREFIX, /*namespace_id=*/0); - editor.BufferTerm("foo"); - editor.IndexAllBufferedTerms(); + ICING_ASSERT_OK(editor.BufferTerm("foo")); + ICING_ASSERT_OK(editor.IndexAllBufferedTerms()); std::string query = R"(search("foo", createList("prop1", "prop2")))"; ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Node> root_node, @@ -2272,22 +2294,22 @@ TEST_P(QueryVisitorTest, PropertyFilterNonNormalized) { DocumentBuilder().SetKey("ns", "uri0").SetSchema("type").Build())); Index::Editor editor = index_->Edit(kDocumentId0, prop1_section_id, TERM_MATCH_PREFIX, /*namespace_id=*/0); - editor.BufferTerm("foo"); - editor.IndexAllBufferedTerms(); + ICING_ASSERT_OK(editor.BufferTerm("foo")); + ICING_ASSERT_OK(editor.IndexAllBufferedTerms()); ICING_ASSERT_OK(document_store_->Put( DocumentBuilder().SetKey("ns", "uri1").SetSchema("type").Build())); editor = index_->Edit(kDocumentId1, prop1_section_id, TERM_MATCH_PREFIX, /*namespace_id=*/0); - editor.BufferTerm("foo"); - editor.IndexAllBufferedTerms(); + ICING_ASSERT_OK(editor.BufferTerm("foo")); + ICING_ASSERT_OK(editor.IndexAllBufferedTerms()); ICING_ASSERT_OK(document_store_->Put( DocumentBuilder().SetKey("ns", "uri2").SetSchema("type").Build())); editor = index_->Edit(kDocumentId2, prop2_section_id, TERM_MATCH_PREFIX, /*namespace_id=*/0); - editor.BufferTerm("foo"); - editor.IndexAllBufferedTerms(); + ICING_ASSERT_OK(editor.BufferTerm("foo")); + ICING_ASSERT_OK(editor.IndexAllBufferedTerms()); std::string query = CreateQuery("foo", /*property_restrict=*/"PROP1"); ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Node> root_node, @@ -2342,22 +2364,22 @@ TEST_P(QueryVisitorTest, PropertyFilterWithGrouping) { DocumentBuilder().SetKey("ns", "uri0").SetSchema("type").Build())); Index::Editor editor = index_->Edit(kDocumentId0, prop1_section_id, TERM_MATCH_PREFIX, /*namespace_id=*/0); - editor.BufferTerm("bar"); - editor.IndexAllBufferedTerms(); + ICING_ASSERT_OK(editor.BufferTerm("bar")); + ICING_ASSERT_OK(editor.IndexAllBufferedTerms()); ICING_ASSERT_OK(document_store_->Put( DocumentBuilder().SetKey("ns", "uri1").SetSchema("type").Build())); editor = index_->Edit(kDocumentId1, prop1_section_id, TERM_MATCH_PREFIX, /*namespace_id=*/0); - editor.BufferTerm("foo"); - editor.IndexAllBufferedTerms(); + ICING_ASSERT_OK(editor.BufferTerm("foo")); + ICING_ASSERT_OK(editor.IndexAllBufferedTerms()); ICING_ASSERT_OK(document_store_->Put( DocumentBuilder().SetKey("ns", "uri2").SetSchema("type").Build())); editor = index_->Edit(kDocumentId2, prop2_section_id, TERM_MATCH_PREFIX, /*namespace_id=*/0); - editor.BufferTerm("foo"); - editor.IndexAllBufferedTerms(); + ICING_ASSERT_OK(editor.BufferTerm("foo")); + ICING_ASSERT_OK(editor.IndexAllBufferedTerms()); std::string query = CreateQuery("(foo OR bar)", /*property_restrict=*/"prop1"); @@ -2410,22 +2432,22 @@ TEST_P(QueryVisitorTest, ValidNestedPropertyFilter) { DocumentBuilder().SetKey("ns", "uri0").SetSchema("type").Build())); Index::Editor editor = index_->Edit(kDocumentId0, prop1_section_id, TERM_MATCH_PREFIX, /*namespace_id=*/0); - editor.BufferTerm("bar"); - editor.IndexAllBufferedTerms(); + ICING_ASSERT_OK(editor.BufferTerm("bar")); + ICING_ASSERT_OK(editor.IndexAllBufferedTerms()); ICING_ASSERT_OK(document_store_->Put( DocumentBuilder().SetKey("ns", "uri1").SetSchema("type").Build())); editor = index_->Edit(kDocumentId1, prop1_section_id, TERM_MATCH_PREFIX, /*namespace_id=*/0); - editor.BufferTerm("foo"); - editor.IndexAllBufferedTerms(); + ICING_ASSERT_OK(editor.BufferTerm("foo")); + ICING_ASSERT_OK(editor.IndexAllBufferedTerms()); ICING_ASSERT_OK(document_store_->Put( DocumentBuilder().SetKey("ns", "uri2").SetSchema("type").Build())); editor = index_->Edit(kDocumentId2, prop2_section_id, TERM_MATCH_PREFIX, /*namespace_id=*/0); - editor.BufferTerm("foo"); - editor.IndexAllBufferedTerms(); + ICING_ASSERT_OK(editor.BufferTerm("foo")); + ICING_ASSERT_OK(editor.IndexAllBufferedTerms()); std::string query = CreateQuery("(prop1:foo)", /*property_restrict=*/"prop1"); ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Node> root_node, @@ -2497,22 +2519,22 @@ TEST_P(QueryVisitorTest, InvalidNestedPropertyFilter) { DocumentBuilder().SetKey("ns", "uri0").SetSchema("type").Build())); Index::Editor editor = index_->Edit(kDocumentId0, prop1_section_id, TERM_MATCH_PREFIX, /*namespace_id=*/0); - editor.BufferTerm("bar"); - editor.IndexAllBufferedTerms(); + ICING_ASSERT_OK(editor.BufferTerm("bar")); + ICING_ASSERT_OK(editor.IndexAllBufferedTerms()); ICING_ASSERT_OK(document_store_->Put( DocumentBuilder().SetKey("ns", "uri1").SetSchema("type").Build())); editor = index_->Edit(kDocumentId1, prop1_section_id, TERM_MATCH_PREFIX, /*namespace_id=*/0); - editor.BufferTerm("foo"); - editor.IndexAllBufferedTerms(); + ICING_ASSERT_OK(editor.BufferTerm("foo")); + ICING_ASSERT_OK(editor.IndexAllBufferedTerms()); ICING_ASSERT_OK(document_store_->Put( DocumentBuilder().SetKey("ns", "uri2").SetSchema("type").Build())); editor = index_->Edit(kDocumentId2, prop2_section_id, TERM_MATCH_PREFIX, /*namespace_id=*/0); - editor.BufferTerm("foo"); - editor.IndexAllBufferedTerms(); + ICING_ASSERT_OK(editor.BufferTerm("foo")); + ICING_ASSERT_OK(editor.IndexAllBufferedTerms()); std::string query = CreateQuery("(prop2:foo)", /*property_restrict=*/"prop1"); ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Node> root_node, @@ -2580,22 +2602,22 @@ TEST_P(QueryVisitorTest, NotWithPropertyFilter) { DocumentBuilder().SetKey("ns", "uri0").SetSchema("type").Build())); Index::Editor editor = index_->Edit(kDocumentId0, prop1_section_id, TERM_MATCH_PREFIX, /*namespace_id=*/0); - editor.BufferTerm("bar"); - editor.IndexAllBufferedTerms(); + ICING_ASSERT_OK(editor.BufferTerm("bar")); + ICING_ASSERT_OK(editor.IndexAllBufferedTerms()); ICING_ASSERT_OK(document_store_->Put( DocumentBuilder().SetKey("ns", "uri1").SetSchema("type").Build())); editor = index_->Edit(kDocumentId1, prop1_section_id, TERM_MATCH_PREFIX, /*namespace_id=*/0); - editor.BufferTerm("foo"); - editor.IndexAllBufferedTerms(); + ICING_ASSERT_OK(editor.BufferTerm("foo")); + ICING_ASSERT_OK(editor.IndexAllBufferedTerms()); ICING_ASSERT_OK(document_store_->Put( DocumentBuilder().SetKey("ns", "uri2").SetSchema("type").Build())); editor = index_->Edit(kDocumentId2, prop2_section_id, TERM_MATCH_PREFIX, /*namespace_id=*/0); - editor.BufferTerm("foo"); - editor.IndexAllBufferedTerms(); + ICING_ASSERT_OK(editor.BufferTerm("foo")); + ICING_ASSERT_OK(editor.IndexAllBufferedTerms()); // Resulting queries: // - kPlain: `-prop1:(foo OR bar)` @@ -2664,30 +2686,43 @@ TEST_P(QueryVisitorTest, PropertyFilterWithNot) { SectionId prop1_section_id = 0; SectionId prop2_section_id = 1; + // Create documents as follows: + // Doc0: + // prop1: "bar" + // prop2: "" + // Doc1: + // prop1: "foo" + // prop2: "" + // Doc2: + // prop1: "" + // prop2: "foo" ICING_ASSERT_OK(document_store_->Put( DocumentBuilder().SetKey("ns", "uri0").SetSchema("type").Build())); Index::Editor editor = index_->Edit(kDocumentId0, prop1_section_id, TERM_MATCH_PREFIX, /*namespace_id=*/0); - editor.BufferTerm("bar"); - editor.IndexAllBufferedTerms(); + ICING_ASSERT_OK(editor.BufferTerm("bar")); + ICING_ASSERT_OK(editor.IndexAllBufferedTerms()); ICING_ASSERT_OK(document_store_->Put( DocumentBuilder().SetKey("ns", "uri1").SetSchema("type").Build())); editor = index_->Edit(kDocumentId1, prop1_section_id, TERM_MATCH_PREFIX, /*namespace_id=*/0); - editor.BufferTerm("foo"); - editor.IndexAllBufferedTerms(); + ICING_ASSERT_OK(editor.BufferTerm("foo")); + ICING_ASSERT_OK(editor.IndexAllBufferedTerms()); ICING_ASSERT_OK(document_store_->Put( DocumentBuilder().SetKey("ns", "uri2").SetSchema("type").Build())); editor = index_->Edit(kDocumentId2, prop2_section_id, TERM_MATCH_PREFIX, /*namespace_id=*/0); - editor.BufferTerm("foo"); - editor.IndexAllBufferedTerms(); + ICING_ASSERT_OK(editor.BufferTerm("foo")); + ICING_ASSERT_OK(editor.IndexAllBufferedTerms()); // Resulting queries: // - kPlain: `prop1:(-foo OR bar)` // - kSearch: `search("-foo OR bar", createList("prop1"))` + // + // The query is equivalent to `-prop1:foo OR prop1:bar`, thus doc0 and doc2 + // will be matched. std::string query = CreateQuery("(-foo OR bar)", /*property_restrict=*/"prop1"); ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Node> root_node, @@ -2709,11 +2744,14 @@ TEST_P(QueryVisitorTest, PropertyFilterWithNot) { EXPECT_THAT(ExtractKeys(query_results.query_term_iterators), UnorderedElementsAre("bar")); EXPECT_THAT(GetDocumentIds(query_results.root_iterator.get()), - ElementsAre(kDocumentId0)); + ElementsAre(kDocumentId2, kDocumentId0)); // Resulting queries: - // - kPlain: `prop1:(foo OR bar)` - // - kSearch: `search("foo OR bar", createList("prop1"))` + // - kPlain: `prop1:(-foo OR bar)` + // - kSearch: `search("-foo OR bar", createList("prop1"))` + // + // The query is equivalent to `-prop1:foo OR prop1:bar`, thus doc0 and doc2 + // will be matched. query = CreateQuery("(NOT foo OR bar)", /*property_restrict=*/"prop1"); ICING_ASSERT_OK_AND_ASSIGN(root_node, ParseQueryHelper(query)); QueryVisitor query_visitor_two( @@ -2732,7 +2770,7 @@ TEST_P(QueryVisitorTest, PropertyFilterWithNot) { EXPECT_THAT(ExtractKeys(query_results.query_term_iterators), UnorderedElementsAre("bar")); EXPECT_THAT(GetDocumentIds(query_results.root_iterator.get()), - ElementsAre(kDocumentId0)); + ElementsAre(kDocumentId2, kDocumentId0)); } TEST_P(QueryVisitorTest, SegmentationTest) { @@ -2753,7 +2791,7 @@ TEST_P(QueryVisitorTest, SegmentationTest) { .Build(), /*ignore_errors_and_delete_documents=*/false, /*allow_circular_schema_definitions=*/false)); - + // Section ids are assigned alphabetically. SectionId prop1_section_id = 0; SectionId prop2_section_id = 1; @@ -2765,36 +2803,36 @@ TEST_P(QueryVisitorTest, SegmentationTest) { DocumentBuilder().SetKey("ns", "uri0").SetSchema("type").Build())); Index::Editor editor = index_->Edit(kDocumentId0, prop1_section_id, TERM_MATCH_PREFIX, /*namespace_id=*/0); - editor.BufferTerm("上班"); - editor.IndexAllBufferedTerms(); + ICING_ASSERT_OK(editor.BufferTerm("上班")); + ICING_ASSERT_OK(editor.IndexAllBufferedTerms()); editor = index_->Edit(kDocumentId0, prop2_section_id, TERM_MATCH_PREFIX, /*namespace_id=*/0); if (IsCfStringTokenization()) { - editor.BufferTerm("每"); - editor.BufferTerm("天"); + ICING_ASSERT_OK(editor.BufferTerm("每")); + ICING_ASSERT_OK(editor.BufferTerm("天")); } else { - editor.BufferTerm("每天"); + ICING_ASSERT_OK(editor.BufferTerm("每天")); } - editor.IndexAllBufferedTerms(); + ICING_ASSERT_OK(editor.IndexAllBufferedTerms()); ICING_ASSERT_OK(document_store_->Put( DocumentBuilder().SetKey("ns", "uri1").SetSchema("type").Build())); editor = index_->Edit(kDocumentId1, prop1_section_id, TERM_MATCH_PREFIX, /*namespace_id=*/0); - editor.BufferTerm("上班"); - editor.IndexAllBufferedTerms(); + ICING_ASSERT_OK(editor.BufferTerm("上班")); + ICING_ASSERT_OK(editor.IndexAllBufferedTerms()); ICING_ASSERT_OK(document_store_->Put( DocumentBuilder().SetKey("ns", "uri2").SetSchema("type").Build())); editor = index_->Edit(kDocumentId2, prop2_section_id, TERM_MATCH_PREFIX, /*namespace_id=*/0); if (IsCfStringTokenization()) { - editor.BufferTerm("每"); - editor.BufferTerm("天"); + ICING_ASSERT_OK(editor.BufferTerm("每")); + ICING_ASSERT_OK(editor.BufferTerm("天")); } else { - editor.BufferTerm("每天"); + ICING_ASSERT_OK(editor.BufferTerm("每天")); } - editor.IndexAllBufferedTerms(); + ICING_ASSERT_OK(editor.IndexAllBufferedTerms()); ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Node> root_node, ParseQueryHelper(query)); @@ -2859,55 +2897,55 @@ TEST_P(QueryVisitorTest, PropertyRestrictsPopCorrectly) { ICING_ASSERT_OK_AND_ASSIGN(DocumentId docid0, document_store_->Put(doc)); Index::Editor editor = index_->Edit(docid0, prop0_id, TERM_MATCH_PREFIX, ns_id); - editor.BufferTerm("val0"); - editor.BufferTerm("val1"); - editor.BufferTerm("val2"); - editor.IndexAllBufferedTerms(); + ICING_ASSERT_OK(editor.BufferTerm("val0")); + ICING_ASSERT_OK(editor.BufferTerm("val1")); + ICING_ASSERT_OK(editor.BufferTerm("val2")); + ICING_ASSERT_OK(editor.IndexAllBufferedTerms()); // - Doc 1: Contains 'val0', 'val1', 'val2' in 'prop1'. Should match. doc = DocumentBuilder(doc).SetUri("uri1").Build(); ICING_ASSERT_OK_AND_ASSIGN(DocumentId docid1, document_store_->Put(doc)); editor = index_->Edit(docid1, prop1_id, TERM_MATCH_PREFIX, ns_id); - editor.BufferTerm("val0"); - editor.BufferTerm("val1"); - editor.BufferTerm("val2"); - editor.IndexAllBufferedTerms(); + ICING_ASSERT_OK(editor.BufferTerm("val0")); + ICING_ASSERT_OK(editor.BufferTerm("val1")); + ICING_ASSERT_OK(editor.BufferTerm("val2")); + ICING_ASSERT_OK(editor.IndexAllBufferedTerms()); // - Doc 2: Contains 'val0', 'val1', 'val2' in 'prop2'. Shouldn't match. doc = DocumentBuilder(doc).SetUri("uri2").Build(); ICING_ASSERT_OK_AND_ASSIGN(DocumentId docid2, document_store_->Put(doc)); editor = index_->Edit(docid2, prop2_id, TERM_MATCH_PREFIX, ns_id); - editor.BufferTerm("val0"); - editor.BufferTerm("val1"); - editor.BufferTerm("val2"); - editor.IndexAllBufferedTerms(); + ICING_ASSERT_OK(editor.BufferTerm("val0")); + ICING_ASSERT_OK(editor.BufferTerm("val1")); + ICING_ASSERT_OK(editor.BufferTerm("val2")); + ICING_ASSERT_OK(editor.IndexAllBufferedTerms()); // - Doc 3: Contains 'val0' in 'prop0', 'val1' in 'prop1' etc. Should match. doc = DocumentBuilder(doc).SetUri("uri3").Build(); ICING_ASSERT_OK_AND_ASSIGN(DocumentId docid3, document_store_->Put(doc)); editor = index_->Edit(docid3, prop0_id, TERM_MATCH_PREFIX, ns_id); - editor.BufferTerm("val0"); - editor.IndexAllBufferedTerms(); + ICING_ASSERT_OK(editor.BufferTerm("val0")); + ICING_ASSERT_OK(editor.IndexAllBufferedTerms()); editor = index_->Edit(docid3, prop1_id, TERM_MATCH_PREFIX, ns_id); - editor.BufferTerm("val1"); - editor.IndexAllBufferedTerms(); + ICING_ASSERT_OK(editor.BufferTerm("val1")); + ICING_ASSERT_OK(editor.IndexAllBufferedTerms()); editor = index_->Edit(docid3, prop2_id, TERM_MATCH_PREFIX, ns_id); - editor.BufferTerm("val2"); - editor.IndexAllBufferedTerms(); + ICING_ASSERT_OK(editor.BufferTerm("val2")); + ICING_ASSERT_OK(editor.IndexAllBufferedTerms()); // - Doc 4: Contains 'val1' in 'prop0', 'val2' in 'prop1', 'val0' in 'prop2'. // Shouldn't match. doc = DocumentBuilder(doc).SetUri("uri4").Build(); ICING_ASSERT_OK_AND_ASSIGN(DocumentId docid4, document_store_->Put(doc)); editor = index_->Edit(docid4, prop0_id, TERM_MATCH_PREFIX, ns_id); - editor.BufferTerm("val1"); - editor.IndexAllBufferedTerms(); + ICING_ASSERT_OK(editor.BufferTerm("val1")); + ICING_ASSERT_OK(editor.IndexAllBufferedTerms()); editor = index_->Edit(docid4, prop1_id, TERM_MATCH_PREFIX, ns_id); - editor.BufferTerm("val2"); - editor.IndexAllBufferedTerms(); + ICING_ASSERT_OK(editor.BufferTerm("val2")); + ICING_ASSERT_OK(editor.IndexAllBufferedTerms()); editor = index_->Edit(docid4, prop1_id, TERM_MATCH_PREFIX, ns_id); - editor.BufferTerm("val0"); - editor.IndexAllBufferedTerms(); + ICING_ASSERT_OK(editor.BufferTerm("val0")); + ICING_ASSERT_OK(editor.IndexAllBufferedTerms()); // Now issue a query with 'val1' restricted to 'prop1'. This should match only // docs 1 and 3. @@ -2974,55 +3012,55 @@ TEST_P(QueryVisitorTest, UnsatisfiablePropertyRestrictsPopCorrectly) { ICING_ASSERT_OK_AND_ASSIGN(DocumentId docid0, document_store_->Put(doc)); Index::Editor editor = index_->Edit(docid0, prop0_id, TERM_MATCH_PREFIX, ns_id); - editor.BufferTerm("val0"); - editor.BufferTerm("val1"); - editor.BufferTerm("val2"); - editor.IndexAllBufferedTerms(); + ICING_ASSERT_OK(editor.BufferTerm("val0")); + ICING_ASSERT_OK(editor.BufferTerm("val1")); + ICING_ASSERT_OK(editor.BufferTerm("val2")); + ICING_ASSERT_OK(editor.IndexAllBufferedTerms()); // - Doc 1: Contains 'val0', 'val1', 'val2' in 'prop1'. Shouldn't match. doc = DocumentBuilder(doc).SetUri("uri1").Build(); ICING_ASSERT_OK_AND_ASSIGN(DocumentId docid1, document_store_->Put(doc)); editor = index_->Edit(docid1, prop1_id, TERM_MATCH_PREFIX, ns_id); - editor.BufferTerm("val0"); - editor.BufferTerm("val1"); - editor.BufferTerm("val2"); - editor.IndexAllBufferedTerms(); + ICING_ASSERT_OK(editor.BufferTerm("val0")); + ICING_ASSERT_OK(editor.BufferTerm("val1")); + ICING_ASSERT_OK(editor.BufferTerm("val2")); + ICING_ASSERT_OK(editor.IndexAllBufferedTerms()); // - Doc 2: Contains 'val0', 'val1', 'val2' in 'prop2'. Should match. doc = DocumentBuilder(doc).SetUri("uri2").Build(); ICING_ASSERT_OK_AND_ASSIGN(DocumentId docid2, document_store_->Put(doc)); editor = index_->Edit(docid2, prop2_id, TERM_MATCH_PREFIX, ns_id); - editor.BufferTerm("val0"); - editor.BufferTerm("val1"); - editor.BufferTerm("val2"); - editor.IndexAllBufferedTerms(); + ICING_ASSERT_OK(editor.BufferTerm("val0")); + ICING_ASSERT_OK(editor.BufferTerm("val1")); + ICING_ASSERT_OK(editor.BufferTerm("val2")); + ICING_ASSERT_OK(editor.IndexAllBufferedTerms()); // - Doc 3: Contains 'val0' in 'prop0', 'val1' in 'prop1' etc. Should match. doc = DocumentBuilder(doc).SetUri("uri3").Build(); ICING_ASSERT_OK_AND_ASSIGN(DocumentId docid3, document_store_->Put(doc)); editor = index_->Edit(docid3, prop0_id, TERM_MATCH_PREFIX, ns_id); - editor.BufferTerm("val0"); - editor.IndexAllBufferedTerms(); + ICING_ASSERT_OK(editor.BufferTerm("val0")); + ICING_ASSERT_OK(editor.IndexAllBufferedTerms()); editor = index_->Edit(docid3, prop1_id, TERM_MATCH_PREFIX, ns_id); - editor.BufferTerm("val1"); - editor.IndexAllBufferedTerms(); + ICING_ASSERT_OK(editor.BufferTerm("val1")); + ICING_ASSERT_OK(editor.IndexAllBufferedTerms()); editor = index_->Edit(docid3, prop2_id, TERM_MATCH_PREFIX, ns_id); - editor.BufferTerm("val2"); - editor.IndexAllBufferedTerms(); + ICING_ASSERT_OK(editor.BufferTerm("val2")); + ICING_ASSERT_OK(editor.IndexAllBufferedTerms()); // - Doc 4: Contains 'val1' in 'prop0', 'val2' in 'prop1', 'val0' in 'prop2'. // Shouldn't match. doc = DocumentBuilder(doc).SetUri("uri4").Build(); ICING_ASSERT_OK_AND_ASSIGN(DocumentId docid4, document_store_->Put(doc)); editor = index_->Edit(docid4, prop0_id, TERM_MATCH_PREFIX, ns_id); - editor.BufferTerm("val1"); - editor.IndexAllBufferedTerms(); + ICING_ASSERT_OK(editor.BufferTerm("val1")); + ICING_ASSERT_OK(editor.IndexAllBufferedTerms()); editor = index_->Edit(docid4, prop1_id, TERM_MATCH_PREFIX, ns_id); - editor.BufferTerm("val2"); - editor.IndexAllBufferedTerms(); + ICING_ASSERT_OK(editor.BufferTerm("val2")); + ICING_ASSERT_OK(editor.IndexAllBufferedTerms()); editor = index_->Edit(docid4, prop1_id, TERM_MATCH_PREFIX, ns_id); - editor.BufferTerm("val0"); - editor.IndexAllBufferedTerms(); + ICING_ASSERT_OK(editor.BufferTerm("val0")); + ICING_ASSERT_OK(editor.IndexAllBufferedTerms()); // Now issue a query with 'val1' restricted to 'prop1'. This should match only // docs 1 and 3. @@ -3194,23 +3232,23 @@ TEST_F(QueryVisitorTest, SearchFunctionNestedFunctionCalls) { DocumentBuilder().SetKey("ns", "uri0").SetSchema("type").Build())); Index::Editor editor = index_->Edit(kDocumentId0, prop1_section_id, TERM_MATCH_PREFIX, /*namespace_id=*/0); - editor.BufferTerm("bar"); - editor.IndexAllBufferedTerms(); + ICING_ASSERT_OK(editor.BufferTerm("bar")); + ICING_ASSERT_OK(editor.IndexAllBufferedTerms()); ICING_ASSERT_OK(document_store_->Put( DocumentBuilder().SetKey("ns", "uri1").SetSchema("type").Build())); editor = index_->Edit(kDocumentId1, prop1_section_id, TERM_MATCH_PREFIX, /*namespace_id=*/0); - editor.BufferTerm("foo"); - editor.IndexAllBufferedTerms(); + ICING_ASSERT_OK(editor.BufferTerm("foo")); + ICING_ASSERT_OK(editor.IndexAllBufferedTerms()); ICING_ASSERT_OK(document_store_->Put( DocumentBuilder().SetKey("ns", "uri2").SetSchema("type").Build())); editor = index_->Edit(kDocumentId2, prop1_section_id, TERM_MATCH_PREFIX, /*namespace_id=*/0); - editor.BufferTerm("foo"); - editor.BufferTerm("bar"); - editor.IndexAllBufferedTerms(); + ICING_ASSERT_OK(editor.BufferTerm("foo")); + ICING_ASSERT_OK(editor.BufferTerm("bar")); + ICING_ASSERT_OK(editor.IndexAllBufferedTerms()); // *If* nested function calls were allowed, then this would simplify as: // `search("search(\"foo\") bar")` -> `search("foo bar")` -> `foo bar` @@ -3332,57 +3370,57 @@ TEST_F(QueryVisitorTest, SearchFunctionNestedPropertyRestrictsNarrowing) { ICING_ASSERT_OK_AND_ASSIGN(DocumentId docid0, document_store_->Put(doc)); Index::Editor editor = index_->Edit(kDocumentId0, prop0_id, TERM_MATCH_PREFIX, ns_id); - editor.BufferTerm("foo"); - editor.IndexAllBufferedTerms(); + ICING_ASSERT_OK(editor.BufferTerm("foo")); + ICING_ASSERT_OK(editor.IndexAllBufferedTerms()); ICING_ASSERT_OK_AND_ASSIGN( DocumentId docid1, document_store_->Put(DocumentBuilder(doc).SetUri("uri1").Build())); editor = index_->Edit(docid1, prop1_id, TERM_MATCH_PREFIX, ns_id); - editor.BufferTerm("foo"); - editor.IndexAllBufferedTerms(); + ICING_ASSERT_OK(editor.BufferTerm("foo")); + ICING_ASSERT_OK(editor.IndexAllBufferedTerms()); ICING_ASSERT_OK_AND_ASSIGN( DocumentId docid2, document_store_->Put(DocumentBuilder(doc).SetUri("uri2").Build())); editor = index_->Edit(docid2, prop2_id, TERM_MATCH_PREFIX, ns_id); - editor.BufferTerm("foo"); - editor.IndexAllBufferedTerms(); + ICING_ASSERT_OK(editor.BufferTerm("foo")); + ICING_ASSERT_OK(editor.IndexAllBufferedTerms()); ICING_ASSERT_OK_AND_ASSIGN( DocumentId docid3, document_store_->Put(DocumentBuilder(doc).SetUri("uri3").Build())); editor = index_->Edit(docid3, prop3_id, TERM_MATCH_PREFIX, ns_id); - editor.BufferTerm("foo"); - editor.IndexAllBufferedTerms(); + ICING_ASSERT_OK(editor.BufferTerm("foo")); + ICING_ASSERT_OK(editor.IndexAllBufferedTerms()); ICING_ASSERT_OK_AND_ASSIGN( DocumentId docid4, document_store_->Put(DocumentBuilder(doc).SetUri("uri4").Build())); editor = index_->Edit(docid4, prop4_id, TERM_MATCH_PREFIX, ns_id); - editor.BufferTerm("foo"); - editor.IndexAllBufferedTerms(); + ICING_ASSERT_OK(editor.BufferTerm("foo")); + ICING_ASSERT_OK(editor.IndexAllBufferedTerms()); ICING_ASSERT_OK_AND_ASSIGN( DocumentId docid5, document_store_->Put(DocumentBuilder(doc).SetUri("uri5").Build())); editor = index_->Edit(docid5, prop5_id, TERM_MATCH_PREFIX, ns_id); - editor.BufferTerm("foo"); - editor.IndexAllBufferedTerms(); + ICING_ASSERT_OK(editor.BufferTerm("foo")); + ICING_ASSERT_OK(editor.IndexAllBufferedTerms()); ICING_ASSERT_OK_AND_ASSIGN( DocumentId docid6, document_store_->Put(DocumentBuilder(doc).SetUri("uri6").Build())); editor = index_->Edit(docid6, prop6_id, TERM_MATCH_PREFIX, ns_id); - editor.BufferTerm("foo"); - editor.IndexAllBufferedTerms(); + ICING_ASSERT_OK(editor.BufferTerm("foo")); + ICING_ASSERT_OK(editor.IndexAllBufferedTerms()); ICING_ASSERT_OK_AND_ASSIGN( DocumentId docid7, document_store_->Put(DocumentBuilder(doc).SetUri("uri7").Build())); editor = index_->Edit(docid7, prop7_id, TERM_MATCH_PREFIX, ns_id); - editor.BufferTerm("foo"); - editor.IndexAllBufferedTerms(); + ICING_ASSERT_OK(editor.BufferTerm("foo")); + ICING_ASSERT_OK(editor.IndexAllBufferedTerms()); // *If* nested function calls were allowed, then this would simplify as: // `search("search(\"foo\") bar")` -> `search("foo bar")` -> `foo bar` @@ -3512,57 +3550,57 @@ TEST_F(QueryVisitorTest, SearchFunctionNestedPropertyRestrictsExpanding) { ICING_ASSERT_OK_AND_ASSIGN(DocumentId docid0, document_store_->Put(doc)); Index::Editor editor = index_->Edit(kDocumentId0, prop0_id, TERM_MATCH_PREFIX, ns_id); - editor.BufferTerm("foo"); - editor.IndexAllBufferedTerms(); + ICING_ASSERT_OK(editor.BufferTerm("foo")); + ICING_ASSERT_OK(editor.IndexAllBufferedTerms()); ICING_ASSERT_OK_AND_ASSIGN( DocumentId docid1, document_store_->Put(DocumentBuilder(doc).SetUri("uri1").Build())); editor = index_->Edit(docid1, prop1_id, TERM_MATCH_PREFIX, ns_id); - editor.BufferTerm("foo"); - editor.IndexAllBufferedTerms(); + ICING_ASSERT_OK(editor.BufferTerm("foo")); + ICING_ASSERT_OK(editor.IndexAllBufferedTerms()); ICING_ASSERT_OK_AND_ASSIGN( DocumentId docid2, document_store_->Put(DocumentBuilder(doc).SetUri("uri2").Build())); editor = index_->Edit(docid2, prop2_id, TERM_MATCH_PREFIX, ns_id); - editor.BufferTerm("foo"); - editor.IndexAllBufferedTerms(); + ICING_ASSERT_OK(editor.BufferTerm("foo")); + ICING_ASSERT_OK(editor.IndexAllBufferedTerms()); ICING_ASSERT_OK_AND_ASSIGN( DocumentId docid3, document_store_->Put(DocumentBuilder(doc).SetUri("uri3").Build())); editor = index_->Edit(docid3, prop3_id, TERM_MATCH_PREFIX, ns_id); - editor.BufferTerm("foo"); - editor.IndexAllBufferedTerms(); + ICING_ASSERT_OK(editor.BufferTerm("foo")); + ICING_ASSERT_OK(editor.IndexAllBufferedTerms()); ICING_ASSERT_OK_AND_ASSIGN( DocumentId docid4, document_store_->Put(DocumentBuilder(doc).SetUri("uri4").Build())); editor = index_->Edit(docid4, prop4_id, TERM_MATCH_PREFIX, ns_id); - editor.BufferTerm("foo"); - editor.IndexAllBufferedTerms(); + ICING_ASSERT_OK(editor.BufferTerm("foo")); + ICING_ASSERT_OK(editor.IndexAllBufferedTerms()); ICING_ASSERT_OK_AND_ASSIGN( DocumentId docid5, document_store_->Put(DocumentBuilder(doc).SetUri("uri5").Build())); editor = index_->Edit(docid5, prop5_id, TERM_MATCH_PREFIX, ns_id); - editor.BufferTerm("foo"); - editor.IndexAllBufferedTerms(); + ICING_ASSERT_OK(editor.BufferTerm("foo")); + ICING_ASSERT_OK(editor.IndexAllBufferedTerms()); ICING_ASSERT_OK_AND_ASSIGN( DocumentId docid6, document_store_->Put(DocumentBuilder(doc).SetUri("uri6").Build())); editor = index_->Edit(docid6, prop6_id, TERM_MATCH_PREFIX, ns_id); - editor.BufferTerm("foo"); - editor.IndexAllBufferedTerms(); + ICING_ASSERT_OK(editor.BufferTerm("foo")); + ICING_ASSERT_OK(editor.IndexAllBufferedTerms()); ICING_ASSERT_OK_AND_ASSIGN( DocumentId docid7, document_store_->Put(DocumentBuilder(doc).SetUri("uri7").Build())); editor = index_->Edit(docid7, prop7_id, TERM_MATCH_PREFIX, ns_id); - editor.BufferTerm("foo"); - editor.IndexAllBufferedTerms(); + ICING_ASSERT_OK(editor.BufferTerm("foo")); + ICING_ASSERT_OK(editor.IndexAllBufferedTerms()); // *If* nested function calls were allowed, then this would simplify as: // `search("search(\"foo\") bar")` -> `search("foo bar")` -> `foo bar` @@ -3723,8 +3761,8 @@ TEST_P(QueryVisitorTest, PropertyDefinedFunctionReturnsMatchingDocuments) { DocumentBuilder().SetKey("ns", "uri0").SetSchema("typeWithUrl").Build())); Index::Editor editor = index_->Edit(kDocumentId0, kSectionId1, TERM_MATCH_PREFIX, /*namespace_id=*/0); - editor.BufferTerm("foo"); - editor.IndexAllBufferedTerms(); + ICING_ASSERT_OK(editor.BufferTerm("foo")); + ICING_ASSERT_OK(editor.IndexAllBufferedTerms()); // Document 1 has the term "foo" and its schema DOESN'T have the url property. ICING_ASSERT_OK(document_store_->Put(DocumentBuilder() @@ -3733,16 +3771,16 @@ TEST_P(QueryVisitorTest, PropertyDefinedFunctionReturnsMatchingDocuments) { .Build())); editor = index_->Edit(kDocumentId1, kSectionId1, TERM_MATCH_PREFIX, /*namespace_id=*/0); - editor.BufferTerm("foo"); - editor.IndexAllBufferedTerms(); + ICING_ASSERT_OK(editor.BufferTerm("foo")); + ICING_ASSERT_OK(editor.IndexAllBufferedTerms()); // Document 2 has the term "bar" and its schema has the url property. ICING_ASSERT_OK(document_store_->Put( DocumentBuilder().SetKey("ns", "uri2").SetSchema("typeWithUrl").Build())); editor = index_->Edit(kDocumentId2, kSectionId1, TERM_MATCH_PREFIX, /*namespace_id=*/0); - editor.BufferTerm("bar"); - editor.IndexAllBufferedTerms(); + ICING_ASSERT_OK(editor.BufferTerm("bar")); + ICING_ASSERT_OK(editor.IndexAllBufferedTerms()); std::string query = CreateQuery("foo propertyDefined(\"url\")"); ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Node> root_node, @@ -3783,8 +3821,8 @@ TEST_P(QueryVisitorTest, DocumentBuilder().SetKey("ns", "uri0").SetSchema("typeWithUrl").Build())); Index::Editor editor = index_->Edit(kDocumentId0, kSectionId1, TERM_MATCH_PREFIX, /*namespace_id=*/0); - editor.BufferTerm("foo"); - editor.IndexAllBufferedTerms(); + ICING_ASSERT_OK(editor.BufferTerm("foo")); + ICING_ASSERT_OK(editor.IndexAllBufferedTerms()); // Document 1 has the term "foo" and its schema DOESN'T have the url property. ICING_ASSERT_OK(document_store_->Put(DocumentBuilder() @@ -3793,8 +3831,8 @@ TEST_P(QueryVisitorTest, .Build())); editor = index_->Edit(kDocumentId1, kSectionId1, TERM_MATCH_PREFIX, /*namespace_id=*/0); - editor.BufferTerm("foo"); - editor.IndexAllBufferedTerms(); + ICING_ASSERT_OK(editor.BufferTerm("foo")); + ICING_ASSERT_OK(editor.IndexAllBufferedTerms()); // Attempt to query a non-existent property. std::string query = CreateQuery("propertyDefined(\"nonexistentproperty\")"); @@ -3835,8 +3873,8 @@ TEST_P(QueryVisitorTest, DocumentBuilder().SetKey("ns", "uri0").SetSchema("typeWithUrl").Build())); Index::Editor editor = index_->Edit(kDocumentId0, kSectionId1, TERM_MATCH_PREFIX, /*namespace_id=*/0); - editor.BufferTerm("foo"); - editor.IndexAllBufferedTerms(); + ICING_ASSERT_OK(editor.BufferTerm("foo")); + ICING_ASSERT_OK(editor.IndexAllBufferedTerms()); // Document 1 has the term "foo" and its schema DOESN'T have the url property. ICING_ASSERT_OK(document_store_->Put(DocumentBuilder() @@ -3845,8 +3883,8 @@ TEST_P(QueryVisitorTest, .Build())); editor = index_->Edit(kDocumentId1, kSectionId1, TERM_MATCH_PREFIX, /*namespace_id=*/0); - editor.BufferTerm("foo"); - editor.IndexAllBufferedTerms(); + ICING_ASSERT_OK(editor.BufferTerm("foo")); + ICING_ASSERT_OK(editor.IndexAllBufferedTerms()); std::string query = CreateQuery("foo AND NOT propertyDefined(\"url\")"); ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Node> root_node, @@ -3866,6 +3904,204 @@ TEST_P(QueryVisitorTest, UnorderedElementsAre(kDocumentId1)); } +TEST_F(QueryVisitorTest, + HasPropertyFunctionWithNoArgumentReturnsInvalidArgument) { + std::string query = "hasProperty()"; + ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Node> root_node, + ParseQueryHelper(query)); + QueryVisitor query_visitor( + index_.get(), numeric_index_.get(), document_store_.get(), + schema_store_.get(), normalizer_.get(), tokenizer_.get(), query, + DocHitInfoIteratorFilter::Options(), TERM_MATCH_PREFIX, + /*needs_term_frequency_info=*/true, clock_.GetSystemTimeMilliseconds()); + root_node->Accept(&query_visitor); + EXPECT_THAT(std::move(query_visitor).ConsumeResults(), + StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); +} + +TEST_F(QueryVisitorTest, + HasPropertyFunctionWithMoreThanOneStringArgumentReturnsInvalidArgument) { + std::string query = "hasProperty(\"foo\", \"bar\")"; + ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Node> root_node, + ParseQueryHelper(query)); + QueryVisitor query_visitor( + index_.get(), numeric_index_.get(), document_store_.get(), + schema_store_.get(), normalizer_.get(), tokenizer_.get(), query, + DocHitInfoIteratorFilter::Options(), TERM_MATCH_PREFIX, + /*needs_term_frequency_info=*/true, clock_.GetSystemTimeMilliseconds()); + root_node->Accept(&query_visitor); + EXPECT_THAT(std::move(query_visitor).ConsumeResults(), + StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); +} + +TEST_F(QueryVisitorTest, + HasPropertyFunctionWithTextArgumentReturnsInvalidArgument) { + // The argument type is TEXT, not STRING here. + std::string query = "hasProperty(foo)"; + ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Node> root_node, + ParseQueryHelper(query)); + QueryVisitor query_visitor( + index_.get(), numeric_index_.get(), document_store_.get(), + schema_store_.get(), normalizer_.get(), tokenizer_.get(), query, + DocHitInfoIteratorFilter::Options(), TERM_MATCH_PREFIX, + /*needs_term_frequency_info=*/true, clock_.GetSystemTimeMilliseconds()); + root_node->Accept(&query_visitor); + EXPECT_THAT(std::move(query_visitor).ConsumeResults(), + StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); +} + +TEST_F(QueryVisitorTest, + HasPropertyFunctionWithNonStringArgumentReturnsInvalidArgument) { + std::string query = "hasProperty(1 < 2)"; + ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Node> root_node, + ParseQueryHelper(query)); + QueryVisitor query_visitor( + index_.get(), numeric_index_.get(), document_store_.get(), + schema_store_.get(), normalizer_.get(), tokenizer_.get(), query, + DocHitInfoIteratorFilter::Options(), TERM_MATCH_PREFIX, + /*needs_term_frequency_info=*/true, clock_.GetSystemTimeMilliseconds()); + root_node->Accept(&query_visitor); + EXPECT_THAT(std::move(query_visitor).ConsumeResults(), + StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); +} + +TEST_P(QueryVisitorTest, HasPropertyFunctionReturnsMatchingDocuments) { + ICING_ASSERT_OK(schema_store_->SetSchema( + SchemaBuilder() + .AddType(SchemaTypeConfigBuilder() + .SetType("Simple") + .AddProperty(PropertyConfigBuilder() + .SetName("name") + .SetDataType(TYPE_STRING) + .SetCardinality(CARDINALITY_OPTIONAL)) + .AddProperty(PropertyConfigBuilder() + .SetName("price") + .SetDataType(TYPE_INT64) + .SetCardinality(CARDINALITY_OPTIONAL))) + .Build(), + /*ignore_errors_and_delete_documents=*/false, + /*allow_circular_schema_definitions=*/false)); + + // Document 0 has the term "foo" and has the "price" property. + ICING_ASSERT_OK(document_store_->Put( + DocumentBuilder().SetKey("ns", "uri0").SetSchema("Simple").Build())); + Index::Editor editor = index_->Edit(kDocumentId0, kSectionId0, + TERM_MATCH_PREFIX, /*namespace_id=*/0); + ICING_ASSERT_OK(editor.BufferTerm("foo")); + ICING_ASSERT_OK(editor.BufferTerm( + absl_ports::StrCat(kPropertyExistenceTokenPrefix, "price").c_str())); + ICING_ASSERT_OK(editor.IndexAllBufferedTerms()); + + // Document 1 has the term "foo" and doesn't have the "price" property. + ICING_ASSERT_OK(document_store_->Put( + DocumentBuilder().SetKey("ns", "uri1").SetSchema("Simple").Build())); + editor = index_->Edit(kDocumentId1, kSectionId0, TERM_MATCH_PREFIX, + /*namespace_id=*/0); + ICING_ASSERT_OK(editor.BufferTerm("foo")); + ICING_ASSERT_OK(editor.IndexAllBufferedTerms()); + + // Document 2 has the term "bar" and has the "price" property. + ICING_ASSERT_OK(document_store_->Put( + DocumentBuilder().SetKey("ns", "uri2").SetSchema("Simple").Build())); + editor = index_->Edit(kDocumentId2, kSectionId0, TERM_MATCH_PREFIX, + /*namespace_id=*/0); + ICING_ASSERT_OK(editor.BufferTerm("bar")); + ICING_ASSERT_OK(editor.BufferTerm( + absl_ports::StrCat(kPropertyExistenceTokenPrefix, "price").c_str())); + ICING_ASSERT_OK(editor.IndexAllBufferedTerms()); + + // Test that `foo hasProperty("price")` matches document 0 only. + std::string query = CreateQuery("foo hasProperty(\"price\")"); + ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Node> root_node, + ParseQueryHelper(query)); + QueryVisitor query_visitor1( + index_.get(), numeric_index_.get(), document_store_.get(), + schema_store_.get(), normalizer_.get(), tokenizer_.get(), query, + DocHitInfoIteratorFilter::Options(), TERM_MATCH_PREFIX, + /*needs_term_frequency_info=*/true, clock_.GetSystemTimeMilliseconds()); + root_node->Accept(&query_visitor1); + ICING_ASSERT_OK_AND_ASSIGN(QueryResults query_results, + std::move(query_visitor1).ConsumeResults()); + EXPECT_THAT(query_results.features_in_use, + UnorderedElementsAre(kHasPropertyFunctionFeature, + kListFilterQueryLanguageFeature)); + EXPECT_THAT(GetDocumentIds(query_results.root_iterator.get()), + UnorderedElementsAre(kDocumentId0)); + + // Test that `bar OR NOT hasProperty("price")` matches document 1 and + // document 2. + query = CreateQuery("bar OR NOT hasProperty(\"price\")"); + ICING_ASSERT_OK_AND_ASSIGN(root_node, ParseQueryHelper(query)); + QueryVisitor query_visitor2( + index_.get(), numeric_index_.get(), document_store_.get(), + schema_store_.get(), normalizer_.get(), tokenizer_.get(), query, + DocHitInfoIteratorFilter::Options(), TERM_MATCH_PREFIX, + /*needs_term_frequency_info=*/true, clock_.GetSystemTimeMilliseconds()); + root_node->Accept(&query_visitor2); + ICING_ASSERT_OK_AND_ASSIGN(query_results, + std::move(query_visitor2).ConsumeResults()); + EXPECT_THAT(query_results.features_in_use, + UnorderedElementsAre(kHasPropertyFunctionFeature, + kListFilterQueryLanguageFeature)); + EXPECT_THAT(GetDocumentIds(query_results.root_iterator.get()), + UnorderedElementsAre(kDocumentId1, kDocumentId2)); +} + +TEST_P(QueryVisitorTest, + HasPropertyFunctionReturnsNothingIfNoMatchingProperties) { + ICING_ASSERT_OK(schema_store_->SetSchema( + SchemaBuilder() + .AddType(SchemaTypeConfigBuilder() + .SetType("Simple") + .AddProperty(PropertyConfigBuilder() + .SetName("name") + .SetDataType(TYPE_STRING) + .SetCardinality(CARDINALITY_OPTIONAL)) + .AddProperty(PropertyConfigBuilder() + .SetName("price") + .SetDataType(TYPE_INT64) + .SetCardinality(CARDINALITY_OPTIONAL))) + .Build(), + /*ignore_errors_and_delete_documents=*/false, + /*allow_circular_schema_definitions=*/false)); + + // Document 0 has the term "foo" and has the "price" property. + ICING_ASSERT_OK(document_store_->Put( + DocumentBuilder().SetKey("ns", "uri0").SetSchema("Simple").Build())); + Index::Editor editor = index_->Edit(kDocumentId0, kSectionId0, + TERM_MATCH_PREFIX, /*namespace_id=*/0); + ICING_ASSERT_OK(editor.BufferTerm("foo")); + ICING_ASSERT_OK(editor.BufferTerm( + absl_ports::StrCat(kPropertyExistenceTokenPrefix, "price").c_str())); + ICING_ASSERT_OK(editor.IndexAllBufferedTerms()); + + // Document 1 has the term "foo" and doesn't have the "price" property. + ICING_ASSERT_OK(document_store_->Put( + DocumentBuilder().SetKey("ns", "uri1").SetSchema("Simple").Build())); + editor = index_->Edit(kDocumentId1, kSectionId0, TERM_MATCH_PREFIX, + /*namespace_id=*/0); + ICING_ASSERT_OK(editor.BufferTerm("foo")); + ICING_ASSERT_OK(editor.IndexAllBufferedTerms()); + + // Attempt to query a non-existent property. + std::string query = CreateQuery("hasProperty(\"nonexistentproperty\")"); + ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Node> root_node, + ParseQueryHelper(query)); + QueryVisitor query_visitor( + index_.get(), numeric_index_.get(), document_store_.get(), + schema_store_.get(), normalizer_.get(), tokenizer_.get(), query, + DocHitInfoIteratorFilter::Options(), TERM_MATCH_PREFIX, + /*needs_term_frequency_info=*/true, clock_.GetSystemTimeMilliseconds()); + root_node->Accept(&query_visitor); + ICING_ASSERT_OK_AND_ASSIGN(QueryResults query_results, + std::move(query_visitor).ConsumeResults()); + EXPECT_THAT(query_results.features_in_use, + UnorderedElementsAre(kHasPropertyFunctionFeature, + kListFilterQueryLanguageFeature)); + + EXPECT_THAT(GetDocumentIds(query_results.root_iterator.get()), IsEmpty()); +} + INSTANTIATE_TEST_SUITE_P(QueryVisitorTest, QueryVisitorTest, testing::Values(QueryType::kPlain, QueryType::kSearch)); diff --git a/icing/query/query-features.h b/icing/query/query-features.h index 158e13e..d829cd7 100644 --- a/icing/query/query-features.h +++ b/icing/query/query-features.h @@ -48,9 +48,13 @@ constexpr Feature kVerbatimSearchFeature = constexpr Feature kListFilterQueryLanguageFeature = "LIST_FILTER_QUERY_LANGUAGE"; // Features#LIST_FILTER_QUERY_LANGUAGE +// This feature relates to the use of the "hasProperty(property_path)" function. +constexpr Feature kHasPropertyFunctionFeature = + "HAS_PROPERTY_FUNCTION"; // Features#HAS_PROPERTY_FUNCTION + inline std::unordered_set<Feature> GetQueryFeaturesSet() { return {kNumericSearchFeature, kVerbatimSearchFeature, - kListFilterQueryLanguageFeature}; + kListFilterQueryLanguageFeature, kHasPropertyFunctionFeature}; } } // namespace lib diff --git a/icing/query/query-processor.cc b/icing/query/query-processor.cc index 5e0b696..bbfbf3c 100644 --- a/icing/query/query-processor.cc +++ b/icing/query/query-processor.cc @@ -176,6 +176,12 @@ libtextclassifier3::StatusOr<QueryResults> QueryProcessor::ParseSearch( results.root_iterator = std::make_unique<DocHitInfoIteratorFilter>( std::move(results.root_iterator), &document_store_, &schema_store_, options, current_time_ms); + if (!search_spec.type_property_filters().empty()) { + results.root_iterator = + DocHitInfoIteratorSectionRestrict::ApplyRestrictions( + std::move(results.root_iterator), &document_store_, &schema_store_, + search_spec, current_time_ms); + } return results; } @@ -399,7 +405,7 @@ libtextclassifier3::StatusOr<QueryResults> QueryProcessor::ParseRawQuery( // the section restrict std::set<std::string> section_restricts; section_restricts.insert(std::move(frames.top().section_restrict)); - result_iterator = std::make_unique<DocHitInfoIteratorSectionRestrict>( + result_iterator = DocHitInfoIteratorSectionRestrict::ApplyRestrictions( std::move(result_iterator), &document_store_, &schema_store_, std::move(section_restricts), current_time_ms); diff --git a/icing/query/query-processor_benchmark.cc b/icing/query/query-processor_benchmark.cc index 89f3b54..025e8e6 100644 --- a/icing/query/query-processor_benchmark.cc +++ b/icing/query/query-processor_benchmark.cc @@ -81,7 +81,9 @@ void AddTokenToIndex(Index* index, DocumentId document_id, SectionId section_id, std::unique_ptr<Index> CreateIndex(const IcingFilesystem& icing_filesystem, const Filesystem& filesystem, const std::string& index_dir) { - Index::Options options(index_dir, /*index_merge_size=*/1024 * 1024 * 10); + Index::Options options(index_dir, /*index_merge_size=*/1024 * 1024 * 10, + /*lite_index_sort_at_indexing=*/true, + /*lite_index_sort_size=*/1024 * 8); return Index::Create(options, &filesystem, &icing_filesystem).ValueOrDie(); } @@ -98,7 +100,8 @@ libtextclassifier3::StatusOr<DocumentStore::CreateResult> CreateDocumentStore( return DocumentStore::Create( filesystem, base_dir, clock, schema_store, /*force_recovery_and_revalidate_documents=*/false, - /*namespace_id_fingerprint=*/false, + /*namespace_id_fingerprint=*/false, /*pre_mapping_fbv=*/false, + /*use_persistent_hash_map=*/false, PortableFileBackedProtoLog<DocumentWrapper>::kDeflateCompressionLevel, /*initialize_stats=*/nullptr); } diff --git a/icing/query/query-processor_test.cc b/icing/query/query-processor_test.cc index 3d3cf48..53e3035 100644 --- a/icing/query/query-processor_test.cc +++ b/icing/query/query-processor_test.cc @@ -17,6 +17,7 @@ #include <cstdint> #include <memory> #include <string> +#include <vector> #include "icing/text_classifier/lib3/utils/base/status.h" #include "gmock/gmock.h" @@ -36,6 +37,7 @@ #include "icing/proto/search.pb.h" #include "icing/proto/term.pb.h" #include "icing/query/query-features.h" +#include "icing/query/query-results.h" #include "icing/schema-builder.h" #include "icing/schema/schema-store.h" #include "icing/schema/section.h" @@ -69,7 +71,8 @@ libtextclassifier3::StatusOr<DocumentStore::CreateResult> CreateDocumentStore( return DocumentStore::Create( filesystem, base_dir, clock, schema_store, /*force_recovery_and_revalidate_documents=*/false, - /*namespace_id_fingerprint=*/false, + /*namespace_id_fingerprint=*/false, /*pre_mapping_fbv=*/false, + /*use_persistent_hash_map=*/false, PortableFileBackedProtoLog<DocumentWrapper>::kDeflateCompressionLevel, /*initialize_stats=*/nullptr); } @@ -112,7 +115,9 @@ class QueryProcessorTest document_store_ = std::move(create_result.document_store); Index::Options options(index_dir_, - /*index_merge_size=*/1024 * 1024); + /*index_merge_size=*/1024 * 1024, + /*lite_index_sort_at_indexing=*/true, + /*lite_index_sort_size=*/1024 * 8); ICING_ASSERT_OK_AND_ASSIGN( index_, Index::Create(options, &filesystem_, &icing_filesystem_)); // TODO(b/249829533): switch to use persistent numeric index. @@ -1096,7 +1101,7 @@ TEST_P(QueryProcessorTest, CombinedAndOrTerms) { IsOk()); EXPECT_THAT(AddTokenToIndex(document_id1, section_id, term_match_type, "dog"), IsOk()); - index_->Merge(); + ICING_ASSERT_OK(index_->Merge()); // Document 2 has content "animal kitten cat" EXPECT_THAT( @@ -2645,6 +2650,261 @@ TEST_P(QueryProcessorTest, PropertyFilterTermAndUnrestrictedTerm) { EXPECT_THAT(results.query_terms["foo"], UnorderedElementsAre("animal")); } +TEST_P(QueryProcessorTest, TypePropertyFilter) { + // Create the schema and document store + SchemaProto schema = + SchemaBuilder() + .AddType(SchemaTypeConfigBuilder().SetType("email") + .AddProperty( + PropertyConfigBuilder() + .SetName("foo") + .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_OPTIONAL)) + .AddProperty( + PropertyConfigBuilder() + .SetName("bar") + .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_OPTIONAL)) + .AddProperty( + PropertyConfigBuilder() + .SetName("baz") + .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_OPTIONAL))) + .AddType(SchemaTypeConfigBuilder().SetType("message") + .AddProperty( + PropertyConfigBuilder() + .SetName("foo") + .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_OPTIONAL)) + .AddProperty( + PropertyConfigBuilder() + .SetName("bar") + .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_OPTIONAL)) + .AddProperty( + PropertyConfigBuilder() + .SetName("baz") + .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_OPTIONAL))) + .Build(); + // SectionIds are assigned in ascending order per schema type, + // alphabetically. + int email_bar_section_id = 0; + int email_baz_section_id = 1; + int email_foo_section_id = 2; + int message_bar_section_id = 0; + int message_baz_section_id = 1; + int message_foo_section_id = 2; + ASSERT_THAT(schema_store_->SetSchema( + schema, /*ignore_errors_and_delete_documents=*/false, + /*allow_circular_schema_definitions=*/false), + IsOk()); + + // These documents don't actually match to the tokens in the index. We're + // inserting the documents to get the appropriate number of documents and + // schema types populated. + ICING_ASSERT_OK_AND_ASSIGN(DocumentId email_document_id, + document_store_->Put(DocumentBuilder() + .SetKey("namespace", "1") + .SetSchema("email") + .Build())); + ICING_ASSERT_OK_AND_ASSIGN(DocumentId message_document_id, + document_store_->Put(DocumentBuilder() + .SetKey("namespace", "2") + .SetSchema("message") + .Build())); + + // Poplate the index + TermMatchType::Code term_match_type = TermMatchType::EXACT_ONLY; + + // Email document has content "animal" in all sections + ASSERT_THAT(AddTokenToIndex(email_document_id, email_foo_section_id, + term_match_type, "animal"), + IsOk()); + ASSERT_THAT(AddTokenToIndex(email_document_id, email_bar_section_id, + term_match_type, "animal"), + IsOk()); + ASSERT_THAT(AddTokenToIndex(email_document_id, email_baz_section_id, + term_match_type, "animal"), + IsOk()); + + // Message document has content "animal" in all sections + ASSERT_THAT(AddTokenToIndex(message_document_id, message_foo_section_id, + term_match_type, "animal"), + IsOk()); + ASSERT_THAT(AddTokenToIndex(message_document_id, message_bar_section_id, + term_match_type, "animal"), + IsOk()); + ASSERT_THAT(AddTokenToIndex(message_document_id, message_baz_section_id, + term_match_type, "animal"), + IsOk()); + + SearchSpecProto search_spec; + search_spec.set_query("animal"); + search_spec.set_term_match_type(term_match_type); + search_spec.set_search_type(GetParam()); + + // email has property filters for foo and baz properties + TypePropertyMask *email_mask = search_spec.add_type_property_filters(); + email_mask->set_schema_type("email"); + email_mask->add_paths("foo"); + email_mask->add_paths("baz"); + + // message has property filters for bar and baz properties + TypePropertyMask *message_mask = search_spec.add_type_property_filters(); + message_mask->set_schema_type("message"); + message_mask->add_paths("bar"); + message_mask->add_paths("baz"); + + ICING_ASSERT_OK_AND_ASSIGN( + QueryResults results, + query_processor_->ParseSearch( + search_spec, ScoringSpecProto::RankingStrategy::RELEVANCE_SCORE, + fake_clock_.GetSystemTimeMilliseconds())); + + // Ordered by descending DocumentId, so message comes first since it was + // inserted last + DocHitInfo expected_doc_hit_info1(message_document_id); + expected_doc_hit_info1.UpdateSection(message_bar_section_id); + expected_doc_hit_info1.UpdateSection(message_baz_section_id); + DocHitInfo expected_doc_hit_info2(email_document_id); + expected_doc_hit_info2.UpdateSection(email_foo_section_id); + expected_doc_hit_info2.UpdateSection(email_baz_section_id); + EXPECT_THAT(GetDocHitInfos(results.root_iterator.get()), + ElementsAre(expected_doc_hit_info1, expected_doc_hit_info2)); + EXPECT_THAT(results.query_term_iterators, SizeIs(1)); + + EXPECT_THAT(results.query_terms, SizeIs(1)); + EXPECT_THAT(results.query_terms[""], UnorderedElementsAre("animal")); +} + +TEST_P(QueryProcessorTest, TypePropertyFilterWithSectionRestrict) { + // Create the schema and document store + SchemaProto schema = + SchemaBuilder() + .AddType(SchemaTypeConfigBuilder().SetType("email") + .AddProperty( + PropertyConfigBuilder() + .SetName("foo") + .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_OPTIONAL)) + .AddProperty( + PropertyConfigBuilder() + .SetName("bar") + .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_OPTIONAL)) + .AddProperty( + PropertyConfigBuilder() + .SetName("baz") + .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_OPTIONAL))) + .AddType(SchemaTypeConfigBuilder().SetType("message") + .AddProperty( + PropertyConfigBuilder() + .SetName("foo") + .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_OPTIONAL)) + .AddProperty( + PropertyConfigBuilder() + .SetName("bar") + .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_OPTIONAL)) + .AddProperty( + PropertyConfigBuilder() + .SetName("baz") + .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_OPTIONAL))) + .Build(); + // SectionIds are assigned in ascending order per schema type, + // alphabetically. + int email_bar_section_id = 0; + int email_baz_section_id = 1; + int email_foo_section_id = 2; + int message_bar_section_id = 0; + int message_baz_section_id = 1; + int message_foo_section_id = 2; + ASSERT_THAT(schema_store_->SetSchema( + schema, /*ignore_errors_and_delete_documents=*/false, + /*allow_circular_schema_definitions=*/false), + IsOk()); + + // These documents don't actually match to the tokens in the index. We're + // inserting the documents to get the appropriate number of documents and + // schema types populated. + ICING_ASSERT_OK_AND_ASSIGN(DocumentId email_document_id, + document_store_->Put(DocumentBuilder() + .SetKey("namespace", "1") + .SetSchema("email") + .Build())); + ICING_ASSERT_OK_AND_ASSIGN(DocumentId message_document_id, + document_store_->Put(DocumentBuilder() + .SetKey("namespace", "2") + .SetSchema("message") + .Build())); + + // Poplate the index + TermMatchType::Code term_match_type = TermMatchType::EXACT_ONLY; + + // Email document has content "animal" in all sections + ASSERT_THAT(AddTokenToIndex(email_document_id, email_foo_section_id, + term_match_type, "animal"), + IsOk()); + ASSERT_THAT(AddTokenToIndex(email_document_id, email_bar_section_id, + term_match_type, "animal"), + IsOk()); + ASSERT_THAT(AddTokenToIndex(email_document_id, email_baz_section_id, + term_match_type, "animal"), + IsOk()); + + // Message document has content "animal" in all sections + ASSERT_THAT(AddTokenToIndex(message_document_id, message_foo_section_id, + term_match_type, "animal"), + IsOk()); + ASSERT_THAT(AddTokenToIndex(message_document_id, message_bar_section_id, + term_match_type, "animal"), + IsOk()); + ASSERT_THAT(AddTokenToIndex(message_document_id, message_baz_section_id, + term_match_type, "animal"), + IsOk()); + + SearchSpecProto search_spec; + // Create a section filter '<section name>:<query term>' + search_spec.set_query("foo:animal"); + search_spec.set_term_match_type(term_match_type); + search_spec.set_search_type(GetParam()); + + // email has property filters for foo and baz properties + TypePropertyMask *email_mask = search_spec.add_type_property_filters(); + email_mask->set_schema_type("email"); + email_mask->add_paths("foo"); + email_mask->add_paths("baz"); + + // message has property filters for bar and baz properties + TypePropertyMask *message_mask = search_spec.add_type_property_filters(); + message_mask->set_schema_type("message"); + message_mask->add_paths("bar"); + message_mask->add_paths("baz"); + + ICING_ASSERT_OK_AND_ASSIGN( + QueryResults results, + query_processor_->ParseSearch( + search_spec, ScoringSpecProto::RankingStrategy::RELEVANCE_SCORE, + fake_clock_.GetSystemTimeMilliseconds())); + + // Only hits in sections allowed by both the property filters and section + // restricts should be returned. Message document should not be returned since + // section foo specified in the section restrict is not allowed by the + // property filters. + DocHitInfo expected_doc_hit_info(email_document_id); + expected_doc_hit_info.UpdateSection(email_foo_section_id); + EXPECT_THAT(GetDocHitInfos(results.root_iterator.get()), + ElementsAre(expected_doc_hit_info)); + EXPECT_THAT(results.query_term_iterators, SizeIs(1)); + + EXPECT_THAT(results.query_terms, SizeIs(1)); + EXPECT_THAT(results.query_terms["foo"], UnorderedElementsAre("animal")); +} + TEST_P(QueryProcessorTest, DocumentBeforeTtlNotFilteredOut) { // Create the schema and document store SchemaProto schema = SchemaBuilder() @@ -2919,6 +3179,147 @@ TEST_P(QueryProcessorTest, NumericFilterWithoutEnablingFeatureFails) { StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); } +TEST_P(QueryProcessorTest, GroupingInSectionRestriction) { + if (GetParam() != + SearchSpecProto::SearchType::EXPERIMENTAL_ICING_ADVANCED_QUERY) { + GTEST_SKIP() << "Grouping in section restriction is only supported in " + "advanced query."; + } + + // Create the schema and document store + SchemaProto schema = + SchemaBuilder() + .AddType(SchemaTypeConfigBuilder() + .SetType("email") + .AddProperty(PropertyConfigBuilder() + .SetName("prop1") + .SetDataTypeString(TERM_MATCH_EXACT, + TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_OPTIONAL)) + .AddProperty(PropertyConfigBuilder() + .SetName("prop2") + .SetDataTypeString(TERM_MATCH_EXACT, + TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_OPTIONAL))) + .Build(); + ASSERT_THAT(schema_store_->SetSchema( + schema, /*ignore_errors_and_delete_documents=*/false, + /*allow_circular_schema_definitions=*/false), + IsOk()); + + SectionId prop1_section_id = 0; + SectionId prop2_section_id = 1; + TermMatchType::Code term_match_type = TermMatchType::EXACT_ONLY; + + // Create documents as follows: + // Doc0: + // prop1: "foo" + // prop2: "bar" + // Doc1: + // prop1: "bar" + // prop2: "foo" + // Doc2: + // prop1: "foo bar" + // prop2: "" + ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id0, + document_store_->Put(DocumentBuilder() + .SetKey("namespace", "0") + .SetSchema("email") + .Build())); + EXPECT_THAT( + AddTokenToIndex(document_id0, prop1_section_id, term_match_type, "foo"), + IsOk()); + EXPECT_THAT( + AddTokenToIndex(document_id0, prop2_section_id, term_match_type, "bar"), + IsOk()); + + ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id1, + document_store_->Put(DocumentBuilder() + .SetKey("namespace", "1") + .SetSchema("email") + .Build())); + EXPECT_THAT( + AddTokenToIndex(document_id1, prop1_section_id, term_match_type, "bar"), + IsOk()); + EXPECT_THAT( + AddTokenToIndex(document_id1, prop2_section_id, term_match_type, "foo"), + IsOk()); + + ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id2, + document_store_->Put(DocumentBuilder() + .SetKey("namespace", "2") + .SetSchema("email") + .Build())); + EXPECT_THAT( + AddTokenToIndex(document_id2, prop1_section_id, term_match_type, "foo"), + IsOk()); + EXPECT_THAT( + AddTokenToIndex(document_id2, prop1_section_id, term_match_type, "bar"), + IsOk()); + + // prop1:(foo bar) <=> prop1:foo AND prop1:bar, which matches doc2. + SearchSpecProto search_spec; + search_spec.set_query("prop1:(foo bar)"); + search_spec.set_term_match_type(term_match_type); + search_spec.set_search_type(GetParam()); + search_spec.add_enabled_features( + std::string(kListFilterQueryLanguageFeature)); + ICING_ASSERT_OK_AND_ASSIGN( + QueryResults results, + query_processor_->ParseSearch(search_spec, + ScoringSpecProto::RankingStrategy::NONE, + fake_clock_.GetSystemTimeMilliseconds())); + EXPECT_THAT(GetDocHitInfos(results.root_iterator.get()), + ElementsAre(EqualsDocHitInfo( + document_id2, std::vector<SectionId>{prop1_section_id}))); + + // prop2:(foo bar) <=> prop2:foo AND prop2:bar, which matches nothing. + search_spec.set_query("prop2:(foo bar)"); + ICING_ASSERT_OK_AND_ASSIGN( + results, query_processor_->ParseSearch( + search_spec, ScoringSpecProto::RankingStrategy::NONE, + fake_clock_.GetSystemTimeMilliseconds())); + EXPECT_THAT(GetDocHitInfos(results.root_iterator.get()), IsEmpty()); + + // prop1:(foo -bar) <=> prop1:foo AND -prop1:bar, which matches doc0. + search_spec.set_query("prop1:(foo -bar)"); + ICING_ASSERT_OK_AND_ASSIGN( + results, query_processor_->ParseSearch( + search_spec, ScoringSpecProto::RankingStrategy::NONE, + fake_clock_.GetSystemTimeMilliseconds())); + EXPECT_THAT(GetDocHitInfos(results.root_iterator.get()), + ElementsAre(EqualsDocHitInfo( + document_id0, std::vector<SectionId>{prop1_section_id}))); + + // prop2:(-foo OR bar) <=> -prop2:foo OR prop2:bar, which matches doc0 and + // doc2. + search_spec.set_query("prop2:(-foo OR bar)"); + ICING_ASSERT_OK_AND_ASSIGN( + results, query_processor_->ParseSearch( + search_spec, ScoringSpecProto::RankingStrategy::NONE, + fake_clock_.GetSystemTimeMilliseconds())); + EXPECT_THAT( + GetDocHitInfos(results.root_iterator.get()), + ElementsAre(EqualsDocHitInfo(document_id2, std::vector<SectionId>{}), + EqualsDocHitInfo(document_id0, + std::vector<SectionId>{prop2_section_id}))); + + // prop1:((foo AND bar) OR (foo AND -baz)) + // <=> ((prop1:foo AND prop1:bar) OR (prop1:foo AND -prop1:baz)), which + // matches doc0 and doc2. + search_spec.set_query("prop1:((foo AND bar) OR (foo AND -baz))"); + ICING_ASSERT_OK_AND_ASSIGN( + results, query_processor_->ParseSearch( + search_spec, ScoringSpecProto::RankingStrategy::NONE, + fake_clock_.GetSystemTimeMilliseconds())); + EXPECT_THAT( + GetDocHitInfos(results.root_iterator.get()), + ElementsAre(EqualsDocHitInfo(document_id2, + std::vector<SectionId>{prop1_section_id}), + EqualsDocHitInfo(document_id0, + std::vector<SectionId>{prop1_section_id}))); +} + INSTANTIATE_TEST_SUITE_P( QueryProcessorTest, QueryProcessorTest, testing::Values( diff --git a/icing/query/suggestion-processor_test.cc b/icing/query/suggestion-processor_test.cc index b1336b3..9f9094d 100644 --- a/icing/query/suggestion-processor_test.cc +++ b/icing/query/suggestion-processor_test.cc @@ -85,17 +85,20 @@ class SuggestionProcessorTest : public Test { ICING_ASSERT_OK_AND_ASSIGN( DocumentStore::CreateResult create_result, - DocumentStore::Create(&filesystem_, store_dir_, &fake_clock_, - schema_store_.get(), - /*force_recovery_and_revalidate_documents=*/false, - /*namespace_id_fingerprint=*/false, - PortableFileBackedProtoLog< - DocumentWrapper>::kDeflateCompressionLevel, - /*initialize_stats=*/nullptr)); + DocumentStore::Create( + &filesystem_, store_dir_, &fake_clock_, schema_store_.get(), + /*force_recovery_and_revalidate_documents=*/false, + /*namespace_id_fingerprint=*/false, /*pre_mapping_fbv=*/false, + /*use_persistent_hash_map=*/false, + PortableFileBackedProtoLog< + DocumentWrapper>::kDeflateCompressionLevel, + /*initialize_stats=*/nullptr)); document_store_ = std::move(create_result.document_store); Index::Options options(index_dir_, - /*index_merge_size=*/1024 * 1024); + /*index_merge_size=*/1024 * 1024, + /*lite_index_sort_at_indexing=*/true, + /*lite_index_sort_size=*/1024 * 8); ICING_ASSERT_OK_AND_ASSIGN( index_, Index::Create(options, &filesystem_, &icing_filesystem_)); // TODO(b/249829533): switch to use persistent numeric index. diff --git a/icing/result/result-retriever-v2_group-result-limiter_test.cc b/icing/result/result-retriever-v2_group-result-limiter_test.cc index 5d8b589..2914a8d 100644 --- a/icing/result/result-retriever-v2_group-result-limiter_test.cc +++ b/icing/result/result-retriever-v2_group-result-limiter_test.cc @@ -89,13 +89,14 @@ class ResultRetrieverV2GroupResultLimiterTest : public testing::Test { ICING_ASSERT_OK_AND_ASSIGN( DocumentStore::CreateResult create_result, - DocumentStore::Create(&filesystem_, test_dir_, &fake_clock_, - schema_store_.get(), - /*force_recovery_and_revalidate_documents=*/false, - /*namespace_id_fingerprint=*/false, - PortableFileBackedProtoLog< - DocumentWrapper>::kDeflateCompressionLevel, - /*initialize_stats=*/nullptr)); + DocumentStore::Create( + &filesystem_, test_dir_, &fake_clock_, schema_store_.get(), + /*force_recovery_and_revalidate_documents=*/false, + /*namespace_id_fingerprint=*/false, /*pre_mapping_fbv=*/false, + /*use_persistent_hash_map=*/false, + PortableFileBackedProtoLog< + DocumentWrapper>::kDeflateCompressionLevel, + /*initialize_stats=*/nullptr)); document_store_ = std::move(create_result.document_store); } diff --git a/icing/result/result-retriever-v2_projection_test.cc b/icing/result/result-retriever-v2_projection_test.cc index 6b868a5..1a75631 100644 --- a/icing/result/result-retriever-v2_projection_test.cc +++ b/icing/result/result-retriever-v2_projection_test.cc @@ -184,13 +184,14 @@ class ResultRetrieverV2ProjectionTest : public testing::Test { ICING_ASSERT_OK_AND_ASSIGN( DocumentStore::CreateResult create_result, - DocumentStore::Create(&filesystem_, test_dir_, &fake_clock_, - schema_store_.get(), - /*force_recovery_and_revalidate_documents=*/false, - /*namespace_id_fingerprint=*/false, - PortableFileBackedProtoLog< - DocumentWrapper>::kDeflateCompressionLevel, - /*initialize_stats=*/nullptr)); + DocumentStore::Create( + &filesystem_, test_dir_, &fake_clock_, schema_store_.get(), + /*force_recovery_and_revalidate_documents=*/false, + /*namespace_id_fingerprint=*/false, /*pre_mapping_fbv=*/false, + /*use_persistent_hash_map=*/false, + PortableFileBackedProtoLog< + DocumentWrapper>::kDeflateCompressionLevel, + /*initialize_stats=*/nullptr)); document_store_ = std::move(create_result.document_store); } diff --git a/icing/result/result-retriever-v2_snippet_test.cc b/icing/result/result-retriever-v2_snippet_test.cc index 27f16a0..440d31c 100644 --- a/icing/result/result-retriever-v2_snippet_test.cc +++ b/icing/result/result-retriever-v2_snippet_test.cc @@ -109,13 +109,14 @@ class ResultRetrieverV2SnippetTest : public testing::Test { ICING_ASSERT_OK_AND_ASSIGN( DocumentStore::CreateResult create_result, - DocumentStore::Create(&filesystem_, test_dir_, &fake_clock_, - schema_store_.get(), - /*force_recovery_and_revalidate_documents=*/false, - /*namespace_id_fingerprint=*/false, - PortableFileBackedProtoLog< - DocumentWrapper>::kDeflateCompressionLevel, - /*initialize_stats=*/nullptr)); + DocumentStore::Create( + &filesystem_, test_dir_, &fake_clock_, schema_store_.get(), + /*force_recovery_and_revalidate_documents=*/false, + /*namespace_id_fingerprint=*/false, /*pre_mapping_fbv=*/false, + /*use_persistent_hash_map=*/false, + PortableFileBackedProtoLog< + DocumentWrapper>::kDeflateCompressionLevel, + /*initialize_stats=*/nullptr)); document_store_ = std::move(create_result.document_store); } diff --git a/icing/result/result-retriever-v2_test.cc b/icing/result/result-retriever-v2_test.cc index 889dc60..0bd40cc 100644 --- a/icing/result/result-retriever-v2_test.cc +++ b/icing/result/result-retriever-v2_test.cc @@ -220,7 +220,8 @@ libtextclassifier3::StatusOr<DocumentStore::CreateResult> CreateDocumentStore( return DocumentStore::Create( filesystem, base_dir, clock, schema_store, /*force_recovery_and_revalidate_documents=*/false, - /*namespace_id_fingerprint=*/false, + /*namespace_id_fingerprint=*/false, /*pre_mapping_fbv=*/false, + /*use_persistent_hash_map=*/false, PortableFileBackedProtoLog<DocumentWrapper>::kDeflateCompressionLevel, /*initialize_stats=*/nullptr); } diff --git a/icing/result/result-state-manager_test.cc b/icing/result/result-state-manager_test.cc index 38d67e8..75d1d93 100644 --- a/icing/result/result-state-manager_test.cc +++ b/icing/result/result-state-manager_test.cc @@ -107,13 +107,14 @@ class ResultStateManagerTest : public testing::Test { ICING_ASSERT_OK_AND_ASSIGN( DocumentStore::CreateResult result, - DocumentStore::Create(&filesystem_, test_dir_, clock_.get(), - schema_store_.get(), - /*force_recovery_and_revalidate_documents=*/false, - /*namespace_id_fingerprint=*/false, - PortableFileBackedProtoLog< - DocumentWrapper>::kDeflateCompressionLevel, - /*initialize_stats=*/nullptr)); + DocumentStore::Create( + &filesystem_, test_dir_, clock_.get(), schema_store_.get(), + /*force_recovery_and_revalidate_documents=*/false, + /*namespace_id_fingerprint=*/false, /*pre_mapping_fbv=*/false, + /*use_persistent_hash_map=*/false, + PortableFileBackedProtoLog< + DocumentWrapper>::kDeflateCompressionLevel, + /*initialize_stats=*/nullptr)); document_store_ = std::move(result.document_store); ICING_ASSERT_OK_AND_ASSIGN( diff --git a/icing/result/result-state-manager_thread-safety_test.cc b/icing/result/result-state-manager_thread-safety_test.cc index 53745e6..7e7e13c 100644 --- a/icing/result/result-state-manager_thread-safety_test.cc +++ b/icing/result/result-state-manager_thread-safety_test.cc @@ -100,13 +100,14 @@ class ResultStateManagerThreadSafetyTest : public testing::Test { ICING_ASSERT_OK_AND_ASSIGN( DocumentStore::CreateResult result, - DocumentStore::Create(&filesystem_, test_dir_, clock_.get(), - schema_store_.get(), - /*force_recovery_and_revalidate_documents=*/false, - /*namespace_id_fingerprint=*/false, - PortableFileBackedProtoLog< - DocumentWrapper>::kDeflateCompressionLevel, - /*initialize_stats=*/nullptr)); + DocumentStore::Create( + &filesystem_, test_dir_, clock_.get(), schema_store_.get(), + /*force_recovery_and_revalidate_documents=*/false, + /*namespace_id_fingerprint=*/false, /*pre_mapping_fbv=*/false, + /*use_persistent_hash_map=*/false, + PortableFileBackedProtoLog< + DocumentWrapper>::kDeflateCompressionLevel, + /*initialize_stats=*/nullptr)); document_store_ = std::move(result.document_store); ICING_ASSERT_OK_AND_ASSIGN( diff --git a/icing/result/result-state-v2_test.cc b/icing/result/result-state-v2_test.cc index ab29d6e..0f88023 100644 --- a/icing/result/result-state-v2_test.cc +++ b/icing/result/result-state-v2_test.cc @@ -76,13 +76,14 @@ class ResultStateV2Test : public ::testing::Test { filesystem_.CreateDirectoryRecursively(doc_store_base_dir_.c_str()); ICING_ASSERT_OK_AND_ASSIGN( DocumentStore::CreateResult result, - DocumentStore::Create(&filesystem_, doc_store_base_dir_, &clock_, - schema_store_.get(), - /*force_recovery_and_revalidate_documents=*/false, - /*namespace_id_fingerprint=*/false, - PortableFileBackedProtoLog< - DocumentWrapper>::kDeflateCompressionLevel, - /*initialize_stats=*/nullptr)); + DocumentStore::Create( + &filesystem_, doc_store_base_dir_, &clock_, schema_store_.get(), + /*force_recovery_and_revalidate_documents=*/false, + /*namespace_id_fingerprint=*/false, /*pre_mapping_fbv=*/false, + /*use_persistent_hash_map=*/false, + PortableFileBackedProtoLog< + DocumentWrapper>::kDeflateCompressionLevel, + /*initialize_stats=*/nullptr)); document_store_ = std::move(result.document_store); num_total_hits_ = 0; diff --git a/icing/schema-builder.h b/icing/schema-builder.h index e8be483..c74505e 100644 --- a/icing/schema-builder.h +++ b/icing/schema-builder.h @@ -127,6 +127,22 @@ class PropertyConfigBuilder { property_.set_schema_type(std::string(schema_type)); property_.mutable_document_indexing_config()->set_index_nested_properties( index_nested_properties); + property_.mutable_document_indexing_config() + ->clear_indexable_nested_properties_list(); + return *this; + } + + PropertyConfigBuilder& SetDataTypeDocument( + std::string_view schema_type, + std::initializer_list<std::string> indexable_nested_properties_list) { + property_.set_data_type(PropertyConfigProto::DataType::DOCUMENT); + property_.set_schema_type(std::string(schema_type)); + property_.mutable_document_indexing_config()->set_index_nested_properties( + false); + for (const std::string& property : indexable_nested_properties_list) { + property_.mutable_document_indexing_config() + ->add_indexable_nested_properties_list(property); + } return *this; } diff --git a/icing/schema/backup-schema-producer_test.cc b/icing/schema/backup-schema-producer_test.cc index b0e793c..dbd033f 100644 --- a/icing/schema/backup-schema-producer_test.cc +++ b/icing/schema/backup-schema-producer_test.cc @@ -36,6 +36,8 @@ namespace lib { namespace { using ::testing::Eq; +using ::testing::Pointee; +using ::testing::SizeIs; class BackupSchemaProducerTest : public ::testing::Test { protected: @@ -442,6 +444,96 @@ TEST_F(BackupSchemaProducerTest, MakeExtraDocumentIndexedPropertiesUnindexed) { EXPECT_THAT(backup, portable_equals_proto::EqualsProto(expected_backup)); } +TEST_F( + BackupSchemaProducerTest, + MakeExtraDocumentIndexedPropertiesWithIndexableNestedPropertiesListUnindexed) { + PropertyConfigBuilder indexed_string_property_builder = + PropertyConfigBuilder() + .SetCardinality(CARDINALITY_OPTIONAL) + .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_PLAIN); + PropertyConfigBuilder indexed_int_property_builder = + PropertyConfigBuilder() + .SetCardinality(CARDINALITY_OPTIONAL) + .SetDataTypeInt64(NUMERIC_MATCH_RANGE); + SchemaTypeConfigProto typeB = + SchemaTypeConfigBuilder() + .SetType("TypeB") + .AddProperty(indexed_string_property_builder.SetName("prop0")) + .AddProperty(indexed_int_property_builder.SetName("prop1")) + .AddProperty(indexed_string_property_builder.SetName("prop2")) + .AddProperty(indexed_int_property_builder.SetName("prop3")) + .AddProperty(indexed_string_property_builder.SetName("prop4")) + .AddProperty(indexed_int_property_builder.SetName("prop5")) + .AddProperty(indexed_string_property_builder.SetName("prop6")) + .AddProperty(indexed_int_property_builder.SetName("prop7")) + .AddProperty(indexed_string_property_builder.SetName("prop8")) + .AddProperty(indexed_int_property_builder.SetName("prop9")) + .Build(); + + // Create indexed document property by using indexable nested properties list. + PropertyConfigBuilder indexed_document_property_with_list_builder = + PropertyConfigBuilder() + .SetCardinality(CARDINALITY_OPTIONAL) + .SetDataTypeDocument( + "TypeB", /*indexable_nested_properties_list=*/{ + "prop0", "prop1", "prop2", "prop3", "prop4", "prop5", + "unknown1", "unknown2", "unknown3"}); + SchemaTypeConfigProto typeA = + SchemaTypeConfigBuilder() + .SetType("TypeA") + .AddProperty( + indexed_document_property_with_list_builder.SetName("propA")) + .AddProperty( + indexed_document_property_with_list_builder.SetName("propB")) + .Build(); + + SchemaProto schema = SchemaBuilder().AddType(typeA).AddType(typeB).Build(); + + SchemaUtil::TypeConfigMap type_config_map; + SchemaUtil::BuildTypeConfigMap(schema, &type_config_map); + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<DynamicTrieKeyMapper<SchemaTypeId>> type_id_mapper, + DynamicTrieKeyMapper<SchemaTypeId>::Create(filesystem_, schema_store_dir_, + /*maximum_size_bytes=*/10000)); + ASSERT_THAT(type_id_mapper->Put("TypeA", 0), IsOk()); + ASSERT_THAT(type_id_mapper->Put("TypeB", 1), IsOk()); + + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<SchemaTypeManager> schema_type_manager, + SchemaTypeManager::Create(type_config_map, type_id_mapper.get())); + ASSERT_THAT(schema_type_manager->section_manager().GetMetadataList("TypeA"), + IsOkAndHolds(Pointee(SizeIs(18)))); + + ICING_ASSERT_OK_AND_ASSIGN( + BackupSchemaProducer backup_producer, + BackupSchemaProducer::Create(schema, + schema_type_manager->section_manager())); + EXPECT_THAT(backup_producer.is_backup_necessary(), Eq(true)); + SchemaProto backup = std::move(backup_producer).Produce(); + + PropertyConfigProto unindexed_document_property = + PropertyConfigBuilder() + .SetCardinality(CARDINALITY_OPTIONAL) + .SetDataType(TYPE_DOCUMENT) + .Build(); + unindexed_document_property.set_schema_type("TypeB"); + PropertyConfigBuilder unindexed_document_property_builder( + unindexed_document_property); + + // "propA" and "propB" both have 9 sections respectively, so we have to drop + // "propB" indexing config to make total # of sections <= 16. + SchemaTypeConfigProto expected_typeA = + SchemaTypeConfigBuilder() + .SetType("TypeA") + .AddProperty( + indexed_document_property_with_list_builder.SetName("propA")) + .AddProperty(unindexed_document_property_builder.SetName("propB")) + .Build(); + SchemaProto expected_backup = + SchemaBuilder().AddType(expected_typeA).AddType(typeB).Build(); + EXPECT_THAT(backup, portable_equals_proto::EqualsProto(expected_backup)); +} + TEST_F(BackupSchemaProducerTest, MakeRfcPropertiesUnindexedFirst) { PropertyConfigBuilder indexed_string_property_builder = PropertyConfigBuilder() @@ -539,31 +631,33 @@ TEST_F(BackupSchemaProducerTest, MakeExtraPropertiesUnindexedMultipleTypes) { .AddProperty(indexed_string_property_builder.SetName("prop2")) .AddProperty(indexed_int_property_builder.SetName("prop3")) .AddProperty(indexed_string_property_builder.SetName("prop4")) - .AddProperty(indexed_int_property_builder.SetName("prop5")) - .AddProperty(indexed_string_property_builder.SetName("prop6")) - .AddProperty(indexed_int_property_builder.SetName("prop7")) - .AddProperty(indexed_string_property_builder.SetName("prop8")) - .AddProperty(indexed_int_property_builder.SetName("prop9")) .Build(); PropertyConfigBuilder indexed_document_property_builder = PropertyConfigBuilder() .SetCardinality(CARDINALITY_OPTIONAL) .SetDataTypeDocument("TypeB", /*index_nested_properties=*/true); + PropertyConfigBuilder indexed_document_property_with_list_builder = + PropertyConfigBuilder() + .SetCardinality(CARDINALITY_OPTIONAL) + .SetDataTypeDocument( + "TypeB", /*indexable_nested_properties_list=*/{ + "prop0", "prop4", "unknown1", "unknown2", "unknown3"}); SchemaTypeConfigProto typeA = SchemaTypeConfigBuilder() .SetType("TypeA") .AddProperty(indexed_string_property_builder.SetName("propA")) - .AddProperty(indexed_int_property_builder.SetName("propB")) + .AddProperty( + indexed_document_property_with_list_builder.SetName("propB")) .AddProperty(indexed_string_property_builder.SetName("propC")) - .AddProperty(indexed_int_property_builder.SetName("propD")) + .AddProperty(indexed_document_property_builder.SetName("propD")) .AddProperty(indexed_string_property_builder.SetName("propE")) .AddProperty(indexed_int_property_builder.SetName("propF")) - .AddProperty(indexed_string_property_builder.SetName("propG")) - .AddProperty(indexed_int_property_builder.SetName("propH")) - .AddProperty(indexed_document_property_builder.SetName("propI")) - .AddProperty(indexed_string_property_builder.SetName("propJ")) - .AddProperty(indexed_int_property_builder.SetName("propK")) + .AddProperty(indexed_document_property_builder.SetName("propG")) + .AddProperty(indexed_string_property_builder.SetName("propH")) + .AddProperty(indexed_int_property_builder.SetName("propI")) + .AddProperty( + indexed_document_property_with_list_builder.SetName("propJ")) .Build(); SchemaProto schema = SchemaBuilder().AddType(typeA).AddType(typeB).Build(); @@ -580,6 +674,8 @@ TEST_F(BackupSchemaProducerTest, MakeExtraPropertiesUnindexedMultipleTypes) { ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<SchemaTypeManager> schema_type_manager, SchemaTypeManager::Create(type_config_map, type_id_mapper.get())); + ASSERT_THAT(schema_type_manager->section_manager().GetMetadataList("TypeA"), + IsOkAndHolds(Pointee(SizeIs(26)))); ICING_ASSERT_OK_AND_ASSIGN( BackupSchemaProducer backup_producer, @@ -605,20 +701,30 @@ TEST_F(BackupSchemaProducerTest, MakeExtraPropertiesUnindexedMultipleTypes) { PropertyConfigBuilder unindexed_document_property_builder( unindexed_document_property); + // On version 0 (Android T): + // - Only "propA", "propC", "propD.prop0", "propD.prop1", "propD.prop2", + // "propD.prop3", "propD.prop4", "propE", "propF" will be assigned sections. + // - Unlike version 2, "propB.prop0", "propB.prop4", "propB.unknown1", + // "propB.unknown2", "propB.unknown3" will be ignored because version 0 + // doesn't recognize indexable nested properties list. + // - So there will be only 9 sections on version 0. We still have potential to + // avoid dropping "propG", "propH", "propI" indexing configs on version 0 + // (in this case it will be 16 sections), but it is ok to make it simple as + // long as total # of sections <= 16. SchemaTypeConfigProto expected_typeA = SchemaTypeConfigBuilder() .SetType("TypeA") .AddProperty(indexed_string_property_builder.SetName("propA")) - .AddProperty(indexed_int_property_builder.SetName("propB")) + .AddProperty( + indexed_document_property_with_list_builder.SetName("propB")) .AddProperty(indexed_string_property_builder.SetName("propC")) - .AddProperty(indexed_int_property_builder.SetName("propD")) + .AddProperty(indexed_document_property_builder.SetName("propD")) .AddProperty(indexed_string_property_builder.SetName("propE")) .AddProperty(indexed_int_property_builder.SetName("propF")) - .AddProperty(indexed_string_property_builder.SetName("propG")) - .AddProperty(indexed_int_property_builder.SetName("propH")) - .AddProperty(unindexed_document_property_builder.SetName("propI")) - .AddProperty(unindexed_string_property_builder.SetName("propJ")) - .AddProperty(unindexed_int_property_builder.SetName("propK")) + .AddProperty(unindexed_document_property_builder.SetName("propG")) + .AddProperty(unindexed_string_property_builder.SetName("propH")) + .AddProperty(unindexed_int_property_builder.SetName("propI")) + .AddProperty(unindexed_document_property_builder.SetName("propJ")) .Build(); SchemaProto expected_backup = SchemaBuilder().AddType(expected_typeA).AddType(typeB).Build(); diff --git a/icing/schema/property-util.cc b/icing/schema/property-util.cc index 7370328..67ff748 100644 --- a/icing/schema/property-util.cc +++ b/icing/schema/property-util.cc @@ -16,11 +16,9 @@ #include <string> #include <string_view> -#include <utility> #include <vector> #include "icing/text_classifier/lib3/utils/base/statusor.h" -#include "icing/absl_ports/canonical_errors.h" #include "icing/absl_ports/str_cat.h" #include "icing/absl_ports/str_join.h" #include "icing/proto/document.pb.h" @@ -85,6 +83,23 @@ std::vector<PropertyInfo> ParsePropertyPathExpr( return property_infos; } +bool IsParentPropertyPath(std::string_view property_path_expr1, + std::string_view property_path_expr2) { + if (property_path_expr2.length() < property_path_expr1.length()) { + return false; + } + if (property_path_expr1 != + property_path_expr2.substr(0, property_path_expr1.length())) { + return false; + } + if (property_path_expr2.length() > property_path_expr1.length() && + property_path_expr2[property_path_expr1.length()] != + kPropertyPathSeparator[0]) { + return false; + } + return true; +} + const PropertyProto* GetPropertyProto(const DocumentProto& document, std::string_view property_name) { for (const PropertyProto& property : document.properties()) { diff --git a/icing/schema/property-util.h b/icing/schema/property-util.h index efa599c..7557879 100644 --- a/icing/schema/property-util.h +++ b/icing/schema/property-util.h @@ -113,6 +113,26 @@ PropertyInfo ParsePropertyNameExpr(std::string_view property_name_expr); std::vector<PropertyInfo> ParsePropertyPathExpr( std::string_view property_path_expr); +// A property path property_path_expr1 is considered a parent of another +// property path property_path_expr2 if: +// 1. property_path_expr2 == property_path_expr1, OR +// 2. property_path_expr2 consists of the entire path of property_path_expr1 +// + "." + [some other property path]. +// +// Note that this can only be used for property name strings that do not +// contain the property index. +// +// Examples: +// - IsParentPropertyPath("foo", "foo") will return true. +// - IsParentPropertyPath("foo", "foo.bar") will return true. +// - IsParentPropertyPath("foo", "bar.foo") will return false. +// - IsParentPropertyPath("foo.bar", "foo.foo.bar") will return false. +// +// Returns: true if property_path_expr1 is a parent property path of +// property_path_expr2. +bool IsParentPropertyPath(std::string_view property_path_expr1, + std::string_view property_path_expr2); + // Gets the desired PropertyProto from the document by given property name. // Since the input parameter is property name, this function only deals with // the first level of properties in the document and cannot deal with nested diff --git a/icing/schema/property-util_test.cc b/icing/schema/property-util_test.cc index 1fabb32..eddcc84 100644 --- a/icing/schema/property-util_test.cc +++ b/icing/schema/property-util_test.cc @@ -43,6 +43,23 @@ static constexpr std::string_view kTypeNestedTest = "NestedTest"; static constexpr std::string_view kPropertyStr = "str"; static constexpr std::string_view kPropertyNestedDocument = "nestedDocument"; +TEST(PropertyUtilTest, IsParentPropertyPath) { + EXPECT_TRUE(property_util::IsParentPropertyPath("foo", "foo")); + EXPECT_TRUE(property_util::IsParentPropertyPath("foo", "foo.bar")); + EXPECT_TRUE(property_util::IsParentPropertyPath("foo", "foo.bar.foo")); + EXPECT_TRUE(property_util::IsParentPropertyPath("foo", "foo.foo.bar")); + EXPECT_TRUE(property_util::IsParentPropertyPath("foo.bar", "foo.bar.foo")); + + EXPECT_FALSE(property_util::IsParentPropertyPath("foo", "foofoo.bar")); + EXPECT_FALSE(property_util::IsParentPropertyPath("foo.bar", "foo.foo.bar")); + EXPECT_FALSE(property_util::IsParentPropertyPath("foo.bar", "foofoo.bar")); + EXPECT_FALSE(property_util::IsParentPropertyPath("foo.bar.foo", "foo")); + EXPECT_FALSE(property_util::IsParentPropertyPath("foo.bar.foo", "foo.bar")); + EXPECT_FALSE( + property_util::IsParentPropertyPath("foo.foo.bar", "foo.bar.foo")); + EXPECT_FALSE(property_util::IsParentPropertyPath("foo", "foo#bar.foo")); +} + TEST(PropertyUtilTest, ExtractPropertyValuesTypeString) { PropertyProto property; property.mutable_string_values()->Add("Hello, world"); diff --git a/icing/schema/schema-property-iterator.cc b/icing/schema/schema-property-iterator.cc index e1078c2..8fc245c 100644 --- a/icing/schema/schema-property-iterator.cc +++ b/icing/schema/schema-property-iterator.cc @@ -14,9 +14,17 @@ #include "icing/schema/schema-property-iterator.h" +#include <algorithm> +#include <string> +#include <unordered_set> +#include <utility> +#include <vector> + #include "icing/text_classifier/lib3/utils/base/status.h" #include "icing/absl_ports/canonical_errors.h" #include "icing/absl_ports/str_cat.h" +#include "icing/proto/schema.pb.h" +#include "icing/schema/property-util.h" namespace icing { namespace lib { @@ -27,16 +35,63 @@ libtextclassifier3::Status SchemaPropertyIterator::Advance() { // When finishing iterating all properties of the current level, pop it // from the stack (levels_), return to the previous level and resume the // iteration. - parent_type_config_names_.erase(levels_.back().GetSchemaTypeName()); + parent_type_config_names_.erase( + parent_type_config_names_.find(levels_.back().GetSchemaTypeName())); levels_.pop_back(); continue; } const PropertyConfigProto& curr_property_config = levels_.back().GetCurrentPropertyConfig(); + std::string curr_property_path = levels_.back().GetCurrentPropertyPath(); + + // Iterate through the sorted_top_level_indexable_nested_properties_ in + // order until we find the first element that is >= curr_property_path. + while (current_top_level_indexable_nested_properties_idx_ < + sorted_top_level_indexable_nested_properties_.size() && + sorted_top_level_indexable_nested_properties_.at( + current_top_level_indexable_nested_properties_idx_) < + curr_property_path) { + // If an element in sorted_top_level_indexable_nested_properties_ < the + // current property path, it means that we've already iterated past the + // possible position for it without seeing it. + // It's not a valid property path in our schema definition. Add it to + // unknown_indexable_nested_properties_ and advance + // current_top_level_indexable_nested_properties_idx_. + unknown_indexable_nested_property_paths_.push_back( + sorted_top_level_indexable_nested_properties_.at( + current_top_level_indexable_nested_properties_idx_)); + ++current_top_level_indexable_nested_properties_idx_; + } + if (curr_property_config.data_type() != PropertyConfigProto::DataType::DOCUMENT) { // We've advanced to a leaf property. + // Set whether this property is indexable according to its level's + // indexable config. If this property is declared in + // indexable_nested_properties_list of the top-level schema, it is also + // nested indexable. + std::string* current_indexable_nested_prop = + current_top_level_indexable_nested_properties_idx_ < + sorted_top_level_indexable_nested_properties_.size() + ? &sorted_top_level_indexable_nested_properties_.at( + current_top_level_indexable_nested_properties_idx_) + : nullptr; + if (current_indexable_nested_prop == nullptr || + *current_indexable_nested_prop > curr_property_path) { + // Current property is not in the indexable list. Set it as indexable if + // its schema level is indexable AND it is an indexable property. + bool is_property_indexable = + levels_.back().GetLevelNestedIndexable() && + SchemaUtil::IsIndexedProperty(curr_property_config); + levels_.back().SetCurrentPropertyIndexable(is_property_indexable); + } else if (*current_indexable_nested_prop == curr_property_path) { + // Current property is in the indexable list. Set its indexable config + // to true. This property will consume a sectionId regardless of whether + // or not it is actually indexable. + levels_.back().SetCurrentPropertyIndexable(true); + ++current_top_level_indexable_nested_properties_idx_; + } return libtextclassifier3::Status::OK; } @@ -55,28 +110,87 @@ libtextclassifier3::Status SchemaPropertyIterator::Advance() { return absl_ports::NotFoundError(absl_ports::StrCat( "Type config not found: ", curr_property_config.schema_type())); } + const SchemaTypeConfigProto& nested_type_config = + nested_type_config_iter->second; - if (parent_type_config_names_.count( - nested_type_config_iter->second.schema_type()) > 0) { + if (levels_.back().GetLevelNestedIndexable()) { + // We should set sorted_top_level_indexable_nested_properties_ to the list + // defined by the current level. + // GetLevelNestedIndexable() is true either because: + // 1. We're looking at a document property of the top-level schema -- + // The first LevelInfo for the iterator is initialized with + // all_nested_properties_indexable_ = true. + // 2. All previous levels set index_nested_properties = true: + // This indicates that upper-level schema types want to follow nested + // properties definition of its document subtypes. If this is the first + // subtype level that defines a list, we should set it as + // top_level_indexable_nested_properties_ for the current top-level + // schema. + sorted_top_level_indexable_nested_properties_.clear(); + sorted_top_level_indexable_nested_properties_.reserve( + curr_property_config.document_indexing_config() + .indexable_nested_properties_list() + .size()); + for (const std::string& property : + curr_property_config.document_indexing_config() + .indexable_nested_properties_list()) { + // Concat the current property name to each property to get the full + // property path expression for each indexable nested property. + sorted_top_level_indexable_nested_properties_.push_back( + property_util::ConcatenatePropertyPathExpr(curr_property_path, + property)); + } + current_top_level_indexable_nested_properties_idx_ = 0; + // Sort elements and dedupe + std::sort(sorted_top_level_indexable_nested_properties_.begin(), + sorted_top_level_indexable_nested_properties_.end()); + auto last = + std::unique(sorted_top_level_indexable_nested_properties_.begin(), + sorted_top_level_indexable_nested_properties_.end()); + sorted_top_level_indexable_nested_properties_.erase( + last, sorted_top_level_indexable_nested_properties_.end()); + } + + bool is_cycle = + parent_type_config_names_.find(nested_type_config.schema_type()) != + parent_type_config_names_.end(); + bool is_parent_property_path = + current_top_level_indexable_nested_properties_idx_ < + sorted_top_level_indexable_nested_properties_.size() && + property_util::IsParentPropertyPath( + curr_property_path, + sorted_top_level_indexable_nested_properties_.at( + current_top_level_indexable_nested_properties_idx_)); + if (is_cycle && !is_parent_property_path) { // Cycle detected. The schema definition is guaranteed to be valid here // since it must have already been validated during SchemaUtil::Validate, // which would have rejected any schema with bad cycles. // + // There are no properties in the indexable_nested_properties_list that + // are a part of this circular reference. // We do not need to iterate this type further so we simply move on to // other properties in the parent type. continue; } - std::string curr_property_path = levels_.back().GetCurrentPropertyPath(); - bool is_nested_indexable = levels_.back().GetCurrentNestedIndexable() && - curr_property_config.document_indexing_config() - .index_nested_properties(); - levels_.push_back(LevelInfo(nested_type_config_iter->second, + bool all_nested_properties_indexable = + levels_.back().GetLevelNestedIndexable() && + curr_property_config.document_indexing_config() + .index_nested_properties(); + levels_.push_back(LevelInfo(nested_type_config, std::move(curr_property_path), - is_nested_indexable)); - parent_type_config_names_.insert( - nested_type_config_iter->second.schema_type()); + all_nested_properties_indexable)); + parent_type_config_names_.insert(nested_type_config.schema_type()); } + + // Before returning, move all remaining uniterated properties from + // sorted_top_level_indexable_nested_properties_ into + // unknown_indexable_nested_properties_. + std::move(sorted_top_level_indexable_nested_properties_.begin() + + current_top_level_indexable_nested_properties_idx_, + sorted_top_level_indexable_nested_properties_.end(), + std::back_inserter(unknown_indexable_nested_property_paths_)); + return absl_ports::OutOfRangeError("End of iterator"); } diff --git a/icing/schema/schema-property-iterator.h b/icing/schema/schema-property-iterator.h index f60a56e..66b8f32 100644 --- a/icing/schema/schema-property-iterator.h +++ b/icing/schema/schema-property-iterator.h @@ -18,6 +18,9 @@ #include <algorithm> #include <numeric> #include <string> +#include <string_view> +#include <unordered_set> +#include <utility> #include <vector> #include "icing/text_classifier/lib3/utils/base/status.h" @@ -44,7 +47,7 @@ class SchemaPropertyIterator { : type_config_map_(type_config_map) { levels_.push_back(LevelInfo(base_schema_type_config, /*base_property_path=*/"", - /*is_nested_indexable=*/true)); + /*all_nested_properties_indexable=*/true)); parent_type_config_names_.insert(base_schema_type_config.schema_type()); } @@ -62,11 +65,31 @@ class SchemaPropertyIterator { return levels_.back().GetCurrentPropertyPath(); } - // Gets if the current property is nested indexable. + // Returns whether the current property is indexable. This would be true if + // either the current level is nested indexable, or if the current property is + // declared indexable in the indexable_nested_properties_list of the top-level + // schema type. // // REQUIRES: The preceding call for Advance() is OK. - bool GetCurrentNestedIndexable() const { - return levels_.back().GetCurrentNestedIndexable(); + bool GetCurrentPropertyIndexable() const { + return levels_.back().GetCurrentPropertyIndexable(); + } + + // Returns whether the current schema level is nested indexable. If this is + // true, all properties in the level are indexed. + // + // REQUIRES: The preceding call for Advance() is OK. + bool GetLevelNestedIndexable() const { + return levels_.back().GetLevelNestedIndexable(); + } + + // The set of indexable nested properties that are defined in the + // indexable_nested_properties_list but are not found in the schema + // definition. These properties still consume sectionIds, but will not be + // indexed. + const std::vector<std::string>& unknown_indexable_nested_property_paths() + const { + return unknown_indexable_nested_property_paths_; } // Advances to the next leaf property. @@ -87,12 +110,14 @@ class SchemaPropertyIterator { class LevelInfo { public: explicit LevelInfo(const SchemaTypeConfigProto& schema_type_config, - std::string base_property_path, bool is_nested_indexable) + std::string base_property_path, + bool all_nested_properties_indexable) : schema_type_config_(schema_type_config), base_property_path_(std::move(base_property_path)), sorted_property_indices_(schema_type_config.properties_size()), current_vec_idx_(-1), - is_nested_indexable_(is_nested_indexable) { + sorted_property_indexable_(schema_type_config.properties_size()), + all_nested_properties_indexable_(all_nested_properties_indexable) { // Index sort property by lexicographical order. std::iota(sorted_property_indices_.begin(), sorted_property_indices_.end(), @@ -119,7 +144,17 @@ class SchemaPropertyIterator { base_property_path_, GetCurrentPropertyConfig().property_name()); } - bool GetCurrentNestedIndexable() const { return is_nested_indexable_; } + bool GetLevelNestedIndexable() const { + return all_nested_properties_indexable_; + } + + bool GetCurrentPropertyIndexable() const { + return sorted_property_indexable_[current_vec_idx_]; + } + + void SetCurrentPropertyIndexable(bool indexable) { + sorted_property_indexable_[current_vec_idx_] = indexable; + } std::string_view GetSchemaTypeName() const { return schema_type_config_.schema_type(); @@ -137,12 +172,20 @@ class SchemaPropertyIterator { std::vector<int> sorted_property_indices_; int current_vec_idx_; - // Indicates if the current level is nested indexable. Document type - // property has index_nested_properties flag indicating whether properties - // under this level should be indexed or not. If any of parent document type - // property sets its flag false, then all child level properties should not - // be indexed. - bool is_nested_indexable_; + // Vector indicating whether each property in the current level is + // indexable. We can declare different indexable settings for properties in + // the same level using indexable_nested_properties_list. + // + // Element indices in this vector correspond to property indices in the + // sorted order. + std::vector<bool> sorted_property_indexable_; + + // Indicates if all properties in the current level is nested indexable. + // This would be true for a level if the document declares + // index_nested_properties=true. If any of parent document type + // property sets its flag false, then this would be false for all its child + // properties. + bool all_nested_properties_indexable_; }; const SchemaUtil::TypeConfigMap& type_config_map_; // Does not own @@ -154,7 +197,23 @@ class SchemaPropertyIterator { // Maintaining all traversed parent schema type config names of the current // stack (levels_). It is used to detect nested schema cycle dependency. - std::unordered_set<std::string_view> parent_type_config_names_; + std::unordered_multiset<std::string_view> parent_type_config_names_; + + // Sorted list of indexable nested properties for the top-level schema. + std::vector<std::string> sorted_top_level_indexable_nested_properties_; + + // Current iteration index in the sorted_top_level_indexable_nested_properties + // list. + int current_top_level_indexable_nested_properties_idx_ = 0; + + // Vector of indexable nested properties defined in the + // indexable_nested_properties_list, but not found in the schema definition. + // These properties still consume sectionIds, but will not be indexed. + // Properties are inserted into this vector in sorted order. + // + // TODO(b/289152024): Implement support for indexing these properties if they + // are in the child types of polymorphic nested properties. + std::vector<std::string> unknown_indexable_nested_property_paths_; }; } // namespace lib diff --git a/icing/schema/schema-property-iterator_test.cc b/icing/schema/schema-property-iterator_test.cc index 080d574..2b0226d 100644 --- a/icing/schema/schema-property-iterator_test.cc +++ b/icing/schema/schema-property-iterator_test.cc @@ -14,6 +14,7 @@ #include "icing/schema/schema-property-iterator.h" +#include <initializer_list> #include <string> #include "icing/text_classifier/lib3/utils/base/status.h" @@ -30,7 +31,9 @@ namespace lib { namespace { using portable_equals_proto::EqualsProto; +using ::testing::ElementsAre; using ::testing::Eq; +using ::testing::IsEmpty; using ::testing::IsFalse; using ::testing::IsTrue; @@ -41,13 +44,14 @@ TEST(SchemaPropertyIteratorTest, SchemaTypeConfigProto schema_type_config = SchemaTypeConfigBuilder() .SetType(schema_type_name) - .AddProperty(PropertyConfigBuilder().SetName("Google").SetDataType( - TYPE_STRING)) + .AddProperty( + PropertyConfigBuilder().SetName("Google").SetDataTypeString( + TERM_MATCH_EXACT, TOKENIZER_PLAIN)) .AddProperty(PropertyConfigBuilder().SetName("Youtube").SetDataType( TYPE_BYTES)) .AddProperty(PropertyConfigBuilder() .SetName("Alphabet") - .SetDataType(TYPE_INT64)) + .SetDataTypeInt64(NUMERIC_MATCH_UNKNOWN)) .Build(); SchemaUtil::TypeConfigMap type_config_map = { {schema_type_name, schema_type_config}}; @@ -57,22 +61,24 @@ TEST(SchemaPropertyIteratorTest, EXPECT_THAT(iterator.GetCurrentPropertyPath(), Eq("Alphabet")); EXPECT_THAT(iterator.GetCurrentPropertyConfig(), EqualsProto(schema_type_config.properties(2))); - EXPECT_THAT(iterator.GetCurrentNestedIndexable(), IsTrue()); + EXPECT_THAT(iterator.GetCurrentPropertyIndexable(), IsFalse()); EXPECT_THAT(iterator.Advance(), IsOk()); EXPECT_THAT(iterator.GetCurrentPropertyPath(), Eq("Google")); EXPECT_THAT(iterator.GetCurrentPropertyConfig(), EqualsProto(schema_type_config.properties(0))); - EXPECT_THAT(iterator.GetCurrentNestedIndexable(), IsTrue()); + EXPECT_THAT(iterator.GetCurrentPropertyIndexable(), IsTrue()); EXPECT_THAT(iterator.Advance(), IsOk()); EXPECT_THAT(iterator.GetCurrentPropertyPath(), Eq("Youtube")); EXPECT_THAT(iterator.GetCurrentPropertyConfig(), EqualsProto(schema_type_config.properties(1))); - EXPECT_THAT(iterator.GetCurrentNestedIndexable(), IsTrue()); + EXPECT_THAT(iterator.GetCurrentPropertyIndexable(), IsFalse()); EXPECT_THAT(iterator.Advance(), StatusIs(libtextclassifier3::StatusCode::OUT_OF_RANGE)); + + EXPECT_THAT(iterator.unknown_indexable_nested_property_paths(), IsEmpty()); } TEST(SchemaPropertyIteratorTest, @@ -84,19 +90,20 @@ TEST(SchemaPropertyIteratorTest, SchemaTypeConfigProto schema_type_config1 = SchemaTypeConfigBuilder() .SetType(schema_type_name1) - .AddProperty(PropertyConfigBuilder().SetName("Google").SetDataType( - TYPE_STRING)) + .AddProperty( + PropertyConfigBuilder().SetName("Google").SetDataTypeString( + TERM_MATCH_EXACT, TOKENIZER_PLAIN)) .AddProperty(PropertyConfigBuilder().SetName("Youtube").SetDataType( TYPE_BYTES)) .AddProperty(PropertyConfigBuilder() .SetName("Alphabet") - .SetDataType(TYPE_INT64)) + .SetDataTypeInt64(NUMERIC_MATCH_RANGE)) .Build(); SchemaTypeConfigProto schema_type_config2 = SchemaTypeConfigBuilder() .SetType(schema_type_name2) - .AddProperty( - PropertyConfigBuilder().SetName("Foo").SetDataType(TYPE_STRING)) + .AddProperty(PropertyConfigBuilder().SetName("Foo").SetDataTypeString( + TERM_MATCH_UNKNOWN, TOKENIZER_NONE)) .AddProperty( PropertyConfigBuilder().SetName("Bar").SetDataTypeDocument( schema_type_name1, /*index_nested_properties=*/true)) @@ -105,7 +112,8 @@ TEST(SchemaPropertyIteratorTest, SchemaTypeConfigBuilder() .SetType(schema_type_name3) .AddProperty( - PropertyConfigBuilder().SetName("Hello").SetDataType(TYPE_STRING)) + PropertyConfigBuilder().SetName("Hello").SetDataTypeString( + TERM_MATCH_EXACT, TOKENIZER_PLAIN)) .AddProperty( PropertyConfigBuilder().SetName("World").SetDataTypeDocument( schema_type_name1, /*index_nested_properties=*/true)) @@ -139,52 +147,54 @@ TEST(SchemaPropertyIteratorTest, EXPECT_THAT(iterator.GetCurrentPropertyPath(), Eq("Hello")); EXPECT_THAT(iterator.GetCurrentPropertyConfig(), EqualsProto(schema_type_config3.properties(0))); - EXPECT_THAT(iterator.GetCurrentNestedIndexable(), IsTrue()); + EXPECT_THAT(iterator.GetCurrentPropertyIndexable(), IsTrue()); EXPECT_THAT(iterator.Advance(), IsOk()); EXPECT_THAT(iterator.GetCurrentPropertyPath(), Eq("Icing.Bar.Alphabet")); EXPECT_THAT(iterator.GetCurrentPropertyConfig(), EqualsProto(schema_type_config1.properties(2))); - EXPECT_THAT(iterator.GetCurrentNestedIndexable(), IsTrue()); + EXPECT_THAT(iterator.GetCurrentPropertyIndexable(), IsTrue()); EXPECT_THAT(iterator.Advance(), IsOk()); EXPECT_THAT(iterator.GetCurrentPropertyPath(), Eq("Icing.Bar.Google")); EXPECT_THAT(iterator.GetCurrentPropertyConfig(), EqualsProto(schema_type_config1.properties(0))); - EXPECT_THAT(iterator.GetCurrentNestedIndexable(), IsTrue()); + EXPECT_THAT(iterator.GetCurrentPropertyIndexable(), IsTrue()); EXPECT_THAT(iterator.Advance(), IsOk()); EXPECT_THAT(iterator.GetCurrentPropertyPath(), Eq("Icing.Bar.Youtube")); EXPECT_THAT(iterator.GetCurrentPropertyConfig(), EqualsProto(schema_type_config1.properties(1))); - EXPECT_THAT(iterator.GetCurrentNestedIndexable(), IsTrue()); + EXPECT_THAT(iterator.GetCurrentPropertyIndexable(), IsFalse()); EXPECT_THAT(iterator.Advance(), IsOk()); EXPECT_THAT(iterator.GetCurrentPropertyPath(), Eq("Icing.Foo")); EXPECT_THAT(iterator.GetCurrentPropertyConfig(), EqualsProto(schema_type_config2.properties(0))); - EXPECT_THAT(iterator.GetCurrentNestedIndexable(), IsTrue()); + EXPECT_THAT(iterator.GetCurrentPropertyIndexable(), IsFalse()); EXPECT_THAT(iterator.Advance(), IsOk()); EXPECT_THAT(iterator.GetCurrentPropertyPath(), Eq("World.Alphabet")); EXPECT_THAT(iterator.GetCurrentPropertyConfig(), EqualsProto(schema_type_config1.properties(2))); - EXPECT_THAT(iterator.GetCurrentNestedIndexable(), IsTrue()); + EXPECT_THAT(iterator.GetCurrentPropertyIndexable(), IsTrue()); EXPECT_THAT(iterator.Advance(), IsOk()); EXPECT_THAT(iterator.GetCurrentPropertyPath(), Eq("World.Google")); EXPECT_THAT(iterator.GetCurrentPropertyConfig(), EqualsProto(schema_type_config1.properties(0))); - EXPECT_THAT(iterator.GetCurrentNestedIndexable(), IsTrue()); + EXPECT_THAT(iterator.GetCurrentPropertyIndexable(), IsTrue()); EXPECT_THAT(iterator.Advance(), IsOk()); EXPECT_THAT(iterator.GetCurrentPropertyPath(), Eq("World.Youtube")); EXPECT_THAT(iterator.GetCurrentPropertyConfig(), EqualsProto(schema_type_config1.properties(1))); - EXPECT_THAT(iterator.GetCurrentNestedIndexable(), IsTrue()); + EXPECT_THAT(iterator.GetCurrentPropertyIndexable(), IsFalse()); EXPECT_THAT(iterator.Advance(), StatusIs(libtextclassifier3::StatusCode::OUT_OF_RANGE)); + + EXPECT_THAT(iterator.unknown_indexable_nested_property_paths(), IsEmpty()); } TEST(SchemaPropertyIteratorTest, @@ -234,6 +244,7 @@ TEST(SchemaPropertyIteratorTest, SchemaPropertyIterator iterator(schema_type_config, type_config_map); EXPECT_THAT(iterator.Advance(), StatusIs(libtextclassifier3::StatusCode::OUT_OF_RANGE)); + EXPECT_THAT(iterator.unknown_indexable_nested_property_paths(), IsEmpty()); } TEST(SchemaPropertyIteratorTest, NestedIndexable) { @@ -338,13 +349,13 @@ TEST(SchemaPropertyIteratorTest, NestedIndexable) { EXPECT_THAT(iterator.GetCurrentPropertyPath(), Eq("Baz1.Bar.Google")); EXPECT_THAT(iterator.GetCurrentPropertyConfig(), EqualsProto(schema_type_config1.properties(0))); - EXPECT_THAT(iterator.GetCurrentNestedIndexable(), IsTrue()); + EXPECT_THAT(iterator.GetCurrentPropertyIndexable(), IsTrue()); EXPECT_THAT(iterator.Advance(), IsOk()); EXPECT_THAT(iterator.GetCurrentPropertyPath(), Eq("Baz1.Foo")); EXPECT_THAT(iterator.GetCurrentPropertyConfig(), EqualsProto(schema_type_config2.properties(1))); - EXPECT_THAT(iterator.GetCurrentNestedIndexable(), IsTrue()); + EXPECT_THAT(iterator.GetCurrentPropertyIndexable(), IsTrue()); // For Baz2, the parent level sets index_nested_properties = false, so all // leaf properties in child levels should be nested unindexable even if @@ -353,13 +364,13 @@ TEST(SchemaPropertyIteratorTest, NestedIndexable) { EXPECT_THAT(iterator.GetCurrentPropertyPath(), Eq("Baz2.Bar.Google")); EXPECT_THAT(iterator.GetCurrentPropertyConfig(), EqualsProto(schema_type_config1.properties(0))); - EXPECT_THAT(iterator.GetCurrentNestedIndexable(), IsFalse()); + EXPECT_THAT(iterator.GetCurrentPropertyIndexable(), IsFalse()); EXPECT_THAT(iterator.Advance(), IsOk()); EXPECT_THAT(iterator.GetCurrentPropertyPath(), Eq("Baz2.Foo")); EXPECT_THAT(iterator.GetCurrentPropertyConfig(), EqualsProto(schema_type_config2.properties(1))); - EXPECT_THAT(iterator.GetCurrentNestedIndexable(), IsFalse()); + EXPECT_THAT(iterator.GetCurrentPropertyIndexable(), IsFalse()); // For Baz3, the parent level sets index_nested_properties = true, but the // child level sets index_nested_properties = false. @@ -369,13 +380,13 @@ TEST(SchemaPropertyIteratorTest, NestedIndexable) { EXPECT_THAT(iterator.GetCurrentPropertyPath(), Eq("Baz3.Bar.Google")); EXPECT_THAT(iterator.GetCurrentPropertyConfig(), EqualsProto(schema_type_config1.properties(0))); - EXPECT_THAT(iterator.GetCurrentNestedIndexable(), IsFalse()); + EXPECT_THAT(iterator.GetCurrentPropertyIndexable(), IsFalse()); EXPECT_THAT(iterator.Advance(), IsOk()); EXPECT_THAT(iterator.GetCurrentPropertyPath(), Eq("Baz3.Foo")); EXPECT_THAT(iterator.GetCurrentPropertyConfig(), EqualsProto(schema_type_config2.properties(1))); - EXPECT_THAT(iterator.GetCurrentNestedIndexable(), IsTrue()); + EXPECT_THAT(iterator.GetCurrentPropertyIndexable(), IsTrue()); // For Baz4, all levels set index_nested_properties = false, so all leaf // properties should be nested unindexable. @@ -383,37 +394,1498 @@ TEST(SchemaPropertyIteratorTest, NestedIndexable) { EXPECT_THAT(iterator.GetCurrentPropertyPath(), Eq("Baz4.Bar.Google")); EXPECT_THAT(iterator.GetCurrentPropertyConfig(), EqualsProto(schema_type_config1.properties(0))); - EXPECT_THAT(iterator.GetCurrentNestedIndexable(), IsFalse()); + EXPECT_THAT(iterator.GetCurrentPropertyIndexable(), IsFalse()); EXPECT_THAT(iterator.Advance(), IsOk()); EXPECT_THAT(iterator.GetCurrentPropertyPath(), Eq("Baz4.Foo")); EXPECT_THAT(iterator.GetCurrentPropertyConfig(), EqualsProto(schema_type_config2.properties(1))); - EXPECT_THAT(iterator.GetCurrentNestedIndexable(), IsFalse()); + EXPECT_THAT(iterator.GetCurrentPropertyIndexable(), IsFalse()); // Verify 1 and 0 level of nested document type properties. EXPECT_THAT(iterator.Advance(), IsOk()); EXPECT_THAT(iterator.GetCurrentPropertyPath(), Eq("Hello1.Google")); EXPECT_THAT(iterator.GetCurrentPropertyConfig(), EqualsProto(schema_type_config1.properties(0))); - EXPECT_THAT(iterator.GetCurrentNestedIndexable(), IsTrue()); + EXPECT_THAT(iterator.GetCurrentPropertyIndexable(), IsTrue()); EXPECT_THAT(iterator.Advance(), IsOk()); EXPECT_THAT(iterator.GetCurrentPropertyPath(), Eq("Hello2.Google")); EXPECT_THAT(iterator.GetCurrentPropertyConfig(), EqualsProto(schema_type_config1.properties(0))); - EXPECT_THAT(iterator.GetCurrentNestedIndexable(), IsFalse()); + EXPECT_THAT(iterator.GetCurrentPropertyIndexable(), IsFalse()); EXPECT_THAT(iterator.Advance(), IsOk()); EXPECT_THAT(iterator.GetCurrentPropertyPath(), Eq("World")); EXPECT_THAT(iterator.GetCurrentPropertyConfig(), EqualsProto(schema_type_config4.properties(6))); - EXPECT_THAT(iterator.GetCurrentNestedIndexable(), IsTrue()); + EXPECT_THAT(iterator.GetCurrentPropertyIndexable(), IsTrue()); EXPECT_THAT(iterator.Advance(), StatusIs(libtextclassifier3::StatusCode::OUT_OF_RANGE)); + + EXPECT_THAT(iterator.unknown_indexable_nested_property_paths(), IsEmpty()); +} + +TEST(SchemaPropertyIteratorTest, + IndexableNestedPropertiesList_singleNestedLevel) { + std::string schema_type_name1 = "SchemaOne"; + std::string schema_type_name2 = "SchemaTwo"; + + SchemaTypeConfigProto schema_type_config1 = + SchemaTypeConfigBuilder() + .SetType(schema_type_name1) + .AddProperty( + PropertyConfigBuilder() + .SetName("schema1prop1") + .SetDataTypeString(TERM_MATCH_UNKNOWN, TOKENIZER_NONE)) + .AddProperty( + PropertyConfigBuilder() + .SetName("schema1prop2") + .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_PLAIN)) + .AddProperty( + PropertyConfigBuilder() + .SetName("schema1prop3") + .SetDataTypeString(TERM_MATCH_UNKNOWN, TOKENIZER_NONE)) + .AddProperty(PropertyConfigBuilder() + .SetName("schema1prop4") + .SetDataTypeInt64(NUMERIC_MATCH_RANGE)) + .AddProperty(PropertyConfigBuilder() + .SetName("schema1prop5") + .SetDataType(TYPE_BOOLEAN)) + .Build(); + SchemaTypeConfigProto schema_type_config2 = + SchemaTypeConfigBuilder() + .SetType(schema_type_name2) + .AddProperty( + PropertyConfigBuilder() + .SetName("schema2prop1") + .SetDataTypeDocument( + schema_type_name1, + /*indexable_nested_properties_list=*/{"schema1prop2", + "schema1prop3", + "schema1prop5"})) + .AddProperty( + PropertyConfigBuilder() + .SetName("schema2prop2") + .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN)) + .AddProperty(PropertyConfigBuilder() + .SetName("schema2prop3") + .SetDataTypeInt64(NUMERIC_MATCH_UNKNOWN)) + .Build(); + SchemaUtil::TypeConfigMap type_config_map = { + {schema_type_name1, schema_type_config1}, + {schema_type_name2, schema_type_config2}}; + + // Order of iteration for Schema2: + // {"schema2prop1.schema1prop1", "schema2prop1.schema1prop2", + // "schema2prop1.schema1prop3", "schema2prop1.schema1prop4", + // "schema2prop1.schema1prop5", "schema2prop2", "schema2prop3"} + // + // Indexable properties: + // {"schema2prop1.schema1prop2", "schema2prop1.schema1prop3", + // "schema2prop1.schema1prop5", "schema2prop2"}. + // + // "schema2prop1.schema1prop4" is indexable by its indexing-config, but is not + // considered indexable for Schema2 because Schema2 sets its + // index_nested_properties config to false, and "schema1prop4" is not + // in the indexable_nested_properties_list for schema2prop1. + // + // "schema2prop1.schema1prop1", "schema2prop1.schema1prop3" and + // "schema2prop1.schema1prop5" are non-indexable by its indexing-config. + // However "schema2prop1.schema1prop3" and "schema2prop1.schema1prop5" are + // indexed as it appears in the indexable_list. + SchemaPropertyIterator schema2_iterator(schema_type_config2, type_config_map); + + EXPECT_THAT(schema2_iterator.Advance(), IsOk()); + EXPECT_THAT(schema2_iterator.GetCurrentPropertyPath(), + Eq("schema2prop1.schema1prop1")); + EXPECT_THAT(schema2_iterator.GetCurrentPropertyConfig(), + EqualsProto(schema_type_config1.properties(0))); + EXPECT_THAT(schema2_iterator.GetCurrentPropertyIndexable(), IsFalse()); + + EXPECT_THAT(schema2_iterator.Advance(), IsOk()); + EXPECT_THAT(schema2_iterator.GetCurrentPropertyPath(), + Eq("schema2prop1.schema1prop2")); + EXPECT_THAT(schema2_iterator.GetCurrentPropertyConfig(), + EqualsProto(schema_type_config1.properties(1))); + EXPECT_THAT(schema2_iterator.GetCurrentPropertyIndexable(), IsTrue()); + + EXPECT_THAT(schema2_iterator.Advance(), IsOk()); + EXPECT_THAT(schema2_iterator.GetCurrentPropertyPath(), + Eq("schema2prop1.schema1prop3")); + EXPECT_THAT(schema2_iterator.GetCurrentPropertyConfig(), + EqualsProto(schema_type_config1.properties(2))); + EXPECT_THAT(schema2_iterator.GetCurrentPropertyIndexable(), IsTrue()); + + EXPECT_THAT(schema2_iterator.Advance(), IsOk()); + EXPECT_THAT(schema2_iterator.GetCurrentPropertyPath(), + Eq("schema2prop1.schema1prop4")); + EXPECT_THAT(schema2_iterator.GetCurrentPropertyConfig(), + EqualsProto(schema_type_config1.properties(3))); + EXPECT_THAT(schema2_iterator.GetCurrentPropertyIndexable(), IsFalse()); + + EXPECT_THAT(schema2_iterator.Advance(), IsOk()); + EXPECT_THAT(schema2_iterator.GetCurrentPropertyPath(), + Eq("schema2prop1.schema1prop5")); + EXPECT_THAT(schema2_iterator.GetCurrentPropertyConfig(), + EqualsProto(schema_type_config1.properties(4))); + EXPECT_THAT(schema2_iterator.GetCurrentPropertyIndexable(), IsTrue()); + + EXPECT_THAT(schema2_iterator.Advance(), IsOk()); + EXPECT_THAT(schema2_iterator.GetCurrentPropertyPath(), Eq("schema2prop2")); + EXPECT_THAT(schema2_iterator.GetCurrentPropertyConfig(), + EqualsProto(schema_type_config2.properties(1))); + EXPECT_THAT(schema2_iterator.GetCurrentPropertyIndexable(), IsTrue()); + + EXPECT_THAT(schema2_iterator.Advance(), IsOk()); + EXPECT_THAT(schema2_iterator.GetCurrentPropertyPath(), Eq("schema2prop3")); + EXPECT_THAT(schema2_iterator.GetCurrentPropertyConfig(), + EqualsProto(schema_type_config2.properties(2))); + EXPECT_THAT(schema2_iterator.GetCurrentPropertyIndexable(), IsFalse()); + + EXPECT_THAT(schema2_iterator.Advance(), + StatusIs(libtextclassifier3::StatusCode::OUT_OF_RANGE)); + + EXPECT_THAT(schema2_iterator.unknown_indexable_nested_property_paths(), + IsEmpty()); + + // Iterate through schema1 properties. Schema1 only has non-document type leaf + // properties, so its properties will be assigned indexable or not according + // to their indexing configs. + SchemaPropertyIterator schema1_iterator(schema_type_config1, type_config_map); + + EXPECT_THAT(schema1_iterator.Advance(), IsOk()); + EXPECT_THAT(schema1_iterator.GetCurrentPropertyPath(), Eq("schema1prop1")); + EXPECT_THAT(schema1_iterator.GetCurrentPropertyConfig(), + EqualsProto(schema_type_config1.properties(0))); + EXPECT_THAT(schema1_iterator.GetCurrentPropertyIndexable(), IsFalse()); + + EXPECT_THAT(schema1_iterator.Advance(), IsOk()); + EXPECT_THAT(schema1_iterator.GetCurrentPropertyPath(), Eq("schema1prop2")); + EXPECT_THAT(schema1_iterator.GetCurrentPropertyConfig(), + EqualsProto(schema_type_config1.properties(1))); + EXPECT_THAT(schema1_iterator.GetCurrentPropertyIndexable(), IsTrue()); + + EXPECT_THAT(schema1_iterator.Advance(), IsOk()); + EXPECT_THAT(schema1_iterator.GetCurrentPropertyPath(), Eq("schema1prop3")); + EXPECT_THAT(schema1_iterator.GetCurrentPropertyConfig(), + EqualsProto(schema_type_config1.properties(2))); + EXPECT_THAT(schema1_iterator.GetCurrentPropertyIndexable(), IsFalse()); + + EXPECT_THAT(schema1_iterator.Advance(), IsOk()); + EXPECT_THAT(schema1_iterator.GetCurrentPropertyPath(), Eq("schema1prop4")); + EXPECT_THAT(schema1_iterator.GetCurrentPropertyConfig(), + EqualsProto(schema_type_config1.properties(3))); + EXPECT_THAT(schema1_iterator.GetCurrentPropertyIndexable(), IsTrue()); + + EXPECT_THAT(schema1_iterator.Advance(), IsOk()); + EXPECT_THAT(schema1_iterator.GetCurrentPropertyPath(), Eq("schema1prop5")); + EXPECT_THAT(schema1_iterator.GetCurrentPropertyConfig(), + EqualsProto(schema_type_config1.properties(4))); + EXPECT_THAT(schema1_iterator.GetCurrentPropertyIndexable(), IsFalse()); + + EXPECT_THAT(schema1_iterator.Advance(), + StatusIs(libtextclassifier3::StatusCode::OUT_OF_RANGE)); + + EXPECT_THAT(schema1_iterator.unknown_indexable_nested_property_paths(), + IsEmpty()); +} + +TEST(SchemaPropertyIteratorTest, + IndexableNestedPropertiesList_indexBooleanTrueDoesNotAffectOtherLevels) { + std::string schema_type_name1 = "SchemaOne"; + std::string schema_type_name2 = "SchemaTwo"; + std::string schema_type_name3 = "SchemaThree"; + + SchemaTypeConfigProto schema_type_config1 = + SchemaTypeConfigBuilder() + .SetType(schema_type_name1) + .AddProperty( + PropertyConfigBuilder() + .SetName("schema1prop1") + .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN)) + .AddProperty( + PropertyConfigBuilder() + .SetName("schema1prop2") + .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_PLAIN)) + .AddProperty( + PropertyConfigBuilder() + .SetName("schema1prop3") + .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_PLAIN)) + .Build(); + SchemaTypeConfigProto schema_type_config2 = + SchemaTypeConfigBuilder() + .SetType(schema_type_name2) + .AddProperty( + PropertyConfigBuilder() + .SetName("schema2prop1") + .SetDataTypeDocument(schema_type_name1, + /*index_nested_properties=*/true)) + .AddProperty( + PropertyConfigBuilder() + .SetName("schema2prop2") + .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN)) + .AddProperty( + PropertyConfigBuilder() + .SetName("schema2prop3") + .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN)) + .Build(); + SchemaTypeConfigProto schema_type_config3 = + SchemaTypeConfigBuilder() + .SetType(schema_type_name3) + .AddProperty( + PropertyConfigBuilder() + .SetName("schema3prop3") + .SetDataTypeDocument( + schema_type_name1, + /*indexable_nested_properties_list=*/{"schema1prop1", + "schema1prop3"})) + .AddProperty(PropertyConfigBuilder() + .SetName("schema3prop1") + .SetDataTypeDocument( + schema_type_name2, + /*indexable_nested_properties_list=*/ + {"schema2prop2", "schema2prop1.schema1prop1", + "schema2prop1.schema1prop3"})) + .AddProperty( + PropertyConfigBuilder() + .SetName("schema3prop2") + .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN)) + .Build(); + SchemaUtil::TypeConfigMap type_config_map = { + {schema_type_name1, schema_type_config1}, + {schema_type_name2, schema_type_config2}, + {schema_type_name3, schema_type_config3}}; + + // Order of iteration for Schema3: + // {"schema3prop1.schema2prop1.schema1prop1", + // "schema3prop1.schema2prop1.schema1prop2", + // "schema3prop1.schema2prop1.schema1prop3", + // "schema3prop1.schema2prop2", "schema3prop1.schema2prop3", "schema3prop2", + // "schema3prop3.schema1prop1", "schema3prop3.schema1prop2", + // "schema3prop3.schema1prop3"}. + // + // Indexable properties: + // {"schema3prop1.schema2prop1.schema1prop1", + // "schema3prop1.schema2prop1.schema1prop3", + // "schema3prop1.schema2prop2", "schema3prop2", "schema3prop3.schema1prop1", + // "schema3prop3.schema1prop3"} + // + // Schema2 setting index_nested_properties=true does not affect nested + // properties indexing for Schema3. + SchemaPropertyIterator schema3_iterator(schema_type_config3, type_config_map); + + EXPECT_THAT(schema3_iterator.Advance(), IsOk()); + EXPECT_THAT(schema3_iterator.GetCurrentPropertyPath(), + Eq("schema3prop1.schema2prop1.schema1prop1")); + EXPECT_THAT(schema3_iterator.GetCurrentPropertyConfig(), + EqualsProto(schema_type_config1.properties(0))); + EXPECT_THAT(schema3_iterator.GetCurrentPropertyIndexable(), IsTrue()); + + EXPECT_THAT(schema3_iterator.Advance(), IsOk()); + EXPECT_THAT(schema3_iterator.GetCurrentPropertyPath(), + Eq("schema3prop1.schema2prop1.schema1prop2")); + EXPECT_THAT(schema3_iterator.GetCurrentPropertyConfig(), + EqualsProto(schema_type_config1.properties(1))); + EXPECT_THAT(schema3_iterator.GetCurrentPropertyIndexable(), IsFalse()); + + EXPECT_THAT(schema3_iterator.Advance(), IsOk()); + EXPECT_THAT(schema3_iterator.GetCurrentPropertyPath(), + Eq("schema3prop1.schema2prop1.schema1prop3")); + EXPECT_THAT(schema3_iterator.GetCurrentPropertyConfig(), + EqualsProto(schema_type_config1.properties(2))); + EXPECT_THAT(schema3_iterator.GetCurrentPropertyIndexable(), IsTrue()); + + EXPECT_THAT(schema3_iterator.Advance(), IsOk()); + EXPECT_THAT(schema3_iterator.GetCurrentPropertyPath(), + Eq("schema3prop1.schema2prop2")); + EXPECT_THAT(schema3_iterator.GetCurrentPropertyConfig(), + EqualsProto(schema_type_config2.properties(1))); + EXPECT_THAT(schema3_iterator.GetCurrentPropertyIndexable(), IsTrue()); + + EXPECT_THAT(schema3_iterator.Advance(), IsOk()); + EXPECT_THAT(schema3_iterator.GetCurrentPropertyPath(), + Eq("schema3prop1.schema2prop3")); + EXPECT_THAT(schema3_iterator.GetCurrentPropertyConfig(), + EqualsProto(schema_type_config2.properties(2))); + EXPECT_THAT(schema3_iterator.GetCurrentPropertyIndexable(), IsFalse()); + + EXPECT_THAT(schema3_iterator.Advance(), IsOk()); + EXPECT_THAT(schema3_iterator.GetCurrentPropertyPath(), Eq("schema3prop2")); + EXPECT_THAT(schema3_iterator.GetCurrentPropertyConfig(), + EqualsProto(schema_type_config3.properties(2))); + EXPECT_THAT(schema3_iterator.GetCurrentPropertyIndexable(), IsTrue()); + + EXPECT_THAT(schema3_iterator.Advance(), IsOk()); + EXPECT_THAT(schema3_iterator.GetCurrentPropertyPath(), + Eq("schema3prop3.schema1prop1")); + EXPECT_THAT(schema3_iterator.GetCurrentPropertyConfig(), + EqualsProto(schema_type_config1.properties(0))); + EXPECT_THAT(schema3_iterator.GetCurrentPropertyIndexable(), IsTrue()); + + EXPECT_THAT(schema3_iterator.Advance(), IsOk()); + EXPECT_THAT(schema3_iterator.GetCurrentPropertyPath(), + Eq("schema3prop3.schema1prop2")); + EXPECT_THAT(schema3_iterator.GetCurrentPropertyConfig(), + EqualsProto(schema_type_config1.properties(1))); + EXPECT_THAT(schema3_iterator.GetCurrentPropertyIndexable(), IsFalse()); + + EXPECT_THAT(schema3_iterator.Advance(), IsOk()); + EXPECT_THAT(schema3_iterator.GetCurrentPropertyPath(), + Eq("schema3prop3.schema1prop3")); + EXPECT_THAT(schema3_iterator.GetCurrentPropertyConfig(), + EqualsProto(schema_type_config1.properties(2))); + EXPECT_THAT(schema3_iterator.GetCurrentPropertyIndexable(), IsTrue()); + + EXPECT_THAT(schema3_iterator.Advance(), + StatusIs(libtextclassifier3::StatusCode::OUT_OF_RANGE)); + + EXPECT_THAT(schema3_iterator.unknown_indexable_nested_property_paths(), + IsEmpty()); + + // Order of iteration for Schema2: + // {"schema2prop1.schema1prop1", "schema2prop1.schema1prop2", + // "schema2prop1.schema1prop3", "schema2prop2", "schema2prop3"} + // + // Indexable properties: + // {"schema2prop1.schema1prop1", "schema2prop1.schema1prop2", + // "schema2prop1.schema1prop3", "schema2prop2", "schema2prop3"} + // + // All properties are indexed because index_nested_properties=true for + // Schema2.schema2prop1. Schema3's indexable_nested_properties setting does + // not affect this. + SchemaPropertyIterator schema2_iterator(schema_type_config2, type_config_map); + + EXPECT_THAT(schema2_iterator.Advance(), IsOk()); + EXPECT_THAT(schema2_iterator.GetCurrentPropertyPath(), + Eq("schema2prop1.schema1prop1")); + EXPECT_THAT(schema2_iterator.GetCurrentPropertyConfig(), + EqualsProto(schema_type_config1.properties(0))); + EXPECT_THAT(schema2_iterator.GetCurrentPropertyIndexable(), IsTrue()); + + EXPECT_THAT(schema2_iterator.Advance(), IsOk()); + EXPECT_THAT(schema2_iterator.GetCurrentPropertyPath(), + Eq("schema2prop1.schema1prop2")); + EXPECT_THAT(schema2_iterator.GetCurrentPropertyConfig(), + EqualsProto(schema_type_config1.properties(1))); + EXPECT_THAT(schema2_iterator.GetCurrentPropertyIndexable(), IsTrue()); + + EXPECT_THAT(schema2_iterator.Advance(), IsOk()); + EXPECT_THAT(schema2_iterator.GetCurrentPropertyPath(), + Eq("schema2prop1.schema1prop3")); + EXPECT_THAT(schema2_iterator.GetCurrentPropertyConfig(), + EqualsProto(schema_type_config1.properties(2))); + EXPECT_THAT(schema2_iterator.GetCurrentPropertyIndexable(), IsTrue()); + + EXPECT_THAT(schema2_iterator.Advance(), IsOk()); + EXPECT_THAT(schema2_iterator.GetCurrentPropertyPath(), Eq("schema2prop2")); + EXPECT_THAT(schema2_iterator.GetCurrentPropertyConfig(), + EqualsProto(schema_type_config2.properties(1))); + EXPECT_THAT(schema2_iterator.GetCurrentPropertyIndexable(), IsTrue()); + + EXPECT_THAT(schema2_iterator.Advance(), IsOk()); + EXPECT_THAT(schema2_iterator.GetCurrentPropertyPath(), Eq("schema2prop3")); + EXPECT_THAT(schema2_iterator.GetCurrentPropertyConfig(), + EqualsProto(schema_type_config2.properties(2))); + EXPECT_THAT(schema2_iterator.GetCurrentPropertyIndexable(), IsTrue()); + + EXPECT_THAT(schema2_iterator.Advance(), + StatusIs(libtextclassifier3::StatusCode::OUT_OF_RANGE)); + + EXPECT_THAT(schema2_iterator.unknown_indexable_nested_property_paths(), + IsEmpty()); +} + +TEST(SchemaPropertyIteratorTest, + IndexableNestedPropertiesList_indexBooleanFalseDoesNotAffectOtherLevels) { + std::string schema_type_name1 = "SchemaOne"; + std::string schema_type_name2 = "SchemaTwo"; + std::string schema_type_name3 = "SchemaThree"; + + SchemaTypeConfigProto schema_type_config1 = + SchemaTypeConfigBuilder() + .SetType(schema_type_name1) + .AddProperty( + PropertyConfigBuilder() + .SetName("schema1prop1") + .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN)) + .AddProperty( + PropertyConfigBuilder() + .SetName("schema1prop2") + .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_PLAIN)) + .Build(); + SchemaTypeConfigProto schema_type_config2 = + SchemaTypeConfigBuilder() + .SetType(schema_type_name2) + .AddProperty( + PropertyConfigBuilder() + .SetName("schema2prop1") + .SetDataTypeDocument(schema_type_name1, + /*index_nested_properties=*/false)) + .Build(); + SchemaTypeConfigProto schema_type_config3 = + SchemaTypeConfigBuilder() + .SetType(schema_type_name3) + .AddProperty( + PropertyConfigBuilder() + .SetName("schema3prop1") + .SetDataTypeDocument(schema_type_name2, + /*indexable_nested_properties_list=*/ + std::initializer_list<std::string>{ + "schema2prop1.schema1prop2"})) + .Build(); + SchemaUtil::TypeConfigMap type_config_map = { + {schema_type_name1, schema_type_config1}, + {schema_type_name2, schema_type_config2}, + {schema_type_name3, schema_type_config3}}; + + // Order of iteration for Schema3: + // {"schema3prop1.schema2prop1.schema1prop1", + // "schema3prop1.schema2prop1.schema1prop2"}. + // + // Indexable properties: {"schema3prop1.schema2prop1.schema1prop2"} + // + // Schema2 setting index_nested_properties=false, does not affect Schema3's + // indexable list. + SchemaPropertyIterator schema3_iterator(schema_type_config3, type_config_map); + + EXPECT_THAT(schema3_iterator.Advance(), IsOk()); + EXPECT_THAT(schema3_iterator.GetCurrentPropertyPath(), + Eq("schema3prop1.schema2prop1.schema1prop1")); + EXPECT_THAT(schema3_iterator.GetCurrentPropertyConfig(), + EqualsProto(schema_type_config1.properties(0))); + EXPECT_THAT(schema3_iterator.GetCurrentPropertyIndexable(), IsFalse()); + + EXPECT_THAT(schema3_iterator.Advance(), IsOk()); + EXPECT_THAT(schema3_iterator.GetCurrentPropertyPath(), + Eq("schema3prop1.schema2prop1.schema1prop2")); + EXPECT_THAT(schema3_iterator.GetCurrentPropertyConfig(), + EqualsProto(schema_type_config1.properties(1))); + EXPECT_THAT(schema3_iterator.GetCurrentPropertyIndexable(), IsTrue()); + + EXPECT_THAT(schema3_iterator.Advance(), + StatusIs(libtextclassifier3::StatusCode::OUT_OF_RANGE)); + + EXPECT_THAT(schema3_iterator.unknown_indexable_nested_property_paths(), + IsEmpty()); + + // Order of iteration for Schema2: + // {"schema2prop1.schema1prop1", "schema2prop1.schema1prop2"} + // + // Indexable properties: None + // + // The indexable list for Schema3 does not propagate to Schema2. + SchemaPropertyIterator schema2_iterator(schema_type_config2, type_config_map); + + EXPECT_THAT(schema2_iterator.Advance(), IsOk()); + EXPECT_THAT(schema2_iterator.GetCurrentPropertyPath(), + Eq("schema2prop1.schema1prop1")); + EXPECT_THAT(schema2_iterator.GetCurrentPropertyConfig(), + EqualsProto(schema_type_config1.properties(0))); + EXPECT_THAT(schema2_iterator.GetCurrentPropertyIndexable(), IsFalse()); + + EXPECT_THAT(schema2_iterator.Advance(), IsOk()); + EXPECT_THAT(schema2_iterator.GetCurrentPropertyPath(), + Eq("schema2prop1.schema1prop2")); + EXPECT_THAT(schema2_iterator.GetCurrentPropertyConfig(), + EqualsProto(schema_type_config1.properties(1))); + EXPECT_THAT(schema2_iterator.GetCurrentPropertyIndexable(), IsFalse()); + + EXPECT_THAT(schema2_iterator.Advance(), + StatusIs(libtextclassifier3::StatusCode::OUT_OF_RANGE)); + + EXPECT_THAT(schema2_iterator.unknown_indexable_nested_property_paths(), + IsEmpty()); +} + +TEST(SchemaPropertyIteratorTest, + IndexableNestedPropertiesList_indexableSetDoesNotAffectOtherLevels) { + std::string schema_type_name1 = "SchemaOne"; + std::string schema_type_name2 = "SchemaTwo"; + std::string schema_type_name3 = "SchemaThree"; + + SchemaTypeConfigProto schema_type_config1 = + SchemaTypeConfigBuilder() + .SetType(schema_type_name1) + .AddProperty( + PropertyConfigBuilder() + .SetName("schema1prop1") + .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN)) + .AddProperty( + PropertyConfigBuilder() + .SetName("schema1prop2") + .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_PLAIN)) + .AddProperty( + PropertyConfigBuilder() + .SetName("schema1prop3") + .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_PLAIN)) + .Build(); + SchemaTypeConfigProto schema_type_config2 = + SchemaTypeConfigBuilder() + .SetType(schema_type_name2) + .AddProperty( + PropertyConfigBuilder() + .SetName("schema2prop1") + .SetDataTypeDocument( + schema_type_name1, + /*indexable_nested_properties_list=*/ + std::initializer_list<std::string>{"schema1prop2"})) + .AddProperty( + PropertyConfigBuilder() + .SetName("schema2prop2") + .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN)) + .AddProperty( + PropertyConfigBuilder() + .SetName("schema2prop3") + .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN)) + .Build(); + SchemaTypeConfigProto schema_type_config3 = + SchemaTypeConfigBuilder() + .SetType(schema_type_name3) + .AddProperty( + PropertyConfigBuilder() + .SetName("schema3prop3") + .SetDataTypeDocument( + schema_type_name1, + /*indexable_nested_properties_list=*/{"schema1prop1", + "schema1prop3"})) + .AddProperty(PropertyConfigBuilder() + .SetName("schema3prop1") + .SetDataTypeDocument( + schema_type_name2, + /*indexable_nested_properties_list=*/ + {"schema2prop2", "schema2prop1.schema1prop1", + "schema2prop1.schema1prop3"})) + .AddProperty( + PropertyConfigBuilder() + .SetName("schema3prop2") + .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN)) + .Build(); + SchemaUtil::TypeConfigMap type_config_map = { + {schema_type_name1, schema_type_config1}, + {schema_type_name2, schema_type_config2}, + {schema_type_name3, schema_type_config3}}; + + // Order of iteration for Schema3: + // {"schema3prop1.schema2prop1.schema1prop1", + // "schema3prop1.schema2prop1.schema1prop2", + // "schema3prop1.schema2prop1.schema1prop3", + // "schema3prop1.schema2prop2", "schema3prop1.schema2prop3", "schema3prop2", + // "schema3prop3.schema1prop1", "schema3prop3.schema1prop2", + // "schema3prop3.schema1prop3"}. + // + // Indexable properties: + // {"schema3prop1.schema2prop1.schema1prop1", + // "schema3prop1.schema2prop1.schema1prop3", + // "schema3prop1.schema2prop2", "schema3prop2", "schema3prop3.schema1prop1", + // "schema3prop3.schema1prop3"} + // + // Schema2 setting indexable_nested_properties_list={schema1prop2} does not + // affect nested properties indexing for Schema3. + SchemaPropertyIterator schema3_iterator(schema_type_config3, type_config_map); + + EXPECT_THAT(schema3_iterator.Advance(), IsOk()); + EXPECT_THAT(schema3_iterator.GetCurrentPropertyPath(), + Eq("schema3prop1.schema2prop1.schema1prop1")); + EXPECT_THAT(schema3_iterator.GetCurrentPropertyConfig(), + EqualsProto(schema_type_config1.properties(0))); + EXPECT_THAT(schema3_iterator.GetCurrentPropertyIndexable(), IsTrue()); + + EXPECT_THAT(schema3_iterator.Advance(), IsOk()); + EXPECT_THAT(schema3_iterator.GetCurrentPropertyPath(), + Eq("schema3prop1.schema2prop1.schema1prop2")); + EXPECT_THAT(schema3_iterator.GetCurrentPropertyConfig(), + EqualsProto(schema_type_config1.properties(1))); + EXPECT_THAT(schema3_iterator.GetCurrentPropertyIndexable(), IsFalse()); + + EXPECT_THAT(schema3_iterator.Advance(), IsOk()); + EXPECT_THAT(schema3_iterator.GetCurrentPropertyPath(), + Eq("schema3prop1.schema2prop1.schema1prop3")); + EXPECT_THAT(schema3_iterator.GetCurrentPropertyConfig(), + EqualsProto(schema_type_config1.properties(2))); + EXPECT_THAT(schema3_iterator.GetCurrentPropertyIndexable(), IsTrue()); + + EXPECT_THAT(schema3_iterator.Advance(), IsOk()); + EXPECT_THAT(schema3_iterator.GetCurrentPropertyPath(), + Eq("schema3prop1.schema2prop2")); + EXPECT_THAT(schema3_iterator.GetCurrentPropertyConfig(), + EqualsProto(schema_type_config2.properties(1))); + EXPECT_THAT(schema3_iterator.GetCurrentPropertyIndexable(), IsTrue()); + + EXPECT_THAT(schema3_iterator.Advance(), IsOk()); + EXPECT_THAT(schema3_iterator.GetCurrentPropertyPath(), + Eq("schema3prop1.schema2prop3")); + EXPECT_THAT(schema3_iterator.GetCurrentPropertyConfig(), + EqualsProto(schema_type_config2.properties(2))); + EXPECT_THAT(schema3_iterator.GetCurrentPropertyIndexable(), IsFalse()); + + EXPECT_THAT(schema3_iterator.Advance(), IsOk()); + EXPECT_THAT(schema3_iterator.GetCurrentPropertyPath(), Eq("schema3prop2")); + EXPECT_THAT(schema3_iterator.GetCurrentPropertyConfig(), + EqualsProto(schema_type_config3.properties(2))); + EXPECT_THAT(schema3_iterator.GetCurrentPropertyIndexable(), IsTrue()); + + EXPECT_THAT(schema3_iterator.Advance(), IsOk()); + EXPECT_THAT(schema3_iterator.GetCurrentPropertyPath(), + Eq("schema3prop3.schema1prop1")); + EXPECT_THAT(schema3_iterator.GetCurrentPropertyConfig(), + EqualsProto(schema_type_config1.properties(0))); + EXPECT_THAT(schema3_iterator.GetCurrentPropertyIndexable(), IsTrue()); + + EXPECT_THAT(schema3_iterator.Advance(), IsOk()); + EXPECT_THAT(schema3_iterator.GetCurrentPropertyPath(), + Eq("schema3prop3.schema1prop2")); + EXPECT_THAT(schema3_iterator.GetCurrentPropertyConfig(), + EqualsProto(schema_type_config1.properties(1))); + EXPECT_THAT(schema3_iterator.GetCurrentPropertyIndexable(), IsFalse()); + + EXPECT_THAT(schema3_iterator.Advance(), IsOk()); + EXPECT_THAT(schema3_iterator.GetCurrentPropertyPath(), + Eq("schema3prop3.schema1prop3")); + EXPECT_THAT(schema3_iterator.GetCurrentPropertyConfig(), + EqualsProto(schema_type_config1.properties(2))); + EXPECT_THAT(schema3_iterator.GetCurrentPropertyIndexable(), IsTrue()); + + EXPECT_THAT(schema3_iterator.Advance(), + StatusIs(libtextclassifier3::StatusCode::OUT_OF_RANGE)); + + EXPECT_THAT(schema3_iterator.unknown_indexable_nested_property_paths(), + IsEmpty()); + + // Order of iteration for Schema2: + // {"schema2prop1.schema1prop1", "schema2prop1.schema1prop2", + // "schema2prop1.schema1prop3", "schema2prop2", "schema2prop3"} + // + // Indexable properties: + // {"schema2prop1.schema1prop2", "schema2prop2", "schema2prop3"} + // + // Indexable_nested_properties set for Schema3.schema3prop1 does not propagate + // to Schema2. + SchemaPropertyIterator schema2_iterator(schema_type_config2, type_config_map); + + EXPECT_THAT(schema2_iterator.Advance(), IsOk()); + EXPECT_THAT(schema2_iterator.GetCurrentPropertyPath(), + Eq("schema2prop1.schema1prop1")); + EXPECT_THAT(schema2_iterator.GetCurrentPropertyConfig(), + EqualsProto(schema_type_config1.properties(0))); + EXPECT_THAT(schema2_iterator.GetCurrentPropertyIndexable(), IsFalse()); + + EXPECT_THAT(schema2_iterator.Advance(), IsOk()); + EXPECT_THAT(schema2_iterator.GetCurrentPropertyPath(), + Eq("schema2prop1.schema1prop2")); + EXPECT_THAT(schema2_iterator.GetCurrentPropertyConfig(), + EqualsProto(schema_type_config1.properties(1))); + EXPECT_THAT(schema2_iterator.GetCurrentPropertyIndexable(), IsTrue()); + + EXPECT_THAT(schema2_iterator.Advance(), IsOk()); + EXPECT_THAT(schema2_iterator.GetCurrentPropertyPath(), + Eq("schema2prop1.schema1prop3")); + EXPECT_THAT(schema2_iterator.GetCurrentPropertyConfig(), + EqualsProto(schema_type_config1.properties(2))); + EXPECT_THAT(schema2_iterator.GetCurrentPropertyIndexable(), IsFalse()); + + EXPECT_THAT(schema2_iterator.Advance(), IsOk()); + EXPECT_THAT(schema2_iterator.GetCurrentPropertyPath(), Eq("schema2prop2")); + EXPECT_THAT(schema2_iterator.GetCurrentPropertyConfig(), + EqualsProto(schema_type_config2.properties(1))); + EXPECT_THAT(schema2_iterator.GetCurrentPropertyIndexable(), IsTrue()); + + EXPECT_THAT(schema2_iterator.Advance(), IsOk()); + EXPECT_THAT(schema2_iterator.GetCurrentPropertyPath(), Eq("schema2prop3")); + EXPECT_THAT(schema2_iterator.GetCurrentPropertyConfig(), + EqualsProto(schema_type_config2.properties(2))); + EXPECT_THAT(schema2_iterator.GetCurrentPropertyIndexable(), IsTrue()); + + EXPECT_THAT(schema2_iterator.Advance(), + StatusIs(libtextclassifier3::StatusCode::OUT_OF_RANGE)); + + EXPECT_THAT(schema2_iterator.unknown_indexable_nested_property_paths(), + IsEmpty()); +} + +TEST( + SchemaPropertyIteratorTest, + IndexableNestedPropertiesList_upperLevelIndexTrueIndexesListOfNestedLevel) { + std::string schema_type_name1 = "SchemaOne"; + std::string schema_type_name2 = "SchemaTwo"; + std::string schema_type_name3 = "SchemaThree"; + std::string schema_type_name4 = "SchemaFour"; + + SchemaTypeConfigProto schema_type_config1 = + SchemaTypeConfigBuilder() + .SetType(schema_type_name1) + .AddProperty( + PropertyConfigBuilder() + .SetName("schema1prop1") + .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN)) + .AddProperty( + PropertyConfigBuilder() + .SetName("schema1prop2") + .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_PLAIN)) + .Build(); + SchemaTypeConfigProto schema_type_config2 = + SchemaTypeConfigBuilder() + .SetType(schema_type_name2) + .AddProperty( + PropertyConfigBuilder() + .SetName("schema2prop1") + .SetDataTypeDocument( + schema_type_name1, + /*indexable_nested_properties_list=*/ + std::initializer_list<std::string>{"schema1prop2"})) + .Build(); + SchemaTypeConfigProto schema_type_config3 = + SchemaTypeConfigBuilder() + .SetType(schema_type_name3) + .AddProperty( + PropertyConfigBuilder() + .SetName("schema3prop1") + .SetDataTypeDocument(schema_type_name2, + /*index_nested_properties=*/true)) + .Build(); + SchemaTypeConfigProto schema_type_config4 = + SchemaTypeConfigBuilder() + .SetType(schema_type_name4) + .AddProperty( + PropertyConfigBuilder() + .SetName("schema4prop1") + .SetDataTypeDocument(schema_type_name3, + /*index_nested_properties=*/true)) + .Build(); + SchemaUtil::TypeConfigMap type_config_map = { + {schema_type_name1, schema_type_config1}, + {schema_type_name2, schema_type_config2}, + {schema_type_name3, schema_type_config3}, + {schema_type_name4, schema_type_config4}}; + + // Order of iteration for Schema4: + // {"schema4prop1.schema3prop1.schema2prop1.schema1prop1", + // "schema4prop1.schema3prop1.schema2prop1.schema1prop2"}. + // + // Indexable properties: {schema4prop1.schema3prop1.schema2prop1.schema1prop2} + // + // Both Schema4 and Schema3 sets index_nested_properties=true, so they both + // want to follow the indexing behavior of its subtype. + // Schema2 is the first subtype to define an indexing config, so we index its + // list for both Schema3 and Schema4 even though it sets + // index_nested_properties=false. + SchemaPropertyIterator schema4_iterator(schema_type_config4, type_config_map); + + EXPECT_THAT(schema4_iterator.Advance(), IsOk()); + EXPECT_THAT(schema4_iterator.GetCurrentPropertyPath(), + Eq("schema4prop1.schema3prop1.schema2prop1.schema1prop1")); + EXPECT_THAT(schema4_iterator.GetCurrentPropertyConfig(), + EqualsProto(schema_type_config1.properties(0))); + EXPECT_THAT(schema4_iterator.GetCurrentPropertyIndexable(), IsFalse()); + + EXPECT_THAT(schema4_iterator.Advance(), IsOk()); + EXPECT_THAT(schema4_iterator.GetCurrentPropertyPath(), + Eq("schema4prop1.schema3prop1.schema2prop1.schema1prop2")); + EXPECT_THAT(schema4_iterator.GetCurrentPropertyConfig(), + EqualsProto(schema_type_config1.properties(1))); + EXPECT_THAT(schema4_iterator.GetCurrentPropertyIndexable(), IsTrue()); + + EXPECT_THAT(schema4_iterator.Advance(), + StatusIs(libtextclassifier3::StatusCode::OUT_OF_RANGE)); + + EXPECT_THAT(schema4_iterator.unknown_indexable_nested_property_paths(), + IsEmpty()); + + // Order of iteration for Schema3: + // {"schema3prop1.schema2prop1.schema1prop1", + // "schema3prop1.schema2prop1.schema1prop2"}. + // + // Indexable properties: {schema3prop1.schema2prop1.schema1prop2} + SchemaPropertyIterator schema3_iterator(schema_type_config3, type_config_map); + + EXPECT_THAT(schema3_iterator.Advance(), IsOk()); + EXPECT_THAT(schema3_iterator.GetCurrentPropertyPath(), + Eq("schema3prop1.schema2prop1.schema1prop1")); + EXPECT_THAT(schema3_iterator.GetCurrentPropertyConfig(), + EqualsProto(schema_type_config1.properties(0))); + EXPECT_THAT(schema3_iterator.GetCurrentPropertyIndexable(), IsFalse()); + + EXPECT_THAT(schema3_iterator.Advance(), IsOk()); + EXPECT_THAT(schema3_iterator.GetCurrentPropertyPath(), + Eq("schema3prop1.schema2prop1.schema1prop2")); + EXPECT_THAT(schema3_iterator.GetCurrentPropertyConfig(), + EqualsProto(schema_type_config1.properties(1))); + EXPECT_THAT(schema3_iterator.GetCurrentPropertyIndexable(), IsTrue()); + + EXPECT_THAT(schema3_iterator.Advance(), + StatusIs(libtextclassifier3::StatusCode::OUT_OF_RANGE)); + + EXPECT_THAT(schema3_iterator.unknown_indexable_nested_property_paths(), + IsEmpty()); + + // Order of iteration for Schema2: + // {"schema2prop1.schema1prop1", "schema2prop1.schema1prop2"} + // + // Indexable properties: + // {"schema2prop1.schema1prop2"} + // + // Schema3 setting index_nested_properties=true does not propagate to Schema2. + SchemaPropertyIterator schema2_iterator(schema_type_config2, type_config_map); + + EXPECT_THAT(schema2_iterator.Advance(), IsOk()); + EXPECT_THAT(schema2_iterator.GetCurrentPropertyPath(), + Eq("schema2prop1.schema1prop1")); + EXPECT_THAT(schema2_iterator.GetCurrentPropertyConfig(), + EqualsProto(schema_type_config1.properties(0))); + EXPECT_THAT(schema2_iterator.GetCurrentPropertyIndexable(), IsFalse()); + + EXPECT_THAT(schema2_iterator.Advance(), IsOk()); + EXPECT_THAT(schema2_iterator.GetCurrentPropertyPath(), + Eq("schema2prop1.schema1prop2")); + EXPECT_THAT(schema2_iterator.GetCurrentPropertyConfig(), + EqualsProto(schema_type_config1.properties(1))); + EXPECT_THAT(schema2_iterator.GetCurrentPropertyIndexable(), IsTrue()); + + EXPECT_THAT(schema2_iterator.Advance(), + StatusIs(libtextclassifier3::StatusCode::OUT_OF_RANGE)); + + EXPECT_THAT(schema2_iterator.unknown_indexable_nested_property_paths(), + IsEmpty()); +} + +TEST(SchemaPropertyIteratorTest, + IndexableNestedPropertiesList_unknownPropPaths) { + std::string schema_type_name1 = "SchemaOne"; + std::string schema_type_name2 = "SchemaTwo"; + std::string schema_type_name3 = "SchemaThree"; + std::string schema_type_name4 = "SchemaFour"; + + SchemaTypeConfigProto schema_type_config1 = + SchemaTypeConfigBuilder() + .SetType(schema_type_name1) + .AddProperty( + PropertyConfigBuilder() + .SetName("schema1prop1") + .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN)) + .AddProperty( + PropertyConfigBuilder() + .SetName("schema1prop2") + .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_PLAIN)) + .Build(); + SchemaTypeConfigProto schema_type_config2 = + SchemaTypeConfigBuilder() + .SetType(schema_type_name2) + .AddProperty( + PropertyConfigBuilder() + .SetName("schema2prop1") + .SetDataTypeDocument(schema_type_name1, + /*indexable_nested_properties_list=*/ + {"schema1prop2", "schema1prop2.foo", + "foo.bar", "zzz", "aaa.zzz"})) + .AddProperty( + PropertyConfigBuilder() + .SetName("schema2prop2") + .SetDataTypeDocument( + schema_type_name1, + /*indexable_nested_properties_list=*/ + {"schema1prop1", "schema1prop2", "unknown.path"})) + .Build(); + SchemaTypeConfigProto schema_type_config3 = + SchemaTypeConfigBuilder() + .SetType(schema_type_name3) + .AddProperty( + PropertyConfigBuilder() + .SetName("schema3prop1") + .SetDataTypeDocument( + schema_type_name2, + /*indexable_nested_properties_list=*/ + {"schema3prop1", "schema2prop1", "schema1prop2", + "schema2prop1.schema1prop2", "schema2prop1.zzz", "zzz"})) + .Build(); + SchemaTypeConfigProto schema_type_config4 = + SchemaTypeConfigBuilder() + .SetType(schema_type_name4) + .AddProperty( + PropertyConfigBuilder() + .SetName("schema4prop1") + .SetDataTypeDocument(schema_type_name3, + /*index_nested_properties=*/true)) + .Build(); + SchemaUtil::TypeConfigMap type_config_map = { + {schema_type_name1, schema_type_config1}, + {schema_type_name2, schema_type_config2}, + {schema_type_name3, schema_type_config3}, + {schema_type_name4, schema_type_config4}}; + + // Order of iteration for Schema4: + // "schema4prop1.schema3prop1.schema2prop1.schema1prop1", + // "schema4prop1.schema3prop1.schema2prop1.schema1prop2" (indexable), + // "schema4prop1.schema3prop1.schema2prop2.schema1prop1", + // "schema4prop1.schema3prop1.schema2prop2.schema1prop2" + // + // Unknown property paths from schema3 will also be included for schema4, + // since schema4 sets index_nested_properties=true. + // This includes everything in schema3prop1's list except + // "schema2prop1.schema1prop2". + SchemaPropertyIterator schema4_iterator(schema_type_config4, type_config_map); + + EXPECT_THAT(schema4_iterator.Advance(), IsOk()); + EXPECT_THAT(schema4_iterator.GetCurrentPropertyPath(), + Eq("schema4prop1.schema3prop1.schema2prop1.schema1prop1")); + EXPECT_THAT(schema4_iterator.GetCurrentPropertyConfig(), + EqualsProto(schema_type_config1.properties(0))); + EXPECT_THAT(schema4_iterator.GetCurrentPropertyIndexable(), IsFalse()); + + EXPECT_THAT(schema4_iterator.Advance(), IsOk()); + EXPECT_THAT(schema4_iterator.GetCurrentPropertyPath(), + Eq("schema4prop1.schema3prop1.schema2prop1.schema1prop2")); + EXPECT_THAT(schema4_iterator.GetCurrentPropertyConfig(), + EqualsProto(schema_type_config1.properties(1))); + EXPECT_THAT(schema4_iterator.GetCurrentPropertyIndexable(), IsTrue()); + + EXPECT_THAT(schema4_iterator.Advance(), IsOk()); + EXPECT_THAT(schema4_iterator.GetCurrentPropertyPath(), + Eq("schema4prop1.schema3prop1.schema2prop2.schema1prop1")); + EXPECT_THAT(schema4_iterator.GetCurrentPropertyConfig(), + EqualsProto(schema_type_config1.properties(0))); + EXPECT_THAT(schema4_iterator.GetCurrentPropertyIndexable(), IsFalse()); + + EXPECT_THAT(schema4_iterator.Advance(), IsOk()); + EXPECT_THAT(schema4_iterator.GetCurrentPropertyPath(), + Eq("schema4prop1.schema3prop1.schema2prop2.schema1prop2")); + EXPECT_THAT(schema4_iterator.GetCurrentPropertyConfig(), + EqualsProto(schema_type_config1.properties(1))); + EXPECT_THAT(schema4_iterator.GetCurrentPropertyIndexable(), IsFalse()); + + EXPECT_THAT(schema4_iterator.Advance(), + StatusIs(libtextclassifier3::StatusCode::OUT_OF_RANGE)); + + EXPECT_THAT(schema4_iterator.unknown_indexable_nested_property_paths(), + testing::ElementsAre("schema4prop1.schema3prop1.schema1prop2", + "schema4prop1.schema3prop1.schema2prop1", + "schema4prop1.schema3prop1.schema2prop1.zzz", + "schema4prop1.schema3prop1.schema3prop1", + "schema4prop1.schema3prop1.zzz")); + + // Order of iteration for Schema3: + // "schema3prop1.schema2prop1.schema1prop1", + // "schema3prop1.schema2prop1.schema1prop2" (indexable), + // "schema3prop1.schema2prop2.schema1prop1", + // "schema3prop1.schema2prop2.schema1prop2" + // + // Unknown properties (in order): + // "schema3prop1.schema1prop2", "schema3prop1.schema2prop1" (not a leaf prop), + // "schema3prop1.schema2prop1.zzz", "schema3prop1.schema3prop1", + // "schema3prop1.zzz" + SchemaPropertyIterator schema3_iterator(schema_type_config3, type_config_map); + + EXPECT_THAT(schema3_iterator.Advance(), IsOk()); + EXPECT_THAT(schema3_iterator.GetCurrentPropertyPath(), + Eq("schema3prop1.schema2prop1.schema1prop1")); + EXPECT_THAT(schema3_iterator.GetCurrentPropertyConfig(), + EqualsProto(schema_type_config1.properties(0))); + EXPECT_THAT(schema3_iterator.GetCurrentPropertyIndexable(), IsFalse()); + + EXPECT_THAT(schema3_iterator.Advance(), IsOk()); + EXPECT_THAT(schema3_iterator.GetCurrentPropertyPath(), + Eq("schema3prop1.schema2prop1.schema1prop2")); + EXPECT_THAT(schema3_iterator.GetCurrentPropertyConfig(), + EqualsProto(schema_type_config1.properties(1))); + EXPECT_THAT(schema3_iterator.GetCurrentPropertyIndexable(), IsTrue()); + + EXPECT_THAT(schema3_iterator.Advance(), IsOk()); + EXPECT_THAT(schema3_iterator.GetCurrentPropertyPath(), + Eq("schema3prop1.schema2prop2.schema1prop1")); + EXPECT_THAT(schema3_iterator.GetCurrentPropertyConfig(), + EqualsProto(schema_type_config1.properties(0))); + EXPECT_THAT(schema3_iterator.GetCurrentPropertyIndexable(), IsFalse()); + + EXPECT_THAT(schema3_iterator.Advance(), IsOk()); + EXPECT_THAT(schema3_iterator.GetCurrentPropertyPath(), + Eq("schema3prop1.schema2prop2.schema1prop2")); + EXPECT_THAT(schema3_iterator.GetCurrentPropertyConfig(), + EqualsProto(schema_type_config1.properties(1))); + EXPECT_THAT(schema3_iterator.GetCurrentPropertyIndexable(), IsFalse()); + + EXPECT_THAT(schema3_iterator.Advance(), + StatusIs(libtextclassifier3::StatusCode::OUT_OF_RANGE)); + + EXPECT_THAT(schema3_iterator.unknown_indexable_nested_property_paths(), + testing::ElementsAre( + "schema3prop1.schema1prop2", "schema3prop1.schema2prop1", + "schema3prop1.schema2prop1.zzz", "schema3prop1.schema3prop1", + "schema3prop1.zzz")); + + // Order of iteration for Schema2: + // "schema2prop1.schema1prop1", + // "schema2prop1.schema1prop2" (indexable), + // "schema2prop2.schema1prop1" (indexable), + // "schema2prop2.schema1prop2" (indexable) + // + // Unknown properties (in order): + // "schema2prop1.aaa.zzz", "schema2prop1.foo.bar", + // "schema2prop1.schema1prop2.foo", "schema2prop1.zzz", + // "schema2prop2.unknown.path" + SchemaPropertyIterator schema2_iterator(schema_type_config2, type_config_map); + + EXPECT_THAT(schema2_iterator.Advance(), IsOk()); + EXPECT_THAT(schema2_iterator.GetCurrentPropertyPath(), + Eq("schema2prop1.schema1prop1")); + EXPECT_THAT(schema2_iterator.GetCurrentPropertyConfig(), + EqualsProto(schema_type_config1.properties(0))); + EXPECT_THAT(schema2_iterator.GetCurrentPropertyIndexable(), IsFalse()); + + EXPECT_THAT(schema2_iterator.Advance(), IsOk()); + EXPECT_THAT(schema2_iterator.GetCurrentPropertyPath(), + Eq("schema2prop1.schema1prop2")); + EXPECT_THAT(schema2_iterator.GetCurrentPropertyConfig(), + EqualsProto(schema_type_config1.properties(1))); + EXPECT_THAT(schema2_iterator.GetCurrentPropertyIndexable(), IsTrue()); + + EXPECT_THAT(schema2_iterator.Advance(), IsOk()); + EXPECT_THAT(schema2_iterator.GetCurrentPropertyPath(), + Eq("schema2prop2.schema1prop1")); + EXPECT_THAT(schema2_iterator.GetCurrentPropertyConfig(), + EqualsProto(schema_type_config1.properties(0))); + EXPECT_THAT(schema2_iterator.GetCurrentPropertyIndexable(), IsTrue()); + + EXPECT_THAT(schema2_iterator.Advance(), IsOk()); + EXPECT_THAT(schema2_iterator.GetCurrentPropertyPath(), + Eq("schema2prop2.schema1prop2")); + EXPECT_THAT(schema2_iterator.GetCurrentPropertyConfig(), + EqualsProto(schema_type_config1.properties(1))); + EXPECT_THAT(schema2_iterator.GetCurrentPropertyIndexable(), IsTrue()); + + EXPECT_THAT(schema2_iterator.Advance(), + StatusIs(libtextclassifier3::StatusCode::OUT_OF_RANGE)); + + EXPECT_THAT( + schema2_iterator.unknown_indexable_nested_property_paths(), + testing::ElementsAre("schema2prop1.aaa.zzz", "schema2prop1.foo.bar", + "schema2prop1.schema1prop2.foo", "schema2prop1.zzz", + "schema2prop2.unknown.path")); +} + +TEST(SchemaPropertyIteratorTest, + IndexableNestedPropertiesListDuplicateElements) { + std::string schema_type_name1 = "SchemaOne"; + std::string schema_type_name2 = "SchemaTwo"; + std::string schema_type_name3 = "SchemaThree"; + std::string schema_type_name4 = "SchemaFour"; + + SchemaTypeConfigProto schema_type_config1 = + SchemaTypeConfigBuilder() + .SetType(schema_type_name1) + .AddProperty( + PropertyConfigBuilder() + .SetName("schema1prop1") + .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN)) + .AddProperty( + PropertyConfigBuilder() + .SetName("schema1prop2") + .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_PLAIN)) + .Build(); + SchemaTypeConfigProto schema_type_config2 = + SchemaTypeConfigBuilder() + .SetType(schema_type_name2) + .AddProperty( + PropertyConfigBuilder() + .SetName("schema2prop1") + .SetDataTypeDocument( + schema_type_name1, + /*indexable_nested_properties_list=*/ + {"schema1prop2", "schema1prop2", "schema1prop2.foo", + "schema1prop2.foo", "foo.bar", "foo.bar", "foo.bar", + "zzz", "zzz", "aaa.zzz", "schema1prop2"})) + .AddProperty(PropertyConfigBuilder() + .SetName("schema2prop2") + .SetDataTypeDocument( + schema_type_name1, + /*indexable_nested_properties_list=*/ + {"schema1prop1", "schema1prop2", "unknown.path", + "unknown.path", "unknown.path", "unknown.path", + "schema1prop1"})) + .Build(); + SchemaTypeConfigProto schema_type_config3 = + SchemaTypeConfigBuilder() + .SetType(schema_type_name3) + .AddProperty( + PropertyConfigBuilder() + .SetName("schema3prop1") + .SetDataTypeDocument( + schema_type_name2, + /*indexable_nested_properties_list=*/ + {"schema3prop1", "schema3prop1", "schema2prop1", + "schema2prop1", "schema1prop2", "schema1prop2", + "schema2prop1.schema1prop2", "schema2prop1.schema1prop2", + "schema2prop1.zzz", "zzz", "zzz"})) + .Build(); + SchemaTypeConfigProto schema_type_config4 = + SchemaTypeConfigBuilder() + .SetType(schema_type_name4) + .AddProperty( + PropertyConfigBuilder() + .SetName("schema4prop1") + .SetDataTypeDocument(schema_type_name3, + /*index_nested_properties=*/true)) + .Build(); + SchemaUtil::TypeConfigMap type_config_map = { + {schema_type_name1, schema_type_config1}, + {schema_type_name2, schema_type_config2}, + {schema_type_name3, schema_type_config3}, + {schema_type_name4, schema_type_config4}}; + + // The results of this test case is the same as the previous test case. This + // is to test that the indexable-list is deduped correctly. + + // Order of iteration for Schema4: + // "schema4prop1.schema3prop1.schema2prop1.schema1prop1", + // "schema4prop1.schema3prop1.schema2prop1.schema1prop2" (indexable), + // "schema4prop1.schema3prop1.schema2prop2.schema1prop1", + // "schema4prop1.schema3prop1.schema2prop2.schema1prop2" + // + // Unknown property paths from schema3 will also be included for schema4, + // since schema4 sets index_nested_properties=true. + // This includes everything in schema3prop1's list except + // "schema2prop1.schema1prop2". + SchemaPropertyIterator schema4_iterator(schema_type_config4, type_config_map); + + EXPECT_THAT(schema4_iterator.Advance(), IsOk()); + EXPECT_THAT(schema4_iterator.GetCurrentPropertyPath(), + Eq("schema4prop1.schema3prop1.schema2prop1.schema1prop1")); + EXPECT_THAT(schema4_iterator.GetCurrentPropertyConfig(), + EqualsProto(schema_type_config1.properties(0))); + EXPECT_THAT(schema4_iterator.GetCurrentPropertyIndexable(), IsFalse()); + + EXPECT_THAT(schema4_iterator.Advance(), IsOk()); + EXPECT_THAT(schema4_iterator.GetCurrentPropertyPath(), + Eq("schema4prop1.schema3prop1.schema2prop1.schema1prop2")); + EXPECT_THAT(schema4_iterator.GetCurrentPropertyConfig(), + EqualsProto(schema_type_config1.properties(1))); + EXPECT_THAT(schema4_iterator.GetCurrentPropertyIndexable(), IsTrue()); + + EXPECT_THAT(schema4_iterator.Advance(), IsOk()); + EXPECT_THAT(schema4_iterator.GetCurrentPropertyPath(), + Eq("schema4prop1.schema3prop1.schema2prop2.schema1prop1")); + EXPECT_THAT(schema4_iterator.GetCurrentPropertyConfig(), + EqualsProto(schema_type_config1.properties(0))); + EXPECT_THAT(schema4_iterator.GetCurrentPropertyIndexable(), IsFalse()); + + EXPECT_THAT(schema4_iterator.Advance(), IsOk()); + EXPECT_THAT(schema4_iterator.GetCurrentPropertyPath(), + Eq("schema4prop1.schema3prop1.schema2prop2.schema1prop2")); + EXPECT_THAT(schema4_iterator.GetCurrentPropertyConfig(), + EqualsProto(schema_type_config1.properties(1))); + EXPECT_THAT(schema4_iterator.GetCurrentPropertyIndexable(), IsFalse()); + + EXPECT_THAT(schema4_iterator.Advance(), + StatusIs(libtextclassifier3::StatusCode::OUT_OF_RANGE)); + + EXPECT_THAT(schema4_iterator.unknown_indexable_nested_property_paths(), + testing::ElementsAre("schema4prop1.schema3prop1.schema1prop2", + "schema4prop1.schema3prop1.schema2prop1", + "schema4prop1.schema3prop1.schema2prop1.zzz", + "schema4prop1.schema3prop1.schema3prop1", + "schema4prop1.schema3prop1.zzz")); + + // Order of iteration for Schema3: + // "schema3prop1.schema2prop1.schema1prop1", + // "schema3prop1.schema2prop1.schema1prop2" (indexable), + // "schema3prop1.schema2prop2.schema1prop1", + // "schema3prop1.schema2prop2.schema1prop2" + // + // Unknown properties (in order): + // "schema2prop1.aaa.zzz", "schema2prop1.foo.bar", + // "schema2prop1.schema1prop2.foo", "schema2prop1.zzz", + // "schema2prop2.unknown.path" + SchemaPropertyIterator schema3_iterator(schema_type_config3, type_config_map); + + EXPECT_THAT(schema3_iterator.Advance(), IsOk()); + EXPECT_THAT(schema3_iterator.GetCurrentPropertyPath(), + Eq("schema3prop1.schema2prop1.schema1prop1")); + EXPECT_THAT(schema3_iterator.GetCurrentPropertyConfig(), + EqualsProto(schema_type_config1.properties(0))); + EXPECT_THAT(schema3_iterator.GetCurrentPropertyIndexable(), IsFalse()); + + EXPECT_THAT(schema3_iterator.Advance(), IsOk()); + EXPECT_THAT(schema3_iterator.GetCurrentPropertyPath(), + Eq("schema3prop1.schema2prop1.schema1prop2")); + EXPECT_THAT(schema3_iterator.GetCurrentPropertyConfig(), + EqualsProto(schema_type_config1.properties(1))); + EXPECT_THAT(schema3_iterator.GetCurrentPropertyIndexable(), IsTrue()); + + EXPECT_THAT(schema3_iterator.Advance(), IsOk()); + EXPECT_THAT(schema3_iterator.GetCurrentPropertyPath(), + Eq("schema3prop1.schema2prop2.schema1prop1")); + EXPECT_THAT(schema3_iterator.GetCurrentPropertyConfig(), + EqualsProto(schema_type_config1.properties(0))); + EXPECT_THAT(schema3_iterator.GetCurrentPropertyIndexable(), IsFalse()); + + EXPECT_THAT(schema3_iterator.Advance(), IsOk()); + EXPECT_THAT(schema3_iterator.GetCurrentPropertyPath(), + Eq("schema3prop1.schema2prop2.schema1prop2")); + EXPECT_THAT(schema3_iterator.GetCurrentPropertyConfig(), + EqualsProto(schema_type_config1.properties(1))); + EXPECT_THAT(schema3_iterator.GetCurrentPropertyIndexable(), IsFalse()); + + EXPECT_THAT(schema3_iterator.Advance(), + StatusIs(libtextclassifier3::StatusCode::OUT_OF_RANGE)); + + EXPECT_THAT(schema3_iterator.unknown_indexable_nested_property_paths(), + testing::ElementsAre( + "schema3prop1.schema1prop2", "schema3prop1.schema2prop1", + "schema3prop1.schema2prop1.zzz", "schema3prop1.schema3prop1", + "schema3prop1.zzz")); + + // Order of iteration for Schema2: + // "schema2prop1.schema1prop1", + // "schema2prop1.schema1prop2" (indexable), + // "schema2prop2.schema1prop1" (indexable), + // "schema2prop2.schema1prop2" (indexable) + // + // Unknown properties (in order): + // "schema2prop1.aaa.zzz", "schema2prop1.foo.bar", + // "schema2prop1.schema1prop2.foo", "schema2prop1.zzz", + // "schema2prop2.unknown.path" + SchemaPropertyIterator schema2_iterator(schema_type_config2, type_config_map); + + EXPECT_THAT(schema2_iterator.Advance(), IsOk()); + EXPECT_THAT(schema2_iterator.GetCurrentPropertyPath(), + Eq("schema2prop1.schema1prop1")); + EXPECT_THAT(schema2_iterator.GetCurrentPropertyConfig(), + EqualsProto(schema_type_config1.properties(0))); + EXPECT_THAT(schema2_iterator.GetCurrentPropertyIndexable(), IsFalse()); + + EXPECT_THAT(schema2_iterator.Advance(), IsOk()); + EXPECT_THAT(schema2_iterator.GetCurrentPropertyPath(), + Eq("schema2prop1.schema1prop2")); + EXPECT_THAT(schema2_iterator.GetCurrentPropertyConfig(), + EqualsProto(schema_type_config1.properties(1))); + EXPECT_THAT(schema2_iterator.GetCurrentPropertyIndexable(), IsTrue()); + + EXPECT_THAT(schema2_iterator.Advance(), IsOk()); + EXPECT_THAT(schema2_iterator.GetCurrentPropertyPath(), + Eq("schema2prop2.schema1prop1")); + EXPECT_THAT(schema2_iterator.GetCurrentPropertyConfig(), + EqualsProto(schema_type_config1.properties(0))); + EXPECT_THAT(schema2_iterator.GetCurrentPropertyIndexable(), IsTrue()); + + EXPECT_THAT(schema2_iterator.Advance(), IsOk()); + EXPECT_THAT(schema2_iterator.GetCurrentPropertyPath(), + Eq("schema2prop2.schema1prop2")); + EXPECT_THAT(schema2_iterator.GetCurrentPropertyConfig(), + EqualsProto(schema_type_config1.properties(1))); + EXPECT_THAT(schema2_iterator.GetCurrentPropertyIndexable(), IsTrue()); + + EXPECT_THAT(schema2_iterator.Advance(), + StatusIs(libtextclassifier3::StatusCode::OUT_OF_RANGE)); + + EXPECT_THAT( + schema2_iterator.unknown_indexable_nested_property_paths(), + testing::ElementsAre("schema2prop1.aaa.zzz", "schema2prop1.foo.bar", + "schema2prop1.schema1prop2.foo", "schema2prop1.zzz", + "schema2prop2.unknown.path")); } +TEST(SchemaPropertyIteratorTest, + IndexableNestedProperties_duplicatePropertyNamesInDifferentProperties) { + std::string schema_type_name1 = "SchemaOne"; + std::string schema_type_name2 = "SchemaTwo"; + std::string schema_type_name3 = "SchemaThree"; + + SchemaTypeConfigProto schema_type_config1 = + SchemaTypeConfigBuilder() + .SetType(schema_type_name1) + .AddProperty( + PropertyConfigBuilder().SetName("prop1").SetDataTypeString( + TERM_MATCH_EXACT, TOKENIZER_PLAIN)) + .AddProperty( + PropertyConfigBuilder().SetName("prop2").SetDataTypeString( + TERM_MATCH_PREFIX, TOKENIZER_PLAIN)) + .AddProperty( + PropertyConfigBuilder().SetName("prop3").SetDataTypeString( + TERM_MATCH_PREFIX, TOKENIZER_PLAIN)) + .Build(); + SchemaTypeConfigProto schema_type_config2 = + SchemaTypeConfigBuilder() + .SetType(schema_type_name2) + .AddProperty( + PropertyConfigBuilder().SetName("prop1").SetDataTypeDocument( + schema_type_name1, + /*indexable_nested_properties_list=*/ + std::initializer_list<std::string>{"prop2"})) + .AddProperty( + PropertyConfigBuilder().SetName("prop2").SetDataTypeString( + TERM_MATCH_EXACT, TOKENIZER_PLAIN)) + .AddProperty( + PropertyConfigBuilder().SetName("prop3").SetDataTypeString( + TERM_MATCH_EXACT, TOKENIZER_PLAIN)) + .Build(); + SchemaTypeConfigProto schema_type_config3 = + SchemaTypeConfigBuilder() + .SetType(schema_type_name3) + .AddProperty( + PropertyConfigBuilder().SetName("prop3").SetDataTypeDocument( + schema_type_name1, + /*indexable_nested_properties_list=*/ + {"prop1", "prop3"})) + .AddProperty( + PropertyConfigBuilder().SetName("prop1").SetDataTypeDocument( + schema_type_name2, + /*indexable_nested_properties_list=*/ + {"prop2", "prop1.prop1", "prop1.prop3"})) + .AddProperty( + PropertyConfigBuilder().SetName("prop2").SetDataTypeString( + TERM_MATCH_EXACT, TOKENIZER_PLAIN)) + .AddProperty( + PropertyConfigBuilder().SetName("prop4").SetDataTypeDocument( + schema_type_name1, + /*indexable_nested_properties_list=*/ + {"prop2", "prop3"})) + .Build(); + SchemaUtil::TypeConfigMap type_config_map = { + {schema_type_name1, schema_type_config1}, + {schema_type_name2, schema_type_config2}, + {schema_type_name3, schema_type_config3}}; + + // Order of iteration for Schema3: + // {"prop1.prop1.prop1", "prop1.prop1.prop2", "prop1.prop1.prop3", + // "prop1.prop2", "prop1.prop3", "prop2", + // "prop3.prop1", "prop3.prop2", "prop3.prop3", + // "prop4.prop1", "prop4.prop2", "prop4.prop3"}. + // + // Indexable properties: + // {"prop1.prop1.prop1", "prop1.prop1.prop3", "prop1.prop2", "prop2", + // "prop3.prop1", "prop3.prop3", "prop4.prop2", "prop4.prop3"} + // + // Properties do not affect other properties with the same name from different + // properties. + SchemaPropertyIterator schema3_iterator(schema_type_config3, type_config_map); + + EXPECT_THAT(schema3_iterator.Advance(), IsOk()); + EXPECT_THAT(schema3_iterator.GetCurrentPropertyPath(), + Eq("prop1.prop1.prop1")); + EXPECT_THAT(schema3_iterator.GetCurrentPropertyConfig(), + EqualsProto(schema_type_config1.properties(0))); + EXPECT_THAT(schema3_iterator.GetCurrentPropertyIndexable(), IsTrue()); + + EXPECT_THAT(schema3_iterator.Advance(), IsOk()); + EXPECT_THAT(schema3_iterator.GetCurrentPropertyPath(), + Eq("prop1.prop1.prop2")); + EXPECT_THAT(schema3_iterator.GetCurrentPropertyConfig(), + EqualsProto(schema_type_config1.properties(1))); + EXPECT_THAT(schema3_iterator.GetCurrentPropertyIndexable(), IsFalse()); + + EXPECT_THAT(schema3_iterator.Advance(), IsOk()); + EXPECT_THAT(schema3_iterator.GetCurrentPropertyPath(), + Eq("prop1.prop1.prop3")); + EXPECT_THAT(schema3_iterator.GetCurrentPropertyConfig(), + EqualsProto(schema_type_config1.properties(2))); + EXPECT_THAT(schema3_iterator.GetCurrentPropertyIndexable(), IsTrue()); + + EXPECT_THAT(schema3_iterator.Advance(), IsOk()); + EXPECT_THAT(schema3_iterator.GetCurrentPropertyPath(), Eq("prop1.prop2")); + EXPECT_THAT(schema3_iterator.GetCurrentPropertyConfig(), + EqualsProto(schema_type_config2.properties(1))); + EXPECT_THAT(schema3_iterator.GetCurrentPropertyIndexable(), IsTrue()); + + EXPECT_THAT(schema3_iterator.Advance(), IsOk()); + EXPECT_THAT(schema3_iterator.GetCurrentPropertyPath(), Eq("prop1.prop3")); + EXPECT_THAT(schema3_iterator.GetCurrentPropertyConfig(), + EqualsProto(schema_type_config2.properties(2))); + EXPECT_THAT(schema3_iterator.GetCurrentPropertyIndexable(), IsFalse()); + + EXPECT_THAT(schema3_iterator.Advance(), IsOk()); + EXPECT_THAT(schema3_iterator.GetCurrentPropertyPath(), Eq("prop2")); + EXPECT_THAT(schema3_iterator.GetCurrentPropertyConfig(), + EqualsProto(schema_type_config3.properties(2))); + EXPECT_THAT(schema3_iterator.GetCurrentPropertyIndexable(), IsTrue()); + + EXPECT_THAT(schema3_iterator.Advance(), IsOk()); + EXPECT_THAT(schema3_iterator.GetCurrentPropertyPath(), Eq("prop3.prop1")); + EXPECT_THAT(schema3_iterator.GetCurrentPropertyConfig(), + EqualsProto(schema_type_config1.properties(0))); + EXPECT_THAT(schema3_iterator.GetCurrentPropertyIndexable(), IsTrue()); + + EXPECT_THAT(schema3_iterator.Advance(), IsOk()); + EXPECT_THAT(schema3_iterator.GetCurrentPropertyPath(), Eq("prop3.prop2")); + EXPECT_THAT(schema3_iterator.GetCurrentPropertyConfig(), + EqualsProto(schema_type_config1.properties(1))); + EXPECT_THAT(schema3_iterator.GetCurrentPropertyIndexable(), IsFalse()); + + EXPECT_THAT(schema3_iterator.Advance(), IsOk()); + EXPECT_THAT(schema3_iterator.GetCurrentPropertyPath(), Eq("prop3.prop3")); + EXPECT_THAT(schema3_iterator.GetCurrentPropertyConfig(), + EqualsProto(schema_type_config1.properties(2))); + EXPECT_THAT(schema3_iterator.GetCurrentPropertyIndexable(), IsTrue()); + + EXPECT_THAT(schema3_iterator.Advance(), IsOk()); + EXPECT_THAT(schema3_iterator.GetCurrentPropertyPath(), Eq("prop4.prop1")); + EXPECT_THAT(schema3_iterator.GetCurrentPropertyConfig(), + EqualsProto(schema_type_config1.properties(0))); + EXPECT_THAT(schema3_iterator.GetCurrentPropertyIndexable(), IsFalse()); + + EXPECT_THAT(schema3_iterator.Advance(), IsOk()); + EXPECT_THAT(schema3_iterator.GetCurrentPropertyPath(), Eq("prop4.prop2")); + EXPECT_THAT(schema3_iterator.GetCurrentPropertyConfig(), + EqualsProto(schema_type_config1.properties(1))); + EXPECT_THAT(schema3_iterator.GetCurrentPropertyIndexable(), IsTrue()); + + EXPECT_THAT(schema3_iterator.Advance(), IsOk()); + EXPECT_THAT(schema3_iterator.GetCurrentPropertyPath(), Eq("prop4.prop3")); + EXPECT_THAT(schema3_iterator.GetCurrentPropertyConfig(), + EqualsProto(schema_type_config1.properties(2))); + EXPECT_THAT(schema3_iterator.GetCurrentPropertyIndexable(), IsTrue()); + + EXPECT_THAT(schema3_iterator.Advance(), + StatusIs(libtextclassifier3::StatusCode::OUT_OF_RANGE)); + + EXPECT_THAT(schema3_iterator.unknown_indexable_nested_property_paths(), + IsEmpty()); + + // Order of iteration for Schema2: + // {"prop1.prop1", "prop1.prop2", + // "prop1.prop3", "prop2", "prop3"} + // + // Indexable properties: + // {"prop1.prop2", "prop1.prop3", "prop2", "prop3"} + // + // Indexable_nested_properties set for Schema3.prop1 does not propagate + // to Schema2. + SchemaPropertyIterator schema2_iterator(schema_type_config2, type_config_map); + + EXPECT_THAT(schema2_iterator.Advance(), IsOk()); + EXPECT_THAT(schema2_iterator.GetCurrentPropertyPath(), Eq("prop1.prop1")); + EXPECT_THAT(schema2_iterator.GetCurrentPropertyConfig(), + EqualsProto(schema_type_config1.properties(0))); + EXPECT_THAT(schema2_iterator.GetCurrentPropertyIndexable(), IsFalse()); + + EXPECT_THAT(schema2_iterator.Advance(), IsOk()); + EXPECT_THAT(schema2_iterator.GetCurrentPropertyPath(), Eq("prop1.prop2")); + EXPECT_THAT(schema2_iterator.GetCurrentPropertyConfig(), + EqualsProto(schema_type_config1.properties(1))); + EXPECT_THAT(schema2_iterator.GetCurrentPropertyIndexable(), IsTrue()); + + EXPECT_THAT(schema2_iterator.Advance(), IsOk()); + EXPECT_THAT(schema2_iterator.GetCurrentPropertyPath(), Eq("prop1.prop3")); + EXPECT_THAT(schema2_iterator.GetCurrentPropertyConfig(), + EqualsProto(schema_type_config1.properties(2))); + EXPECT_THAT(schema2_iterator.GetCurrentPropertyIndexable(), IsFalse()); + + EXPECT_THAT(schema2_iterator.Advance(), IsOk()); + EXPECT_THAT(schema2_iterator.GetCurrentPropertyPath(), Eq("prop2")); + EXPECT_THAT(schema2_iterator.GetCurrentPropertyConfig(), + EqualsProto(schema_type_config2.properties(1))); + EXPECT_THAT(schema2_iterator.GetCurrentPropertyIndexable(), IsTrue()); + + EXPECT_THAT(schema2_iterator.Advance(), IsOk()); + EXPECT_THAT(schema2_iterator.GetCurrentPropertyPath(), Eq("prop3")); + EXPECT_THAT(schema2_iterator.GetCurrentPropertyConfig(), + EqualsProto(schema_type_config2.properties(2))); + EXPECT_THAT(schema2_iterator.GetCurrentPropertyIndexable(), IsTrue()); + + EXPECT_THAT(schema2_iterator.Advance(), + StatusIs(libtextclassifier3::StatusCode::OUT_OF_RANGE)); + + EXPECT_THAT(schema2_iterator.unknown_indexable_nested_property_paths(), + IsEmpty()); +} TEST(SchemaPropertyIteratorTest, SingleLevelCycle) { std::string schema_a = "A"; std::string schema_b = "B"; @@ -457,17 +1929,20 @@ TEST(SchemaPropertyIteratorTest, SingleLevelCycle) { Eq("schemaAprop1.schemaBprop2")); EXPECT_THAT(schema_a_iterator.GetCurrentPropertyConfig(), EqualsProto(schema_type_config_b.properties(1))); - EXPECT_THAT(schema_a_iterator.GetCurrentNestedIndexable(), IsTrue()); + EXPECT_THAT(schema_a_iterator.GetCurrentPropertyIndexable(), IsTrue()); EXPECT_THAT(schema_a_iterator.Advance(), IsOk()); EXPECT_THAT(schema_a_iterator.GetCurrentPropertyPath(), Eq("schemaAprop2")); EXPECT_THAT(schema_a_iterator.GetCurrentPropertyConfig(), EqualsProto(schema_type_config_a.properties(1))); - EXPECT_THAT(schema_a_iterator.GetCurrentNestedIndexable(), IsTrue()); + EXPECT_THAT(schema_a_iterator.GetCurrentPropertyIndexable(), IsTrue()); EXPECT_THAT(schema_a_iterator.Advance(), StatusIs(libtextclassifier3::StatusCode::OUT_OF_RANGE)); + EXPECT_THAT(schema_a_iterator.unknown_indexable_nested_property_paths(), + IsEmpty()); + // Order of iteration for schema B: // {"schemaBprop2"}, indexable. SchemaPropertyIterator schema_b_iterator(schema_type_config_b, @@ -477,10 +1952,13 @@ TEST(SchemaPropertyIteratorTest, SingleLevelCycle) { EXPECT_THAT(schema_b_iterator.GetCurrentPropertyPath(), Eq("schemaBprop2")); EXPECT_THAT(schema_b_iterator.GetCurrentPropertyConfig(), EqualsProto(schema_type_config_b.properties(1))); - EXPECT_THAT(schema_b_iterator.GetCurrentNestedIndexable(), IsTrue()); + EXPECT_THAT(schema_b_iterator.GetCurrentPropertyIndexable(), IsTrue()); EXPECT_THAT(schema_b_iterator.Advance(), StatusIs(libtextclassifier3::StatusCode::OUT_OF_RANGE)); + + EXPECT_THAT(schema_b_iterator.unknown_indexable_nested_property_paths(), + IsEmpty()); } TEST(SchemaPropertyIteratorTest, MultipleLevelCycle) { @@ -542,24 +2020,27 @@ TEST(SchemaPropertyIteratorTest, MultipleLevelCycle) { Eq("schemaAprop1.schemaBprop1.schemaCprop2")); EXPECT_THAT(schema_a_iterator.GetCurrentPropertyConfig(), EqualsProto(schema_type_config_c.properties(1))); - EXPECT_THAT(schema_a_iterator.GetCurrentNestedIndexable(), IsTrue()); + EXPECT_THAT(schema_a_iterator.GetCurrentPropertyIndexable(), IsTrue()); EXPECT_THAT(schema_a_iterator.Advance(), IsOk()); EXPECT_THAT(schema_a_iterator.GetCurrentPropertyPath(), Eq("schemaAprop1.schemaBprop2")); EXPECT_THAT(schema_a_iterator.GetCurrentPropertyConfig(), EqualsProto(schema_type_config_b.properties(1))); - EXPECT_THAT(schema_a_iterator.GetCurrentNestedIndexable(), IsTrue()); + EXPECT_THAT(schema_a_iterator.GetCurrentPropertyIndexable(), IsTrue()); EXPECT_THAT(schema_a_iterator.Advance(), IsOk()); EXPECT_THAT(schema_a_iterator.GetCurrentPropertyPath(), Eq("schemaAprop2")); EXPECT_THAT(schema_a_iterator.GetCurrentPropertyConfig(), EqualsProto(schema_type_config_a.properties(1))); - EXPECT_THAT(schema_a_iterator.GetCurrentNestedIndexable(), IsTrue()); + EXPECT_THAT(schema_a_iterator.GetCurrentPropertyIndexable(), IsTrue()); EXPECT_THAT(schema_a_iterator.Advance(), StatusIs(libtextclassifier3::StatusCode::OUT_OF_RANGE)); + EXPECT_THAT(schema_a_iterator.unknown_indexable_nested_property_paths(), + IsEmpty()); + // Order of iteration for schema B: // {"schemaBprop1.schemaCprop1.schemaAprop2", "schemaBprop1.schemaCprop2", // "schemaBprop2"} @@ -573,24 +2054,27 @@ TEST(SchemaPropertyIteratorTest, MultipleLevelCycle) { Eq("schemaBprop1.schemaCprop1.schemaAprop2")); EXPECT_THAT(schema_b_iterator.GetCurrentPropertyConfig(), EqualsProto(schema_type_config_a.properties(1))); - EXPECT_THAT(schema_b_iterator.GetCurrentNestedIndexable(), IsFalse()); + EXPECT_THAT(schema_b_iterator.GetCurrentPropertyIndexable(), IsFalse()); EXPECT_THAT(schema_b_iterator.Advance(), IsOk()); EXPECT_THAT(schema_b_iterator.GetCurrentPropertyPath(), Eq("schemaBprop1.schemaCprop2")); EXPECT_THAT(schema_b_iterator.GetCurrentPropertyConfig(), EqualsProto(schema_type_config_c.properties(1))); - EXPECT_THAT(schema_b_iterator.GetCurrentNestedIndexable(), IsTrue()); + EXPECT_THAT(schema_b_iterator.GetCurrentPropertyIndexable(), IsTrue()); EXPECT_THAT(schema_b_iterator.Advance(), IsOk()); EXPECT_THAT(schema_b_iterator.GetCurrentPropertyPath(), Eq("schemaBprop2")); EXPECT_THAT(schema_b_iterator.GetCurrentPropertyConfig(), EqualsProto(schema_type_config_b.properties(1))); - EXPECT_THAT(schema_b_iterator.GetCurrentNestedIndexable(), IsTrue()); + EXPECT_THAT(schema_b_iterator.GetCurrentPropertyIndexable(), IsTrue()); EXPECT_THAT(schema_b_iterator.Advance(), StatusIs(libtextclassifier3::StatusCode::OUT_OF_RANGE)); + EXPECT_THAT(schema_b_iterator.unknown_indexable_nested_property_paths(), + IsEmpty()); + // Order of iteration for schema C: // {"schemaCprop1.schemaAprop1.schemaBprop2", "schemaCprop1.schemaAprop2", // "schemaCprop2"} @@ -604,23 +2088,222 @@ TEST(SchemaPropertyIteratorTest, MultipleLevelCycle) { Eq("schemaCprop1.schemaAprop1.schemaBprop2")); EXPECT_THAT(schema_c_iterator.GetCurrentPropertyConfig(), EqualsProto(schema_type_config_b.properties(1))); - EXPECT_THAT(schema_c_iterator.GetCurrentNestedIndexable(), IsFalse()); + EXPECT_THAT(schema_c_iterator.GetCurrentPropertyIndexable(), IsFalse()); EXPECT_THAT(schema_c_iterator.Advance(), IsOk()); EXPECT_THAT(schema_c_iterator.GetCurrentPropertyPath(), Eq("schemaCprop1.schemaAprop2")); EXPECT_THAT(schema_c_iterator.GetCurrentPropertyConfig(), EqualsProto(schema_type_config_a.properties(1))); - EXPECT_THAT(schema_c_iterator.GetCurrentNestedIndexable(), IsFalse()); + EXPECT_THAT(schema_c_iterator.GetCurrentPropertyIndexable(), IsFalse()); EXPECT_THAT(schema_c_iterator.Advance(), IsOk()); EXPECT_THAT(schema_c_iterator.GetCurrentPropertyPath(), Eq("schemaCprop2")); EXPECT_THAT(schema_c_iterator.GetCurrentPropertyConfig(), EqualsProto(schema_type_config_c.properties(1))); - EXPECT_THAT(schema_c_iterator.GetCurrentNestedIndexable(), IsTrue()); + EXPECT_THAT(schema_c_iterator.GetCurrentPropertyIndexable(), IsTrue()); EXPECT_THAT(schema_c_iterator.Advance(), StatusIs(libtextclassifier3::StatusCode::OUT_OF_RANGE)); + + EXPECT_THAT(schema_c_iterator.unknown_indexable_nested_property_paths(), + IsEmpty()); +} + +TEST(SchemaPropertyIteratorTest, SingleLevelCycleWithIndexableList) { + std::string schema_a = "A"; + std::string schema_b = "B"; + + // Create schema with A -> B -> B -> B... + SchemaTypeConfigProto schema_type_config_a = + SchemaTypeConfigBuilder() + .SetType(schema_a) + .AddProperty(PropertyConfigBuilder() + .SetName("schemaAprop1") + .SetDataTypeDocument( + schema_b, /*index_nested_properties=*/true)) + .AddProperty( + PropertyConfigBuilder() + .SetName("schemaAprop2") + .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN)) + .Build(); + SchemaTypeConfigProto schema_type_config_b = + SchemaTypeConfigBuilder() + .SetType(schema_b) + .AddProperty( + PropertyConfigBuilder() + .SetName("schemaBprop1") + .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN)) + .AddProperty(PropertyConfigBuilder() + .SetName("schemaBprop2") + .SetDataTypeDocument( + schema_b, /*indexable_nested_properties_list=*/ + {"schemaBprop1", "schemaBprop2.schemaBprop1", + "schemaBprop2.schemaBprop3", + "schemaBprop2.schemaBprop2.schemaBprop3"})) + .AddProperty( + PropertyConfigBuilder() + .SetName("schemaBprop3") + .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN)) + .Build(); + + SchemaUtil::TypeConfigMap type_config_map = { + {schema_a, schema_type_config_a}, {schema_b, schema_type_config_b}}; + + // Order of iteration and whether each property is indexable for schema A: + // {"schemaAprop1.schemaBprop1" (true), + // "schemaAprop1.schemaBprop2.schemaBprop1" (true), + // "schemaAprop1.schemaBprop2.schemaBprop2.schemaBprop1" (true), + // "schemaAprop1.schemaBprop2.schemaBprop2.schemaBprop2.schemaBprop1" (false), + // "schemaAprop1.schemaBprop2.schemaBprop2.schemaBprop2.schemaBprop3" (true), + // "schemaAprop1.schemaBprop2.schemaBprop2.schemaBprop3" (true), + // "schemaAprop1.schemaBprop2.schemaBprop3" (false), + // "schemaAprop1.schemaBprop3" (true), + // "schemaAprop2" (true)} + SchemaPropertyIterator schema_a_iterator(schema_type_config_a, + type_config_map); + + EXPECT_THAT(schema_a_iterator.Advance(), IsOk()); + EXPECT_THAT(schema_a_iterator.GetCurrentPropertyPath(), + Eq("schemaAprop1.schemaBprop1")); + EXPECT_THAT(schema_a_iterator.GetCurrentPropertyConfig(), + EqualsProto(schema_type_config_b.properties(0))); + EXPECT_THAT(schema_a_iterator.GetCurrentPropertyIndexable(), IsTrue()); + + EXPECT_THAT(schema_a_iterator.Advance(), IsOk()); + EXPECT_THAT(schema_a_iterator.GetCurrentPropertyPath(), + Eq("schemaAprop1.schemaBprop2.schemaBprop1")); + EXPECT_THAT(schema_a_iterator.GetCurrentPropertyConfig(), + EqualsProto(schema_type_config_b.properties(0))); + EXPECT_THAT(schema_a_iterator.GetCurrentPropertyIndexable(), IsTrue()); + + EXPECT_THAT(schema_a_iterator.Advance(), IsOk()); + EXPECT_THAT(schema_a_iterator.GetCurrentPropertyPath(), + Eq("schemaAprop1.schemaBprop2.schemaBprop2.schemaBprop1")); + EXPECT_THAT(schema_a_iterator.GetCurrentPropertyConfig(), + EqualsProto(schema_type_config_b.properties(0))); + EXPECT_THAT(schema_a_iterator.GetCurrentPropertyIndexable(), IsTrue()); + + EXPECT_THAT(schema_a_iterator.Advance(), IsOk()); + EXPECT_THAT( + schema_a_iterator.GetCurrentPropertyPath(), + Eq("schemaAprop1.schemaBprop2.schemaBprop2.schemaBprop2.schemaBprop1")); + EXPECT_THAT(schema_a_iterator.GetCurrentPropertyConfig(), + EqualsProto(schema_type_config_b.properties(0))); + EXPECT_THAT(schema_a_iterator.GetCurrentPropertyIndexable(), IsFalse()); + + EXPECT_THAT(schema_a_iterator.Advance(), IsOk()); + EXPECT_THAT( + schema_a_iterator.GetCurrentPropertyPath(), + Eq("schemaAprop1.schemaBprop2.schemaBprop2.schemaBprop2.schemaBprop3")); + EXPECT_THAT(schema_a_iterator.GetCurrentPropertyConfig(), + EqualsProto(schema_type_config_b.properties(2))); + EXPECT_THAT(schema_a_iterator.GetCurrentPropertyIndexable(), IsTrue()); + + EXPECT_THAT(schema_a_iterator.Advance(), IsOk()); + EXPECT_THAT(schema_a_iterator.GetCurrentPropertyPath(), + Eq("schemaAprop1.schemaBprop2.schemaBprop2.schemaBprop3")); + EXPECT_THAT(schema_a_iterator.GetCurrentPropertyConfig(), + EqualsProto(schema_type_config_b.properties(2))); + EXPECT_THAT(schema_a_iterator.GetCurrentPropertyIndexable(), IsTrue()); + + EXPECT_THAT(schema_a_iterator.Advance(), IsOk()); + EXPECT_THAT(schema_a_iterator.GetCurrentPropertyPath(), + Eq("schemaAprop1.schemaBprop2.schemaBprop3")); + EXPECT_THAT(schema_a_iterator.GetCurrentPropertyConfig(), + EqualsProto(schema_type_config_b.properties(2))); + EXPECT_THAT(schema_a_iterator.GetCurrentPropertyIndexable(), IsFalse()); + + EXPECT_THAT(schema_a_iterator.Advance(), IsOk()); + EXPECT_THAT(schema_a_iterator.GetCurrentPropertyPath(), + Eq("schemaAprop1.schemaBprop3")); + EXPECT_THAT(schema_a_iterator.GetCurrentPropertyConfig(), + EqualsProto(schema_type_config_b.properties(2))); + EXPECT_THAT(schema_a_iterator.GetCurrentPropertyIndexable(), IsTrue()); + + EXPECT_THAT(schema_a_iterator.Advance(), IsOk()); + EXPECT_THAT(schema_a_iterator.GetCurrentPropertyPath(), Eq("schemaAprop2")); + EXPECT_THAT(schema_a_iterator.GetCurrentPropertyConfig(), + EqualsProto(schema_type_config_a.properties(1))); + EXPECT_THAT(schema_a_iterator.GetCurrentPropertyIndexable(), IsTrue()); + + EXPECT_THAT(schema_a_iterator.Advance(), + StatusIs(libtextclassifier3::StatusCode::OUT_OF_RANGE)); + + EXPECT_THAT(schema_a_iterator.unknown_indexable_nested_property_paths(), + IsEmpty()); + + // Order of iteration for schema B: + // {"schemaBprop1" (true), + // "schemaBprop2.schemaBprop1" (true), + // "schemaBprop2.schemaBprop2.schemaBprop1" (true), + // "schemaBprop2.schemaBprop2.schemaBprop2.schemaBprop1" (false), + // "schemaBprop2.schemaBprop2.schemaBprop2.schemaBprop3" (true), + // "schemaBprop2.schemaBprop2.schemaBprop3" (true), + // "schemaBprop2.schemaBprop3" (false), + // "schemaBprop3" (true)} + SchemaPropertyIterator schema_b_iterator(schema_type_config_b, + type_config_map); + + EXPECT_THAT(schema_b_iterator.Advance(), IsOk()); + EXPECT_THAT(schema_b_iterator.GetCurrentPropertyPath(), Eq("schemaBprop1")); + EXPECT_THAT(schema_b_iterator.GetCurrentPropertyConfig(), + EqualsProto(schema_type_config_b.properties(0))); + EXPECT_THAT(schema_b_iterator.GetCurrentPropertyIndexable(), IsTrue()); + + EXPECT_THAT(schema_b_iterator.Advance(), IsOk()); + EXPECT_THAT(schema_b_iterator.GetCurrentPropertyPath(), + Eq("schemaBprop2.schemaBprop1")); + EXPECT_THAT(schema_b_iterator.GetCurrentPropertyConfig(), + EqualsProto(schema_type_config_b.properties(0))); + EXPECT_THAT(schema_b_iterator.GetCurrentPropertyIndexable(), IsTrue()); + + EXPECT_THAT(schema_b_iterator.Advance(), IsOk()); + EXPECT_THAT(schema_b_iterator.GetCurrentPropertyPath(), + Eq("schemaBprop2.schemaBprop2.schemaBprop1")); + EXPECT_THAT(schema_b_iterator.GetCurrentPropertyConfig(), + EqualsProto(schema_type_config_b.properties(0))); + EXPECT_THAT(schema_b_iterator.GetCurrentPropertyIndexable(), IsTrue()); + + EXPECT_THAT(schema_b_iterator.Advance(), IsOk()); + EXPECT_THAT(schema_b_iterator.GetCurrentPropertyPath(), + Eq("schemaBprop2.schemaBprop2.schemaBprop2.schemaBprop1")); + EXPECT_THAT(schema_b_iterator.GetCurrentPropertyConfig(), + EqualsProto(schema_type_config_b.properties(0))); + EXPECT_THAT(schema_b_iterator.GetCurrentPropertyIndexable(), IsFalse()); + + EXPECT_THAT(schema_b_iterator.Advance(), IsOk()); + EXPECT_THAT(schema_b_iterator.GetCurrentPropertyPath(), + Eq("schemaBprop2.schemaBprop2.schemaBprop2.schemaBprop3")); + EXPECT_THAT(schema_b_iterator.GetCurrentPropertyConfig(), + EqualsProto(schema_type_config_b.properties(2))); + EXPECT_THAT(schema_b_iterator.GetCurrentPropertyIndexable(), IsTrue()); + + EXPECT_THAT(schema_b_iterator.Advance(), IsOk()); + EXPECT_THAT(schema_b_iterator.GetCurrentPropertyPath(), + Eq("schemaBprop2.schemaBprop2.schemaBprop3")); + EXPECT_THAT(schema_b_iterator.GetCurrentPropertyConfig(), + EqualsProto(schema_type_config_b.properties(2))); + EXPECT_THAT(schema_b_iterator.GetCurrentPropertyIndexable(), IsTrue()); + + EXPECT_THAT(schema_b_iterator.Advance(), IsOk()); + EXPECT_THAT(schema_b_iterator.GetCurrentPropertyPath(), + Eq("schemaBprop2.schemaBprop3")); + EXPECT_THAT(schema_b_iterator.GetCurrentPropertyConfig(), + EqualsProto(schema_type_config_b.properties(2))); + EXPECT_THAT(schema_b_iterator.GetCurrentPropertyIndexable(), IsFalse()); + + EXPECT_THAT(schema_b_iterator.Advance(), IsOk()); + EXPECT_THAT(schema_b_iterator.GetCurrentPropertyPath(), Eq("schemaBprop3")); + EXPECT_THAT(schema_b_iterator.GetCurrentPropertyConfig(), + EqualsProto(schema_type_config_b.properties(2))); + EXPECT_THAT(schema_b_iterator.GetCurrentPropertyIndexable(), IsTrue()); + + EXPECT_THAT(schema_b_iterator.Advance(), + StatusIs(libtextclassifier3::StatusCode::OUT_OF_RANGE)); + + EXPECT_THAT(schema_b_iterator.unknown_indexable_nested_property_paths(), + IsEmpty()); } TEST(SchemaPropertyIteratorTest, MultipleCycles) { @@ -629,7 +2312,11 @@ TEST(SchemaPropertyIteratorTest, MultipleCycles) { std::string schema_c = "C"; std::string schema_d = "D"; - // Create schema with D <-> A -> B -> C -> A -> B -> C -> A... + // Create the following schema: + // D <--> A <--- C + // \ ^ + // v / + // B // Schema type A has two cycles: A-B-C-A and A-D-A SchemaTypeConfigProto schema_type_config_a = SchemaTypeConfigBuilder() @@ -701,31 +2388,34 @@ TEST(SchemaPropertyIteratorTest, MultipleCycles) { Eq("schemaAprop1.schemaBprop1.schemaCprop2")); EXPECT_THAT(schema_a_iterator.GetCurrentPropertyConfig(), EqualsProto(schema_type_config_c.properties(1))); - EXPECT_THAT(schema_a_iterator.GetCurrentNestedIndexable(), IsTrue()); + EXPECT_THAT(schema_a_iterator.GetCurrentPropertyIndexable(), IsTrue()); EXPECT_THAT(schema_a_iterator.Advance(), IsOk()); EXPECT_THAT(schema_a_iterator.GetCurrentPropertyPath(), Eq("schemaAprop1.schemaBprop2")); EXPECT_THAT(schema_a_iterator.GetCurrentPropertyConfig(), EqualsProto(schema_type_config_b.properties(1))); - EXPECT_THAT(schema_a_iterator.GetCurrentNestedIndexable(), IsTrue()); + EXPECT_THAT(schema_a_iterator.GetCurrentPropertyIndexable(), IsTrue()); EXPECT_THAT(schema_a_iterator.Advance(), IsOk()); EXPECT_THAT(schema_a_iterator.GetCurrentPropertyPath(), Eq("schemaAprop2")); EXPECT_THAT(schema_a_iterator.GetCurrentPropertyConfig(), EqualsProto(schema_type_config_a.properties(1))); - EXPECT_THAT(schema_a_iterator.GetCurrentNestedIndexable(), IsTrue()); + EXPECT_THAT(schema_a_iterator.GetCurrentPropertyIndexable(), IsTrue()); EXPECT_THAT(schema_a_iterator.Advance(), IsOk()); EXPECT_THAT(schema_a_iterator.GetCurrentPropertyPath(), Eq("schemaAprop3.schemaDprop2")); EXPECT_THAT(schema_a_iterator.GetCurrentPropertyConfig(), EqualsProto(schema_type_config_d.properties(1))); - EXPECT_THAT(schema_a_iterator.GetCurrentNestedIndexable(), IsTrue()); + EXPECT_THAT(schema_a_iterator.GetCurrentPropertyIndexable(), IsTrue()); EXPECT_THAT(schema_a_iterator.Advance(), StatusIs(libtextclassifier3::StatusCode::OUT_OF_RANGE)); + EXPECT_THAT(schema_a_iterator.unknown_indexable_nested_property_paths(), + IsEmpty()); + // Order of iteration for schema B: // {"schemaBprop1.schemaCprop1.schemaAprop2", // "schemaBprop1.schemaCprop1.schemaAprop3.schemaDprop2", @@ -740,31 +2430,34 @@ TEST(SchemaPropertyIteratorTest, MultipleCycles) { Eq("schemaBprop1.schemaCprop1.schemaAprop2")); EXPECT_THAT(schema_b_iterator.GetCurrentPropertyConfig(), EqualsProto(schema_type_config_a.properties(1))); - EXPECT_THAT(schema_b_iterator.GetCurrentNestedIndexable(), IsFalse()); + EXPECT_THAT(schema_b_iterator.GetCurrentPropertyIndexable(), IsFalse()); EXPECT_THAT(schema_b_iterator.Advance(), IsOk()); EXPECT_THAT(schema_b_iterator.GetCurrentPropertyPath(), Eq("schemaBprop1.schemaCprop1.schemaAprop3.schemaDprop2")); EXPECT_THAT(schema_b_iterator.GetCurrentPropertyConfig(), EqualsProto(schema_type_config_d.properties(1))); - EXPECT_THAT(schema_b_iterator.GetCurrentNestedIndexable(), IsFalse()); + EXPECT_THAT(schema_b_iterator.GetCurrentPropertyIndexable(), IsFalse()); EXPECT_THAT(schema_b_iterator.Advance(), IsOk()); EXPECT_THAT(schema_b_iterator.GetCurrentPropertyPath(), Eq("schemaBprop1.schemaCprop2")); EXPECT_THAT(schema_b_iterator.GetCurrentPropertyConfig(), EqualsProto(schema_type_config_c.properties(1))); - EXPECT_THAT(schema_b_iterator.GetCurrentNestedIndexable(), IsTrue()); + EXPECT_THAT(schema_b_iterator.GetCurrentPropertyIndexable(), IsTrue()); EXPECT_THAT(schema_b_iterator.Advance(), IsOk()); EXPECT_THAT(schema_b_iterator.GetCurrentPropertyPath(), Eq("schemaBprop2")); EXPECT_THAT(schema_b_iterator.GetCurrentPropertyConfig(), EqualsProto(schema_type_config_b.properties(1))); - EXPECT_THAT(schema_b_iterator.GetCurrentNestedIndexable(), IsTrue()); + EXPECT_THAT(schema_b_iterator.GetCurrentPropertyIndexable(), IsTrue()); EXPECT_THAT(schema_b_iterator.Advance(), StatusIs(libtextclassifier3::StatusCode::OUT_OF_RANGE)); + EXPECT_THAT(schema_b_iterator.unknown_indexable_nested_property_paths(), + IsEmpty()); + // Order of iteration for schema C: // {"schemaCprop1.schemaAprop1.schemaBprop2", "schemaCprop1.schemaAprop2", // "schemaCprop1.schemaAprop3.schemaDprop2", "schemaCprop2"} @@ -778,31 +2471,34 @@ TEST(SchemaPropertyIteratorTest, MultipleCycles) { Eq("schemaCprop1.schemaAprop1.schemaBprop2")); EXPECT_THAT(schema_c_iterator.GetCurrentPropertyConfig(), EqualsProto(schema_type_config_b.properties(1))); - EXPECT_THAT(schema_c_iterator.GetCurrentNestedIndexable(), IsFalse()); + EXPECT_THAT(schema_c_iterator.GetCurrentPropertyIndexable(), IsFalse()); EXPECT_THAT(schema_c_iterator.Advance(), IsOk()); EXPECT_THAT(schema_c_iterator.GetCurrentPropertyPath(), Eq("schemaCprop1.schemaAprop2")); EXPECT_THAT(schema_c_iterator.GetCurrentPropertyConfig(), EqualsProto(schema_type_config_a.properties(1))); - EXPECT_THAT(schema_c_iterator.GetCurrentNestedIndexable(), IsFalse()); + EXPECT_THAT(schema_c_iterator.GetCurrentPropertyIndexable(), IsFalse()); EXPECT_THAT(schema_c_iterator.Advance(), IsOk()); EXPECT_THAT(schema_c_iterator.GetCurrentPropertyPath(), Eq("schemaCprop1.schemaAprop3.schemaDprop2")); EXPECT_THAT(schema_c_iterator.GetCurrentPropertyConfig(), EqualsProto(schema_type_config_d.properties(1))); - EXPECT_THAT(schema_c_iterator.GetCurrentNestedIndexable(), IsFalse()); + EXPECT_THAT(schema_c_iterator.GetCurrentPropertyIndexable(), IsFalse()); EXPECT_THAT(schema_c_iterator.Advance(), IsOk()); EXPECT_THAT(schema_c_iterator.GetCurrentPropertyPath(), Eq("schemaCprop2")); EXPECT_THAT(schema_c_iterator.GetCurrentPropertyConfig(), EqualsProto(schema_type_config_c.properties(1))); - EXPECT_THAT(schema_c_iterator.GetCurrentNestedIndexable(), IsTrue()); + EXPECT_THAT(schema_c_iterator.GetCurrentPropertyIndexable(), IsTrue()); EXPECT_THAT(schema_c_iterator.Advance(), StatusIs(libtextclassifier3::StatusCode::OUT_OF_RANGE)); + EXPECT_THAT(schema_c_iterator.unknown_indexable_nested_property_paths(), + IsEmpty()); + // Order of iteration for schema D: // {"schemaDprop1.schemaAprop1.schemaBprop1.schemaCprop2", // "schemaDprop1.schemaAprop1.schemaBprop2", "schemaDprop1.schemaAprop2", @@ -817,30 +2513,1390 @@ TEST(SchemaPropertyIteratorTest, MultipleCycles) { Eq("schemaDprop1.schemaAprop1.schemaBprop1.schemaCprop2")); EXPECT_THAT(schema_d_iterator.GetCurrentPropertyConfig(), EqualsProto(schema_type_config_c.properties(1))); - EXPECT_THAT(schema_d_iterator.GetCurrentNestedIndexable(), IsFalse()); + EXPECT_THAT(schema_d_iterator.GetCurrentPropertyIndexable(), IsFalse()); EXPECT_THAT(schema_d_iterator.Advance(), IsOk()); EXPECT_THAT(schema_d_iterator.GetCurrentPropertyPath(), Eq("schemaDprop1.schemaAprop1.schemaBprop2")); EXPECT_THAT(schema_d_iterator.GetCurrentPropertyConfig(), EqualsProto(schema_type_config_b.properties(1))); - EXPECT_THAT(schema_d_iterator.GetCurrentNestedIndexable(), IsFalse()); + EXPECT_THAT(schema_d_iterator.GetCurrentPropertyIndexable(), IsFalse()); EXPECT_THAT(schema_d_iterator.Advance(), IsOk()); EXPECT_THAT(schema_d_iterator.GetCurrentPropertyPath(), Eq("schemaDprop1.schemaAprop2")); EXPECT_THAT(schema_d_iterator.GetCurrentPropertyConfig(), EqualsProto(schema_type_config_a.properties(1))); - EXPECT_THAT(schema_d_iterator.GetCurrentNestedIndexable(), IsFalse()); + EXPECT_THAT(schema_d_iterator.GetCurrentPropertyIndexable(), IsFalse()); EXPECT_THAT(schema_d_iterator.Advance(), IsOk()); EXPECT_THAT(schema_d_iterator.GetCurrentPropertyPath(), Eq("schemaDprop2")); EXPECT_THAT(schema_d_iterator.GetCurrentPropertyConfig(), EqualsProto(schema_type_config_d.properties(1))); - EXPECT_THAT(schema_d_iterator.GetCurrentNestedIndexable(), IsTrue()); + EXPECT_THAT(schema_d_iterator.GetCurrentPropertyIndexable(), IsTrue()); EXPECT_THAT(schema_d_iterator.Advance(), StatusIs(libtextclassifier3::StatusCode::OUT_OF_RANGE)); + + EXPECT_THAT(schema_d_iterator.unknown_indexable_nested_property_paths(), + IsEmpty()); +} + +TEST(SchemaPropertyIteratorTest, MultipleCyclesWithIndexableList) { + std::string schema_a = "A"; + std::string schema_b = "B"; + std::string schema_c = "C"; + std::string schema_d = "D"; + + // Create the following schema: + // D <--> A <--- C + // \ ^ + // v / + // B + // Schema type A has two cycles: A-B-C-A and A-D-A + SchemaTypeConfigProto schema_type_config_a = + SchemaTypeConfigBuilder() + .SetType(schema_a) + .AddProperty( + PropertyConfigBuilder() + .SetName("schemaAprop1") + .SetDataTypeDocument( + schema_b, /*indexable_nested_properties_list=*/ + {"schemaBprop2", "schemaBprop1.schemaCprop1.schemaAprop2", + "schemaBprop1.schemaCprop1.schemaAprop1.schemaBprop2", + "schemaBprop1.schemaCprop1.schemaAprop3.schemaDprop2", + "schemaBprop1.schemaCprop1.schemaAprop3.schemaDprop1." + "schemaAprop2"})) + .AddProperty( + PropertyConfigBuilder() + .SetName("schemaAprop2") + .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN)) + .AddProperty( + PropertyConfigBuilder() + .SetName("schemaAprop3") + .SetDataTypeDocument( + schema_d, /*indexable_nested_properties_list=*/ + {"schemaDprop2", "schemaDprop1.schemaAprop2", + "schemaDprop1.schemaAprop1.schemaBprop2", + "schemaDprop1.schemaAprop1.schemaBprop1.schemaCprop2", + "schemaDprop1.schemaAprop3.schemaDprop2"})) + .Build(); + SchemaTypeConfigProto schema_type_config_b = + SchemaTypeConfigBuilder() + .SetType(schema_b) + .AddProperty(PropertyConfigBuilder() + .SetName("schemaBprop1") + .SetDataTypeDocument( + schema_c, /*index_nested_properties=*/true)) + .AddProperty( + PropertyConfigBuilder() + .SetName("schemaBprop2") + .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN)) + .Build(); + SchemaTypeConfigProto schema_type_config_c = + SchemaTypeConfigBuilder() + .SetType(schema_c) + .AddProperty(PropertyConfigBuilder() + .SetName("schemaCprop1") + .SetDataTypeDocument( + schema_a, /*index_nested_properties=*/false)) + .AddProperty( + PropertyConfigBuilder() + .SetName("schemaCprop2") + .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN)) + .Build(); + SchemaTypeConfigProto schema_type_config_d = + SchemaTypeConfigBuilder() + .SetType(schema_d) + .AddProperty(PropertyConfigBuilder() + .SetName("schemaDprop1") + .SetDataTypeDocument( + schema_a, /*index_nested_properties=*/false)) + .AddProperty( + PropertyConfigBuilder() + .SetName("schemaDprop2") + .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN)) + .Build(); + + SchemaUtil::TypeConfigMap type_config_map = { + {schema_a, schema_type_config_a}, + {schema_b, schema_type_config_b}, + {schema_c, schema_type_config_c}, + {schema_d, schema_type_config_d}}; + + // Order of iteration and whether each property is indexable for schema A: + // "schemaAprop1.schemaBprop1.schemaCprop1.schemaAprop1.schemaBprop2" (true), + // "schemaAprop1.schemaBprop1.schemaCprop1.schemaAprop2" (true), + // "schemaAprop1.schemaBprop1.schemaCprop1.schemaAprop3.schemaDprop1.schemaAprop2" + // (true), "schemaAprop1.schemaBprop1.schemaCprop1.schemaAprop3.schemaDprop2" + // (true), "schemaAprop1.schemaBprop1.schemaCprop2" (false), + // "schemaAprop1.schemaBprop2" (true), + // "schemaAprop2" (true), + // "schemaAprop3.schemaDprop1.schemaAprop1.schemaBprop1.schemaCprop2" (true), + // "schemaAprop3.schemaDprop1.schemaAprop1.schemaBprop2" (true), + // "schemaAprop3.schemaDprop1.schemaAprop2" (true), + // "schemaAprop3.schemaDprop1.schemaAprop3.schemaDprop2" (true), + // "schemaAprop3.schemaDprop2" (true) + SchemaPropertyIterator schema_a_iterator(schema_type_config_a, + type_config_map); + + EXPECT_THAT(schema_a_iterator.Advance(), IsOk()); + EXPECT_THAT( + schema_a_iterator.GetCurrentPropertyPath(), + Eq("schemaAprop1.schemaBprop1.schemaCprop1.schemaAprop1.schemaBprop2")); + EXPECT_THAT(schema_a_iterator.GetCurrentPropertyConfig(), + EqualsProto(schema_type_config_b.properties(1))); + EXPECT_THAT(schema_a_iterator.GetCurrentPropertyIndexable(), IsTrue()); + + EXPECT_THAT(schema_a_iterator.Advance(), IsOk()); + EXPECT_THAT(schema_a_iterator.GetCurrentPropertyPath(), + Eq("schemaAprop1.schemaBprop1.schemaCprop1.schemaAprop2")); + EXPECT_THAT(schema_a_iterator.GetCurrentPropertyConfig(), + EqualsProto(schema_type_config_a.properties(1))); + EXPECT_THAT(schema_a_iterator.GetCurrentPropertyIndexable(), IsTrue()); + + EXPECT_THAT(schema_a_iterator.Advance(), IsOk()); + EXPECT_THAT(schema_a_iterator.GetCurrentPropertyPath(), + Eq("schemaAprop1.schemaBprop1.schemaCprop1.schemaAprop3." + "schemaDprop1.schemaAprop2")); + EXPECT_THAT(schema_a_iterator.GetCurrentPropertyConfig(), + EqualsProto(schema_type_config_a.properties(1))); + EXPECT_THAT(schema_a_iterator.GetCurrentPropertyIndexable(), IsTrue()); + + EXPECT_THAT(schema_a_iterator.Advance(), IsOk()); + EXPECT_THAT( + schema_a_iterator.GetCurrentPropertyPath(), + Eq("schemaAprop1.schemaBprop1.schemaCprop1.schemaAprop3.schemaDprop2")); + EXPECT_THAT(schema_a_iterator.GetCurrentPropertyConfig(), + EqualsProto(schema_type_config_d.properties(1))); + EXPECT_THAT(schema_a_iterator.GetCurrentPropertyIndexable(), IsTrue()); + + EXPECT_THAT(schema_a_iterator.Advance(), IsOk()); + EXPECT_THAT(schema_a_iterator.GetCurrentPropertyPath(), + Eq("schemaAprop1.schemaBprop1.schemaCprop2")); + EXPECT_THAT(schema_a_iterator.GetCurrentPropertyConfig(), + EqualsProto(schema_type_config_c.properties(1))); + EXPECT_THAT(schema_a_iterator.GetCurrentPropertyIndexable(), IsFalse()); + + EXPECT_THAT(schema_a_iterator.Advance(), IsOk()); + EXPECT_THAT(schema_a_iterator.GetCurrentPropertyPath(), + Eq("schemaAprop1.schemaBprop2")); + EXPECT_THAT(schema_a_iterator.GetCurrentPropertyConfig(), + EqualsProto(schema_type_config_b.properties(1))); + EXPECT_THAT(schema_a_iterator.GetCurrentPropertyIndexable(), IsTrue()); + + EXPECT_THAT(schema_a_iterator.Advance(), IsOk()); + EXPECT_THAT(schema_a_iterator.GetCurrentPropertyPath(), Eq("schemaAprop2")); + EXPECT_THAT(schema_a_iterator.GetCurrentPropertyConfig(), + EqualsProto(schema_type_config_a.properties(1))); + EXPECT_THAT(schema_a_iterator.GetCurrentPropertyIndexable(), IsTrue()); + + EXPECT_THAT(schema_a_iterator.Advance(), IsOk()); + EXPECT_THAT( + schema_a_iterator.GetCurrentPropertyPath(), + Eq("schemaAprop3.schemaDprop1.schemaAprop1.schemaBprop1.schemaCprop2")); + EXPECT_THAT(schema_a_iterator.GetCurrentPropertyConfig(), + EqualsProto(schema_type_config_c.properties(1))); + EXPECT_THAT(schema_a_iterator.GetCurrentPropertyIndexable(), IsTrue()); + + EXPECT_THAT(schema_a_iterator.Advance(), IsOk()); + EXPECT_THAT(schema_a_iterator.GetCurrentPropertyPath(), + Eq("schemaAprop3.schemaDprop1.schemaAprop1.schemaBprop2")); + EXPECT_THAT(schema_a_iterator.GetCurrentPropertyConfig(), + EqualsProto(schema_type_config_b.properties(1))); + EXPECT_THAT(schema_a_iterator.GetCurrentPropertyIndexable(), IsTrue()); + + EXPECT_THAT(schema_a_iterator.Advance(), IsOk()); + EXPECT_THAT(schema_a_iterator.GetCurrentPropertyPath(), + Eq("schemaAprop3.schemaDprop1.schemaAprop2")); + EXPECT_THAT(schema_a_iterator.GetCurrentPropertyConfig(), + EqualsProto(schema_type_config_a.properties(1))); + EXPECT_THAT(schema_a_iterator.GetCurrentPropertyIndexable(), IsTrue()); + + EXPECT_THAT(schema_a_iterator.Advance(), IsOk()); + EXPECT_THAT(schema_a_iterator.GetCurrentPropertyPath(), + Eq("schemaAprop3.schemaDprop1.schemaAprop3.schemaDprop2")); + EXPECT_THAT(schema_a_iterator.GetCurrentPropertyConfig(), + EqualsProto(schema_type_config_d.properties(1))); + EXPECT_THAT(schema_a_iterator.GetCurrentPropertyIndexable(), IsTrue()); + + EXPECT_THAT(schema_a_iterator.Advance(), IsOk()); + EXPECT_THAT(schema_a_iterator.GetCurrentPropertyPath(), + Eq("schemaAprop3.schemaDprop2")); + EXPECT_THAT(schema_a_iterator.GetCurrentPropertyConfig(), + EqualsProto(schema_type_config_d.properties(1))); + EXPECT_THAT(schema_a_iterator.GetCurrentPropertyIndexable(), IsTrue()); + + EXPECT_THAT(schema_a_iterator.Advance(), + StatusIs(libtextclassifier3::StatusCode::OUT_OF_RANGE)); + + EXPECT_THAT(schema_a_iterator.unknown_indexable_nested_property_paths(), + IsEmpty()); + + // Order of iteration and whether each property is indexable for schema B: + // "schemaBprop1.schemaCprop1.schemaAprop2" (false), + // "schemaBprop1.schemaCprop1.schemaAprop3.schemaDprop2" (false), + // "schemaBprop1.schemaCprop2" (true), + // "schemaBprop2" (true) + SchemaPropertyIterator schema_b_iterator(schema_type_config_b, + type_config_map); + + EXPECT_THAT(schema_b_iterator.Advance(), IsOk()); + EXPECT_THAT(schema_b_iterator.GetCurrentPropertyPath(), + Eq("schemaBprop1.schemaCprop1.schemaAprop2")); + EXPECT_THAT(schema_b_iterator.GetCurrentPropertyConfig(), + EqualsProto(schema_type_config_a.properties(1))); + EXPECT_THAT(schema_b_iterator.GetCurrentPropertyIndexable(), IsFalse()); + + EXPECT_THAT(schema_b_iterator.Advance(), IsOk()); + EXPECT_THAT(schema_b_iterator.GetCurrentPropertyPath(), + Eq("schemaBprop1.schemaCprop1.schemaAprop3.schemaDprop2")); + EXPECT_THAT(schema_b_iterator.GetCurrentPropertyConfig(), + EqualsProto(schema_type_config_d.properties(1))); + EXPECT_THAT(schema_b_iterator.GetCurrentPropertyIndexable(), IsFalse()); + + EXPECT_THAT(schema_b_iterator.Advance(), IsOk()); + EXPECT_THAT(schema_b_iterator.GetCurrentPropertyPath(), + Eq("schemaBprop1.schemaCprop2")); + EXPECT_THAT(schema_b_iterator.GetCurrentPropertyConfig(), + EqualsProto(schema_type_config_c.properties(1))); + EXPECT_THAT(schema_b_iterator.GetCurrentPropertyIndexable(), IsTrue()); + + EXPECT_THAT(schema_b_iterator.Advance(), IsOk()); + EXPECT_THAT(schema_b_iterator.GetCurrentPropertyPath(), Eq("schemaBprop2")); + EXPECT_THAT(schema_b_iterator.GetCurrentPropertyConfig(), + EqualsProto(schema_type_config_b.properties(1))); + EXPECT_THAT(schema_b_iterator.GetCurrentPropertyIndexable(), IsTrue()); + + EXPECT_THAT(schema_b_iterator.Advance(), + StatusIs(libtextclassifier3::StatusCode::OUT_OF_RANGE)); + + EXPECT_THAT(schema_b_iterator.unknown_indexable_nested_property_paths(), + IsEmpty()); + + // Order of iteration for schema C: + // "schemaCprop1.schemaAprop1.schemaBprop2" (false), + // "schemaCprop1.schemaAprop2" (false), + // "schemaCprop1.schemaAprop3.schemaDprop2" (false), + // "schemaCprop2" (true) + SchemaPropertyIterator schema_c_iterator(schema_type_config_c, + type_config_map); + + EXPECT_THAT(schema_c_iterator.Advance(), IsOk()); + EXPECT_THAT(schema_c_iterator.GetCurrentPropertyPath(), + Eq("schemaCprop1.schemaAprop1.schemaBprop2")); + EXPECT_THAT(schema_c_iterator.GetCurrentPropertyConfig(), + EqualsProto(schema_type_config_b.properties(1))); + EXPECT_THAT(schema_c_iterator.GetCurrentPropertyIndexable(), IsFalse()); + + EXPECT_THAT(schema_c_iterator.Advance(), IsOk()); + EXPECT_THAT(schema_c_iterator.GetCurrentPropertyPath(), + Eq("schemaCprop1.schemaAprop2")); + EXPECT_THAT(schema_c_iterator.GetCurrentPropertyConfig(), + EqualsProto(schema_type_config_a.properties(1))); + EXPECT_THAT(schema_c_iterator.GetCurrentPropertyIndexable(), IsFalse()); + + EXPECT_THAT(schema_c_iterator.Advance(), IsOk()); + EXPECT_THAT(schema_c_iterator.GetCurrentPropertyPath(), + Eq("schemaCprop1.schemaAprop3.schemaDprop2")); + EXPECT_THAT(schema_c_iterator.GetCurrentPropertyConfig(), + EqualsProto(schema_type_config_d.properties(1))); + EXPECT_THAT(schema_c_iterator.GetCurrentPropertyIndexable(), IsFalse()); + + EXPECT_THAT(schema_c_iterator.Advance(), IsOk()); + EXPECT_THAT(schema_c_iterator.GetCurrentPropertyPath(), Eq("schemaCprop2")); + EXPECT_THAT(schema_c_iterator.GetCurrentPropertyConfig(), + EqualsProto(schema_type_config_c.properties(1))); + EXPECT_THAT(schema_c_iterator.GetCurrentPropertyIndexable(), IsTrue()); + + EXPECT_THAT(schema_c_iterator.Advance(), + StatusIs(libtextclassifier3::StatusCode::OUT_OF_RANGE)); + + EXPECT_THAT(schema_c_iterator.unknown_indexable_nested_property_paths(), + IsEmpty()); + + // Order of iteration for schema D: + // "schemaDprop1.schemaAprop1.schemaBprop1.schemaCprop2" (false), + // "schemaDprop1.schemaAprop1.schemaBprop2" (false), + // "schemaDprop1.schemaAprop2" (false), + // "schemaDprop2" (true) + SchemaPropertyIterator schema_d_iterator(schema_type_config_d, + type_config_map); + + EXPECT_THAT(schema_d_iterator.Advance(), IsOk()); + EXPECT_THAT(schema_d_iterator.GetCurrentPropertyPath(), + Eq("schemaDprop1.schemaAprop1.schemaBprop1.schemaCprop2")); + EXPECT_THAT(schema_d_iterator.GetCurrentPropertyConfig(), + EqualsProto(schema_type_config_c.properties(1))); + EXPECT_THAT(schema_d_iterator.GetCurrentPropertyIndexable(), IsFalse()); + + EXPECT_THAT(schema_d_iterator.Advance(), IsOk()); + EXPECT_THAT(schema_d_iterator.GetCurrentPropertyPath(), + Eq("schemaDprop1.schemaAprop1.schemaBprop2")); + EXPECT_THAT(schema_d_iterator.GetCurrentPropertyConfig(), + EqualsProto(schema_type_config_b.properties(1))); + EXPECT_THAT(schema_d_iterator.GetCurrentPropertyIndexable(), IsFalse()); + + EXPECT_THAT(schema_d_iterator.Advance(), IsOk()); + EXPECT_THAT(schema_d_iterator.GetCurrentPropertyPath(), + Eq("schemaDprop1.schemaAprop2")); + EXPECT_THAT(schema_d_iterator.GetCurrentPropertyConfig(), + EqualsProto(schema_type_config_a.properties(1))); + EXPECT_THAT(schema_d_iterator.GetCurrentPropertyIndexable(), IsFalse()); + + EXPECT_THAT(schema_d_iterator.Advance(), IsOk()); + EXPECT_THAT(schema_d_iterator.GetCurrentPropertyPath(), Eq("schemaDprop2")); + EXPECT_THAT(schema_d_iterator.GetCurrentPropertyConfig(), + EqualsProto(schema_type_config_d.properties(1))); + EXPECT_THAT(schema_d_iterator.GetCurrentPropertyIndexable(), IsTrue()); + + EXPECT_THAT(schema_d_iterator.Advance(), + StatusIs(libtextclassifier3::StatusCode::OUT_OF_RANGE)); + + EXPECT_THAT(schema_d_iterator.unknown_indexable_nested_property_paths(), + IsEmpty()); +} + +TEST(SchemaPropertyIteratorTest, MultipleCyclesWithIndexableList_allIndexTrue) { + std::string schema_a = "A"; + std::string schema_b = "B"; + std::string schema_c = "C"; + std::string schema_d = "D"; + + // Create the following schema: + // D <--> A <--- C + // \ ^ + // v / + // B + // Schema type A has two cycles: A-B-C-A and A-D-A + SchemaTypeConfigProto schema_type_config_a = + SchemaTypeConfigBuilder() + .SetType(schema_a) + .AddProperty( + PropertyConfigBuilder() + .SetName("schemaAprop1") + .SetDataTypeDocument( + schema_b, /*indexable_nested_properties_list=*/ + {"schemaBprop2", "schemaBprop1.schemaCprop1.schemaAprop2", + "schemaBprop1.schemaCprop1.schemaAprop1.schemaBprop2", + "schemaBprop1.schemaCprop1.schemaAprop3.schemaDprop2", + "schemaBprop1.schemaCprop1.schemaAprop3.schemaDprop1." + "schemaAprop2"})) + .AddProperty( + PropertyConfigBuilder() + .SetName("schemaAprop2") + .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN)) + .AddProperty( + PropertyConfigBuilder() + .SetName("schemaAprop3") + .SetDataTypeDocument( + schema_d, /*indexable_nested_properties_list=*/ + {"schemaDprop2", "schemaDprop1.schemaAprop2", + "schemaDprop1.schemaAprop1.schemaBprop2", + "schemaDprop1.schemaAprop1.schemaBprop1.schemaCprop2", + "schemaDprop1.schemaAprop3.schemaDprop2"})) + .Build(); + SchemaTypeConfigProto schema_type_config_b = + SchemaTypeConfigBuilder() + .SetType(schema_b) + .AddProperty(PropertyConfigBuilder() + .SetName("schemaBprop1") + .SetDataTypeDocument( + schema_c, /*index_nested_properties=*/true)) + .AddProperty( + PropertyConfigBuilder() + .SetName("schemaBprop2") + .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN)) + .Build(); + SchemaTypeConfigProto schema_type_config_c = + SchemaTypeConfigBuilder() + .SetType(schema_c) + .AddProperty(PropertyConfigBuilder() + .SetName("schemaCprop1") + .SetDataTypeDocument( + schema_a, /*index_nested_properties=*/true)) + .AddProperty( + PropertyConfigBuilder() + .SetName("schemaCprop2") + .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN)) + .Build(); + SchemaTypeConfigProto schema_type_config_d = + SchemaTypeConfigBuilder() + .SetType(schema_d) + .AddProperty(PropertyConfigBuilder() + .SetName("schemaDprop1") + .SetDataTypeDocument( + schema_a, /*index_nested_properties=*/true)) + .AddProperty( + PropertyConfigBuilder() + .SetName("schemaDprop2") + .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN)) + .Build(); + + SchemaUtil::TypeConfigMap type_config_map = { + {schema_a, schema_type_config_a}, + {schema_b, schema_type_config_b}, + {schema_c, schema_type_config_c}, + {schema_d, schema_type_config_d}}; + + // Order of iteration and whether each property is indexable for schema A: + // "schemaAprop1.schemaBprop1.schemaCprop1.schemaAprop1.schemaBprop2" (true), + // "schemaAprop1.schemaBprop1.schemaCprop1.schemaAprop2" (true), + // "schemaAprop1.schemaBprop1.schemaCprop1.schemaAprop3.schemaDprop1.schemaAprop2" + // (true), "schemaAprop1.schemaBprop1.schemaCprop1.schemaAprop3.schemaDprop2" + // (true), "schemaAprop1.schemaBprop1.schemaCprop2" (false), + // "schemaAprop1.schemaBprop2" (true), + // "schemaAprop2" (true), + // "schemaAprop3.schemaDprop1.schemaAprop1.schemaBprop1.schemaCprop2" (true), + // "schemaAprop3.schemaDprop1.schemaAprop1.schemaBprop2" (true), + // "schemaAprop3.schemaDprop1.schemaAprop2" (true), + // "schemaAprop3.schemaDprop1.schemaAprop3.schemaDprop2" (true), + // "schemaAprop3.schemaDprop2" (true) + SchemaPropertyIterator schema_a_iterator(schema_type_config_a, + type_config_map); + + EXPECT_THAT(schema_a_iterator.Advance(), IsOk()); + EXPECT_THAT( + schema_a_iterator.GetCurrentPropertyPath(), + Eq("schemaAprop1.schemaBprop1.schemaCprop1.schemaAprop1.schemaBprop2")); + EXPECT_THAT(schema_a_iterator.GetCurrentPropertyConfig(), + EqualsProto(schema_type_config_b.properties(1))); + EXPECT_THAT(schema_a_iterator.GetCurrentPropertyIndexable(), IsTrue()); + + EXPECT_THAT(schema_a_iterator.Advance(), IsOk()); + EXPECT_THAT(schema_a_iterator.GetCurrentPropertyPath(), + Eq("schemaAprop1.schemaBprop1.schemaCprop1.schemaAprop2")); + EXPECT_THAT(schema_a_iterator.GetCurrentPropertyConfig(), + EqualsProto(schema_type_config_a.properties(1))); + EXPECT_THAT(schema_a_iterator.GetCurrentPropertyIndexable(), IsTrue()); + + EXPECT_THAT(schema_a_iterator.Advance(), IsOk()); + EXPECT_THAT(schema_a_iterator.GetCurrentPropertyPath(), + Eq("schemaAprop1.schemaBprop1.schemaCprop1.schemaAprop3." + "schemaDprop1.schemaAprop2")); + EXPECT_THAT(schema_a_iterator.GetCurrentPropertyConfig(), + EqualsProto(schema_type_config_a.properties(1))); + EXPECT_THAT(schema_a_iterator.GetCurrentPropertyIndexable(), IsTrue()); + + EXPECT_THAT(schema_a_iterator.Advance(), IsOk()); + EXPECT_THAT( + schema_a_iterator.GetCurrentPropertyPath(), + Eq("schemaAprop1.schemaBprop1.schemaCprop1.schemaAprop3.schemaDprop2")); + EXPECT_THAT(schema_a_iterator.GetCurrentPropertyConfig(), + EqualsProto(schema_type_config_d.properties(1))); + EXPECT_THAT(schema_a_iterator.GetCurrentPropertyIndexable(), IsTrue()); + + EXPECT_THAT(schema_a_iterator.Advance(), IsOk()); + EXPECT_THAT(schema_a_iterator.GetCurrentPropertyPath(), + Eq("schemaAprop1.schemaBprop1.schemaCprop2")); + EXPECT_THAT(schema_a_iterator.GetCurrentPropertyConfig(), + EqualsProto(schema_type_config_c.properties(1))); + EXPECT_THAT(schema_a_iterator.GetCurrentPropertyIndexable(), IsFalse()); + + EXPECT_THAT(schema_a_iterator.Advance(), IsOk()); + EXPECT_THAT(schema_a_iterator.GetCurrentPropertyPath(), + Eq("schemaAprop1.schemaBprop2")); + EXPECT_THAT(schema_a_iterator.GetCurrentPropertyConfig(), + EqualsProto(schema_type_config_b.properties(1))); + EXPECT_THAT(schema_a_iterator.GetCurrentPropertyIndexable(), IsTrue()); + + EXPECT_THAT(schema_a_iterator.Advance(), IsOk()); + EXPECT_THAT(schema_a_iterator.GetCurrentPropertyPath(), Eq("schemaAprop2")); + EXPECT_THAT(schema_a_iterator.GetCurrentPropertyConfig(), + EqualsProto(schema_type_config_a.properties(1))); + EXPECT_THAT(schema_a_iterator.GetCurrentPropertyIndexable(), IsTrue()); + + EXPECT_THAT(schema_a_iterator.Advance(), IsOk()); + EXPECT_THAT( + schema_a_iterator.GetCurrentPropertyPath(), + Eq("schemaAprop3.schemaDprop1.schemaAprop1.schemaBprop1.schemaCprop2")); + EXPECT_THAT(schema_a_iterator.GetCurrentPropertyConfig(), + EqualsProto(schema_type_config_c.properties(1))); + EXPECT_THAT(schema_a_iterator.GetCurrentPropertyIndexable(), IsTrue()); + + EXPECT_THAT(schema_a_iterator.Advance(), IsOk()); + EXPECT_THAT(schema_a_iterator.GetCurrentPropertyPath(), + Eq("schemaAprop3.schemaDprop1.schemaAprop1.schemaBprop2")); + EXPECT_THAT(schema_a_iterator.GetCurrentPropertyConfig(), + EqualsProto(schema_type_config_b.properties(1))); + EXPECT_THAT(schema_a_iterator.GetCurrentPropertyIndexable(), IsTrue()); + + EXPECT_THAT(schema_a_iterator.Advance(), IsOk()); + EXPECT_THAT(schema_a_iterator.GetCurrentPropertyPath(), + Eq("schemaAprop3.schemaDprop1.schemaAprop2")); + EXPECT_THAT(schema_a_iterator.GetCurrentPropertyConfig(), + EqualsProto(schema_type_config_a.properties(1))); + EXPECT_THAT(schema_a_iterator.GetCurrentPropertyIndexable(), IsTrue()); + + EXPECT_THAT(schema_a_iterator.Advance(), IsOk()); + EXPECT_THAT(schema_a_iterator.GetCurrentPropertyPath(), + Eq("schemaAprop3.schemaDprop1.schemaAprop3.schemaDprop2")); + EXPECT_THAT(schema_a_iterator.GetCurrentPropertyConfig(), + EqualsProto(schema_type_config_d.properties(1))); + EXPECT_THAT(schema_a_iterator.GetCurrentPropertyIndexable(), IsTrue()); + + EXPECT_THAT(schema_a_iterator.Advance(), IsOk()); + EXPECT_THAT(schema_a_iterator.GetCurrentPropertyPath(), + Eq("schemaAprop3.schemaDprop2")); + EXPECT_THAT(schema_a_iterator.GetCurrentPropertyConfig(), + EqualsProto(schema_type_config_d.properties(1))); + EXPECT_THAT(schema_a_iterator.GetCurrentPropertyIndexable(), IsTrue()); + + EXPECT_THAT(schema_a_iterator.Advance(), + StatusIs(libtextclassifier3::StatusCode::OUT_OF_RANGE)); + + EXPECT_THAT(schema_a_iterator.unknown_indexable_nested_property_paths(), + IsEmpty()); + + // Order of iteration and whether each property is indexable for schema B: + // "schemaBprop1.schemaCprop1.schemaAprop1.schemaBprop1.schemaCprop1.schemaAprop1.schemaBprop2" + // (true), + // "schemaBprop1.schemaCprop1.schemaAprop1.schemaBprop1.schemaCprop1.schemaAprop2" + // (true), + // "schemaBprop1.schemaCprop1.schemaAprop1.schemaBprop1.schemaCprop1.schemaAprop3.schemaDprop1.schemaAprop2" + // (true), + // "schemaBprop1.schemaCprop1.schemaAprop1.schemaBprop1.schemaCprop1.schemaAprop3.schemaDprop2" + // (true), "schemaBprop1.schemaCprop1.schemaAprop1.schemaBprop1.schemaCprop2" + // (false), "schemaBprop1.schemaCprop1.schemaAprop1.schemaBprop2" (true), + // "schemaBprop1.schemaCprop1.schemaAprop2" (true), + // "schemaBprop1.schemaCprop1.schemaAprop3.schemaDprop1.schemaAprop1.schemaBprop1.schemaCprop2" + // (true), + // "schemaBprop1.schemaCprop1.schemaAprop3.schemaDprop1.schemaAprop1.schemaBprop2" + // (true), "schemaBprop1.schemaCprop1.schemaAprop3.schemaDprop1.schemaAprop2" + // (true), + // "schemaBprop1.schemaCprop1.schemaAprop3.schemaDprop1.schemaAprop3.schemaDprop2" + // (true), "schemaBprop1.schemaCprop1.schemaAprop3.schemaDprop2" (true) + // "schemaBprop1.schemaCprop2" (true) + // "schemaBprop2" (true) + + SchemaPropertyIterator schema_b_iterator(schema_type_config_b, + type_config_map); + + EXPECT_THAT(schema_b_iterator.Advance(), IsOk()); + EXPECT_THAT(schema_b_iterator.GetCurrentPropertyPath(), + Eq("schemaBprop1.schemaCprop1.schemaAprop1.schemaBprop1." + "schemaCprop1.schemaAprop1.schemaBprop2")); + EXPECT_THAT(schema_b_iterator.GetCurrentPropertyConfig(), + EqualsProto(schema_type_config_b.properties(1))); + EXPECT_THAT(schema_b_iterator.GetCurrentPropertyIndexable(), IsTrue()); + + EXPECT_THAT(schema_b_iterator.Advance(), IsOk()); + EXPECT_THAT(schema_b_iterator.GetCurrentPropertyPath(), + Eq("schemaBprop1.schemaCprop1.schemaAprop1.schemaBprop1." + "schemaCprop1.schemaAprop2")); + EXPECT_THAT(schema_b_iterator.GetCurrentPropertyConfig(), + EqualsProto(schema_type_config_a.properties(1))); + EXPECT_THAT(schema_b_iterator.GetCurrentPropertyIndexable(), IsTrue()); + + EXPECT_THAT(schema_b_iterator.Advance(), IsOk()); + EXPECT_THAT(schema_b_iterator.GetCurrentPropertyPath(), + Eq("schemaBprop1.schemaCprop1.schemaAprop1.schemaBprop1." + "schemaCprop1.schemaAprop3.schemaDprop1.schemaAprop2")); + EXPECT_THAT(schema_b_iterator.GetCurrentPropertyConfig(), + EqualsProto(schema_type_config_a.properties(1))); + EXPECT_THAT(schema_b_iterator.GetCurrentPropertyIndexable(), IsTrue()); + + EXPECT_THAT(schema_b_iterator.Advance(), IsOk()); + EXPECT_THAT(schema_b_iterator.GetCurrentPropertyPath(), + Eq("schemaBprop1.schemaCprop1.schemaAprop1.schemaBprop1." + "schemaCprop1.schemaAprop3.schemaDprop2")); + EXPECT_THAT(schema_b_iterator.GetCurrentPropertyConfig(), + EqualsProto(schema_type_config_d.properties(1))); + EXPECT_THAT(schema_b_iterator.GetCurrentPropertyIndexable(), IsTrue()); + + EXPECT_THAT(schema_b_iterator.Advance(), IsOk()); + EXPECT_THAT( + schema_b_iterator.GetCurrentPropertyPath(), + Eq("schemaBprop1.schemaCprop1.schemaAprop1.schemaBprop1.schemaCprop2")); + EXPECT_THAT(schema_b_iterator.GetCurrentPropertyConfig(), + EqualsProto(schema_type_config_c.properties(1))); + EXPECT_THAT(schema_b_iterator.GetCurrentPropertyIndexable(), IsFalse()); + + EXPECT_THAT(schema_b_iterator.Advance(), IsOk()); + EXPECT_THAT(schema_b_iterator.GetCurrentPropertyPath(), + Eq("schemaBprop1.schemaCprop1.schemaAprop1.schemaBprop2")); + EXPECT_THAT(schema_b_iterator.GetCurrentPropertyConfig(), + EqualsProto(schema_type_config_b.properties(1))); + EXPECT_THAT(schema_b_iterator.GetCurrentPropertyIndexable(), IsTrue()); + + EXPECT_THAT(schema_b_iterator.Advance(), IsOk()); + EXPECT_THAT(schema_b_iterator.GetCurrentPropertyPath(), + Eq("schemaBprop1.schemaCprop1.schemaAprop2")); + EXPECT_THAT(schema_b_iterator.GetCurrentPropertyConfig(), + EqualsProto(schema_type_config_a.properties(1))); + EXPECT_THAT(schema_b_iterator.GetCurrentPropertyIndexable(), IsTrue()); + + EXPECT_THAT(schema_b_iterator.Advance(), IsOk()); + EXPECT_THAT(schema_b_iterator.GetCurrentPropertyPath(), + Eq("schemaBprop1.schemaCprop1.schemaAprop3.schemaDprop1." + "schemaAprop1.schemaBprop1.schemaCprop2")); + EXPECT_THAT(schema_b_iterator.GetCurrentPropertyConfig(), + EqualsProto(schema_type_config_c.properties(1))); + EXPECT_THAT(schema_b_iterator.GetCurrentPropertyIndexable(), IsTrue()); + + EXPECT_THAT(schema_b_iterator.Advance(), IsOk()); + EXPECT_THAT(schema_b_iterator.GetCurrentPropertyPath(), + Eq("schemaBprop1.schemaCprop1.schemaAprop3.schemaDprop1." + "schemaAprop1.schemaBprop2")); + EXPECT_THAT(schema_b_iterator.GetCurrentPropertyConfig(), + EqualsProto(schema_type_config_b.properties(1))); + EXPECT_THAT(schema_b_iterator.GetCurrentPropertyIndexable(), IsTrue()); + + EXPECT_THAT(schema_b_iterator.Advance(), IsOk()); + EXPECT_THAT( + schema_b_iterator.GetCurrentPropertyPath(), + Eq("schemaBprop1.schemaCprop1.schemaAprop3.schemaDprop1.schemaAprop2")); + EXPECT_THAT(schema_b_iterator.GetCurrentPropertyConfig(), + EqualsProto(schema_type_config_a.properties(1))); + EXPECT_THAT(schema_b_iterator.GetCurrentPropertyIndexable(), IsTrue()); + + EXPECT_THAT(schema_b_iterator.Advance(), IsOk()); + EXPECT_THAT(schema_b_iterator.GetCurrentPropertyPath(), + Eq("schemaBprop1.schemaCprop1.schemaAprop3.schemaDprop1." + "schemaAprop3.schemaDprop2")); + EXPECT_THAT(schema_b_iterator.GetCurrentPropertyConfig(), + EqualsProto(schema_type_config_d.properties(1))); + EXPECT_THAT(schema_b_iterator.GetCurrentPropertyIndexable(), IsTrue()); + + EXPECT_THAT(schema_b_iterator.Advance(), IsOk()); + EXPECT_THAT(schema_b_iterator.GetCurrentPropertyPath(), + Eq("schemaBprop1.schemaCprop1.schemaAprop3.schemaDprop2")); + EXPECT_THAT(schema_b_iterator.GetCurrentPropertyConfig(), + EqualsProto(schema_type_config_d.properties(1))); + EXPECT_THAT(schema_b_iterator.GetCurrentPropertyIndexable(), IsTrue()); + + EXPECT_THAT(schema_b_iterator.Advance(), IsOk()); + EXPECT_THAT(schema_b_iterator.GetCurrentPropertyPath(), + Eq("schemaBprop1.schemaCprop2")); + EXPECT_THAT(schema_b_iterator.GetCurrentPropertyConfig(), + EqualsProto(schema_type_config_c.properties(1))); + EXPECT_THAT(schema_b_iterator.GetCurrentPropertyIndexable(), IsTrue()); + + EXPECT_THAT(schema_b_iterator.Advance(), IsOk()); + EXPECT_THAT(schema_b_iterator.GetCurrentPropertyPath(), Eq("schemaBprop2")); + EXPECT_THAT(schema_b_iterator.GetCurrentPropertyConfig(), + EqualsProto(schema_type_config_b.properties(1))); + EXPECT_THAT(schema_b_iterator.GetCurrentPropertyIndexable(), IsTrue()); + + EXPECT_THAT(schema_b_iterator.Advance(), + StatusIs(libtextclassifier3::StatusCode::OUT_OF_RANGE)); + + EXPECT_THAT(schema_b_iterator.unknown_indexable_nested_property_paths(), + IsEmpty()); + + // Order of iteration and whether each property is indexable for schema C: + // "schemaCprop1.schemaAprop1.schemaBprop1.schemaCprop1.schemaAprop1.schemaBprop2" + // (true), "schemaCprop1.schemaAprop1.schemaBprop1.schemaCprop1.schemaAprop2" + // (true), + // "schemaCprop1.schemaAprop1.schemaBprop1.schemaCprop1.schemaAprop3.schemaDprop1.schemaAprop2" + // (true), + // "schemaCprop1.schemaAprop1.schemaBprop1.schemaCprop1.schemaAprop3.schemaDprop2" + // (true), + // "schemaCprop1.schemaAprop1.schemaBprop1.schemaCprop2" (false), + // "schemaCprop1.schemaAprop1.schemaBprop2" (true), + // "schemaCprop1.schemaAprop2" (true), + // "schemaCprop1.schemaAprop3.schemaDprop1.schemaAprop1.schemaBprop1.schemaCprop2" + // (true), + // "schemaCprop1.schemaAprop3.schemaDprop1.schemaAprop1.schemaBprop2" (true), + // "schemaCprop1.schemaAprop3.schemaDprop1.schemaAprop2" (true), + // "schemaCprop1.schemaAprop3.schemaDprop1.schemaAprop3.schemaDprop2" (true), + // "schemaCprop1.schemaAprop3.schemaDprop2" (true) + // "schemaCprop2" (true) + SchemaPropertyIterator schema_c_iterator(schema_type_config_c, + type_config_map); + + EXPECT_THAT(schema_c_iterator.Advance(), IsOk()); + EXPECT_THAT(schema_c_iterator.GetCurrentPropertyPath(), + Eq("schemaCprop1.schemaAprop1.schemaBprop1.schemaCprop1." + "schemaAprop1.schemaBprop2")); + EXPECT_THAT(schema_c_iterator.GetCurrentPropertyConfig(), + EqualsProto(schema_type_config_b.properties(1))); + EXPECT_THAT(schema_c_iterator.GetCurrentPropertyIndexable(), IsTrue()); + + EXPECT_THAT(schema_c_iterator.Advance(), IsOk()); + EXPECT_THAT( + schema_c_iterator.GetCurrentPropertyPath(), + Eq("schemaCprop1.schemaAprop1.schemaBprop1.schemaCprop1.schemaAprop2")); + EXPECT_THAT(schema_c_iterator.GetCurrentPropertyConfig(), + EqualsProto(schema_type_config_a.properties(1))); + EXPECT_THAT(schema_c_iterator.GetCurrentPropertyIndexable(), IsTrue()); + + EXPECT_THAT(schema_c_iterator.Advance(), IsOk()); + EXPECT_THAT(schema_c_iterator.GetCurrentPropertyPath(), + Eq("schemaCprop1.schemaAprop1.schemaBprop1.schemaCprop1." + "schemaAprop3.schemaDprop1.schemaAprop2")); + EXPECT_THAT(schema_c_iterator.GetCurrentPropertyConfig(), + EqualsProto(schema_type_config_a.properties(1))); + EXPECT_THAT(schema_c_iterator.GetCurrentPropertyIndexable(), IsTrue()); + + EXPECT_THAT(schema_c_iterator.Advance(), IsOk()); + EXPECT_THAT(schema_c_iterator.GetCurrentPropertyPath(), + Eq("schemaCprop1.schemaAprop1.schemaBprop1.schemaCprop1." + "schemaAprop3.schemaDprop2")); + EXPECT_THAT(schema_c_iterator.GetCurrentPropertyConfig(), + EqualsProto(schema_type_config_d.properties(1))); + EXPECT_THAT(schema_c_iterator.GetCurrentPropertyIndexable(), IsTrue()); + + EXPECT_THAT(schema_c_iterator.Advance(), IsOk()); + EXPECT_THAT(schema_c_iterator.GetCurrentPropertyPath(), + Eq("schemaCprop1.schemaAprop1.schemaBprop1.schemaCprop2")); + EXPECT_THAT(schema_c_iterator.GetCurrentPropertyConfig(), + EqualsProto(schema_type_config_c.properties(1))); + EXPECT_THAT(schema_c_iterator.GetCurrentPropertyIndexable(), IsFalse()); + + EXPECT_THAT(schema_c_iterator.Advance(), IsOk()); + EXPECT_THAT(schema_c_iterator.GetCurrentPropertyPath(), + Eq("schemaCprop1.schemaAprop1.schemaBprop2")); + EXPECT_THAT(schema_c_iterator.GetCurrentPropertyConfig(), + EqualsProto(schema_type_config_b.properties(1))); + EXPECT_THAT(schema_c_iterator.GetCurrentPropertyIndexable(), IsTrue()); + + EXPECT_THAT(schema_c_iterator.Advance(), IsOk()); + EXPECT_THAT(schema_c_iterator.GetCurrentPropertyPath(), + Eq("schemaCprop1.schemaAprop2")); + EXPECT_THAT(schema_c_iterator.GetCurrentPropertyConfig(), + EqualsProto(schema_type_config_a.properties(1))); + EXPECT_THAT(schema_c_iterator.GetCurrentPropertyIndexable(), IsTrue()); + + EXPECT_THAT(schema_c_iterator.Advance(), IsOk()); + EXPECT_THAT(schema_c_iterator.GetCurrentPropertyPath(), + Eq("schemaCprop1.schemaAprop3.schemaDprop1.schemaAprop1." + "schemaBprop1.schemaCprop2")); + EXPECT_THAT(schema_c_iterator.GetCurrentPropertyConfig(), + EqualsProto(schema_type_config_c.properties(1))); + EXPECT_THAT(schema_c_iterator.GetCurrentPropertyIndexable(), IsTrue()); + + EXPECT_THAT(schema_c_iterator.Advance(), IsOk()); + EXPECT_THAT( + schema_c_iterator.GetCurrentPropertyPath(), + Eq("schemaCprop1.schemaAprop3.schemaDprop1.schemaAprop1.schemaBprop2")); + EXPECT_THAT(schema_c_iterator.GetCurrentPropertyConfig(), + EqualsProto(schema_type_config_b.properties(1))); + EXPECT_THAT(schema_c_iterator.GetCurrentPropertyIndexable(), IsTrue()); + + EXPECT_THAT(schema_c_iterator.Advance(), IsOk()); + EXPECT_THAT(schema_c_iterator.GetCurrentPropertyPath(), + Eq("schemaCprop1.schemaAprop3.schemaDprop1.schemaAprop2")); + EXPECT_THAT(schema_c_iterator.GetCurrentPropertyConfig(), + EqualsProto(schema_type_config_a.properties(1))); + EXPECT_THAT(schema_c_iterator.GetCurrentPropertyIndexable(), IsTrue()); + + EXPECT_THAT(schema_c_iterator.Advance(), IsOk()); + EXPECT_THAT( + schema_c_iterator.GetCurrentPropertyPath(), + Eq("schemaCprop1.schemaAprop3.schemaDprop1.schemaAprop3.schemaDprop2")); + EXPECT_THAT(schema_c_iterator.GetCurrentPropertyConfig(), + EqualsProto(schema_type_config_d.properties(1))); + EXPECT_THAT(schema_c_iterator.GetCurrentPropertyIndexable(), IsTrue()); + + EXPECT_THAT(schema_c_iterator.Advance(), IsOk()); + EXPECT_THAT(schema_c_iterator.GetCurrentPropertyPath(), + Eq("schemaCprop1.schemaAprop3.schemaDprop2")); + EXPECT_THAT(schema_c_iterator.GetCurrentPropertyConfig(), + EqualsProto(schema_type_config_d.properties(1))); + EXPECT_THAT(schema_c_iterator.GetCurrentPropertyIndexable(), IsTrue()); + + EXPECT_THAT(schema_c_iterator.Advance(), IsOk()); + EXPECT_THAT(schema_c_iterator.GetCurrentPropertyPath(), Eq("schemaCprop2")); + EXPECT_THAT(schema_c_iterator.GetCurrentPropertyConfig(), + EqualsProto(schema_type_config_c.properties(1))); + EXPECT_THAT(schema_c_iterator.GetCurrentPropertyIndexable(), IsTrue()); + + EXPECT_THAT(schema_c_iterator.Advance(), + StatusIs(libtextclassifier3::StatusCode::OUT_OF_RANGE)); + + EXPECT_THAT(schema_c_iterator.unknown_indexable_nested_property_paths(), + IsEmpty()); + + // Order of iteration and whether each property is indexable for schema D: + // "schemaDprop1.schemaAprop1.schemaBprop1.schemaCprop1.schemaAprop1.schemaBprop2" + // (true), "schemaDprop1.schemaAprop1.schemaBprop1.schemaCprop1.schemaAprop2" + // (true), + // "schemaDprop1.schemaAprop1.schemaBprop1.schemaCprop1.schemaAprop3.schemaDprop1.schemaAprop2" + // (true), + // "schemaDprop1.schemaAprop1.schemaBprop1.schemaCprop1.schemaAprop3.schemaDprop2" + // (true), "schemaDprop1.schemaAprop1.schemaBprop1.schemaCprop2" (false), + // "schemaDprop1.schemaAprop1.schemaBprop2" (true), + // "schemaDprop1.schemaAprop2" (true), + // "schemaDprop1.schemaAprop3.schemaDprop1.schemaAprop1.schemaBprop1.schemaCprop2" + // (true), "schemaDprop1.schemaAprop3.schemaDprop1.schemaAprop1.schemaBprop2" + // (true), "schemaDprop1.schemaAprop3.schemaDprop1.schemaAprop2" (true), + // "schemaDprop1.schemaAprop3.schemaDprop1.schemaAprop3.schemaDprop2" (true), + // "schemaDprop1.schemaAprop3.schemaDprop2" (true), + // "schemaDprop2" (true) + SchemaPropertyIterator schema_d_iterator(schema_type_config_d, + type_config_map); + + EXPECT_THAT(schema_d_iterator.Advance(), IsOk()); + EXPECT_THAT(schema_d_iterator.GetCurrentPropertyPath(), + Eq("schemaDprop1.schemaAprop1.schemaBprop1.schemaCprop1." + "schemaAprop1.schemaBprop2")); + EXPECT_THAT(schema_d_iterator.GetCurrentPropertyConfig(), + EqualsProto(schema_type_config_b.properties(1))); + EXPECT_THAT(schema_d_iterator.GetCurrentPropertyIndexable(), IsTrue()); + + EXPECT_THAT(schema_d_iterator.Advance(), IsOk()); + EXPECT_THAT( + schema_d_iterator.GetCurrentPropertyPath(), + Eq("schemaDprop1.schemaAprop1.schemaBprop1.schemaCprop1.schemaAprop2")); + EXPECT_THAT(schema_d_iterator.GetCurrentPropertyConfig(), + EqualsProto(schema_type_config_a.properties(1))); + EXPECT_THAT(schema_d_iterator.GetCurrentPropertyIndexable(), IsTrue()); + + EXPECT_THAT(schema_d_iterator.Advance(), IsOk()); + EXPECT_THAT(schema_d_iterator.GetCurrentPropertyPath(), + Eq("schemaDprop1.schemaAprop1.schemaBprop1.schemaCprop1." + "schemaAprop3.schemaDprop1.schemaAprop2")); + EXPECT_THAT(schema_d_iterator.GetCurrentPropertyConfig(), + EqualsProto(schema_type_config_a.properties(1))); + EXPECT_THAT(schema_d_iterator.GetCurrentPropertyIndexable(), IsTrue()); + + EXPECT_THAT(schema_d_iterator.Advance(), IsOk()); + EXPECT_THAT(schema_d_iterator.GetCurrentPropertyPath(), + Eq("schemaDprop1.schemaAprop1.schemaBprop1.schemaCprop1." + "schemaAprop3.schemaDprop2")); + EXPECT_THAT(schema_d_iterator.GetCurrentPropertyConfig(), + EqualsProto(schema_type_config_d.properties(1))); + EXPECT_THAT(schema_d_iterator.GetCurrentPropertyIndexable(), IsTrue()); + + EXPECT_THAT(schema_d_iterator.Advance(), IsOk()); + EXPECT_THAT(schema_d_iterator.GetCurrentPropertyPath(), + Eq("schemaDprop1.schemaAprop1.schemaBprop1.schemaCprop2")); + EXPECT_THAT(schema_d_iterator.GetCurrentPropertyConfig(), + EqualsProto(schema_type_config_c.properties(1))); + EXPECT_THAT(schema_d_iterator.GetCurrentPropertyIndexable(), IsFalse()); + + EXPECT_THAT(schema_d_iterator.Advance(), IsOk()); + EXPECT_THAT(schema_d_iterator.GetCurrentPropertyPath(), + Eq("schemaDprop1.schemaAprop1.schemaBprop2")); + EXPECT_THAT(schema_d_iterator.GetCurrentPropertyConfig(), + EqualsProto(schema_type_config_b.properties(1))); + EXPECT_THAT(schema_d_iterator.GetCurrentPropertyIndexable(), IsTrue()); + + EXPECT_THAT(schema_d_iterator.Advance(), IsOk()); + EXPECT_THAT(schema_d_iterator.GetCurrentPropertyPath(), + Eq("schemaDprop1.schemaAprop2")); + EXPECT_THAT(schema_d_iterator.GetCurrentPropertyConfig(), + EqualsProto(schema_type_config_a.properties(1))); + EXPECT_THAT(schema_d_iterator.GetCurrentPropertyIndexable(), IsTrue()); + + EXPECT_THAT(schema_d_iterator.Advance(), IsOk()); + EXPECT_THAT(schema_d_iterator.GetCurrentPropertyPath(), + Eq("schemaDprop1.schemaAprop3.schemaDprop1.schemaAprop1." + "schemaBprop1.schemaCprop2")); + EXPECT_THAT(schema_d_iterator.GetCurrentPropertyConfig(), + EqualsProto(schema_type_config_c.properties(1))); + EXPECT_THAT(schema_d_iterator.GetCurrentPropertyIndexable(), IsTrue()); + + EXPECT_THAT(schema_d_iterator.Advance(), IsOk()); + EXPECT_THAT( + schema_d_iterator.GetCurrentPropertyPath(), + Eq("schemaDprop1.schemaAprop3.schemaDprop1.schemaAprop1.schemaBprop2")); + EXPECT_THAT(schema_d_iterator.GetCurrentPropertyConfig(), + EqualsProto(schema_type_config_b.properties(1))); + EXPECT_THAT(schema_d_iterator.GetCurrentPropertyIndexable(), IsTrue()); + + EXPECT_THAT(schema_d_iterator.Advance(), IsOk()); + EXPECT_THAT(schema_d_iterator.GetCurrentPropertyPath(), + Eq("schemaDprop1.schemaAprop3.schemaDprop1.schemaAprop2")); + EXPECT_THAT(schema_d_iterator.GetCurrentPropertyConfig(), + EqualsProto(schema_type_config_a.properties(1))); + EXPECT_THAT(schema_d_iterator.GetCurrentPropertyIndexable(), IsTrue()); + + EXPECT_THAT(schema_d_iterator.Advance(), IsOk()); + EXPECT_THAT( + schema_d_iterator.GetCurrentPropertyPath(), + Eq("schemaDprop1.schemaAprop3.schemaDprop1.schemaAprop3.schemaDprop2")); + EXPECT_THAT(schema_d_iterator.GetCurrentPropertyConfig(), + EqualsProto(schema_type_config_d.properties(1))); + EXPECT_THAT(schema_d_iterator.GetCurrentPropertyIndexable(), IsTrue()); + + EXPECT_THAT(schema_d_iterator.Advance(), IsOk()); + EXPECT_THAT(schema_d_iterator.GetCurrentPropertyPath(), + Eq("schemaDprop1.schemaAprop3.schemaDprop2")); + EXPECT_THAT(schema_d_iterator.GetCurrentPropertyConfig(), + EqualsProto(schema_type_config_d.properties(1))); + EXPECT_THAT(schema_d_iterator.GetCurrentPropertyIndexable(), IsTrue()); + + EXPECT_THAT(schema_d_iterator.Advance(), IsOk()); + EXPECT_THAT(schema_d_iterator.GetCurrentPropertyPath(), Eq("schemaDprop2")); + EXPECT_THAT(schema_d_iterator.GetCurrentPropertyConfig(), + EqualsProto(schema_type_config_d.properties(1))); + EXPECT_THAT(schema_d_iterator.GetCurrentPropertyIndexable(), IsTrue()); + + EXPECT_THAT(schema_d_iterator.Advance(), + StatusIs(libtextclassifier3::StatusCode::OUT_OF_RANGE)); + + EXPECT_THAT(schema_d_iterator.unknown_indexable_nested_property_paths(), + IsEmpty()); +} + +TEST(SchemaPropertyIteratorTest, + MultipleCyclesWithIndexableList_unknownPropPaths) { + std::string schema_a = "A"; + std::string schema_b = "B"; + std::string schema_c = "C"; + std::string schema_d = "D"; + + // Create the following schema: + // D <--> A <--- C + // \ ^ + // v / + // B + // Schema type A has two cycles: A-B-C-A and A-D-A + SchemaTypeConfigProto schema_type_config_a = + SchemaTypeConfigBuilder() + .SetType(schema_a) + .AddProperty( + PropertyConfigBuilder() + .SetName("schemaAprop1") + .SetDataTypeDocument( + schema_b, /*indexable_nested_properties_list=*/ + {"schemaBprop2", "schemaBprop1.schemaCprop1.schemaAprop2", + "schemaBprop1.schemaCprop1.schemaAprop1.schemaBprop2", + "schemaBprop1.schemaCprop1.schemaAprop3.schemaDprop2", + "schemaBprop1.schemaCprop1.schemaAprop3.schemaDprop1." + "schemaAprop2", + "schemaBprop1.schemaCprop1", + "schemaBprop1.schemaCprop1.schemaAprop3", "schemaAprop2", + "schemaBprop2.schemaCprop2", "schemaBprop1.foo.bar", + "foo", "foo", "bar"})) + .AddProperty( + PropertyConfigBuilder() + .SetName("schemaAprop2") + .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN)) + .AddProperty( + PropertyConfigBuilder() + .SetName("schemaAprop3") + .SetDataTypeDocument( + schema_d, /*indexable_nested_properties_list=*/ + {"schemaDprop2", "schemaDprop1.schemaAprop2", + "schemaDprop1.schemaAprop1.schemaBprop2", + "schemaDprop1.schemaAprop1.schemaBprop1.schemaCprop2", + "schemaDprop1.schemaAprop3.schemaDprop2", "schemaBprop2", + "bar", "schemaDprop2.foo", "schemaDprop1", + "schemaAprop3.schemaDprop2"})) + .Build(); + SchemaTypeConfigProto schema_type_config_b = + SchemaTypeConfigBuilder() + .SetType(schema_b) + .AddProperty(PropertyConfigBuilder() + .SetName("schemaBprop1") + .SetDataTypeDocument( + schema_c, /*index_nested_properties=*/true)) + .AddProperty( + PropertyConfigBuilder() + .SetName("schemaBprop2") + .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN)) + .Build(); + SchemaTypeConfigProto schema_type_config_c = + SchemaTypeConfigBuilder() + .SetType(schema_c) + .AddProperty(PropertyConfigBuilder() + .SetName("schemaCprop1") + .SetDataTypeDocument( + schema_a, /*index_nested_properties=*/false)) + .AddProperty( + PropertyConfigBuilder() + .SetName("schemaCprop2") + .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN)) + .Build(); + SchemaTypeConfigProto schema_type_config_d = + SchemaTypeConfigBuilder() + .SetType(schema_d) + .AddProperty(PropertyConfigBuilder() + .SetName("schemaDprop1") + .SetDataTypeDocument( + schema_a, /*index_nested_properties=*/false)) + .AddProperty( + PropertyConfigBuilder() + .SetName("schemaDprop2") + .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN)) + .Build(); + + SchemaUtil::TypeConfigMap type_config_map = { + {schema_a, schema_type_config_a}, + {schema_b, schema_type_config_b}, + {schema_c, schema_type_config_c}, + {schema_d, schema_type_config_d}}; + + // Order of iteration and whether each property is indexable for schema A: + // "schemaAprop1.schemaBprop1.schemaCprop1.schemaAprop1.schemaBprop2" (true), + // "schemaAprop1.schemaBprop1.schemaCprop1.schemaAprop2" (true), + // "schemaAprop1.schemaBprop1.schemaCprop1.schemaAprop3.schemaDprop1.schemaAprop2" + // (true), "schemaAprop1.schemaBprop1.schemaCprop1.schemaAprop3.schemaDprop2" + // (true), "schemaAprop1.schemaBprop1.schemaCprop2" (false), + // "schemaAprop1.schemaBprop2" (true), + // "schemaAprop2" (true), + // "schemaAprop3.schemaDprop1.schemaAprop1.schemaBprop1.schemaCprop2" (true), + // "schemaAprop3.schemaDprop1.schemaAprop1.schemaBprop2" (true), + // "schemaAprop3.schemaDprop1.schemaAprop2" (true), + // "schemaAprop3.schemaDprop1.schemaAprop3.schemaDprop2" (true), + // "schemaAprop3.schemaDprop2" (true) + // + // The following properties listed in the indexable_list are not defined + // in the schema and should not be seen during iteration. These should appear + // in the unknown_indexable_nested_properties_ set. + // "schemaAprop1.bar", + // "schemaAprop1.foo", + // "schemaAprop1.schemaAprop2", + // "schemaAprop1.schemaBprop1.foo.bar", + // "schemaAprop1.schemaBprop1.schemaCprop1", + // "schemaAprop1.schemaBprop1.schemaCprop1.schemaAprop3", + // "schemaAprop1.schemaBprop2.schemaCprop2", + // "schemaAprop3.bar", + // "schemaAprop3.schemaAprop3.schemaDprop2", + // "schemaAprop3.schemaBprop2", + // "schemaAprop3.schemaDprop1", + // "schemaAprop3.schemaDprop2.foo" + SchemaPropertyIterator schema_a_iterator(schema_type_config_a, + type_config_map); + + EXPECT_THAT(schema_a_iterator.Advance(), IsOk()); + EXPECT_THAT( + schema_a_iterator.GetCurrentPropertyPath(), + Eq("schemaAprop1.schemaBprop1.schemaCprop1.schemaAprop1.schemaBprop2")); + EXPECT_THAT(schema_a_iterator.GetCurrentPropertyConfig(), + EqualsProto(schema_type_config_b.properties(1))); + EXPECT_THAT(schema_a_iterator.GetCurrentPropertyIndexable(), IsTrue()); + + EXPECT_THAT(schema_a_iterator.Advance(), IsOk()); + EXPECT_THAT(schema_a_iterator.GetCurrentPropertyPath(), + Eq("schemaAprop1.schemaBprop1.schemaCprop1.schemaAprop2")); + EXPECT_THAT(schema_a_iterator.GetCurrentPropertyConfig(), + EqualsProto(schema_type_config_a.properties(1))); + EXPECT_THAT(schema_a_iterator.GetCurrentPropertyIndexable(), IsTrue()); + + EXPECT_THAT(schema_a_iterator.Advance(), IsOk()); + EXPECT_THAT(schema_a_iterator.GetCurrentPropertyPath(), + Eq("schemaAprop1.schemaBprop1.schemaCprop1.schemaAprop3." + "schemaDprop1.schemaAprop2")); + EXPECT_THAT(schema_a_iterator.GetCurrentPropertyConfig(), + EqualsProto(schema_type_config_a.properties(1))); + EXPECT_THAT(schema_a_iterator.GetCurrentPropertyIndexable(), IsTrue()); + + EXPECT_THAT(schema_a_iterator.Advance(), IsOk()); + EXPECT_THAT( + schema_a_iterator.GetCurrentPropertyPath(), + Eq("schemaAprop1.schemaBprop1.schemaCprop1.schemaAprop3.schemaDprop2")); + EXPECT_THAT(schema_a_iterator.GetCurrentPropertyConfig(), + EqualsProto(schema_type_config_d.properties(1))); + EXPECT_THAT(schema_a_iterator.GetCurrentPropertyIndexable(), IsTrue()); + + EXPECT_THAT(schema_a_iterator.Advance(), IsOk()); + EXPECT_THAT(schema_a_iterator.GetCurrentPropertyPath(), + Eq("schemaAprop1.schemaBprop1.schemaCprop2")); + EXPECT_THAT(schema_a_iterator.GetCurrentPropertyConfig(), + EqualsProto(schema_type_config_c.properties(1))); + EXPECT_THAT(schema_a_iterator.GetCurrentPropertyIndexable(), IsFalse()); + + EXPECT_THAT(schema_a_iterator.Advance(), IsOk()); + EXPECT_THAT(schema_a_iterator.GetCurrentPropertyPath(), + Eq("schemaAprop1.schemaBprop2")); + EXPECT_THAT(schema_a_iterator.GetCurrentPropertyConfig(), + EqualsProto(schema_type_config_b.properties(1))); + EXPECT_THAT(schema_a_iterator.GetCurrentPropertyIndexable(), IsTrue()); + + EXPECT_THAT(schema_a_iterator.Advance(), IsOk()); + EXPECT_THAT(schema_a_iterator.GetCurrentPropertyPath(), Eq("schemaAprop2")); + EXPECT_THAT(schema_a_iterator.GetCurrentPropertyConfig(), + EqualsProto(schema_type_config_a.properties(1))); + EXPECT_THAT(schema_a_iterator.GetCurrentPropertyIndexable(), IsTrue()); + + EXPECT_THAT(schema_a_iterator.Advance(), IsOk()); + EXPECT_THAT( + schema_a_iterator.GetCurrentPropertyPath(), + Eq("schemaAprop3.schemaDprop1.schemaAprop1.schemaBprop1.schemaCprop2")); + EXPECT_THAT(schema_a_iterator.GetCurrentPropertyConfig(), + EqualsProto(schema_type_config_c.properties(1))); + EXPECT_THAT(schema_a_iterator.GetCurrentPropertyIndexable(), IsTrue()); + + EXPECT_THAT(schema_a_iterator.Advance(), IsOk()); + EXPECT_THAT(schema_a_iterator.GetCurrentPropertyPath(), + Eq("schemaAprop3.schemaDprop1.schemaAprop1.schemaBprop2")); + EXPECT_THAT(schema_a_iterator.GetCurrentPropertyConfig(), + EqualsProto(schema_type_config_b.properties(1))); + EXPECT_THAT(schema_a_iterator.GetCurrentPropertyIndexable(), IsTrue()); + + EXPECT_THAT(schema_a_iterator.Advance(), IsOk()); + EXPECT_THAT(schema_a_iterator.GetCurrentPropertyPath(), + Eq("schemaAprop3.schemaDprop1.schemaAprop2")); + EXPECT_THAT(schema_a_iterator.GetCurrentPropertyConfig(), + EqualsProto(schema_type_config_a.properties(1))); + EXPECT_THAT(schema_a_iterator.GetCurrentPropertyIndexable(), IsTrue()); + + EXPECT_THAT(schema_a_iterator.Advance(), IsOk()); + EXPECT_THAT(schema_a_iterator.GetCurrentPropertyPath(), + Eq("schemaAprop3.schemaDprop1.schemaAprop3.schemaDprop2")); + EXPECT_THAT(schema_a_iterator.GetCurrentPropertyConfig(), + EqualsProto(schema_type_config_d.properties(1))); + EXPECT_THAT(schema_a_iterator.GetCurrentPropertyIndexable(), IsTrue()); + + EXPECT_THAT(schema_a_iterator.Advance(), IsOk()); + EXPECT_THAT(schema_a_iterator.GetCurrentPropertyPath(), + Eq("schemaAprop3.schemaDprop2")); + EXPECT_THAT(schema_a_iterator.GetCurrentPropertyConfig(), + EqualsProto(schema_type_config_d.properties(1))); + EXPECT_THAT(schema_a_iterator.GetCurrentPropertyIndexable(), IsTrue()); + + EXPECT_THAT(schema_a_iterator.Advance(), + StatusIs(libtextclassifier3::StatusCode::OUT_OF_RANGE)); + + EXPECT_THAT( + schema_a_iterator.unknown_indexable_nested_property_paths(), + ElementsAre( + "schemaAprop1.bar", "schemaAprop1.foo", "schemaAprop1.schemaAprop2", + "schemaAprop1.schemaBprop1.foo.bar", + "schemaAprop1.schemaBprop1.schemaCprop1", + "schemaAprop1.schemaBprop1.schemaCprop1.schemaAprop3", + "schemaAprop1.schemaBprop2.schemaCprop2", "schemaAprop3.bar", + "schemaAprop3.schemaAprop3.schemaDprop2", "schemaAprop3.schemaBprop2", + "schemaAprop3.schemaDprop1", "schemaAprop3.schemaDprop2.foo")); + + // Order of iteration and whether each property is indexable for schema B: + // "schemaBprop1.schemaCprop1.schemaAprop2" (false), + // "schemaBprop1.schemaCprop1.schemaAprop3.schemaDprop2" (false), + // "schemaBprop1.schemaCprop2" (true), + // "schemaBprop2" (true) + SchemaPropertyIterator schema_b_iterator(schema_type_config_b, + type_config_map); + + EXPECT_THAT(schema_b_iterator.Advance(), IsOk()); + EXPECT_THAT(schema_b_iterator.GetCurrentPropertyPath(), + Eq("schemaBprop1.schemaCprop1.schemaAprop2")); + EXPECT_THAT(schema_b_iterator.GetCurrentPropertyConfig(), + EqualsProto(schema_type_config_a.properties(1))); + EXPECT_THAT(schema_b_iterator.GetCurrentPropertyIndexable(), IsFalse()); + + EXPECT_THAT(schema_b_iterator.Advance(), IsOk()); + EXPECT_THAT(schema_b_iterator.GetCurrentPropertyPath(), + Eq("schemaBprop1.schemaCprop1.schemaAprop3.schemaDprop2")); + EXPECT_THAT(schema_b_iterator.GetCurrentPropertyConfig(), + EqualsProto(schema_type_config_d.properties(1))); + EXPECT_THAT(schema_b_iterator.GetCurrentPropertyIndexable(), IsFalse()); + + EXPECT_THAT(schema_b_iterator.Advance(), IsOk()); + EXPECT_THAT(schema_b_iterator.GetCurrentPropertyPath(), + Eq("schemaBprop1.schemaCprop2")); + EXPECT_THAT(schema_b_iterator.GetCurrentPropertyConfig(), + EqualsProto(schema_type_config_c.properties(1))); + EXPECT_THAT(schema_b_iterator.GetCurrentPropertyIndexable(), IsTrue()); + + EXPECT_THAT(schema_b_iterator.Advance(), IsOk()); + EXPECT_THAT(schema_b_iterator.GetCurrentPropertyPath(), Eq("schemaBprop2")); + EXPECT_THAT(schema_b_iterator.GetCurrentPropertyConfig(), + EqualsProto(schema_type_config_b.properties(1))); + EXPECT_THAT(schema_b_iterator.GetCurrentPropertyIndexable(), IsTrue()); + + EXPECT_THAT(schema_b_iterator.Advance(), + StatusIs(libtextclassifier3::StatusCode::OUT_OF_RANGE)); + + EXPECT_THAT(schema_b_iterator.unknown_indexable_nested_property_paths(), + IsEmpty()); + + // Order of iteration for schema C: + // "schemaCprop1.schemaAprop1.schemaBprop2" (false), + // "schemaCprop1.schemaAprop2" (false), + // "schemaCprop1.schemaAprop3.schemaDprop2" (false), + // "schemaCprop2" (true) + SchemaPropertyIterator schema_c_iterator(schema_type_config_c, + type_config_map); + + EXPECT_THAT(schema_c_iterator.Advance(), IsOk()); + EXPECT_THAT(schema_c_iterator.GetCurrentPropertyPath(), + Eq("schemaCprop1.schemaAprop1.schemaBprop2")); + EXPECT_THAT(schema_c_iterator.GetCurrentPropertyConfig(), + EqualsProto(schema_type_config_b.properties(1))); + EXPECT_THAT(schema_c_iterator.GetCurrentPropertyIndexable(), IsFalse()); + + EXPECT_THAT(schema_c_iterator.Advance(), IsOk()); + EXPECT_THAT(schema_c_iterator.GetCurrentPropertyPath(), + Eq("schemaCprop1.schemaAprop2")); + EXPECT_THAT(schema_c_iterator.GetCurrentPropertyConfig(), + EqualsProto(schema_type_config_a.properties(1))); + EXPECT_THAT(schema_c_iterator.GetCurrentPropertyIndexable(), IsFalse()); + + EXPECT_THAT(schema_c_iterator.Advance(), IsOk()); + EXPECT_THAT(schema_c_iterator.GetCurrentPropertyPath(), + Eq("schemaCprop1.schemaAprop3.schemaDprop2")); + EXPECT_THAT(schema_c_iterator.GetCurrentPropertyConfig(), + EqualsProto(schema_type_config_d.properties(1))); + EXPECT_THAT(schema_c_iterator.GetCurrentPropertyIndexable(), IsFalse()); + + EXPECT_THAT(schema_c_iterator.Advance(), IsOk()); + EXPECT_THAT(schema_c_iterator.GetCurrentPropertyPath(), Eq("schemaCprop2")); + EXPECT_THAT(schema_c_iterator.GetCurrentPropertyConfig(), + EqualsProto(schema_type_config_c.properties(1))); + EXPECT_THAT(schema_c_iterator.GetCurrentPropertyIndexable(), IsTrue()); + + EXPECT_THAT(schema_c_iterator.Advance(), + StatusIs(libtextclassifier3::StatusCode::OUT_OF_RANGE)); + + EXPECT_THAT(schema_c_iterator.unknown_indexable_nested_property_paths(), + IsEmpty()); + + // Order of iteration for schema D: + // "schemaDprop1.schemaAprop1.schemaBprop1.schemaCprop2" (false), + // "schemaDprop1.schemaAprop1.schemaBprop2" (false), + // "schemaDprop1.schemaAprop2" (false), + // "schemaDprop2" (true) + SchemaPropertyIterator schema_d_iterator(schema_type_config_d, + type_config_map); + + EXPECT_THAT(schema_d_iterator.Advance(), IsOk()); + EXPECT_THAT(schema_d_iterator.GetCurrentPropertyPath(), + Eq("schemaDprop1.schemaAprop1.schemaBprop1.schemaCprop2")); + EXPECT_THAT(schema_d_iterator.GetCurrentPropertyConfig(), + EqualsProto(schema_type_config_c.properties(1))); + EXPECT_THAT(schema_d_iterator.GetCurrentPropertyIndexable(), IsFalse()); + + EXPECT_THAT(schema_d_iterator.Advance(), IsOk()); + EXPECT_THAT(schema_d_iterator.GetCurrentPropertyPath(), + Eq("schemaDprop1.schemaAprop1.schemaBprop2")); + EXPECT_THAT(schema_d_iterator.GetCurrentPropertyConfig(), + EqualsProto(schema_type_config_b.properties(1))); + EXPECT_THAT(schema_d_iterator.GetCurrentPropertyIndexable(), IsFalse()); + + EXPECT_THAT(schema_d_iterator.Advance(), IsOk()); + EXPECT_THAT(schema_d_iterator.GetCurrentPropertyPath(), + Eq("schemaDprop1.schemaAprop2")); + EXPECT_THAT(schema_d_iterator.GetCurrentPropertyConfig(), + EqualsProto(schema_type_config_a.properties(1))); + EXPECT_THAT(schema_d_iterator.GetCurrentPropertyIndexable(), IsFalse()); + + EXPECT_THAT(schema_d_iterator.Advance(), IsOk()); + EXPECT_THAT(schema_d_iterator.GetCurrentPropertyPath(), Eq("schemaDprop2")); + EXPECT_THAT(schema_d_iterator.GetCurrentPropertyConfig(), + EqualsProto(schema_type_config_d.properties(1))); + EXPECT_THAT(schema_d_iterator.GetCurrentPropertyIndexable(), IsTrue()); + + EXPECT_THAT(schema_d_iterator.Advance(), + StatusIs(libtextclassifier3::StatusCode::OUT_OF_RANGE)); + + EXPECT_THAT(schema_d_iterator.unknown_indexable_nested_property_paths(), + IsEmpty()); +} + +TEST(SchemaPropertyIteratorTest, TopLevelCycleWithMultipleIndexableLists) { + std::string schema_a = "A"; + std::string schema_b = "B"; + std::string schema_c = "C"; + std::string schema_d = "D"; + + // Create the following schema: + // A <-> A -> B + // A has a top-level property that is a self-reference. + SchemaTypeConfigProto schema_type_config_a = + SchemaTypeConfigBuilder() + .SetType(schema_a) + .AddProperty(PropertyConfigBuilder() + .SetName("schemaAprop1") + .SetDataTypeDocument( + schema_b, /*indexable_nested_properties_list=*/ + {"schemaBprop1", "schemaBprop2"})) + .AddProperty(PropertyConfigBuilder() + .SetName("schemaAprop2") + .SetDataTypeDocument( + schema_a, /*indexable_nested_properties_list=*/ + {"schemaAprop1.schemaBprop2", + "schemaAprop1.schemaBprop3"})) + .AddProperty( + PropertyConfigBuilder() + .SetName("schemaAprop3") + .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN)) + .Build(); + SchemaTypeConfigProto schema_type_config_b = + SchemaTypeConfigBuilder() + .SetType(schema_b) + .AddProperty( + PropertyConfigBuilder() + .SetName("schemaBprop1") + .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN)) + .AddProperty( + PropertyConfigBuilder() + .SetName("schemaBprop2") + .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN)) + .AddProperty( + PropertyConfigBuilder() + .SetName("schemaBprop3") + .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN)) + .Build(); + + SchemaUtil::TypeConfigMap type_config_map = { + {schema_a, schema_type_config_a}, {schema_b, schema_type_config_b}}; + + // Order of iteration for Schema A: + // "schemaAprop1.schemaBprop1" (true) + // "schemaAprop1.schemaBprop2" (true) + // "schemaAprop1.schemaBprop3" (false) + // "schemaAprop2.schemaAprop1.schemaBprop1" (false) + // "schemaAprop2.schemaAprop1.schemaBprop2" (true) + // "schemaAprop2.schemaAprop1.schemaBprop3" (true) + // "schemaAprop2.schemaAprop3" (false) + // "schemaAprop3" (true) + SchemaPropertyIterator schema_a_iterator(schema_type_config_a, + type_config_map); + + EXPECT_THAT(schema_a_iterator.Advance(), IsOk()); + EXPECT_THAT(schema_a_iterator.GetCurrentPropertyPath(), + Eq("schemaAprop1.schemaBprop1")); + EXPECT_THAT(schema_a_iterator.GetCurrentPropertyConfig(), + EqualsProto(schema_type_config_b.properties(0))); + EXPECT_THAT(schema_a_iterator.GetCurrentPropertyIndexable(), IsTrue()); + + EXPECT_THAT(schema_a_iterator.Advance(), IsOk()); + EXPECT_THAT(schema_a_iterator.GetCurrentPropertyPath(), + Eq("schemaAprop1.schemaBprop2")); + EXPECT_THAT(schema_a_iterator.GetCurrentPropertyConfig(), + EqualsProto(schema_type_config_b.properties(1))); + EXPECT_THAT(schema_a_iterator.GetCurrentPropertyIndexable(), IsTrue()); + + EXPECT_THAT(schema_a_iterator.Advance(), IsOk()); + EXPECT_THAT(schema_a_iterator.GetCurrentPropertyPath(), + Eq("schemaAprop1.schemaBprop3")); + EXPECT_THAT(schema_a_iterator.GetCurrentPropertyConfig(), + EqualsProto(schema_type_config_b.properties(2))); + EXPECT_THAT(schema_a_iterator.GetCurrentPropertyIndexable(), IsFalse()); + + EXPECT_THAT(schema_a_iterator.Advance(), IsOk()); + EXPECT_THAT(schema_a_iterator.GetCurrentPropertyPath(), + Eq("schemaAprop2.schemaAprop1.schemaBprop1")); + EXPECT_THAT(schema_a_iterator.GetCurrentPropertyConfig(), + EqualsProto(schema_type_config_b.properties(0))); + EXPECT_THAT(schema_a_iterator.GetCurrentPropertyIndexable(), IsFalse()); + + EXPECT_THAT(schema_a_iterator.Advance(), IsOk()); + EXPECT_THAT(schema_a_iterator.GetCurrentPropertyPath(), + Eq("schemaAprop2.schemaAprop1.schemaBprop2")); + EXPECT_THAT(schema_a_iterator.GetCurrentPropertyConfig(), + EqualsProto(schema_type_config_b.properties(1))); + EXPECT_THAT(schema_a_iterator.GetCurrentPropertyIndexable(), IsTrue()); + + EXPECT_THAT(schema_a_iterator.Advance(), IsOk()); + EXPECT_THAT(schema_a_iterator.GetCurrentPropertyPath(), + Eq("schemaAprop2.schemaAprop1.schemaBprop3")); + EXPECT_THAT(schema_a_iterator.GetCurrentPropertyConfig(), + EqualsProto(schema_type_config_b.properties(2))); + EXPECT_THAT(schema_a_iterator.GetCurrentPropertyIndexable(), IsTrue()); + + EXPECT_THAT(schema_a_iterator.Advance(), IsOk()); + EXPECT_THAT(schema_a_iterator.GetCurrentPropertyPath(), + Eq("schemaAprop2.schemaAprop3")); + EXPECT_THAT(schema_a_iterator.GetCurrentPropertyConfig(), + EqualsProto(schema_type_config_a.properties(2))); + EXPECT_THAT(schema_a_iterator.GetCurrentPropertyIndexable(), IsFalse()); + + EXPECT_THAT(schema_a_iterator.Advance(), IsOk()); + EXPECT_THAT(schema_a_iterator.GetCurrentPropertyPath(), Eq("schemaAprop3")); + EXPECT_THAT(schema_a_iterator.GetCurrentPropertyConfig(), + EqualsProto(schema_type_config_a.properties(2))); + EXPECT_THAT(schema_a_iterator.GetCurrentPropertyIndexable(), IsTrue()); + + EXPECT_THAT(schema_a_iterator.Advance(), + StatusIs(libtextclassifier3::StatusCode::OUT_OF_RANGE)); + + EXPECT_THAT(schema_a_iterator.unknown_indexable_nested_property_paths(), + IsEmpty()); } } // namespace diff --git a/icing/schema/schema-store.cc b/icing/schema/schema-store.cc index bcc7c2c..e17e388 100644 --- a/icing/schema/schema-store.cc +++ b/icing/schema/schema-store.cc @@ -448,7 +448,7 @@ libtextclassifier3::Status SchemaStore::InitializeDerivedFiles() { "Combined checksum of SchemaStore was inconsistent"); } - BuildInMemoryCache(); + ICING_RETURN_IF_ERROR(BuildInMemoryCache()); return libtextclassifier3::Status::OK; } @@ -463,7 +463,7 @@ libtextclassifier3::Status SchemaStore::RegenerateDerivedFiles( ICING_RETURN_IF_ERROR(schema_type_mapper_->Put( type_config.schema_type(), schema_type_mapper_->num_keys())); } - BuildInMemoryCache(); + ICING_RETURN_IF_ERROR(BuildInMemoryCache()); if (create_overlay_if_necessary) { ICING_ASSIGN_OR_RETURN( @@ -485,12 +485,16 @@ libtextclassifier3::Status SchemaStore::RegenerateDerivedFiles( std::make_unique<SchemaProto>(std::move(base_schema)); ICING_RETURN_IF_ERROR(schema_file_->Write(std::move(base_schema_ptr))); + // LINT.IfChange(min_overlay_version_compatibility) + // Although the current version is 3, the schema is compatible with + // version 1, so min_overlay_version_compatibility should be 1. + int32_t min_overlay_version_compatibility = version_util::kVersionOne; + // LINT.ThenChange(//depot/google3/icing/file/version-util.h:kVersion) header_->SetOverlayInfo( - /*overlay_created=*/true, - /*min_overlay_version_compatibility=*/version_util::kVersionOne); + /*overlay_created=*/true, min_overlay_version_compatibility); // Rebuild in memory data - references to the old schema will be invalid // now. - BuildInMemoryCache(); + ICING_RETURN_IF_ERROR(BuildInMemoryCache()); } } @@ -776,6 +780,17 @@ libtextclassifier3::StatusOr<SchemaTypeId> SchemaStore::GetSchemaTypeId( return schema_type_mapper_->Get(schema_type); } +libtextclassifier3::StatusOr<const std::string*> SchemaStore::GetSchemaType( + SchemaTypeId schema_type_id) const { + ICING_RETURN_IF_ERROR(CheckSchemaSet()); + if (const auto it = reverse_schema_type_mapper_.find(schema_type_id); + it == reverse_schema_type_mapper_.end()) { + return absl_ports::InvalidArgumentError("Invalid schema type id"); + } else { + return &it->second; + } +} + libtextclassifier3::StatusOr<const std::unordered_set<SchemaTypeId>*> SchemaStore::GetSchemaTypeIdsWithChildren(std::string_view schema_type) const { ICING_ASSIGN_OR_RETURN(SchemaTypeId schema_type_id, diff --git a/icing/schema/schema-store.h b/icing/schema/schema-store.h index 6075f5b..88968b1 100644 --- a/icing/schema/schema-store.h +++ b/icing/schema/schema-store.h @@ -276,6 +276,15 @@ class SchemaStore { libtextclassifier3::StatusOr<const SchemaTypeConfigProto*> GetSchemaTypeConfig(std::string_view schema_type) const; + // Returns the schema type of the passed in SchemaTypeId + // + // Returns: + // schema type on success + // FAILED_PRECONDITION if schema hasn't been set yet + // INVALID_ARGUMENT if schema type id is invalid + libtextclassifier3::StatusOr<const std::string*> GetSchemaType( + SchemaTypeId schema_type_id) const; + // Returns the SchemaTypeId of the passed in schema type // // Returns: diff --git a/icing/schema/schema-store_test.cc b/icing/schema/schema-store_test.cc index 3298b75..8cc7008 100644 --- a/icing/schema/schema-store_test.cc +++ b/icing/schema/schema-store_test.cc @@ -1084,6 +1084,137 @@ TEST_F(SchemaStoreTest, SetSchemaWithCompatibleNestedTypesOk) { EXPECT_THAT(*actual_schema, EqualsProto(new_schema)); } +TEST_F(SchemaStoreTest, SetSchemaWithAddedIndexableNestedTypeOk) { + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<SchemaStore> schema_store, + SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_)); + + // 1. Create a ContactPoint type with a optional property, and a type that + // references the ContactPoint type. + SchemaTypeConfigBuilder contact_point = + SchemaTypeConfigBuilder() + .SetType("ContactPoint") + .AddProperty( + PropertyConfigBuilder() + .SetName("label") + .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_REPEATED)); + SchemaTypeConfigBuilder person = + SchemaTypeConfigBuilder().SetType("Person").AddProperty( + PropertyConfigBuilder() + .SetName("contactPoints") + .SetDataTypeDocument("ContactPoint", + /*index_nested_properties=*/true) + .SetCardinality(CARDINALITY_REPEATED)); + SchemaProto old_schema = + SchemaBuilder().AddType(contact_point).AddType(person).Build(); + ICING_EXPECT_OK(schema_store->SetSchema( + old_schema, /*ignore_errors_and_delete_documents=*/false, + /*allow_circular_schema_definitions=*/false)); + + // 2. Add another nested document property to "Person" that has type + // "ContactPoint" + SchemaTypeConfigBuilder new_person = + SchemaTypeConfigBuilder() + .SetType("Person") + .AddProperty( + PropertyConfigBuilder() + .SetName("contactPoints") + .SetDataTypeDocument("ContactPoint", + /*index_nested_properties=*/true) + .SetCardinality(CARDINALITY_REPEATED)) + .AddProperty( + PropertyConfigBuilder() + .SetName("anotherContactPoint") + .SetDataTypeDocument("ContactPoint", + /*index_nested_properties=*/true) + .SetCardinality(CARDINALITY_REPEATED)); + SchemaProto new_schema = + SchemaBuilder().AddType(contact_point).AddType(new_person).Build(); + + // 3. Set to new schema. "Person" should be index-incompatible since we need + // to index an additional property: 'anotherContactPoint.label'. + // - "Person" is also considered join-incompatible since the added nested + // document property could also contain a joinable property. + SchemaStore::SetSchemaResult expected_result; + expected_result.success = true; + expected_result.schema_types_index_incompatible_by_name.insert("Person"); + expected_result.schema_types_join_incompatible_by_name.insert("Person"); + + EXPECT_THAT(schema_store->SetSchema( + new_schema, /*ignore_errors_and_delete_documents=*/false, + /*allow_circular_schema_definitions=*/false), + IsOkAndHolds(EqualsSetSchemaResult(expected_result))); + ICING_ASSERT_OK_AND_ASSIGN(const SchemaProto* actual_schema, + schema_store->GetSchema()); + EXPECT_THAT(*actual_schema, EqualsProto(new_schema)); +} + +TEST_F(SchemaStoreTest, SetSchemaWithAddedJoinableNestedTypeOk) { + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<SchemaStore> schema_store, + SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_)); + + // 1. Create a ContactPoint type with a optional property, and a type that + // references the ContactPoint type. + SchemaTypeConfigBuilder contact_point = + SchemaTypeConfigBuilder() + .SetType("ContactPoint") + .AddProperty( + PropertyConfigBuilder() + .SetName("label") + .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_PLAIN) + .SetJoinable(JOINABLE_VALUE_TYPE_QUALIFIED_ID, + /*propagate_delete=*/false) + .SetCardinality(CARDINALITY_REQUIRED)); + SchemaTypeConfigBuilder person = + SchemaTypeConfigBuilder().SetType("Person").AddProperty( + PropertyConfigBuilder() + .SetName("contactPoints") + .SetDataTypeDocument("ContactPoint", + /*index_nested_properties=*/true) + .SetCardinality(CARDINALITY_OPTIONAL)); + SchemaProto old_schema = + SchemaBuilder().AddType(contact_point).AddType(person).Build(); + ICING_EXPECT_OK(schema_store->SetSchema( + old_schema, /*ignore_errors_and_delete_documents=*/false, + /*allow_circular_schema_definitions=*/false)); + + // 2. Add another nested document property to "Person" that has type + // "ContactPoint", but make it non-indexable + SchemaTypeConfigBuilder new_person = + SchemaTypeConfigBuilder() + .SetType("Person") + .AddProperty( + PropertyConfigBuilder() + .SetName("contactPoints") + .SetDataTypeDocument("ContactPoint", + /*index_nested_properties=*/true) + .SetCardinality(CARDINALITY_OPTIONAL)) + .AddProperty( + PropertyConfigBuilder() + .SetName("anotherContactPoint") + .SetDataTypeDocument("ContactPoint", + /*index_nested_properties=*/false) + .SetCardinality(CARDINALITY_OPTIONAL)); + SchemaProto new_schema = + SchemaBuilder().AddType(contact_point).AddType(new_person).Build(); + + // 3. Set to new schema. "Person" should be join-incompatible but + // index-compatible. + SchemaStore::SetSchemaResult expected_result; + expected_result.success = true; + expected_result.schema_types_join_incompatible_by_name.insert("Person"); + + EXPECT_THAT(schema_store->SetSchema( + new_schema, /*ignore_errors_and_delete_documents=*/false, + /*allow_circular_schema_definitions=*/false), + IsOkAndHolds(EqualsSetSchemaResult(expected_result))); + ICING_ASSERT_OK_AND_ASSIGN(const SchemaProto* actual_schema, + schema_store->GetSchema()); + EXPECT_THAT(*actual_schema, EqualsProto(new_schema)); +} + TEST_F(SchemaStoreTest, GetSchemaTypeId) { ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<SchemaStore> schema_store, @@ -2722,7 +2853,8 @@ TEST_F(SchemaStoreTest, MigrateSchemaVersionZeroUpgradeNoChange) { } } -TEST_F(SchemaStoreTest, MigrateSchemaRollbackDiscardsOverlaySchema) { +TEST_F(SchemaStoreTest, + MigrateSchemaRollbackDiscardsIncompatibleOverlaySchema) { // Because we are upgrading from version zero, the schema must be compatible // with version zero. SchemaTypeConfigProto type_a = @@ -2749,12 +2881,12 @@ TEST_F(SchemaStoreTest, MigrateSchemaRollbackDiscardsOverlaySchema) { IsOkAndHolds(Pointee(EqualsProto(schema)))); } - // Rollback to a version before kVersion. The schema header will declare that - // the overlay is compatible with any version starting with kVersion. So - // kVersion - 1 is incompatible and will throw out the schema. + // Rollback to a version before kVersionOne. The schema header will declare + // that the overlay is compatible with any version starting with kVersionOne. + // So kVersionOne - 1 is incompatible and will throw out the schema. ICING_EXPECT_OK(SchemaStore::MigrateSchema( &filesystem_, schema_store_dir_, version_util::StateChange::kRollBack, - version_util::kVersion - 1)); + version_util::kVersionOne - 1)); { // Create a new of the schema store and check that we fell back to the @@ -2777,7 +2909,7 @@ TEST_F(SchemaStoreTest, MigrateSchemaRollbackDiscardsOverlaySchema) { } } -TEST_F(SchemaStoreTest, MigrateSchemaCompatibleRollbackKeepsOverlaySchema) { +TEST_F(SchemaStoreTest, MigrateSchemaRollbackKeepsCompatibleOverlaySchema) { // Because we are upgrading from version zero, the schema must be compatible // with version zero. SchemaTypeConfigProto type_a = @@ -2846,12 +2978,12 @@ TEST_F(SchemaStoreTest, MigrateSchemaRollforwardRetainsBaseSchema) { IsOkAndHolds(Pointee(EqualsProto(schema)))); } - // Rollback to a version before kVersion. The schema header will declare that - // the overlay is compatible with any version starting with kVersion. So - // kVersion - 1 is incompatible and will throw out the schema. + // Rollback to a version before kVersionOne. The schema header will declare + // that the overlay is compatible with any version starting with kVersionOne. + // So kVersionOne - 1 is incompatible and will throw out the schema. ICING_EXPECT_OK(SchemaStore::MigrateSchema( &filesystem_, schema_store_dir_, version_util::StateChange::kRollBack, - version_util::kVersion - 1)); + version_util::kVersionOne - 1)); SchemaTypeConfigProto other_type_a = SchemaTypeConfigBuilder() diff --git a/icing/schema/schema-type-manager.cc b/icing/schema/schema-type-manager.cc index f3a86d4..4a6b7f2 100644 --- a/icing/schema/schema-type-manager.cc +++ b/icing/schema/schema-type-manager.cc @@ -20,6 +20,7 @@ #include "icing/text_classifier/lib3/utils/base/statusor.h" #include "icing/absl_ports/canonical_errors.h" #include "icing/schema/joinable-property-manager.h" +#include "icing/schema/property-util.h" #include "icing/schema/schema-property-iterator.h" #include "icing/schema/schema-util.h" #include "icing/schema/section-manager.h" @@ -55,7 +56,7 @@ SchemaTypeManager::Create(const SchemaUtil::TypeConfigMap& type_config_map, } // Process section (indexable property) - if (iterator.GetCurrentNestedIndexable()) { + if (iterator.GetCurrentPropertyIndexable()) { ICING_RETURN_IF_ERROR( section_manager_builder.ProcessSchemaTypePropertyConfig( schema_type_id, iterator.GetCurrentPropertyConfig(), @@ -68,6 +69,34 @@ SchemaTypeManager::Create(const SchemaUtil::TypeConfigMap& type_config_map, schema_type_id, iterator.GetCurrentPropertyConfig(), iterator.GetCurrentPropertyPath())); } + + // Process unknown property paths in the indexable_nested_properties_list. + // These property paths should consume sectionIds but are currently + // not indexed. + // + // SectionId assignment order: + // - We assign section ids to known (existing) properties first in alphabet + // order. + // - After handling all known properties, we assign section ids to all + // unknown (non-existent) properties that are specified in the + // indexable_nested_properties_list. + // - As a result, assignment of the entire section set is not done + // alphabetically, but assignment is still deterministic and alphabetical + // order is preserved inside the known properties and unknown properties + // sets individually. + for (const auto& property_path : + iterator.unknown_indexable_nested_property_paths()) { + PropertyConfigProto unknown_property_config; + unknown_property_config.set_property_name(std::string( + property_util::SplitPropertyPathExpr(property_path).back())); + unknown_property_config.set_data_type( + PropertyConfigProto::DataType::UNKNOWN); + + ICING_RETURN_IF_ERROR( + section_manager_builder.ProcessSchemaTypePropertyConfig( + schema_type_id, unknown_property_config, + std::string(property_path))); + } } return std::unique_ptr<SchemaTypeManager>(new SchemaTypeManager( diff --git a/icing/schema/schema-util.cc b/icing/schema/schema-util.cc index 371ed00..72287a8 100644 --- a/icing/schema/schema-util.cc +++ b/icing/schema/schema-util.cc @@ -115,6 +115,34 @@ bool IsIntegerNumericMatchTypeCompatible( return old_indexed.numeric_match_type() == new_indexed.numeric_match_type(); } +bool IsDocumentIndexingCompatible(const DocumentIndexingConfig& old_indexed, + const DocumentIndexingConfig& new_indexed) { + // TODO(b/265304217): This could mark the new schema as incompatible and + // generate some unnecessary index rebuilds if the two schemas have an + // equivalent set of indexed properties, but changed the way that it is + // declared. + if (old_indexed.index_nested_properties() != + new_indexed.index_nested_properties()) { + return false; + } + + if (old_indexed.indexable_nested_properties_list().size() != + new_indexed.indexable_nested_properties_list().size()) { + return false; + } + + std::unordered_set<std::string_view> old_indexable_nested_properies_set( + old_indexed.indexable_nested_properties_list().begin(), + old_indexed.indexable_nested_properties_list().end()); + for (const auto& property : new_indexed.indexable_nested_properties_list()) { + if (old_indexable_nested_properies_set.find(property) == + old_indexable_nested_properies_set.end()) { + return false; + } + } + return true; +} + void AddIncompatibleChangeToDelta( std::unordered_set<std::string>& incompatible_delta, const SchemaTypeConfigProto& old_type_config, @@ -161,6 +189,18 @@ bool CardinalityLessThanEq(PropertyConfigProto::Cardinality::Code C1, return false; } +// Check if set1 is a subset of set2. +template <typename T> +bool IsSubset(const std::unordered_set<T>& set1, + const std::unordered_set<T>& set2) { + for (const auto& item : set1) { + if (set2.find(item) == set2.end()) { + return false; + } + } + return true; +} + } // namespace libtextclassifier3::Status CalculateTransitiveNestedTypeRelations( @@ -252,8 +292,6 @@ libtextclassifier3::Status CalculateTransitiveNestedTypeRelations( // 4. "adjacent" has been fully expanded. Add all of its transitive // outgoing relations to this type's transitive outgoing relations. auto adjacent_expanded_itr = expanded_nested_types_map->find(adjacent_type); - expanded_relations.reserve(expanded_relations.size() + - adjacent_expanded_itr->second.size()); for (const auto& [transitive_reachable, _] : adjacent_expanded_itr->second) { // Insert a transitive reachable node `transitive_reachable` for `type` if @@ -317,8 +355,6 @@ libtextclassifier3::Status CalculateAcyclicTransitiveRelations( // 3. "adjacent" has been fully expanded. Add all of its transitive outgoing // relations to this type's transitive outgoing relations. auto adjacent_expanded_itr = expanded_relation_map->find(adjacent); - expanded_relations.reserve(expanded_relations.size() + - adjacent_expanded_itr->second.size()); for (const auto& [transitive_reachable, _] : adjacent_expanded_itr->second) { // Insert a transitive reachable node `transitive_reachable` for `type`. @@ -498,7 +534,6 @@ BuildTransitiveDependentGraph(const SchemaProto& schema, // Insert the parent_type into the dependent map if it is not present // already. merged_dependent_map.insert({parent_type, {}}); - merged_dependent_map[parent_type].reserve(inheritance_relation.size()); for (const auto& [child_type, _] : inheritance_relation) { // Insert the child_type into parent_type's dependent map if it's not // present already, in which case the value will be an empty vector. @@ -571,6 +606,10 @@ libtextclassifier3::StatusOr<SchemaUtil::DependentMap> SchemaUtil::Validate( "data_types in schema property '", schema_type, ".", property_name, "'")); } + + ICING_RETURN_IF_ERROR(ValidateDocumentIndexingConfig( + property_config.document_indexing_config(), schema_type, + property_name)); } ICING_RETURN_IF_ERROR(ValidateCardinality(property_config.cardinality(), @@ -751,6 +790,20 @@ libtextclassifier3::Status SchemaUtil::ValidateJoinableConfig( return libtextclassifier3::Status::OK; } +libtextclassifier3::Status SchemaUtil::ValidateDocumentIndexingConfig( + const DocumentIndexingConfig& config, std::string_view schema_type, + std::string_view property_name) { + if (!config.indexable_nested_properties_list().empty() && + config.index_nested_properties()) { + return absl_ports::InvalidArgumentError(absl_ports::StrCat( + "DocumentIndexingConfig.index_nested_properties is required to be " + "false when providing a non-empty indexable_nested_properties_list " + "for property '", + schema_type, ".", property_name, "'")); + } + return libtextclassifier3::Status::OK; +} + /* static */ bool SchemaUtil::IsIndexedProperty( const PropertyConfigProto& property_config) { switch (property_config.data_type()) { @@ -762,11 +815,19 @@ libtextclassifier3::Status SchemaUtil::ValidateJoinableConfig( case PropertyConfigProto::DataType::INT64: return property_config.integer_indexing_config().numeric_match_type() != IntegerIndexingConfig::NumericMatchType::UNKNOWN; + case PropertyConfigProto::DataType::DOCUMENT: + // A document property is considered indexed if it has + // index_nested_properties=true, or a non-empty + // indexable_nested_properties_list. + return property_config.document_indexing_config() + .index_nested_properties() || + !property_config.document_indexing_config() + .indexable_nested_properties_list() + .empty(); case PropertyConfigProto::DataType::UNKNOWN: case PropertyConfigProto::DataType::DOUBLE: case PropertyConfigProto::DataType::BOOLEAN: case PropertyConfigProto::DataType::BYTES: - case PropertyConfigProto::DataType::DOCUMENT: return false; } } @@ -880,24 +941,32 @@ SchemaUtil::ParsedPropertyConfigs SchemaUtil::ParsePropertyConfigs( // TODO(cassiewang): consider caching property_config_map for some properties, // e.g. using LRU cache. Or changing schema.proto to use go/protomap. for (const PropertyConfigProto& property_config : type_config.properties()) { - parsed_property_configs.property_config_map.emplace( - property_config.property_name(), &property_config); + std::string_view property_name = property_config.property_name(); + parsed_property_configs.property_config_map.emplace(property_name, + &property_config); if (property_config.cardinality() == PropertyConfigProto::Cardinality::REQUIRED) { - ++parsed_property_configs.num_required_properties; + parsed_property_configs.required_properties.insert(property_name); } // A non-default term_match_type indicates that this property is meant to be // indexed. if (IsIndexedProperty(property_config)) { - ++parsed_property_configs.num_indexed_properties; + parsed_property_configs.indexed_properties.insert(property_name); } // A non-default value_type indicates that this property is meant to be // joinable. if (property_config.joinable_config().value_type() != JoinableConfig::ValueType::NONE) { - ++parsed_property_configs.num_joinable_properties; + parsed_property_configs.joinable_properties.insert(property_name); + } + + // Also keep track of how many nested document properties there are. Adding + // new nested document properties will result in join-index rebuild. + if (property_config.data_type() == + PropertyConfigProto::DataType::DOCUMENT) { + parsed_property_configs.nested_document_properties.insert(property_name); } } @@ -934,9 +1003,10 @@ const SchemaUtil::SchemaDelta SchemaUtil::ComputeCompatibilityDelta( // We only need to check the old, existing properties to see if they're // compatible since we'll have old data that may be invalidated or need to // be reindexed. - int32_t old_required_properties = 0; - int32_t old_indexed_properties = 0; - int32_t old_joinable_properties = 0; + std::unordered_set<std::string_view> old_required_properties; + std::unordered_set<std::string_view> old_indexed_properties; + std::unordered_set<std::string_view> old_joinable_properties; + std::unordered_set<std::string_view> old_nested_document_properties; // If there is a different number of properties, then there must have been a // change. @@ -947,23 +1017,32 @@ const SchemaUtil::SchemaDelta SchemaUtil::ComputeCompatibilityDelta( bool is_index_incompatible = false; bool is_join_incompatible = false; for (const auto& old_property_config : old_type_config.properties()) { + std::string_view property_name = old_property_config.property_name(); if (old_property_config.cardinality() == PropertyConfigProto::Cardinality::REQUIRED) { - ++old_required_properties; + old_required_properties.insert(property_name); } // A non-default term_match_type indicates that this property is meant to // be indexed. bool is_indexed_property = IsIndexedProperty(old_property_config); if (is_indexed_property) { - ++old_indexed_properties; + old_indexed_properties.insert(property_name); } bool is_joinable_property = old_property_config.joinable_config().value_type() != JoinableConfig::ValueType::NONE; if (is_joinable_property) { - ++old_joinable_properties; + old_joinable_properties.insert(property_name); + } + + // A nested-document property is a property of DataType::DOCUMENT. + bool is_nested_document_property = + old_property_config.data_type() == + PropertyConfigProto::DataType::DOCUMENT; + if (is_nested_document_property) { + old_nested_document_properties.insert(property_name); } auto new_property_name_and_config = @@ -979,7 +1058,8 @@ const SchemaUtil::SchemaDelta SchemaUtil::ComputeCompatibilityDelta( "' was not defined in new schema"); is_incompatible = true; is_index_incompatible |= is_indexed_property; - is_join_incompatible |= is_joinable_property; + is_join_incompatible |= + is_joinable_property || is_nested_document_property; continue; } @@ -1005,10 +1085,9 @@ const SchemaUtil::SchemaDelta SchemaUtil::ComputeCompatibilityDelta( !IsIntegerNumericMatchTypeCompatible( old_property_config.integer_indexing_config(), new_property_config->integer_indexing_config()) || - old_property_config.document_indexing_config() - .index_nested_properties() != - new_property_config->document_indexing_config() - .index_nested_properties()) { + !IsDocumentIndexingCompatible( + old_property_config.document_indexing_config(), + new_property_config->document_indexing_config())) { is_index_incompatible = true; } @@ -1023,8 +1102,8 @@ const SchemaUtil::SchemaDelta SchemaUtil::ComputeCompatibilityDelta( // guaranteed from our previous checks that all the old properties are also // present in the new property config, so we can do a simple int comparison // here to detect new required properties. - if (new_parsed_property_configs.num_required_properties > - old_required_properties) { + if (!IsSubset(new_parsed_property_configs.required_properties, + old_required_properties)) { ICING_VLOG(1) << absl_ports::StrCat( "New schema '", old_type_config.schema_type(), "' has REQUIRED properties that are not " @@ -1032,11 +1111,12 @@ const SchemaUtil::SchemaDelta SchemaUtil::ComputeCompatibilityDelta( is_incompatible = true; } - // If we've gained any new indexed properties, then the section ids may - // change. Since the section ids are stored in the index, we'll need to + // If we've gained any new indexed properties (this includes gaining new + // indexed nested document properties), then the section ids may change. + // Since the section ids are stored in the index, we'll need to // reindex everything. - if (new_parsed_property_configs.num_indexed_properties > - old_indexed_properties) { + if (!IsSubset(new_parsed_property_configs.indexed_properties, + old_indexed_properties)) { ICING_VLOG(1) << "Set of indexed properties in schema type '" << old_type_config.schema_type() << "' has changed, required reindexing."; @@ -1045,9 +1125,15 @@ const SchemaUtil::SchemaDelta SchemaUtil::ComputeCompatibilityDelta( // If we've gained any new joinable properties, then the joinable property // ids may change. Since the joinable property ids are stored in the cache, - // we'll need to reconstruct joinable cache. - if (new_parsed_property_configs.num_joinable_properties > - old_joinable_properties) { + // we'll need to reconstruct join index. + // If we've gained any new nested document properties, we also rebuild the + // join index. This is because we index all nested joinable properties, so + // adding a nested document property will most probably result in having + // more joinable properties. + if (!IsSubset(new_parsed_property_configs.joinable_properties, + old_joinable_properties) || + !IsSubset(new_parsed_property_configs.nested_document_properties, + old_nested_document_properties)) { ICING_VLOG(1) << "Set of joinable properties in schema type '" << old_type_config.schema_type() << "' has changed, required reconstructing joinable cache."; diff --git a/icing/schema/schema-util.h b/icing/schema/schema-util.h index e707758..4f09915 100644 --- a/icing/schema/schema-util.h +++ b/icing/schema/schema-util.h @@ -113,14 +113,17 @@ class SchemaUtil { std::unordered_map<std::string_view, const PropertyConfigProto*> property_config_map; - // Total number of properties that have an indexing config - int32_t num_indexed_properties = 0; + // Properties that have an indexing config + std::unordered_set<std::string_view> indexed_properties; - // Total number of properties that were REQUIRED - int32_t num_required_properties = 0; + // Properties that were REQUIRED + std::unordered_set<std::string_view> required_properties; - // Total number of properties that have joinable config - int32_t num_joinable_properties = 0; + // Properties that have joinable config + std::unordered_set<std::string_view> joinable_properties; + + // Properties that have DataType::DOCUMENT + std::unordered_set<std::string_view> nested_document_properties; }; // This function validates: @@ -157,6 +160,9 @@ class SchemaUtil { // (property whose joinable config is not NONE), OR // ii. Any type node in the cycle has a nested-type (direct or // indirect) with a joinable property. + // 15. For DOCUMENT data types, if + // DocumentIndexingConfig.indexable_nested_properties_list is non-empty, + // DocumentIndexingConfig.index_nested_properties must be false. // // Returns: // On success, a dependent map from each types to their dependent types @@ -315,6 +321,17 @@ class SchemaUtil { PropertyConfigProto::Cardinality::Code cardinality, std::string_view schema_type, std::string_view property_name); + // Checks that the 'document_indexing_config' satisfies the following rule: + // 1. If indexable_nested_properties is non-empty, index_nested_properties + // must be set to false. + // + // Returns: + // INVALID_ARGUMENT if any of the rules are not followed + // OK on success + static libtextclassifier3::Status ValidateDocumentIndexingConfig( + const DocumentIndexingConfig& config, std::string_view schema_type, + std::string_view property_name); + // Returns if 'parent_type' is a direct or indirect parent of 'child_type'. static bool IsParent(const SchemaUtil::InheritanceMap& inheritance_map, std::string_view parent_type, diff --git a/icing/schema/schema-util_test.cc b/icing/schema/schema-util_test.cc index 40e30b0..82683ba 100644 --- a/icing/schema/schema-util_test.cc +++ b/icing/schema/schema-util_test.cc @@ -14,6 +14,8 @@ #include "icing/schema/schema-util.h" +#include <initializer_list> +#include <string> #include <string_view> #include <unordered_set> @@ -2562,6 +2564,114 @@ TEST_P(SchemaUtilTest, DifferentSchemaTypeIsIncompatible) { EXPECT_THAT(actual.schema_types_deleted, testing::IsEmpty()); } +TEST_P(SchemaUtilTest, SameNumberOfRequiredFieldsCanBeIncompatible) { + SchemaProto old_schema = + SchemaBuilder() + .AddType(SchemaTypeConfigBuilder() + .SetType(kEmailType) + .AddProperty(PropertyConfigBuilder() + .SetName("Property1") + .SetDataType(TYPE_STRING) + .SetCardinality(CARDINALITY_REQUIRED))) + .Build(); + + SchemaProto new_schema = + SchemaBuilder() + .AddType(SchemaTypeConfigBuilder() + .SetType(kEmailType) + .AddProperty( + PropertyConfigBuilder() + .SetName("Property1") + .SetDataType(TYPE_STRING) + // Changing required to optional should be fine + .SetCardinality(CARDINALITY_OPTIONAL)) + .AddProperty( + PropertyConfigBuilder() + .SetName("Property2") + .SetDataType(TYPE_STRING) + // Adding a new required property is incompatible + .SetCardinality(CARDINALITY_REQUIRED))) + .Build(); + + SchemaUtil::SchemaDelta delta = SchemaUtil::ComputeCompatibilityDelta( + old_schema, new_schema, /*new_schema_dependent_map=*/{}); + EXPECT_THAT(delta.schema_types_incompatible, + testing::ElementsAre(kEmailType)); + EXPECT_THAT(delta.schema_types_index_incompatible, testing::IsEmpty()); + EXPECT_THAT(delta.schema_types_deleted, testing::IsEmpty()); +} + +TEST_P(SchemaUtilTest, SameNumberOfIndexedPropertiesCanMakeIndexIncompatible) { + SchemaProto old_schema = + SchemaBuilder() + .AddType(SchemaTypeConfigBuilder() + .SetType(kEmailType) + .AddProperty(PropertyConfigBuilder() + .SetName("Property1") + .SetDataTypeString(TERM_MATCH_EXACT, + TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_OPTIONAL))) + .Build(); + + SchemaProto new_schema = + SchemaBuilder() + .AddType(SchemaTypeConfigBuilder() + .SetType(kEmailType) + .AddProperty(PropertyConfigBuilder() + .SetName("Property1") + .SetDataType(TYPE_STRING) + .SetCardinality(CARDINALITY_OPTIONAL)) + .AddProperty(PropertyConfigBuilder() + .SetName("Property2") + .SetDataTypeString(TERM_MATCH_EXACT, + TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_OPTIONAL))) + .Build(); + + SchemaUtil::SchemaDelta delta = SchemaUtil::ComputeCompatibilityDelta( + old_schema, new_schema, /*new_schema_dependent_map=*/{}); + EXPECT_THAT(delta.schema_types_incompatible, testing::IsEmpty()); + EXPECT_THAT(delta.schema_types_index_incompatible, + testing::ElementsAre(kEmailType)); + EXPECT_THAT(delta.schema_types_deleted, testing::IsEmpty()); +} + +TEST_P(SchemaUtilTest, SameNumberOfJoinablePropertiesCanMakeJoinIncompatible) { + SchemaProto old_schema = + SchemaBuilder() + .AddType(SchemaTypeConfigBuilder() + .SetType(kEmailType) + .AddProperty(PropertyConfigBuilder() + .SetName("Property1") + .SetDataTypeJoinableString( + JOINABLE_VALUE_TYPE_QUALIFIED_ID) + .SetCardinality(CARDINALITY_OPTIONAL))) + .Build(); + + SchemaProto new_schema = + SchemaBuilder() + .AddType(SchemaTypeConfigBuilder() + .SetType(kEmailType) + .AddProperty(PropertyConfigBuilder() + .SetName("Property1") + .SetDataType(TYPE_STRING) + .SetCardinality(CARDINALITY_OPTIONAL)) + .AddProperty(PropertyConfigBuilder() + .SetName("Property2") + .SetDataTypeJoinableString( + JOINABLE_VALUE_TYPE_QUALIFIED_ID) + .SetCardinality(CARDINALITY_OPTIONAL))) + .Build(); + + SchemaUtil::SchemaDelta delta = SchemaUtil::ComputeCompatibilityDelta( + old_schema, new_schema, /*new_schema_dependent_map=*/{}); + EXPECT_THAT(delta.schema_types_incompatible, testing::IsEmpty()); + EXPECT_THAT(delta.schema_types_index_incompatible, testing::IsEmpty()); + EXPECT_THAT(delta.schema_types_deleted, testing::IsEmpty()); + EXPECT_THAT(delta.schema_types_join_incompatible, + testing::ElementsAre(kEmailType)); +} + TEST_P(SchemaUtilTest, ChangingIndexedStringPropertiesMakesIndexIncompatible) { // Configure old schema SchemaProto schema_with_indexed_property = @@ -2790,6 +2900,437 @@ TEST_P(SchemaUtilTest, IsEmpty()); } +TEST_P(SchemaUtilTest, + AddingNewIndexedDocumentPropertyMakesIndexAndJoinIncompatible) { + SchemaTypeConfigProto nested_schema = + SchemaTypeConfigBuilder() + .SetType(kEmailType) + .AddProperty(PropertyConfigBuilder() + .SetName("subject") + .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_OPTIONAL)) + .Build(); + + // Configure old schema + SchemaProto old_schema = + SchemaBuilder() + .AddType(nested_schema) + .AddType(SchemaTypeConfigBuilder() + .SetType(kPersonType) + .AddProperty(PropertyConfigBuilder() + .SetName("Property") + .SetDataTypeInt64(NUMERIC_MATCH_RANGE) + .SetCardinality(CARDINALITY_OPTIONAL))) + .Build(); + + // Configure new schema + SchemaProto new_schema = + SchemaBuilder() + .AddType(nested_schema) + .AddType(SchemaTypeConfigBuilder() + .SetType(kPersonType) + .AddProperty(PropertyConfigBuilder() + .SetName("Property") + .SetDataTypeInt64(NUMERIC_MATCH_RANGE) + .SetCardinality(CARDINALITY_OPTIONAL)) + .AddProperty( + PropertyConfigBuilder() + .SetName("NewEmailProperty") + .SetDataTypeDocument( + kEmailType, /*index_nested_properties=*/true) + .SetCardinality(CARDINALITY_OPTIONAL))) + .Build(); + + SchemaUtil::SchemaDelta schema_delta; + schema_delta.schema_types_index_incompatible.insert(kPersonType); + schema_delta.schema_types_join_incompatible.insert(kPersonType); + + SchemaUtil::DependentMap dependents_map = {{kEmailType, {{kPersonType, {}}}}}; + SchemaUtil::SchemaDelta result_schema_delta = + SchemaUtil::ComputeCompatibilityDelta(old_schema, new_schema, + dependents_map); + EXPECT_THAT(result_schema_delta, Eq(schema_delta)); +} + +TEST_P( + SchemaUtilTest, + AddingNewIndexedDocumentPropertyWithIndexableListMakesIndexAndJoinIncompatible) { + SchemaTypeConfigProto nested_schema = + SchemaTypeConfigBuilder() + .SetType(kEmailType) + .AddProperty(PropertyConfigBuilder() + .SetName("subject") + .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_OPTIONAL)) + .Build(); + + // Configure old schema + SchemaProto old_schema = + SchemaBuilder() + .AddType(nested_schema) + .AddType(SchemaTypeConfigBuilder() + .SetType(kPersonType) + .AddProperty(PropertyConfigBuilder() + .SetName("Property") + .SetDataTypeInt64(NUMERIC_MATCH_RANGE) + .SetCardinality(CARDINALITY_OPTIONAL))) + .Build(); + + // Configure new schema. The added nested document property is indexed, so + // this is both index and join incompatible + SchemaProto new_schema = + SchemaBuilder() + .AddType(nested_schema) + .AddType( + SchemaTypeConfigBuilder() + .SetType(kPersonType) + .AddProperty(PropertyConfigBuilder() + .SetName("Property") + .SetDataTypeInt64(NUMERIC_MATCH_RANGE) + .SetCardinality(CARDINALITY_OPTIONAL)) + .AddProperty( + PropertyConfigBuilder() + .SetName("NewEmailProperty") + .SetDataTypeDocument( + kEmailType, + /*indexable_nested_properties_list=*/ + std::initializer_list<std::string>{"subject"}) + .SetCardinality(CARDINALITY_OPTIONAL))) + .Build(); + + SchemaUtil::SchemaDelta schema_delta; + schema_delta.schema_types_index_incompatible.insert(kPersonType); + schema_delta.schema_types_join_incompatible.insert(kPersonType); + + SchemaUtil::DependentMap dependents_map = {{kEmailType, {{kPersonType, {}}}}}; + SchemaUtil::SchemaDelta result_schema_delta = + SchemaUtil::ComputeCompatibilityDelta(old_schema, new_schema, + dependents_map); + EXPECT_THAT(result_schema_delta, Eq(schema_delta)); +} + +TEST_P(SchemaUtilTest, + AddingNewNonIndexedDocumentPropertyMakesJoinIncompatible) { + SchemaTypeConfigProto nested_schema = + SchemaTypeConfigBuilder() + .SetType(kEmailType) + .AddProperty(PropertyConfigBuilder() + .SetName("subject") + .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_OPTIONAL)) + .Build(); + + // Configure old schema + SchemaProto old_schema = + SchemaBuilder() + .AddType(nested_schema) + .AddType(SchemaTypeConfigBuilder() + .SetType(kPersonType) + .AddProperty(PropertyConfigBuilder() + .SetName("Property") + .SetDataTypeInt64(NUMERIC_MATCH_RANGE) + .SetCardinality(CARDINALITY_OPTIONAL))) + .Build(); + + // Configure new schema. The added nested document property is not indexed, so + // this is index compatible, but join incompatible + SchemaProto new_schema = + SchemaBuilder() + .AddType(nested_schema) + .AddType(SchemaTypeConfigBuilder() + .SetType(kPersonType) + .AddProperty(PropertyConfigBuilder() + .SetName("Property") + .SetDataTypeInt64(NUMERIC_MATCH_RANGE) + .SetCardinality(CARDINALITY_OPTIONAL)) + .AddProperty(PropertyConfigBuilder() + .SetName("NewEmailProperty") + .SetDataTypeDocument( + kEmailType, + /*index_nested_properties=*/false) + .SetCardinality(CARDINALITY_OPTIONAL))) + .Build(); + + SchemaUtil::SchemaDelta schema_delta; + schema_delta.schema_types_join_incompatible.insert(kPersonType); + + SchemaUtil::DependentMap dependents_map = {{kEmailType, {{kPersonType, {}}}}}; + SchemaUtil::SchemaDelta result_schema_delta = + SchemaUtil::ComputeCompatibilityDelta(old_schema, new_schema, + dependents_map); + EXPECT_THAT(result_schema_delta, Eq(schema_delta)); +} + +TEST_P(SchemaUtilTest, DeletingIndexedDocumentPropertyIsIncompatible) { + SchemaTypeConfigProto nested_schema = + SchemaTypeConfigBuilder() + .SetType(kEmailType) + .AddProperty(PropertyConfigBuilder() + .SetName("subject") + .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_OPTIONAL)) + .Build(); + + // Configure old schemam with two nested document properties of the same type + SchemaProto old_schema = + SchemaBuilder() + .AddType(nested_schema) + .AddType(SchemaTypeConfigBuilder() + .SetType(kPersonType) + .AddProperty(PropertyConfigBuilder() + .SetName("Property") + .SetDataTypeInt64(NUMERIC_MATCH_RANGE) + .SetCardinality(CARDINALITY_OPTIONAL)) + .AddProperty( + PropertyConfigBuilder() + .SetName("EmailProperty") + .SetDataTypeDocument( + kEmailType, /*index_nested_properties=*/true) + .SetCardinality(CARDINALITY_OPTIONAL)) + .AddProperty( + PropertyConfigBuilder() + .SetName("AnotherEmailProperty") + .SetDataTypeDocument( + kEmailType, /*index_nested_properties=*/true) + .SetCardinality(CARDINALITY_OPTIONAL))) + .Build(); + + // Configure new schema and drop one of the nested document properties + SchemaProto new_schema = + SchemaBuilder() + .AddType(nested_schema) + .AddType(SchemaTypeConfigBuilder() + .SetType(kPersonType) + .AddProperty(PropertyConfigBuilder() + .SetName("Property") + .SetDataTypeInt64(NUMERIC_MATCH_RANGE) + .SetCardinality(CARDINALITY_OPTIONAL)) + .AddProperty( + PropertyConfigBuilder() + .SetName("EmailProperty") + .SetDataTypeDocument( + kEmailType, /*index_nested_properties=*/true) + .SetCardinality(CARDINALITY_OPTIONAL))) + .Build(); + + SchemaUtil::SchemaDelta schema_delta; + schema_delta.schema_types_incompatible.insert(kPersonType); + schema_delta.schema_types_index_incompatible.insert(kPersonType); + schema_delta.schema_types_join_incompatible.insert(kPersonType); + + SchemaUtil::DependentMap dependents_map = {{kEmailType, {{kPersonType, {}}}}}; + SchemaUtil::SchemaDelta result_schema_delta = + SchemaUtil::ComputeCompatibilityDelta(old_schema, new_schema, + dependents_map); + EXPECT_THAT(result_schema_delta, Eq(schema_delta)); +} + +TEST_P(SchemaUtilTest, DeletingNonIndexedDocumentPropertyIsIncompatible) { + SchemaTypeConfigProto nested_schema = + SchemaTypeConfigBuilder() + .SetType(kEmailType) + .AddProperty(PropertyConfigBuilder() + .SetName("subject") + .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_OPTIONAL)) + .Build(); + + // Configure old schemam with two nested document properties of the same type + SchemaProto old_schema = + SchemaBuilder() + .AddType(nested_schema) + .AddType(SchemaTypeConfigBuilder() + .SetType(kPersonType) + .AddProperty(PropertyConfigBuilder() + .SetName("Property") + .SetDataTypeInt64(NUMERIC_MATCH_RANGE) + .SetCardinality(CARDINALITY_OPTIONAL)) + .AddProperty( + PropertyConfigBuilder() + .SetName("EmailProperty") + .SetDataTypeDocument( + kEmailType, /*index_nested_properties=*/true) + .SetCardinality(CARDINALITY_OPTIONAL)) + .AddProperty(PropertyConfigBuilder() + .SetName("AnotherEmailProperty") + .SetDataTypeDocument( + kEmailType, + /*index_nested_properties=*/false) + .SetCardinality(CARDINALITY_OPTIONAL))) + .Build(); + + // Configure new schema and drop the non-indexed nested document property + SchemaProto new_schema = + SchemaBuilder() + .AddType(nested_schema) + .AddType(SchemaTypeConfigBuilder() + .SetType(kPersonType) + .AddProperty(PropertyConfigBuilder() + .SetName("Property") + .SetDataTypeInt64(NUMERIC_MATCH_RANGE) + .SetCardinality(CARDINALITY_OPTIONAL)) + .AddProperty( + PropertyConfigBuilder() + .SetName("EmailProperty") + .SetDataTypeDocument( + kEmailType, /*index_nested_properties=*/true) + .SetCardinality(CARDINALITY_OPTIONAL))) + .Build(); + + SchemaUtil::SchemaDelta schema_delta; + schema_delta.schema_types_incompatible.insert(kPersonType); + schema_delta.schema_types_join_incompatible.insert(kPersonType); + + SchemaUtil::DependentMap dependents_map = {{kEmailType, {{kPersonType, {}}}}}; + SchemaUtil::SchemaDelta result_schema_delta = + SchemaUtil::ComputeCompatibilityDelta(old_schema, new_schema, + dependents_map); + EXPECT_THAT(result_schema_delta, Eq(schema_delta)); +} + +TEST_P(SchemaUtilTest, ChangingIndexedDocumentPropertyIsIncompatible) { + SchemaTypeConfigProto nested_schema = + SchemaTypeConfigBuilder() + .SetType(kEmailType) + .AddProperty(PropertyConfigBuilder() + .SetName("subject") + .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_OPTIONAL)) + .Build(); + + // Configure old schemam with two nested document properties of the same type + SchemaProto old_schema = + SchemaBuilder() + .AddType(nested_schema) + .AddType(SchemaTypeConfigBuilder() + .SetType(kPersonType) + .AddProperty(PropertyConfigBuilder() + .SetName("Property") + .SetDataTypeInt64(NUMERIC_MATCH_RANGE) + .SetCardinality(CARDINALITY_OPTIONAL)) + .AddProperty( + PropertyConfigBuilder() + .SetName("EmailProperty") + .SetDataTypeDocument( + kEmailType, /*index_nested_properties=*/true) + .SetCardinality(CARDINALITY_OPTIONAL)) + .AddProperty( + PropertyConfigBuilder() + .SetName("AnotherEmailProperty") + .SetDataTypeDocument( + kEmailType, /*index_nested_properties=*/true) + .SetCardinality(CARDINALITY_OPTIONAL))) + .Build(); + + // Configure new schema and change one of the nested document properties + // to a different name (this is the same as deleting a property and adding + // another) + SchemaProto new_schema = + SchemaBuilder() + .AddType(nested_schema) + .AddType(SchemaTypeConfigBuilder() + .SetType(kPersonType) + .AddProperty(PropertyConfigBuilder() + .SetName("Property") + .SetDataTypeInt64(NUMERIC_MATCH_RANGE) + .SetCardinality(CARDINALITY_OPTIONAL)) + .AddProperty( + PropertyConfigBuilder() + .SetName("EmailProperty") + .SetDataTypeDocument( + kEmailType, /*index_nested_properties=*/true) + .SetCardinality(CARDINALITY_OPTIONAL)) + .AddProperty( + PropertyConfigBuilder() + .SetName("DifferentEmailProperty") + .SetDataTypeDocument( + kEmailType, /*index_nested_properties=*/true) + .SetCardinality(CARDINALITY_OPTIONAL))) + .Build(); + + SchemaUtil::SchemaDelta schema_delta; + schema_delta.schema_types_incompatible.insert(kPersonType); + schema_delta.schema_types_index_incompatible.insert(kPersonType); + schema_delta.schema_types_join_incompatible.insert(kPersonType); + + SchemaUtil::DependentMap dependents_map = {{kEmailType, {{kPersonType, {}}}}}; + SchemaUtil::SchemaDelta result_schema_delta = + SchemaUtil::ComputeCompatibilityDelta(old_schema, new_schema, + dependents_map); + EXPECT_THAT(result_schema_delta, Eq(schema_delta)); +} + +TEST_P(SchemaUtilTest, ChangingNonIndexedDocumentPropertyIsIncompatible) { + SchemaTypeConfigProto nested_schema = + SchemaTypeConfigBuilder() + .SetType(kEmailType) + .AddProperty(PropertyConfigBuilder() + .SetName("subject") + .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_OPTIONAL)) + .Build(); + + // Configure old schemam with two nested document properties of the same type + SchemaProto old_schema = + SchemaBuilder() + .AddType(nested_schema) + .AddType(SchemaTypeConfigBuilder() + .SetType(kPersonType) + .AddProperty(PropertyConfigBuilder() + .SetName("Property") + .SetDataTypeInt64(NUMERIC_MATCH_RANGE) + .SetCardinality(CARDINALITY_OPTIONAL)) + .AddProperty( + PropertyConfigBuilder() + .SetName("EmailProperty") + .SetDataTypeDocument( + kEmailType, /*index_nested_properties=*/true) + .SetCardinality(CARDINALITY_OPTIONAL)) + .AddProperty(PropertyConfigBuilder() + .SetName("AnotherEmailProperty") + .SetDataTypeDocument( + kEmailType, + /*index_nested_properties=*/false) + .SetCardinality(CARDINALITY_OPTIONAL))) + .Build(); + + // Configure new schema and change the non-indexed nested document property to + // a different name (this is the same as deleting a property and adding + // another) + SchemaProto new_schema = + SchemaBuilder() + .AddType(nested_schema) + .AddType(SchemaTypeConfigBuilder() + .SetType(kPersonType) + .AddProperty(PropertyConfigBuilder() + .SetName("Property") + .SetDataTypeInt64(NUMERIC_MATCH_RANGE) + .SetCardinality(CARDINALITY_OPTIONAL)) + .AddProperty( + PropertyConfigBuilder() + .SetName("EmailProperty") + .SetDataTypeDocument( + kEmailType, /*index_nested_properties=*/true) + .SetCardinality(CARDINALITY_OPTIONAL)) + .AddProperty(PropertyConfigBuilder() + .SetName("DifferentEmailProperty") + .SetDataTypeDocument( + kEmailType, + /*index_nested_properties=*/false) + .SetCardinality(CARDINALITY_OPTIONAL))) + .Build(); + + SchemaUtil::SchemaDelta schema_delta; + schema_delta.schema_types_incompatible.insert(kPersonType); + schema_delta.schema_types_join_incompatible.insert(kPersonType); + + SchemaUtil::DependentMap dependents_map = {{kEmailType, {{kPersonType, {}}}}}; + SchemaUtil::SchemaDelta result_schema_delta = + SchemaUtil::ComputeCompatibilityDelta(old_schema, new_schema, + dependents_map); + EXPECT_THAT(result_schema_delta, Eq(schema_delta)); +} + TEST_P(SchemaUtilTest, ChangingJoinablePropertiesMakesJoinIncompatible) { // Configure old schema SchemaProto schema_with_joinable_property = @@ -3081,6 +3622,239 @@ TEST_P(SchemaUtilTest, IndexNestedDocumentsIndexIncompatible) { EXPECT_THAT(actual, Eq(schema_delta)); } +TEST_P(SchemaUtilTest, AddOrDropIndexableNestedProperties_IndexIncompatible) { + SchemaTypeConfigProto email_type_config = + SchemaTypeConfigBuilder() + .SetType(kEmailType) + .AddProperty(PropertyConfigBuilder() + .SetName("recipient") + .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_OPTIONAL)) + .AddProperty(PropertyConfigBuilder() + .SetName("subject") + .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_OPTIONAL)) + .AddProperty(PropertyConfigBuilder() + .SetName("body") + .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_OPTIONAL)) + .Build(); + SchemaProto schema_1 = + SchemaBuilder() + .AddType(email_type_config) + .AddType( + SchemaTypeConfigBuilder() + .SetType(kPersonType) + .AddProperty(PropertyConfigBuilder() + .SetName("emails") + .SetDataTypeDocument( + kEmailType, + /*indexable_nested_properties_list=*/ + {"recipient", "subject", "body"}) + .SetCardinality(CARDINALITY_REPEATED))) + .Build(); + + SchemaProto schema_2 = + SchemaBuilder() + .AddType(email_type_config) + .AddType(SchemaTypeConfigBuilder() + .SetType(kPersonType) + .AddProperty(PropertyConfigBuilder() + .SetName("emails") + .SetDataTypeDocument( + kEmailType, + /*indexable_nested_properties=*/ + {"recipient", "subject"}) + .SetCardinality(CARDINALITY_REPEATED))) + .Build(); + + // Dropping some indexable_nested_properties should make kPersonType + // index_incompatible. kEmailType should be unaffected. + SchemaUtil::SchemaDelta schema_delta; + schema_delta.schema_types_index_incompatible.emplace(kPersonType); + SchemaUtil::DependentMap dependents_map = {{kEmailType, {{kPersonType, {}}}}}; + SchemaUtil::SchemaDelta actual = + SchemaUtil::ComputeCompatibilityDelta(schema_1, schema_2, dependents_map); + EXPECT_THAT(actual, Eq(schema_delta)); + + // Adding some indexable_nested_properties should also make kPersonType + // index_incompatible. kEmailType should be unaffected. + actual = + SchemaUtil::ComputeCompatibilityDelta(schema_2, schema_1, dependents_map); + EXPECT_THAT(actual, Eq(schema_delta)); +} + +TEST_P(SchemaUtilTest, ChangingIndexableNestedProperties_IndexIncompatible) { + SchemaTypeConfigProto email_type_config = + SchemaTypeConfigBuilder() + .SetType(kEmailType) + .AddProperty(PropertyConfigBuilder() + .SetName("recipient") + .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_OPTIONAL)) + .AddProperty(PropertyConfigBuilder() + .SetName("subject") + .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_OPTIONAL)) + .AddProperty(PropertyConfigBuilder() + .SetName("body") + .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_OPTIONAL)) + .Build(); + SchemaProto schema_1 = + SchemaBuilder() + .AddType(email_type_config) + .AddType( + SchemaTypeConfigBuilder() + .SetType(kPersonType) + .AddProperty(PropertyConfigBuilder() + .SetName("emails") + .SetDataTypeDocument( + kEmailType, + /*indexable_nested_properties_list=*/ + {"recipient", "subject"}) + .SetCardinality(CARDINALITY_REPEATED))) + .Build(); + + SchemaProto schema_2 = + SchemaBuilder() + .AddType(email_type_config) + .AddType( + SchemaTypeConfigBuilder() + .SetType(kPersonType) + .AddProperty(PropertyConfigBuilder() + .SetName("emails") + .SetDataTypeDocument( + kEmailType, + /*indexable_nested_properties_list=*/ + {"recipient", "body"}) + .SetCardinality(CARDINALITY_REPEATED))) + .Build(); + + // Changing 'subject' to 'body' for indexable_nested_properties_list should + // make kPersonType index_incompatible. kEmailType should be unaffected. + SchemaUtil::SchemaDelta schema_delta; + schema_delta.schema_types_index_incompatible.emplace(kPersonType); + SchemaUtil::DependentMap dependents_map = {{kEmailType, {{kPersonType, {}}}}}; + SchemaUtil::SchemaDelta actual = + SchemaUtil::ComputeCompatibilityDelta(schema_1, schema_2, dependents_map); + EXPECT_THAT(actual, Eq(schema_delta)); +} + +TEST_P(SchemaUtilTest, IndexableNestedPropertiesFullSet_IndexIncompatible) { + SchemaTypeConfigProto email_type_config = + SchemaTypeConfigBuilder() + .SetType(kEmailType) + .AddProperty(PropertyConfigBuilder() + .SetName("recipient") + .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_OPTIONAL)) + .AddProperty(PropertyConfigBuilder() + .SetName("subject") + .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_OPTIONAL)) + .AddProperty(PropertyConfigBuilder() + .SetName("body") + .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_OPTIONAL)) + .Build(); + SchemaProto schema_1 = + SchemaBuilder() + .AddType(email_type_config) + .AddType(SchemaTypeConfigBuilder() + .SetType(kPersonType) + .AddProperty(PropertyConfigBuilder() + .SetName("emails") + .SetDataTypeDocument( + kEmailType, + /*index_nested_properties=*/true) + .SetCardinality(CARDINALITY_REPEATED))) + .Build(); + + SchemaProto schema_2 = + SchemaBuilder() + .AddType(email_type_config) + .AddType( + SchemaTypeConfigBuilder() + .SetType(kPersonType) + .AddProperty(PropertyConfigBuilder() + .SetName("emails") + .SetDataTypeDocument( + kEmailType, + /*indexable_nested_properties_list=*/ + {"recipient", "body", "subject"}) + .SetCardinality(CARDINALITY_REPEATED))) + .Build(); + + // This scenario also invalidates kPersonType and triggers an index rebuild at + // the moment, even though the set of indexable_nested_properties from + // schema_1 to schema_2 should be the same. + SchemaUtil::SchemaDelta schema_delta; + schema_delta.schema_types_index_incompatible.emplace(kPersonType); + SchemaUtil::DependentMap dependents_map = {{kEmailType, {{kPersonType, {}}}}}; + SchemaUtil::SchemaDelta actual = + SchemaUtil::ComputeCompatibilityDelta(schema_1, schema_2, dependents_map); + EXPECT_THAT(actual, Eq(schema_delta)); +} + +TEST_P(SchemaUtilTest, + ChangingIndexableNestedPropertiesOrder_IndexIsCompatible) { + SchemaTypeConfigProto email_type_config = + SchemaTypeConfigBuilder() + .SetType(kEmailType) + .AddProperty(PropertyConfigBuilder() + .SetName("recipient") + .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_OPTIONAL)) + .AddProperty(PropertyConfigBuilder() + .SetName("subject") + .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_OPTIONAL)) + .AddProperty(PropertyConfigBuilder() + .SetName("body") + .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_OPTIONAL)) + .Build(); + SchemaProto schema_1 = + SchemaBuilder() + .AddType(email_type_config) + .AddType( + SchemaTypeConfigBuilder() + .SetType(kPersonType) + .AddProperty(PropertyConfigBuilder() + .SetName("emails") + .SetDataTypeDocument( + kEmailType, + /*indexable_nested_properties_list=*/ + {"recipient", "subject", "body"}) + .SetCardinality(CARDINALITY_REPEATED))) + .Build(); + + SchemaProto schema_2 = + SchemaBuilder() + .AddType(email_type_config) + .AddType( + SchemaTypeConfigBuilder() + .SetType(kPersonType) + .AddProperty(PropertyConfigBuilder() + .SetName("emails") + .SetDataTypeDocument( + kEmailType, + /*indexable_nested_properties_list=*/ + {"subject", "body", "recipient"}) + .SetCardinality(CARDINALITY_REPEATED))) + .Build(); + + // Changing order of elements in indexable_nested_properties_list should have + // no effect on schema compatibility. + SchemaUtil::SchemaDelta schema_delta; + SchemaUtil::DependentMap dependents_map = {{kEmailType, {{kPersonType, {}}}}}; + SchemaUtil::SchemaDelta actual = + SchemaUtil::ComputeCompatibilityDelta(schema_1, schema_2, dependents_map); + EXPECT_THAT(actual, Eq(schema_delta)); + EXPECT_THAT(actual.schema_types_index_incompatible, IsEmpty()); +} + TEST_P(SchemaUtilTest, ValidateStringIndexingConfigShouldHaveTermMatchType) { SchemaProto schema = SchemaBuilder() @@ -3673,6 +4447,137 @@ TEST_P(SchemaUtilTest, ValidateNestedJoinablePropertyDiamondRelationship) { StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); } +TEST_P(SchemaUtilTest, + ValidDocumentIndexingConfigFields_emptyIndexableListBooleanTrue) { + SchemaProto schema = + SchemaBuilder() + .AddType(SchemaTypeConfigBuilder() + .SetType("InnerSchema") + .AddProperty(PropertyConfigBuilder() + .SetName("prop1") + .SetDataTypeString(TERM_MATCH_PREFIX, + TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_OPTIONAL)) + .AddProperty(PropertyConfigBuilder() + .SetName("prop2") + .SetDataTypeString(TERM_MATCH_UNKNOWN, + TOKENIZER_NONE) + .SetCardinality(CARDINALITY_OPTIONAL))) + .AddType(SchemaTypeConfigBuilder() + .SetType("OuterSchema") + .AddProperty(PropertyConfigBuilder() + .SetName("InnerProperty") + .SetDataTypeDocument( + "InnerSchema", + /*index_nested_properties=*/true) + .SetCardinality(CARDINALITY_REPEATED))) + .Build(); + + SchemaTypeConfigProto* outerSchemaType = schema.mutable_types(1); + outerSchemaType->mutable_properties(0) + ->mutable_document_indexing_config() + ->clear_indexable_nested_properties_list(); + + EXPECT_THAT(SchemaUtil::Validate(schema, GetParam()), IsOk()); +} + +TEST_P(SchemaUtilTest, + ValidDocumentIndexingConfigFields_emptyIndexableListBooleanFalse) { + SchemaProto schema = + SchemaBuilder() + .AddType(SchemaTypeConfigBuilder() + .SetType("InnerSchema") + .AddProperty(PropertyConfigBuilder() + .SetName("prop1") + .SetDataTypeString(TERM_MATCH_PREFIX, + TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_OPTIONAL)) + .AddProperty(PropertyConfigBuilder() + .SetName("prop2") + .SetDataTypeString(TERM_MATCH_UNKNOWN, + TOKENIZER_NONE) + .SetCardinality(CARDINALITY_OPTIONAL))) + .AddType(SchemaTypeConfigBuilder() + .SetType("OuterSchema") + .AddProperty(PropertyConfigBuilder() + .SetName("InnerProperty") + .SetDataTypeDocument( + "InnerSchema", + /*index_nested_properties=*/false) + .SetCardinality(CARDINALITY_REPEATED))) + .Build(); + + SchemaTypeConfigProto* outerSchemaType = schema.mutable_types(1); + outerSchemaType->mutable_properties(0) + ->mutable_document_indexing_config() + ->clear_indexable_nested_properties_list(); + + EXPECT_THAT(SchemaUtil::Validate(schema, GetParam()), IsOk()); +} + +TEST_P(SchemaUtilTest, + ValidDocumentIndexingConfigFields_nonEmptyIndexableList) { + SchemaProto schema = + SchemaBuilder() + .AddType(SchemaTypeConfigBuilder() + .SetType("InnerSchema") + .AddProperty(PropertyConfigBuilder() + .SetName("prop1") + .SetDataTypeString(TERM_MATCH_PREFIX, + TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_OPTIONAL))) + .AddType(SchemaTypeConfigBuilder() + .SetType("OuterSchema") + .AddProperty( + PropertyConfigBuilder() + .SetName("InnerProperty") + .SetDataTypeDocument( + "InnerSchema", + /*indexable_nested_properties_list=*/ + std::initializer_list<std::string>{"prop1"}) + .SetCardinality(CARDINALITY_REPEATED))) + .Build(); + + SchemaTypeConfigProto* outerSchemaType = schema.mutable_types(1); + outerSchemaType->mutable_properties(0) + ->mutable_document_indexing_config() + ->set_index_nested_properties(false); + EXPECT_THAT(SchemaUtil::Validate(schema, GetParam()), IsOk()); +} + +TEST_P(SchemaUtilTest, InvalidDocumentIndexingConfigFields) { + // If indexable_nested_properties is non-empty, index_nested_properties is + // required to be false. + SchemaProto schema = + SchemaBuilder() + .AddType(SchemaTypeConfigBuilder() + .SetType("InnerSchema") + .AddProperty(PropertyConfigBuilder() + .SetName("prop1") + .SetDataTypeString(TERM_MATCH_PREFIX, + TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_OPTIONAL))) + .AddType(SchemaTypeConfigBuilder() + .SetType("OuterSchema") + .AddProperty(PropertyConfigBuilder() + .SetName("InnerProperty") + .SetDataTypeDocument( + "InnerSchema", + /*index_nested_properties=*/true) + .SetCardinality(CARDINALITY_REPEATED))) + .Build(); + + // Setting a non-empty indexable_nested_properties_list while + // index_nested_properties=true is invalid. + SchemaTypeConfigProto* outerSchemaType = schema.mutable_types(1); + outerSchemaType->mutable_properties(0) + ->mutable_document_indexing_config() + ->add_indexable_nested_properties_list("prop"); + + EXPECT_THAT(SchemaUtil::Validate(schema, GetParam()), + StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); +} + TEST_P(SchemaUtilTest, MultipleReferencesToSameNestedSchemaOk) { SchemaProto schema = SchemaBuilder() diff --git a/icing/schema/section-manager-builder_test.cc b/icing/schema/section-manager-builder_test.cc index 60dd507..1d452d5 100644 --- a/icing/schema/section-manager-builder_test.cc +++ b/icing/schema/section-manager-builder_test.cc @@ -270,9 +270,11 @@ TEST_P(NonIndexableSectionManagerBuilderTest, Build) { ICING_ASSERT_OK(builder.ProcessSchemaTypePropertyConfig( /*schema_type_id=*/0, property_config, std::string(kPropertyPath))); + // NonIndexable sections will still consume a sectionId. std::unique_ptr<SectionManager> section_manager = std::move(builder).Build(); EXPECT_THAT(section_manager->GetMetadataList(std::string(kSchemaType)), - IsOkAndHolds(Pointee(IsEmpty()))); + IsOkAndHolds(Pointee(ElementsAre(EqualsSectionMetadata( + /*expected_id=*/0, kPropertyPath, property_config))))); } // The following types are considered non-indexable: diff --git a/icing/schema/section-manager.cc b/icing/schema/section-manager.cc index 38042d0..3d540d6 100644 --- a/icing/schema/section-manager.cc +++ b/icing/schema/section-manager.cc @@ -15,15 +15,9 @@ #include "icing/schema/section-manager.h" #include <algorithm> -#include <cinttypes> -#include <cstddef> #include <cstdint> -#include <iterator> -#include <memory> #include <string> #include <string_view> -#include <unordered_map> -#include <unordered_set> #include <utility> #include <vector> @@ -35,7 +29,6 @@ #include "icing/proto/schema.pb.h" #include "icing/proto/term.pb.h" #include "icing/schema/property-util.h" -#include "icing/schema/schema-util.h" #include "icing/schema/section.h" #include "icing/store/document-filter-data.h" #include "icing/store/key-mapper.h" @@ -99,12 +92,14 @@ SectionManager::Builder::ProcessSchemaTypePropertyConfig( return absl_ports::InvalidArgumentError("Invalid schema type id"); } - if (SchemaUtil::IsIndexedProperty(property_config)) { - ICING_RETURN_IF_ERROR( - AppendNewSectionMetadata(§ion_metadata_cache_[schema_type_id], - std::move(property_path), property_config)); - } - + // We don't need to check if the property is indexable. This method will + // only be called properties that should consume sectionIds, even if the + // property's indexing configuration itself is not indexable. + // This would be the case for unknown and non-indexable property paths that + // are defined in the indexable_nested_properties_list. + ICING_RETURN_IF_ERROR( + AppendNewSectionMetadata(§ion_metadata_cache_[schema_type_id], + std::move(property_path), property_config)); return libtextclassifier3::Status::OK; } @@ -141,6 +136,13 @@ libtextclassifier3::StatusOr<SectionGroup> SectionManager::ExtractSections( for (const SectionMetadata& section_metadata : *metadata_list) { switch (section_metadata.data_type) { case PropertyConfigProto::DataType::STRING: { + if (section_metadata.term_match_type == TermMatchType::UNKNOWN || + section_metadata.tokenizer == + StringIndexingConfig::TokenizerType::NONE) { + // Skip if term-match type is UNKNOWN, or if the tokenizer-type is + // NONE. + break; + } AppendSection( section_metadata, property_util::ExtractPropertyValuesFromDocument<std::string_view>( @@ -149,6 +151,11 @@ libtextclassifier3::StatusOr<SectionGroup> SectionManager::ExtractSections( break; } case PropertyConfigProto::DataType::INT64: { + if (section_metadata.numeric_match_type == + IntegerIndexingConfig::NumericMatchType::UNKNOWN) { + // Skip if numeric-match type is UNKNOWN. + break; + } AppendSection(section_metadata, property_util::ExtractPropertyValuesFromDocument<int64_t>( document, section_metadata.path), diff --git a/icing/schema/section-manager_test.cc b/icing/schema/section-manager_test.cc index db2be6b..eee78e9 100644 --- a/icing/schema/section-manager_test.cc +++ b/icing/schema/section-manager_test.cc @@ -14,7 +14,6 @@ #include "icing/schema/section-manager.h" -#include <limits> #include <memory> #include <string> #include <string_view> @@ -25,7 +24,6 @@ #include "icing/file/filesystem.h" #include "icing/proto/document.pb.h" #include "icing/proto/schema.pb.h" -#include "icing/proto/term.pb.h" #include "icing/schema-builder.h" #include "icing/schema/schema-type-manager.h" #include "icing/schema/schema-util.h" @@ -63,6 +61,28 @@ static constexpr std::string_view kTypeConversation = "Conversation"; static constexpr std::string_view kPropertyEmails = "emails"; static constexpr std::string_view kPropertyName = "name"; +// type and property names of Group +static constexpr std::string_view kTypeGroup = "Group"; +// indexable +static constexpr std::string_view kPropertyConversation = "conversation"; +static constexpr std::string_view kPropertyGroupName = "groupName"; +// nested indexable +static constexpr std::string_view kPropertyNestedConversationName = "name"; +static constexpr std::string_view kPropertyNestedConversationEmailRecipientIds = + "emails.recipientIds"; +static constexpr std::string_view kPropertyNestedConversationEmailRecipient = + "emails.recipients"; +static constexpr std::string_view kPropertyNestedConversationEmailSubject = + "emails.subject"; +// nested non-indexable +static constexpr std::string_view kPropertyNestedConversationEmailAttachment = + "emails.attachment"; +// non-existent property path +static constexpr std::string_view kPropertyNestedNonExistent = + "emails.nonExistentNestedProperty"; +static constexpr std::string_view kPropertyNestedNonExistent2 = + "emails.nonExistentNestedProperty2"; + constexpr int64_t kDefaultTimestamp = 1663274901; PropertyConfigProto CreateRecipientIdsPropertyConfig() { @@ -105,6 +125,22 @@ PropertyConfigProto CreateNamePropertyConfig() { .Build(); } +PropertyConfigProto CreateAttachmentPropertyConfig() { + return PropertyConfigBuilder() + .SetName(kPropertyAttachment) + .SetDataType(TYPE_BYTES) + .SetCardinality(CARDINALITY_OPTIONAL) + .Build(); +} + +PropertyConfigProto CreateGroupNamePropertyConfig() { + return PropertyConfigBuilder() + .SetName(kPropertyGroupName) + .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_OPTIONAL) + .Build(); +} + SchemaTypeConfigProto CreateEmailTypeConfig() { return SchemaTypeConfigBuilder() .SetType(kTypeEmail) @@ -139,6 +175,28 @@ SchemaTypeConfigProto CreateConversationTypeConfig() { .Build(); } +SchemaTypeConfigProto CreateGroupTypeConfig() { + return SchemaTypeConfigBuilder() + .SetType(kTypeGroup) + .AddProperty(CreateGroupNamePropertyConfig()) + .AddProperty( + PropertyConfigBuilder() + .SetName(kPropertyConversation) + .SetDataTypeDocument( + kTypeConversation, + /*indexable_nested_properties_list=*/ + {std::string(kPropertyNestedConversationName), + std::string(kPropertyNestedConversationEmailRecipientIds), + std::string(kPropertyNestedConversationEmailSubject), + std::string(kPropertyNestedConversationEmailRecipient), + std::string(kPropertyNestedConversationEmailAttachment), + std::string(kPropertyNestedNonExistent2), + std::string(kPropertyNestedNonExistent), + std::string(kPropertyNestedNonExistent)}) + .SetCardinality(CARDINALITY_REPEATED)) + .Build(); +} + class SectionManagerTest : public ::testing::Test { protected: void SetUp() override { @@ -146,9 +204,11 @@ class SectionManagerTest : public ::testing::Test { auto email_type = CreateEmailTypeConfig(); auto conversation_type = CreateConversationTypeConfig(); + auto group_type = CreateGroupTypeConfig(); type_config_map_.emplace(email_type.schema_type(), email_type); type_config_map_.emplace(conversation_type.schema_type(), conversation_type); + type_config_map_.emplace(group_type.schema_type(), group_type); // DynamicTrieKeyMapper uses 3 internal arrays for bookkeeping. Give each // one 128KiB so the total DynamicTrieKeyMapper should get 384KiB @@ -158,6 +218,7 @@ class SectionManagerTest : public ::testing::Test { filesystem_, test_dir_, key_mapper_size)); ICING_ASSERT_OK(schema_type_mapper_->Put(kTypeEmail, 0)); ICING_ASSERT_OK(schema_type_mapper_->Put(kTypeConversation, 1)); + ICING_ASSERT_OK(schema_type_mapper_->Put(kTypeGroup, 2)); email_document_ = DocumentBuilder() @@ -183,6 +244,15 @@ class SectionManagerTest : public ::testing::Test { DocumentProto(email_document_), DocumentProto(email_document_)) .Build(); + + group_document_ = + DocumentBuilder() + .SetKey("icing", "group/1") + .SetSchema(std::string(kTypeGroup)) + .AddDocumentProperty(std::string(kPropertyConversation), + DocumentProto(conversation_document_)) + .AddStringProperty(std::string(kPropertyGroupName), "group_name_1") + .Build(); } void TearDown() override { @@ -197,6 +267,7 @@ class SectionManagerTest : public ::testing::Test { DocumentProto email_document_; DocumentProto conversation_document_; + DocumentProto group_document_; }; TEST_F(SectionManagerTest, ExtractSections) { @@ -295,6 +366,91 @@ TEST_F(SectionManagerTest, ExtractSectionsNested) { ElementsAre(kDefaultTimestamp, kDefaultTimestamp)); } +TEST_F(SectionManagerTest, ExtractSectionsIndexableNestedPropertiesList) { + // Use SchemaTypeManager factory method to instantiate SectionManager. + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<SchemaTypeManager> schema_type_manager, + SchemaTypeManager::Create(type_config_map_, schema_type_mapper_.get())); + + // Extracts all sections from 'Group' document + ICING_ASSERT_OK_AND_ASSIGN( + SectionGroup section_group, + schema_type_manager->section_manager().ExtractSections(group_document_)); + + // SectionId assignments: + // 0 -> conversation.emails.attachment (bytes, non-indexable) + // 1 -> conversation.emails.recipientIds (int64) + // 2 -> conversation.emails.recipients (string) + // 3 -> conversation.emails.subject (string) + // 4 -> conversation.name + // (string, but no entry for this in conversation_document_) + // 5 -> groupName (string) + // 6 -> conversation.emails.nonExistentNestedProperty + // (unknown, non-indexable) + // 7 -> conversation.emails.nonExistentNestedProperty2 + // (unknown, non-indexable) + // + // SectionId assignment order: + // - We assign section ids to known (existing) properties first in alphabet + // order. + // - After handling all known properties, we assign section ids to all unknown + // (non-existent) properties that are specified in the + // indexable_nested_properties_list. + // - As a result, assignment of the entire section set is not done + // alphabetically, but assignment is still deterministic and alphabetical + // order is preserved inside the known properties and unknown properties + // sets individually. + // + // 'conversation.emails.attachment', + // 'conversation.emails.nonExistentNestedProperty' and + // 'conversation.emails.nonExistentNestedProperty2' are assigned sectionIds + // even though they are non-indexable because they appear in 'Group' schema + // type's indexable_nested_props_list. + // However 'conversation.emails.attachment' does not exist in section_group + // (even though the property exists and has a sectionId assignment) as + // SectionManager::ExtractSections only extracts indexable string and integer + // section data from a document. + + // String sections + EXPECT_THAT(section_group.string_sections, SizeIs(3)); + + EXPECT_THAT(section_group.string_sections[0].metadata, + EqualsSectionMetadata( + /*expected_id=*/2, + /*expected_property_path=*/"conversation.emails.recipients", + CreateRecipientsPropertyConfig())); + EXPECT_THAT(section_group.string_sections[0].content, + ElementsAre("recipient1", "recipient2", "recipient3", + "recipient1", "recipient2", "recipient3")); + + EXPECT_THAT(section_group.string_sections[1].metadata, + EqualsSectionMetadata( + /*expected_id=*/3, + /*expected_property_path=*/"conversation.emails.subject", + CreateSubjectPropertyConfig())); + EXPECT_THAT(section_group.string_sections[1].content, + ElementsAre("the subject", "the subject")); + + EXPECT_THAT(section_group.string_sections[2].metadata, + EqualsSectionMetadata( + /*expected_id=*/5, + /*expected_property_path=*/"groupName", + CreateGroupNamePropertyConfig())); + EXPECT_THAT(section_group.string_sections[2].content, + ElementsAre("group_name_1")); + + // Integer sections + EXPECT_THAT(section_group.integer_sections, SizeIs(1)); + + EXPECT_THAT(section_group.integer_sections[0].metadata, + EqualsSectionMetadata( + /*expected_id=*/1, + /*expected_property_path=*/"conversation.emails.recipientIds", + CreateRecipientIdsPropertyConfig())); + EXPECT_THAT(section_group.integer_sections[0].content, + ElementsAre(1, 2, 3, 1, 2, 3)); +} + TEST_F(SectionManagerTest, GetSectionMetadata) { // Use SchemaTypeManager factory method to instantiate SectionManager. ICING_ASSERT_OK_AND_ASSIGN( @@ -352,6 +508,86 @@ TEST_F(SectionManagerTest, GetSectionMetadata) { IsOkAndHolds(Pointee(EqualsSectionMetadata( /*expected_id=*/4, /*expected_property_path=*/"name", CreateNamePropertyConfig())))); + + // Group (section id -> section property path): + // 0 -> conversation.emails.attachment (non-indexable) + // 1 -> conversation.emails.recipientIds + // 2 -> conversation.emails.recipients + // 3 -> conversation.emails.subject + // 4 -> conversation.name + // 5 -> groupName + // 6 -> conversation.emails.nonExistentNestedProperty (non-indexable) + // 7 -> conversation.emails.nonExistentNestedProperty2 (non-indexable) + // + // SectionId assignment order: + // - We assign section ids to known (existing) properties first in alphabet + // order. + // - After handling all known properties, we assign section ids to all unknown + // (non-existent) properties that are specified in the + // indexable_nested_properties_list. + // - As a result, assignment of the entire section set is not done + // alphabetically, but assignment is still deterministic and alphabetical + // order is preserved inside the known properties and unknown properties + // sets individually. + EXPECT_THAT(schema_type_manager->section_manager().GetSectionMetadata( + /*schema_type_id=*/2, /*section_id=*/0), + IsOkAndHolds(Pointee(EqualsSectionMetadata( + /*expected_id=*/0, + /*expected_property_path=*/"conversation.emails.attachment", + CreateAttachmentPropertyConfig())))); + EXPECT_THAT(schema_type_manager->section_manager().GetSectionMetadata( + /*schema_type_id=*/2, /*section_id=*/1), + IsOkAndHolds(Pointee(EqualsSectionMetadata( + /*expected_id=*/1, + /*expected_property_path=*/"conversation.emails.recipientIds", + CreateRecipientIdsPropertyConfig())))); + EXPECT_THAT(schema_type_manager->section_manager().GetSectionMetadata( + /*schema_type_id=*/2, /*section_id=*/2), + IsOkAndHolds(Pointee(EqualsSectionMetadata( + /*expected_id=*/2, + /*expected_property_path=*/"conversation.emails.recipients", + CreateRecipientsPropertyConfig())))); + EXPECT_THAT(schema_type_manager->section_manager().GetSectionMetadata( + /*schema_type_id=*/2, /*section_id=*/3), + IsOkAndHolds(Pointee(EqualsSectionMetadata( + /*expected_id=*/3, + /*expected_property_path=*/"conversation.emails.subject", + CreateSubjectPropertyConfig())))); + EXPECT_THAT( + schema_type_manager->section_manager().GetSectionMetadata( + /*schema_type_id=*/2, /*section_id=*/4), + IsOkAndHolds(Pointee(EqualsSectionMetadata( + /*expected_id=*/4, /*expected_property_path=*/"conversation.name", + CreateNamePropertyConfig())))); + EXPECT_THAT(schema_type_manager->section_manager().GetSectionMetadata( + /*schema_type_id=*/2, /*section_id=*/5), + IsOkAndHolds(Pointee(EqualsSectionMetadata( + /*expected_id=*/5, /*expected_property_path=*/"groupName", + CreateGroupNamePropertyConfig())))); + EXPECT_THAT(schema_type_manager->section_manager().GetSectionMetadata( + /*schema_type_id=*/2, /*section_id=*/6), + IsOkAndHolds(Pointee(EqualsSectionMetadata( + /*expected_id=*/6, + /*expected_property_path=*/ + "conversation.emails.nonExistentNestedProperty", + PropertyConfigBuilder() + .SetName("nonExistentNestedProperty") + .SetDataType(TYPE_UNKNOWN) + .Build())))); + EXPECT_THAT(schema_type_manager->section_manager().GetSectionMetadata( + /*schema_type_id=*/2, /*section_id=*/7), + IsOkAndHolds(Pointee(EqualsSectionMetadata( + /*expected_id=*/7, + /*expected_property_path=*/ + "conversation.emails.nonExistentNestedProperty2", + PropertyConfigBuilder() + .SetName("nonExistentNestedProperty2") + .SetDataType(TYPE_UNKNOWN) + .Build())))); + // Check that no more properties are indexed + EXPECT_THAT(schema_type_manager->section_manager().GetSectionMetadata( + /*schema_type_id=*/2, /*section_id=*/8), + StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); } TEST_F(SectionManagerTest, GetSectionMetadataInvalidSchemaTypeId) { @@ -359,13 +595,13 @@ TEST_F(SectionManagerTest, GetSectionMetadataInvalidSchemaTypeId) { ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<SchemaTypeManager> schema_type_manager, SchemaTypeManager::Create(type_config_map_, schema_type_mapper_.get())); - ASSERT_THAT(type_config_map_, SizeIs(2)); + ASSERT_THAT(type_config_map_, SizeIs(3)); EXPECT_THAT(schema_type_manager->section_manager().GetSectionMetadata( /*schema_type_id=*/-1, /*section_id=*/0), StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); EXPECT_THAT(schema_type_manager->section_manager().GetSectionMetadata( - /*schema_type_id=*/2, /*section_id=*/0), + /*schema_type_id=*/3, /*section_id=*/0), StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); } diff --git a/icing/scoring/advanced_scoring/advanced-scorer_fuzz_test.cc b/icing/scoring/advanced_scoring/advanced-scorer_fuzz_test.cc index bdafa28..3612359 100644 --- a/icing/scoring/advanced_scoring/advanced-scorer_fuzz_test.cc +++ b/icing/scoring/advanced_scoring/advanced-scorer_fuzz_test.cc @@ -37,13 +37,13 @@ extern "C" int LLVMFuzzerTestOneInput(const uint8_t* data, size_t size) { SchemaStore::Create(&filesystem, schema_store_dir, &fake_clock) .ValueOrDie(); std::unique_ptr<DocumentStore> document_store = - DocumentStore::Create(&filesystem, doc_store_dir, &fake_clock, - schema_store.get(), - /*force_recovery_and_revalidate_documents=*/false, - /*namespace_id_fingerprint=*/false, - PortableFileBackedProtoLog< - DocumentWrapper>::kDeflateCompressionLevel, - /*initialize_stats=*/nullptr) + DocumentStore::Create( + &filesystem, doc_store_dir, &fake_clock, schema_store.get(), + /*force_recovery_and_revalidate_documents=*/false, + /*namespace_id_fingerprint=*/false, /*pre_mapping_fbv=*/false, + /*use_persistent_hash_map=*/false, + PortableFileBackedProtoLog<DocumentWrapper>::kDeflateCompressionLevel, + /*initialize_stats=*/nullptr) .ValueOrDie() .document_store; diff --git a/icing/scoring/advanced_scoring/advanced-scorer_test.cc b/icing/scoring/advanced_scoring/advanced-scorer_test.cc index 0ecc21d..cc1d413 100644 --- a/icing/scoring/advanced_scoring/advanced-scorer_test.cc +++ b/icing/scoring/advanced_scoring/advanced-scorer_test.cc @@ -64,13 +64,14 @@ class AdvancedScorerTest : public testing::Test { ICING_ASSERT_OK_AND_ASSIGN( DocumentStore::CreateResult create_result, - DocumentStore::Create(&filesystem_, doc_store_dir_, &fake_clock_, - schema_store_.get(), - /*force_recovery_and_revalidate_documents=*/false, - /*namespace_id_fingerprint=*/false, - PortableFileBackedProtoLog< - DocumentWrapper>::kDeflateCompressionLevel, - /*initialize_stats=*/nullptr)); + DocumentStore::Create( + &filesystem_, doc_store_dir_, &fake_clock_, schema_store_.get(), + /*force_recovery_and_revalidate_documents=*/false, + /*namespace_id_fingerprint=*/false, /*pre_mapping_fbv=*/false, + /*use_persistent_hash_map=*/false, + PortableFileBackedProtoLog< + DocumentWrapper>::kDeflateCompressionLevel, + /*initialize_stats=*/nullptr)); document_store_ = std::move(create_result.document_store); // Creates a simple email schema diff --git a/icing/scoring/score-and-rank_benchmark.cc b/icing/scoring/score-and-rank_benchmark.cc index abb019f..7cb5a95 100644 --- a/icing/scoring/score-and-rank_benchmark.cc +++ b/icing/scoring/score-and-rank_benchmark.cc @@ -95,7 +95,8 @@ libtextclassifier3::StatusOr<DocumentStore::CreateResult> CreateDocumentStore( return DocumentStore::Create( filesystem, base_dir, clock, schema_store, /*force_recovery_and_revalidate_documents=*/false, - /*namespace_id_fingerprint=*/false, + /*namespace_id_fingerprint=*/false, /*pre_mapping_fbv=*/false, + /*use_persistent_hash_map=*/false, PortableFileBackedProtoLog<DocumentWrapper>::kDeflateCompressionLevel, /*initialize_stats=*/nullptr); } diff --git a/icing/scoring/scorer_test.cc b/icing/scoring/scorer_test.cc index 4a97a87..5194c7f 100644 --- a/icing/scoring/scorer_test.cc +++ b/icing/scoring/scorer_test.cc @@ -64,13 +64,14 @@ class ScorerTest : public ::testing::TestWithParam<ScorerTestingMode> { ICING_ASSERT_OK_AND_ASSIGN( DocumentStore::CreateResult create_result, - DocumentStore::Create(&filesystem_, doc_store_dir_, &fake_clock1_, - schema_store_.get(), - /*force_recovery_and_revalidate_documents=*/false, - /*namespace_id_fingerprint=*/false, - PortableFileBackedProtoLog< - DocumentWrapper>::kDeflateCompressionLevel, - /*initialize_stats=*/nullptr)); + DocumentStore::Create( + &filesystem_, doc_store_dir_, &fake_clock1_, schema_store_.get(), + /*force_recovery_and_revalidate_documents=*/false, + /*namespace_id_fingerprint=*/false, /*pre_mapping_fbv=*/false, + /*use_persistent_hash_map=*/false, + PortableFileBackedProtoLog< + DocumentWrapper>::kDeflateCompressionLevel, + /*initialize_stats=*/nullptr)); document_store_ = std::move(create_result.document_store); // Creates a simple email schema diff --git a/icing/scoring/scoring-processor.cc b/icing/scoring/scoring-processor.cc index 8284426..b827bd8 100644 --- a/icing/scoring/scoring-processor.cc +++ b/icing/scoring/scoring-processor.cc @@ -14,7 +14,9 @@ #include "icing/scoring/scoring-processor.h" +#include <limits> #include <memory> +#include <string> #include <unordered_map> #include <utility> #include <vector> @@ -68,7 +70,8 @@ ScoringProcessor::Create(const ScoringSpecProto& scoring_spec, std::vector<ScoredDocumentHit> ScoringProcessor::Score( std::unique_ptr<DocHitInfoIterator> doc_hit_info_iterator, int num_to_score, std::unordered_map<std::string, std::unique_ptr<DocHitInfoIterator>>* - query_term_iterators) { + query_term_iterators, + QueryStatsProto::SearchStats* search_stats) { std::vector<ScoredDocumentHit> scored_document_hits; scorer_->PrepareToScore(query_term_iterators); @@ -85,6 +88,18 @@ std::vector<ScoredDocumentHit> ScoringProcessor::Score( doc_hit_info.document_id(), doc_hit_info.hit_section_ids_mask(), score); } + if (search_stats != nullptr) { + search_stats->set_num_documents_scored(scored_document_hits.size()); + DocHitInfoIterator::CallStats iterator_call_stats = + doc_hit_info_iterator->GetCallStats(); + search_stats->set_num_fetched_hits_lite_index( + iterator_call_stats.num_leaf_advance_calls_lite_index); + search_stats->set_num_fetched_hits_main_index( + iterator_call_stats.num_leaf_advance_calls_main_index); + search_stats->set_num_fetched_hits_integer_index( + iterator_call_stats.num_leaf_advance_calls_integer_index); + } + return scored_document_hits; } diff --git a/icing/scoring/scoring-processor.h b/icing/scoring/scoring-processor.h index e9efda7..8634a22 100644 --- a/icing/scoring/scoring-processor.h +++ b/icing/scoring/scoring-processor.h @@ -15,14 +15,19 @@ #ifndef ICING_SCORING_SCORING_PROCESSOR_H_ #define ICING_SCORING_SCORING_PROCESSOR_H_ +#include <cstdint> #include <memory> +#include <string> +#include <unordered_map> #include <utility> #include <vector> #include "icing/text_classifier/lib3/utils/base/statusor.h" #include "icing/index/iterator/doc-hit-info-iterator.h" #include "icing/join/join-children-fetcher.h" +#include "icing/proto/logging.pb.h" #include "icing/proto/scoring.pb.h" +#include "icing/schema/schema-store.h" #include "icing/scoring/scored-document-hit.h" #include "icing/scoring/scorer.h" #include "icing/store/document-store.h" @@ -57,7 +62,8 @@ class ScoringProcessor { std::unique_ptr<DocHitInfoIterator> doc_hit_info_iterator, int num_to_score, std::unordered_map<std::string, std::unique_ptr<DocHitInfoIterator>>* - query_term_iterators = nullptr); + query_term_iterators = nullptr, + QueryStatsProto::SearchStats* search_stats = nullptr); private: explicit ScoringProcessor(std::unique_ptr<Scorer> scorer) diff --git a/icing/scoring/scoring-processor_test.cc b/icing/scoring/scoring-processor_test.cc index 644e013..deddff8 100644 --- a/icing/scoring/scoring-processor_test.cc +++ b/icing/scoring/scoring-processor_test.cc @@ -62,13 +62,14 @@ class ScoringProcessorTest ICING_ASSERT_OK_AND_ASSIGN( DocumentStore::CreateResult create_result, - DocumentStore::Create(&filesystem_, doc_store_dir_, &fake_clock_, - schema_store_.get(), - /*force_recovery_and_revalidate_documents=*/false, - /*namespace_id_fingerprint=*/false, - PortableFileBackedProtoLog< - DocumentWrapper>::kDeflateCompressionLevel, - /*initialize_stats=*/nullptr)); + DocumentStore::Create( + &filesystem_, doc_store_dir_, &fake_clock_, schema_store_.get(), + /*force_recovery_and_revalidate_documents=*/false, + /*namespace_id_fingerprint=*/false, /*pre_mapping_fbv=*/false, + /*use_persistent_hash_map=*/false, + PortableFileBackedProtoLog< + DocumentWrapper>::kDeflateCompressionLevel, + /*initialize_stats=*/nullptr)); document_store_ = std::move(create_result.document_store); // Creates a simple email schema diff --git a/icing/store/document-store.cc b/icing/store/document-store.cc index e99bacf..094eea1 100644 --- a/icing/store/document-store.cc +++ b/icing/store/document-store.cc @@ -53,7 +53,9 @@ #include "icing/store/document-id.h" #include "icing/store/document-log-creator.h" #include "icing/store/dynamic-trie-key-mapper.h" +#include "icing/store/namespace-fingerprint-identifier.h" #include "icing/store/namespace-id.h" +#include "icing/store/persistent-hash-map-key-mapper.h" #include "icing/store/usage-store.h" #include "icing/tokenization/language-segmenter.h" #include "icing/util/clock.h" @@ -73,6 +75,7 @@ namespace { // Used in DocumentId mapper to mark a document as deleted constexpr int64_t kDocDeletedFlag = -1; constexpr char kDocumentIdMapperFilename[] = "document_id_mapper"; +constexpr char kUriHashMapperWorkingPath[] = "uri_mapper"; constexpr char kDocumentStoreHeaderFilename[] = "document_store_header"; constexpr char kScoreCacheFilename[] = "score_cache"; constexpr char kCorpusScoreCache[] = "corpus_score_cache"; @@ -81,9 +84,17 @@ constexpr char kNamespaceMapperFilename[] = "namespace_mapper"; constexpr char kUsageStoreDirectoryName[] = "usage_store"; constexpr char kCorpusIdMapperFilename[] = "corpus_mapper"; -// Determined through manual testing to allow for 1 million uris. 1 million -// because we allow up to 1 million DocumentIds. -constexpr int32_t kUriMapperMaxSize = 36 * 1024 * 1024; // 36 MiB +// Determined through manual testing to allow for 4 million uris. 4 million +// because we allow up to 4 million DocumentIds. +constexpr int32_t kUriDynamicTrieKeyMapperMaxSize = + 144 * 1024 * 1024; // 144 MiB + +constexpr int32_t kUriHashKeyMapperMaxNumEntries = + kMaxDocumentId + 1; // 1 << 22, 4M +// - Key: namespace_id_str (3 bytes) + fingerprinted_uri (10 bytes) + '\0' (1 +// byte) +// - Value: DocumentId (4 bytes) +constexpr int32_t kUriHashKeyMapperKVByteSize = 13 + 1 + sizeof(DocumentId); // 384 KiB for a DynamicTrieKeyMapper would allow each internal array to have a // max of 128 KiB for storage. @@ -100,6 +111,10 @@ std::string MakeHeaderFilename(const std::string& base_dir) { return absl_ports::StrCat(base_dir, "/", kDocumentStoreHeaderFilename); } +std::string MakeUriHashMapperWorkingPath(const std::string& base_dir) { + return absl_ports::StrCat(base_dir, "/", kUriHashMapperWorkingPath); +} + std::string MakeDocumentIdMapperFilename(const std::string& base_dir) { return absl_ports::StrCat(base_dir, "/", kDocumentIdMapperFilename); } @@ -128,25 +143,6 @@ std::string MakeCorpusMapperFilename(const std::string& base_dir) { return absl_ports::StrCat(base_dir, "/", kCorpusIdMapperFilename); } -// This function will encode a namespace id into a fixed 3 bytes string. -std::string EncodeNamespaceId(NamespaceId namespace_id) { - // encoding should be 1 to 3 bytes based on the value of namespace_id. - std::string encoding = encode_util::EncodeIntToCString(namespace_id); - // Make encoding to fixed 3 bytes. - while (encoding.size() < 3) { - // DynamicTrie cannot handle keys with 0 as bytes, so we append it using 1, - // just like what we do in encode_util::EncodeIntToCString. - // - // The reason that this works is because DecodeIntToString decodes a byte - // value of 0x01 as 0x00. When EncodeIntToCString returns a namespaceid - // encoding that is less than 3 bytes, it means that the id contains - // unencoded leading 0x00. So here we're explicitly encoding those bytes as - // 0x01. - encoding.push_back(1); - } - return encoding; -} - int64_t CalculateExpirationTimestampMs(int64_t creation_timestamp_ms, int64_t ttl_ms) { if (ttl_ms == 0) { @@ -207,6 +203,41 @@ std::unordered_map<NamespaceId, std::string> GetNamespaceIdsToNamespaces( return namespace_ids_to_namespaces; } +libtextclassifier3::StatusOr<std::unique_ptr< + KeyMapper<DocumentId, fingerprint_util::FingerprintStringFormatter>>> +CreateUriMapper(const Filesystem& filesystem, const std::string& base_dir, + bool pre_mapping_fbv, bool use_persistent_hash_map) { + std::string uri_hash_mapper_working_path = + MakeUriHashMapperWorkingPath(base_dir); + // Due to historic issue, we use document store's base_dir directly as + // DynamicTrieKeyMapper's working directory for uri mapper. + // DynamicTrieKeyMapper also creates a subdirectory "key_mapper_dir", so the + // actual files will be put under "<base_dir>/key_mapper_dir/". + bool dynamic_trie_key_mapper_dir_exists = filesystem.DirectoryExists( + absl_ports::StrCat(base_dir, "/key_mapper_dir").c_str()); + bool persistent_hash_map_dir_exists = + filesystem.DirectoryExists(uri_hash_mapper_working_path.c_str()); + if ((use_persistent_hash_map && dynamic_trie_key_mapper_dir_exists) || + (!use_persistent_hash_map && persistent_hash_map_dir_exists)) { + // Return a failure here so that the caller can properly delete and rebuild + // this component. + return absl_ports::FailedPreconditionError("Key mapper type mismatch"); + } + + if (use_persistent_hash_map) { + return PersistentHashMapKeyMapper< + DocumentId, fingerprint_util::FingerprintStringFormatter>:: + Create(filesystem, std::move(uri_hash_mapper_working_path), + pre_mapping_fbv, + /*max_num_entries=*/kUriHashKeyMapperMaxNumEntries, + /*average_kv_byte_size=*/kUriHashKeyMapperKVByteSize); + } else { + return DynamicTrieKeyMapper<DocumentId, + fingerprint_util::FingerprintStringFormatter>:: + Create(filesystem, base_dir, kUriDynamicTrieKeyMapperMaxSize); + } +} + } // namespace std::string DocumentStore::MakeFingerprint( @@ -220,9 +251,8 @@ std::string DocumentStore::MakeFingerprint( absl_ports::StrCat(namespace_, uri_or_schema)); return fingerprint_util::GetFingerprintString(fprint); } else { - return absl_ports::StrCat(EncodeNamespaceId(namespace_id), - encode_util::EncodeIntToCString( - tc3farmhash::Fingerprint64(uri_or_schema))); + return NamespaceFingerprintIdentifier(namespace_id, uri_or_schema) + .EncodeToCString(); } } @@ -231,6 +261,7 @@ DocumentStore::DocumentStore(const Filesystem* filesystem, const Clock* clock, const SchemaStore* schema_store, bool namespace_id_fingerprint, + bool pre_mapping_fbv, bool use_persistent_hash_map, int32_t compression_level) : filesystem_(filesystem), base_dir_(base_dir), @@ -238,6 +269,8 @@ DocumentStore::DocumentStore(const Filesystem* filesystem, schema_store_(schema_store), document_validator_(schema_store), namespace_id_fingerprint_(namespace_id_fingerprint), + pre_mapping_fbv_(pre_mapping_fbv), + use_persistent_hash_map_(use_persistent_hash_map), compression_level_(compression_level) {} libtextclassifier3::StatusOr<DocumentId> DocumentStore::Put( @@ -266,6 +299,7 @@ libtextclassifier3::StatusOr<DocumentStore::CreateResult> DocumentStore::Create( const Filesystem* filesystem, const std::string& base_dir, const Clock* clock, const SchemaStore* schema_store, bool force_recovery_and_revalidate_documents, bool namespace_id_fingerprint, + bool pre_mapping_fbv, bool use_persistent_hash_map, int32_t compression_level, InitializeStatsProto* initialize_stats) { ICING_RETURN_ERROR_IF_NULL(filesystem); ICING_RETURN_ERROR_IF_NULL(clock); @@ -273,15 +307,17 @@ libtextclassifier3::StatusOr<DocumentStore::CreateResult> DocumentStore::Create( auto document_store = std::unique_ptr<DocumentStore>(new DocumentStore( filesystem, base_dir, clock, schema_store, namespace_id_fingerprint, - compression_level)); + pre_mapping_fbv, use_persistent_hash_map, compression_level)); ICING_ASSIGN_OR_RETURN( - DataLoss data_loss, + InitializeResult initialize_result, document_store->Initialize(force_recovery_and_revalidate_documents, initialize_stats)); CreateResult create_result; create_result.document_store = std::move(document_store); - create_result.data_loss = data_loss; + create_result.data_loss = initialize_result.data_loss; + create_result.derived_files_regenerated = + initialize_result.derived_files_regenerated; return create_result; } @@ -293,9 +329,12 @@ libtextclassifier3::StatusOr<DocumentStore::CreateResult> DocumentStore::Create( return absl_ports::InternalError("Couldn't delete header file"); } - // Document key mapper + // Document key mapper. Doesn't hurt to delete both dynamic trie and + // persistent hash map without checking. ICING_RETURN_IF_ERROR( DynamicTrieKeyMapper<DocumentId>::Delete(*filesystem, base_dir)); + ICING_RETURN_IF_ERROR(PersistentHashMapKeyMapper<DocumentId>::Delete( + *filesystem, MakeUriHashMapperWorkingPath(base_dir))); // Document id mapper ICING_RETURN_IF_ERROR(FileBackedVector<int64_t>::Delete( @@ -324,9 +363,9 @@ libtextclassifier3::StatusOr<DocumentStore::CreateResult> DocumentStore::Create( return libtextclassifier3::Status::OK; } -libtextclassifier3::StatusOr<DataLoss> DocumentStore::Initialize( - bool force_recovery_and_revalidate_documents, - InitializeStatsProto* initialize_stats) { +libtextclassifier3::StatusOr<DocumentStore::InitializeResult> +DocumentStore::Initialize(bool force_recovery_and_revalidate_documents, + InitializeStatsProto* initialize_stats) { auto create_result_or = DocumentLogCreator::Create(filesystem_, base_dir_, compression_level_); @@ -344,6 +383,7 @@ libtextclassifier3::StatusOr<DataLoss> DocumentStore::Initialize( InitializeStatsProto::RecoveryCause recovery_cause = GetRecoveryCause(create_result, force_recovery_and_revalidate_documents); + bool derived_files_regenerated = false; if (recovery_cause != InitializeStatsProto::NONE || create_result.new_file) { ICING_LOG(INFO) << "Starting Document Store Recovery with cause=" << recovery_cause << ", and create result { new_file=" @@ -360,16 +400,18 @@ libtextclassifier3::StatusOr<DataLoss> DocumentStore::Initialize( std::unique_ptr<Timer> document_recovery_timer = clock_.GetNewTimer(); libtextclassifier3::Status status = RegenerateDerivedFiles(force_recovery_and_revalidate_documents); - if (initialize_stats != nullptr && - recovery_cause != InitializeStatsProto::NONE) { + if (recovery_cause != InitializeStatsProto::NONE) { // Only consider it a recovery if the client forced a recovery or there // was data loss. Otherwise, this could just be the first time we're // initializing and generating derived files. - initialize_stats->set_document_store_recovery_latency_ms( - document_recovery_timer->GetElapsedMilliseconds()); - initialize_stats->set_document_store_recovery_cause(recovery_cause); - initialize_stats->set_document_store_data_status( - GetDataStatus(create_result.log_create_result.data_loss)); + derived_files_regenerated = true; + if (initialize_stats != nullptr) { + initialize_stats->set_document_store_recovery_latency_ms( + document_recovery_timer->GetElapsedMilliseconds()); + initialize_stats->set_document_store_recovery_cause(recovery_cause); + initialize_stats->set_document_store_data_status( + GetDataStatus(create_result.log_create_result.data_loss)); + } } if (!status.ok()) { ICING_LOG(ERROR) @@ -382,6 +424,7 @@ libtextclassifier3::StatusOr<DataLoss> DocumentStore::Initialize( << "Couldn't find derived files or failed to initialize them, " "regenerating derived files for DocumentStore."; std::unique_ptr<Timer> document_recovery_timer = clock_.GetNewTimer(); + derived_files_regenerated = true; libtextclassifier3::Status status = RegenerateDerivedFiles( /*force_recovery_and_revalidate_documents=*/false); if (initialize_stats != nullptr) { @@ -403,7 +446,10 @@ libtextclassifier3::StatusOr<DataLoss> DocumentStore::Initialize( initialize_stats->set_num_documents(document_id_mapper_->num_elements()); } - return create_result.log_create_result.data_loss; + InitializeResult initialize_result = { + .data_loss = create_result.log_create_result.data_loss, + .derived_files_regenerated = derived_files_regenerated}; + return initialize_result; } libtextclassifier3::Status DocumentStore::InitializeExistingDerivedFiles() { @@ -429,11 +475,8 @@ libtextclassifier3::Status DocumentStore::InitializeExistingDerivedFiles() { // TODO(b/144458732): Implement a more robust version of TC_ASSIGN_OR_RETURN // that can support error logging. - auto document_key_mapper_or = DynamicTrieKeyMapper< - DocumentId, - fingerprint_util::FingerprintStringFormatter>::Create(*filesystem_, - base_dir_, - kUriMapperMaxSize); + auto document_key_mapper_or = CreateUriMapper( + *filesystem_, base_dir_, pre_mapping_fbv_, use_persistent_hash_map_); if (!document_key_mapper_or.ok()) { ICING_LOG(ERROR) << document_key_mapper_or.status().error_message() << "Failed to initialize KeyMapper"; @@ -646,6 +689,10 @@ libtextclassifier3::Status DocumentStore::RegenerateDerivedFiles( } libtextclassifier3::Status DocumentStore::ResetDocumentKeyMapper() { + // Only one type of KeyMapper (either DynamicTrieKeyMapper or + // PersistentHashMapKeyMapper) will actually exist at any moment, but it is ok + // to call Delete() for both since Delete() returns OK if any of them doesn't + // exist. // TODO(b/139734457): Replace ptr.reset()->Delete->Create flow with Reset(). document_key_mapper_.reset(); // TODO(b/216487496): Implement a more robust version of TC_RETURN_IF_ERROR @@ -654,17 +701,21 @@ libtextclassifier3::Status DocumentStore::ResetDocumentKeyMapper() { DynamicTrieKeyMapper<DocumentId>::Delete(*filesystem_, base_dir_); if (!status.ok()) { ICING_LOG(ERROR) << status.error_message() - << "Failed to delete old key mapper"; + << "Failed to delete old dynamic trie key mapper"; + return status; + } + status = PersistentHashMapKeyMapper<DocumentId>::Delete( + *filesystem_, MakeUriHashMapperWorkingPath(base_dir_)); + if (!status.ok()) { + ICING_LOG(ERROR) << status.error_message() + << "Failed to delete old persistent hash map key mapper"; return status; } // TODO(b/216487496): Implement a more robust version of TC_ASSIGN_OR_RETURN // that can support error logging. - auto document_key_mapper_or = DynamicTrieKeyMapper< - DocumentId, - fingerprint_util::FingerprintStringFormatter>::Create(*filesystem_, - base_dir_, - kUriMapperMaxSize); + auto document_key_mapper_or = CreateUriMapper( + *filesystem_, base_dir_, pre_mapping_fbv_, use_persistent_hash_map_); if (!document_key_mapper_or.ok()) { ICING_LOG(ERROR) << document_key_mapper_or.status().error_message() << "Failed to re-init key mapper"; @@ -1116,6 +1167,25 @@ libtextclassifier3::StatusOr<DocumentId> DocumentStore::GetDocumentId( "Failed to find DocumentId by key: ", name_space, ", ", uri)); } +libtextclassifier3::StatusOr<DocumentId> DocumentStore::GetDocumentId( + const NamespaceFingerprintIdentifier& namespace_fingerprint_identifier) + const { + if (!namespace_id_fingerprint_) { + return absl_ports::FailedPreconditionError( + "Cannot lookup document id by namespace id + fingerprint without " + "enabling it on uri_mapper"); + } + + auto document_id_or = document_key_mapper_->Get( + namespace_fingerprint_identifier.EncodeToCString()); + if (document_id_or.ok()) { + return document_id_or.ValueOrDie(); + } + return absl_ports::Annotate( + std::move(document_id_or).status(), + "Failed to find DocumentId by namespace id + fingerprint"); +} + std::vector<std::string> DocumentStore::GetAllNamespaces() const { std::unordered_map<NamespaceId, std::string> namespace_id_to_namespace = GetNamespaceIdsToNamespaces(namespace_mapper_.get()); @@ -1768,11 +1838,10 @@ libtextclassifier3::Status DocumentStore::Optimize() { return libtextclassifier3::Status::OK; } -libtextclassifier3::StatusOr<std::vector<DocumentId>> +libtextclassifier3::StatusOr<DocumentStore::OptimizeResult> DocumentStore::OptimizeInto(const std::string& new_directory, const LanguageSegmenter* lang_segmenter, - bool namespace_id_fingerprint, - OptimizeStatsProto* stats) { + OptimizeStatsProto* stats) const { // Validates directory if (new_directory == base_dir_) { return absl_ports::InvalidArgumentError( @@ -1783,26 +1852,29 @@ DocumentStore::OptimizeInto(const std::string& new_directory, auto doc_store_create_result, DocumentStore::Create(filesystem_, new_directory, &clock_, schema_store_, /*force_recovery_and_revalidate_documents=*/false, - namespace_id_fingerprint, compression_level_, + namespace_id_fingerprint_, pre_mapping_fbv_, + use_persistent_hash_map_, compression_level_, /*initialize_stats=*/nullptr)); std::unique_ptr<DocumentStore> new_doc_store = std::move(doc_store_create_result.document_store); // Writes all valid docs into new document store (new directory) - int size = document_id_mapper_->num_elements(); - int num_deleted = 0; - int num_expired = 0; + int document_cnt = document_id_mapper_->num_elements(); + int num_deleted_documents = 0; + int num_expired_documents = 0; UsageStore::UsageScores default_usage; - std::vector<DocumentId> document_id_old_to_new(size, kInvalidDocumentId); + + OptimizeResult result; + result.document_id_old_to_new.resize(document_cnt, kInvalidDocumentId); int64_t current_time_ms = clock_.GetSystemTimeMilliseconds(); - for (DocumentId document_id = 0; document_id < size; document_id++) { + for (DocumentId document_id = 0; document_id < document_cnt; document_id++) { auto document_or = Get(document_id, /*clear_internal_fields=*/false); if (absl_ports::IsNotFound(document_or.status())) { if (IsDeleted(document_id)) { - ++num_deleted; + ++num_deleted_documents; } else if (!GetNonExpiredDocumentFilterData(document_id, current_time_ms)) { - ++num_expired; + ++num_expired_documents; } continue; } else if (!document_or.ok()) { @@ -1842,7 +1914,8 @@ DocumentStore::OptimizeInto(const std::string& new_directory, return new_document_id_or.status(); } - document_id_old_to_new[document_id] = new_document_id_or.ValueOrDie(); + result.document_id_old_to_new[document_id] = + new_document_id_or.ValueOrDie(); // Copy over usage scores. ICING_ASSIGN_OR_RETURN(UsageStore::UsageScores usage_scores, @@ -1856,13 +1929,61 @@ DocumentStore::OptimizeInto(const std::string& new_directory, new_doc_store->SetUsageScores(new_document_id, usage_scores)); } } + + // Construct namespace_id_old_to_new + int namespace_cnt = namespace_mapper_->num_keys(); + std::unordered_map<NamespaceId, std::string> old_namespaces = + GetNamespaceIdsToNamespaces(namespace_mapper_.get()); + if (namespace_cnt != old_namespaces.size()) { + // This really shouldn't happen. If it really happens, then: + // - It won't block DocumentStore optimization, so don't return error here. + // - Instead, write a warning log here and hint the caller to rebuild index. + ICING_LOG(WARNING) << "Unexpected old namespace count " << namespace_cnt + << " vs " << old_namespaces.size(); + result.should_rebuild_index = true; + } else { + result.namespace_id_old_to_new.resize(namespace_cnt, kInvalidNamespaceId); + for (const auto& [old_namespace_id, ns] : old_namespaces) { + if (old_namespace_id >= result.namespace_id_old_to_new.size()) { + // This really shouldn't happen. If it really happens, then: + // - It won't block DocumentStore optimization, so don't return error + // here. + // - Instead, write a warning log here and hint the caller to rebuild + // index. + ICING_LOG(WARNING) << "Found unexpected namespace id " + << old_namespace_id << ". Should be in range 0 to " + << result.namespace_id_old_to_new.size() + << " (exclusive)."; + result.namespace_id_old_to_new.clear(); + result.should_rebuild_index = true; + break; + } + + auto new_namespace_id_or = new_doc_store->namespace_mapper_->Get(ns); + if (!new_namespace_id_or.ok()) { + if (absl_ports::IsNotFound(new_namespace_id_or.status())) { + continue; + } + // Real error, return it. + return std::move(new_namespace_id_or).status(); + } + + NamespaceId new_namespace_id = new_namespace_id_or.ValueOrDie(); + // Safe to use bracket to assign given that we've checked the range above. + result.namespace_id_old_to_new[old_namespace_id] = new_namespace_id; + } + } + if (stats != nullptr) { - stats->set_num_original_documents(size); - stats->set_num_deleted_documents(num_deleted); - stats->set_num_expired_documents(num_expired); + stats->set_num_original_documents(document_cnt); + stats->set_num_deleted_documents(num_deleted_documents); + stats->set_num_expired_documents(num_expired_documents); + stats->set_num_original_namespaces(namespace_cnt); + stats->set_num_deleted_namespaces( + namespace_cnt - new_doc_store->namespace_mapper_->num_keys()); } ICING_RETURN_IF_ERROR(new_doc_store->PersistToDisk(PersistType::FULL)); - return document_id_old_to_new; + return result; } libtextclassifier3::StatusOr<DocumentStore::OptimizeInfo> diff --git a/icing/store/document-store.h b/icing/store/document-store.h index 3941f6d..c228e8b 100644 --- a/icing/store/document-store.h +++ b/icing/store/document-store.h @@ -43,6 +43,7 @@ #include "icing/store/document-filter-data.h" #include "icing/store/document-id.h" #include "icing/store/key-mapper.h" +#include "icing/store/namespace-fingerprint-identifier.h" #include "icing/store/namespace-id.h" #include "icing/store/usage-store.h" #include "icing/tokenization/language-segmenter.h" @@ -106,6 +107,11 @@ class DocumentStore { // unpersisted. This may be used to signal that any derived data off of the // document store may need to be regenerated. DataLoss data_loss; + + // A boolean flag indicating if derived files of the document store have + // been regenerated or not. This is usually a signal for callers to detect + // if any id assignment has changed (e.g. NamespaceId). + bool derived_files_regenerated; }; // Not copyable @@ -142,8 +148,8 @@ class DocumentStore { const Filesystem* filesystem, const std::string& base_dir, const Clock* clock, const SchemaStore* schema_store, bool force_recovery_and_revalidate_documents, - bool namespace_id_fingerprint, - int32_t compression_level, + bool namespace_id_fingerprint, bool pre_mapping_fbv, + bool use_persistent_hash_map, int32_t compression_level, InitializeStatsProto* initialize_stats); // Discards all derived data in the document store. @@ -270,6 +276,21 @@ class DocumentStore { libtextclassifier3::StatusOr<DocumentId> GetDocumentId( std::string_view name_space, std::string_view uri) const; + // Helper method to find a DocumentId that is associated with the given + // NamespaceFingerprintIdentifier. + // + // NOTE: The DocumentId may refer to a invalid document (deleted + // or expired). Callers can call DoesDocumentExist(document_id) to ensure it + // refers to a valid Document. + // + // Returns: + // A DocumentId on success + // NOT_FOUND if the key doesn't exist + // INTERNAL_ERROR on IO error + libtextclassifier3::StatusOr<DocumentId> GetDocumentId( + const NamespaceFingerprintIdentifier& namespace_fingerprint_identifier) + const; + // Returns the CorpusId associated with the given namespace and schema. // // Returns: @@ -439,10 +460,23 @@ class DocumentStore { // INTERNAL_ERROR on IO error libtextclassifier3::Status Optimize(); + struct OptimizeResult { + // A vector that maps old document id to new document id. + std::vector<DocumentId> document_id_old_to_new; + + // A vector that maps old namespace id to new namespace id. Will be empty if + // should_rebuild_index is set to true. + std::vector<NamespaceId> namespace_id_old_to_new; + + // A boolean flag that hints the caller (usually IcingSearchEngine) if it + // should rebuild index instead of adopting the id changes via the 2 vectors + // above. It will be set to true if finding any id inconsistency. + bool should_rebuild_index = false; + }; // Copy data from current base directory into a new directory. Any outdated or - // deleted data won't be copied. During the process, document ids will be - // reassigned so any files / classes that are based on old document ids may be - // outdated. + // deleted data won't be copied. During the process, document/namespace ids + // will be reassigned so any files / classes that are based on old + // document/namespace ids may be outdated. // // stats will be set if non-null. // @@ -451,12 +485,14 @@ class DocumentStore { // method based on device usage. // // Returns: - // A vector that maps from old document id to new document id on success + // OptimizeResult which contains a vector mapping from old document id to + // new document id and another vector mapping from old namespace id to new + // namespace id, on success // INVALID_ARGUMENT if new_directory is same as current base directory // INTERNAL_ERROR on IO error - libtextclassifier3::StatusOr<std::vector<DocumentId>> OptimizeInto( + libtextclassifier3::StatusOr<OptimizeResult> OptimizeInto( const std::string& new_directory, const LanguageSegmenter* lang_segmenter, - bool namespace_id_fingerprint, OptimizeStatsProto* stats = nullptr); + OptimizeStatsProto* stats = nullptr) const; // Calculates status for a potential Optimize call. Includes how many docs // there are vs how many would be optimized away. And also includes an @@ -488,9 +524,12 @@ class DocumentStore { private: // Use DocumentStore::Create() to instantiate. - DocumentStore(const Filesystem* filesystem, std::string_view base_dir, - const Clock* clock, const SchemaStore* schema_store, - bool namespace_id_fingerprint, int32_t compression_level); + explicit DocumentStore(const Filesystem* filesystem, + std::string_view base_dir, const Clock* clock, + const SchemaStore* schema_store, + bool namespace_id_fingerprint, bool pre_mapping_fbv, + bool use_persistent_hash_map, + int32_t compression_level); const Filesystem* const filesystem_; const std::string base_dir_; @@ -507,6 +546,15 @@ class DocumentStore { // document_key_mapper_ and corpus_mapper_. bool namespace_id_fingerprint_; + // Flag indicating whether memory map max possible file size for underlying + // FileBackedVector before growing the actual file size. + bool pre_mapping_fbv_; + + // Flag indicating whether use persistent hash map as the key mapper (if + // false, then fall back to dynamic trie key mapper). Note: we only use + // persistent hash map for uri mapper if it is true. + bool use_persistent_hash_map_; + const int32_t compression_level_; // A log used to store all documents, it serves as a ground truth of doc @@ -568,7 +616,15 @@ class DocumentStore { // worry about this field. bool initialized_ = false; - libtextclassifier3::StatusOr<DataLoss> Initialize( + struct InitializeResult { + DataLoss data_loss; + + // A boolean flag indicating if derived files of the document store have + // been regenerated or not. This is usually a signal for callers to detect + // if any id assignment has changed (e.g. NamespaceId). + bool derived_files_regenerated; + }; + libtextclassifier3::StatusOr<InitializeResult> Initialize( bool force_recovery_and_revalidate_documents, InitializeStatsProto* initialize_stats); diff --git a/icing/store/document-store_benchmark.cc b/icing/store/document-store_benchmark.cc index 75995e9..46d76d8 100644 --- a/icing/store/document-store_benchmark.cc +++ b/icing/store/document-store_benchmark.cc @@ -132,7 +132,8 @@ libtextclassifier3::StatusOr<DocumentStore::CreateResult> CreateDocumentStore( return DocumentStore::Create( filesystem, base_dir, clock, schema_store, /*force_recovery_and_revalidate_documents=*/false, - /*namespace_id_fingerprint=*/false, + /*namespace_id_fingerprint=*/false, /*pre_mapping_fbv=*/false, + /*use_persistent_hash_map=*/false, PortableFileBackedProtoLog<DocumentWrapper>::kDeflateCompressionLevel, /*initialize_stats=*/nullptr); } @@ -162,8 +163,9 @@ void BM_DoesDocumentExistBenchmark(benchmark::State& state) { // stuff. ICING_ASSERT_OK(document_store->Put( CreateDocument("namespace", /*uri=*/std::to_string(i)))); - document_store->Delete("namespace", /*uri=*/std::to_string(i), - clock.GetSystemTimeMilliseconds()); + ICING_ASSERT_OK(document_store->Delete("namespace", + /*uri=*/std::to_string(i), + clock.GetSystemTimeMilliseconds())); } std::default_random_engine random; diff --git a/icing/store/document-store_test.cc b/icing/store/document-store_test.cc index 9a1f4a6..2d4cd99 100644 --- a/icing/store/document-store_test.cc +++ b/icing/store/document-store_test.cc @@ -47,6 +47,7 @@ #include "icing/store/document-filter-data.h" #include "icing/store/document-id.h" #include "icing/store/document-log-creator.h" +#include "icing/store/namespace-fingerprint-identifier.h" #include "icing/store/namespace-id.h" #include "icing/testing/common-matchers.h" #include "icing/testing/fake-clock.h" @@ -71,6 +72,7 @@ using ::testing::Ge; using ::testing::Gt; using ::testing::HasSubstr; using ::testing::IsEmpty; +using ::testing::IsFalse; using ::testing::IsTrue; using ::testing::Not; using ::testing::Return; @@ -120,7 +122,21 @@ void WriteDocumentLogHeader( sizeof(PortableFileBackedProtoLog<DocumentWrapper>::Header)); } -class DocumentStoreTest : public ::testing::Test { +struct DocumentStoreTestParam { + bool namespace_id_fingerprint; + bool pre_mapping_fbv; + bool use_persistent_hash_map; + + explicit DocumentStoreTestParam(bool namespace_id_fingerprint_in, + bool pre_mapping_fbv_in, + bool use_persistent_hash_map_in) + : namespace_id_fingerprint(namespace_id_fingerprint_in), + pre_mapping_fbv(pre_mapping_fbv_in), + use_persistent_hash_map(use_persistent_hash_map_in) {} +}; + +class DocumentStoreTest + : public ::testing::TestWithParam<DocumentStoreTestParam> { protected: DocumentStoreTest() : test_dir_(GetTestTempDir() + "/icing"), @@ -213,7 +229,7 @@ class DocumentStoreTest : public ::testing::Test { absl_ports::StrCat(document_store_dir_, "/document_store_header"); DocumentStore::Header header; header.magic = DocumentStore::Header::GetCurrentMagic( - /*namespace_id_fingerprint=*/false); + GetParam().namespace_id_fingerprint); header.checksum = 10; // Arbitrary garbage checksum filesystem_.DeleteFile(header_file.c_str()); filesystem_.Write(header_file.c_str(), &header, sizeof(header)); @@ -225,7 +241,8 @@ class DocumentStoreTest : public ::testing::Test { return DocumentStore::Create( filesystem, base_dir, clock, schema_store, /*force_recovery_and_revalidate_documents=*/false, - /*namespace_id_fingerprint=*/false, + GetParam().namespace_id_fingerprint, GetParam().pre_mapping_fbv, + GetParam().use_persistent_hash_map, PortableFileBackedProtoLog<DocumentWrapper>::kDeflateCompressionLevel, /*initialize_stats=*/nullptr); } @@ -254,7 +271,7 @@ class DocumentStoreTest : public ::testing::Test { const int64_t document2_expiration_timestamp_ = 3; // creation + ttl }; -TEST_F(DocumentStoreTest, CreationWithNullPointerShouldFail) { +TEST_P(DocumentStoreTest, CreationWithNullPointerShouldFail) { EXPECT_THAT(CreateDocumentStore(/*filesystem=*/nullptr, document_store_dir_, &fake_clock_, schema_store_.get()), StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION)); @@ -268,7 +285,7 @@ TEST_F(DocumentStoreTest, CreationWithNullPointerShouldFail) { StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION)); } -TEST_F(DocumentStoreTest, CreationWithBadFilesystemShouldFail) { +TEST_P(DocumentStoreTest, CreationWithBadFilesystemShouldFail) { MockFilesystem mock_filesystem; ON_CALL(mock_filesystem, OpenForWrite(_)).WillByDefault(Return(false)); @@ -277,7 +294,7 @@ TEST_F(DocumentStoreTest, CreationWithBadFilesystemShouldFail) { StatusIs(libtextclassifier3::StatusCode::INTERNAL)); } -TEST_F(DocumentStoreTest, PutAndGetInSameNamespaceOk) { +TEST_P(DocumentStoreTest, PutAndGetInSameNamespaceOk) { ICING_ASSERT_OK_AND_ASSIGN( DocumentStore::CreateResult create_result, CreateDocumentStore(&filesystem_, document_store_dir_, &fake_clock_, @@ -297,7 +314,7 @@ TEST_F(DocumentStoreTest, PutAndGetInSameNamespaceOk) { IsOkAndHolds(EqualsProto(test_document2_))); } -TEST_F(DocumentStoreTest, PutAndGetAcrossNamespacesOk) { +TEST_P(DocumentStoreTest, PutAndGetAcrossNamespacesOk) { ICING_ASSERT_OK_AND_ASSIGN( DocumentStore::CreateResult create_result, CreateDocumentStore(&filesystem_, document_store_dir_, &fake_clock_, @@ -330,7 +347,7 @@ TEST_F(DocumentStoreTest, PutAndGetAcrossNamespacesOk) { // Validates that putting an document with the same key will overwrite previous // document and old doc ids are not getting reused. -TEST_F(DocumentStoreTest, PutSameKey) { +TEST_P(DocumentStoreTest, PutSameKey) { ICING_ASSERT_OK_AND_ASSIGN( DocumentStore::CreateResult create_result, CreateDocumentStore(&filesystem_, document_store_dir_, &fake_clock_, @@ -359,7 +376,7 @@ TEST_F(DocumentStoreTest, PutSameKey) { EXPECT_THAT(doc_store->Put(document3), IsOkAndHolds(Not(document_id1))); } -TEST_F(DocumentStoreTest, IsDocumentExistingWithoutStatus) { +TEST_P(DocumentStoreTest, IsDocumentExistingWithoutStatus) { ICING_ASSERT_OK_AND_ASSIGN( DocumentStore::CreateResult create_result, CreateDocumentStore(&filesystem_, document_store_dir_, &fake_clock_, @@ -395,7 +412,7 @@ TEST_F(DocumentStoreTest, IsDocumentExistingWithoutStatus) { fake_clock_.GetSystemTimeMilliseconds())); } -TEST_F(DocumentStoreTest, GetDeletedDocumentNotFound) { +TEST_P(DocumentStoreTest, GetDeletedDocumentNotFound) { ICING_ASSERT_OK_AND_ASSIGN( DocumentStore::CreateResult create_result, CreateDocumentStore(&filesystem_, document_store_dir_, &fake_clock_, @@ -416,7 +433,7 @@ TEST_F(DocumentStoreTest, GetDeletedDocumentNotFound) { StatusIs(libtextclassifier3::StatusCode::NOT_FOUND)); } -TEST_F(DocumentStoreTest, GetExpiredDocumentNotFound) { +TEST_P(DocumentStoreTest, GetExpiredDocumentNotFound) { DocumentProto document = DocumentBuilder() .SetKey("namespace", "uri") .SetSchema("email") @@ -451,7 +468,7 @@ TEST_F(DocumentStoreTest, GetExpiredDocumentNotFound) { StatusIs(libtextclassifier3::StatusCode::NOT_FOUND)); } -TEST_F(DocumentStoreTest, GetInvalidDocumentId) { +TEST_P(DocumentStoreTest, GetInvalidDocumentId) { ICING_ASSERT_OK_AND_ASSIGN( DocumentStore::CreateResult create_result, CreateDocumentStore(&filesystem_, document_store_dir_, &fake_clock_, @@ -478,7 +495,7 @@ TEST_F(DocumentStoreTest, GetInvalidDocumentId) { StatusIs(libtextclassifier3::StatusCode::NOT_FOUND)); } -TEST_F(DocumentStoreTest, DeleteNonexistentDocumentNotFound) { +TEST_P(DocumentStoreTest, DeleteNonexistentDocumentNotFound) { ICING_ASSERT_OK_AND_ASSIGN( DocumentStore::CreateResult create_result, CreateDocumentStore(&filesystem_, document_store_dir_, &fake_clock_, @@ -504,7 +521,7 @@ TEST_F(DocumentStoreTest, DeleteNonexistentDocumentNotFound) { EXPECT_THAT(document_log_size_before, Eq(document_log_size_after)); } -TEST_F(DocumentStoreTest, DeleteNonexistentDocumentPrintableErrorMessage) { +TEST_P(DocumentStoreTest, DeleteNonexistentDocumentPrintableErrorMessage) { ICING_ASSERT_OK_AND_ASSIGN( DocumentStore::CreateResult create_result, CreateDocumentStore(&filesystem_, document_store_dir_, &fake_clock_, @@ -533,7 +550,7 @@ TEST_F(DocumentStoreTest, DeleteNonexistentDocumentPrintableErrorMessage) { EXPECT_THAT(document_log_size_before, Eq(document_log_size_after)); } -TEST_F(DocumentStoreTest, DeleteAlreadyDeletedDocumentNotFound) { +TEST_P(DocumentStoreTest, DeleteAlreadyDeletedDocumentNotFound) { ICING_ASSERT_OK_AND_ASSIGN( DocumentStore::CreateResult create_result, CreateDocumentStore(&filesystem_, document_store_dir_, &fake_clock_, @@ -555,7 +572,7 @@ TEST_F(DocumentStoreTest, DeleteAlreadyDeletedDocumentNotFound) { StatusIs(libtextclassifier3::StatusCode::NOT_FOUND)); } -TEST_F(DocumentStoreTest, DeleteByNamespaceOk) { +TEST_P(DocumentStoreTest, DeleteByNamespaceOk) { ICING_ASSERT_OK_AND_ASSIGN( DocumentStore::CreateResult create_result, CreateDocumentStore(&filesystem_, document_store_dir_, &fake_clock_, @@ -599,7 +616,7 @@ TEST_F(DocumentStoreTest, DeleteByNamespaceOk) { StatusIs(libtextclassifier3::StatusCode::NOT_FOUND)); } -TEST_F(DocumentStoreTest, DeleteByNamespaceNonexistentNamespaceNotFound) { +TEST_P(DocumentStoreTest, DeleteByNamespaceNonexistentNamespaceNotFound) { ICING_ASSERT_OK_AND_ASSIGN( DocumentStore::CreateResult create_result, CreateDocumentStore(&filesystem_, document_store_dir_, &fake_clock_, @@ -624,7 +641,7 @@ TEST_F(DocumentStoreTest, DeleteByNamespaceNonexistentNamespaceNotFound) { EXPECT_THAT(document_log_size_before, Eq(document_log_size_after)); } -TEST_F(DocumentStoreTest, DeleteByNamespaceNoExistingDocumentsNotFound) { +TEST_P(DocumentStoreTest, DeleteByNamespaceNoExistingDocumentsNotFound) { ICING_ASSERT_OK_AND_ASSIGN( DocumentStore::CreateResult create_result, CreateDocumentStore(&filesystem_, document_store_dir_, &fake_clock_, @@ -645,7 +662,7 @@ TEST_F(DocumentStoreTest, DeleteByNamespaceNoExistingDocumentsNotFound) { StatusIs(libtextclassifier3::StatusCode::NOT_FOUND)); } -TEST_F(DocumentStoreTest, DeleteByNamespaceRecoversOk) { +TEST_P(DocumentStoreTest, DeleteByNamespaceRecoversOk) { DocumentProto document1 = test_document1_; document1.set_namespace_("namespace.1"); document1.set_uri("uri1"); @@ -715,7 +732,7 @@ TEST_F(DocumentStoreTest, DeleteByNamespaceRecoversOk) { StatusIs(libtextclassifier3::StatusCode::NOT_FOUND)); } -TEST_F(DocumentStoreTest, DeleteBySchemaTypeOk) { +TEST_P(DocumentStoreTest, DeleteBySchemaTypeOk) { SchemaProto schema = SchemaBuilder() .AddType(SchemaTypeConfigBuilder().SetType("email")) @@ -802,7 +819,7 @@ TEST_F(DocumentStoreTest, DeleteBySchemaTypeOk) { IsOkAndHolds(EqualsProto(person_document))); } -TEST_F(DocumentStoreTest, DeleteBySchemaTypeNonexistentSchemaTypeNotFound) { +TEST_P(DocumentStoreTest, DeleteBySchemaTypeNonexistentSchemaTypeNotFound) { ICING_ASSERT_OK_AND_ASSIGN( DocumentStore::CreateResult create_result, CreateDocumentStore(&filesystem_, document_store_dir_, &fake_clock_, @@ -828,7 +845,7 @@ TEST_F(DocumentStoreTest, DeleteBySchemaTypeNonexistentSchemaTypeNotFound) { EXPECT_THAT(document_log_size_before, Eq(document_log_size_after)); } -TEST_F(DocumentStoreTest, DeleteBySchemaTypeNoExistingDocumentsNotFound) { +TEST_P(DocumentStoreTest, DeleteBySchemaTypeNoExistingDocumentsNotFound) { ICING_ASSERT_OK_AND_ASSIGN( DocumentStore::CreateResult create_result, CreateDocumentStore(&filesystem_, document_store_dir_, &fake_clock_, @@ -846,7 +863,7 @@ TEST_F(DocumentStoreTest, DeleteBySchemaTypeNoExistingDocumentsNotFound) { StatusIs(libtextclassifier3::StatusCode::NOT_FOUND)); } -TEST_F(DocumentStoreTest, DeleteBySchemaTypeRecoversOk) { +TEST_P(DocumentStoreTest, DeleteBySchemaTypeRecoversOk) { SchemaProto schema = SchemaBuilder() .AddType(SchemaTypeConfigBuilder().SetType("email")) @@ -926,7 +943,7 @@ TEST_F(DocumentStoreTest, DeleteBySchemaTypeRecoversOk) { IsOkAndHolds(EqualsProto(message_document))); } -TEST_F(DocumentStoreTest, PutDeleteThenPut) { +TEST_P(DocumentStoreTest, PutDeleteThenPut) { ICING_ASSERT_OK_AND_ASSIGN( DocumentStore::CreateResult create_result, CreateDocumentStore(&filesystem_, document_store_dir_, &fake_clock_, @@ -940,7 +957,7 @@ TEST_F(DocumentStoreTest, PutDeleteThenPut) { ICING_EXPECT_OK(doc_store->Put(test_document1_)); } -TEST_F(DocumentStoreTest, DeletedSchemaTypeFromSchemaStoreRecoversOk) { +TEST_P(DocumentStoreTest, DeletedSchemaTypeFromSchemaStoreRecoversOk) { SchemaProto schema = SchemaBuilder() .AddType(SchemaTypeConfigBuilder().SetType("email")) @@ -1034,7 +1051,7 @@ TEST_F(DocumentStoreTest, DeletedSchemaTypeFromSchemaStoreRecoversOk) { IsOkAndHolds(EqualsProto(message_document))); } -TEST_F(DocumentStoreTest, OptimizeInto) { +TEST_P(DocumentStoreTest, OptimizeIntoSingleNamespace) { ICING_ASSERT_OK_AND_ASSIGN( DocumentStore::CreateResult create_result, CreateDocumentStore(&filesystem_, document_store_dir_, &fake_clock_, @@ -1078,8 +1095,7 @@ TEST_F(DocumentStoreTest, OptimizeInto) { // Optimizing into the same directory is not allowed EXPECT_THAT( - doc_store->OptimizeInto(document_store_dir_, lang_segmenter_.get(), - /*namespace_id_fingerprint=*/false), + doc_store->OptimizeInto(document_store_dir_, lang_segmenter_.get()), StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT, HasSubstr("directory is the same"))); @@ -1088,26 +1104,33 @@ TEST_F(DocumentStoreTest, OptimizeInto) { optimized_dir + "/" + DocumentLogCreator::GetDocumentLogFilename(); // Validates that the optimized document log has the same size if nothing is - // deleted + // deleted. Also namespace ids remain the same. ASSERT_TRUE(filesystem_.DeleteDirectoryRecursively(optimized_dir.c_str())); ASSERT_TRUE(filesystem_.CreateDirectoryRecursively(optimized_dir.c_str())); - EXPECT_THAT(doc_store->OptimizeInto(optimized_dir, lang_segmenter_.get(), - /*namespace_id_fingerprint=*/false), - IsOkAndHolds(ElementsAre(0, 1, 2))); + ICING_ASSERT_OK_AND_ASSIGN( + DocumentStore::OptimizeResult optimize_result1, + doc_store->OptimizeInto(optimized_dir, lang_segmenter_.get())); + EXPECT_THAT(optimize_result1.document_id_old_to_new, ElementsAre(0, 1, 2)); + EXPECT_THAT(optimize_result1.namespace_id_old_to_new, ElementsAre(0)); + EXPECT_THAT(optimize_result1.should_rebuild_index, IsFalse()); int64_t optimized_size1 = filesystem_.GetFileSize(optimized_document_log.c_str()); EXPECT_EQ(original_size, optimized_size1); // Validates that the optimized document log has a smaller size if something - // is deleted + // is deleted. Namespace ids remain the same. ASSERT_TRUE(filesystem_.DeleteDirectoryRecursively(optimized_dir.c_str())); ASSERT_TRUE(filesystem_.CreateDirectoryRecursively(optimized_dir.c_str())); ICING_ASSERT_OK(doc_store->Delete("namespace", "uri1", fake_clock_.GetSystemTimeMilliseconds())); // DocumentId 0 is removed. - EXPECT_THAT(doc_store->OptimizeInto(optimized_dir, lang_segmenter_.get(), - /*namespace_id_fingerprint=*/false), - IsOkAndHolds(ElementsAre(kInvalidDocumentId, 0, 1))); + ICING_ASSERT_OK_AND_ASSIGN( + DocumentStore::OptimizeResult optimize_result2, + doc_store->OptimizeInto(optimized_dir, lang_segmenter_.get())); + EXPECT_THAT(optimize_result2.document_id_old_to_new, + ElementsAre(kInvalidDocumentId, 0, 1)); + EXPECT_THAT(optimize_result2.namespace_id_old_to_new, ElementsAre(0)); + EXPECT_THAT(optimize_result2.should_rebuild_index, IsFalse()); int64_t optimized_size2 = filesystem_.GetFileSize(optimized_document_log.c_str()); EXPECT_THAT(original_size, Gt(optimized_size2)); @@ -1117,14 +1140,17 @@ TEST_F(DocumentStoreTest, OptimizeInto) { fake_clock_.SetSystemTimeMilliseconds(300); // Validates that the optimized document log has a smaller size if something - // expired + // expired. Namespace ids remain the same. ASSERT_TRUE(filesystem_.DeleteDirectoryRecursively(optimized_dir.c_str())); ASSERT_TRUE(filesystem_.CreateDirectoryRecursively(optimized_dir.c_str())); // DocumentId 0 is removed, and DocumentId 2 is expired. - EXPECT_THAT( - doc_store->OptimizeInto(optimized_dir, lang_segmenter_.get(), - /*namespace_id_fingerprint=*/false), - IsOkAndHolds(ElementsAre(kInvalidDocumentId, 0, kInvalidDocumentId))); + ICING_ASSERT_OK_AND_ASSIGN( + DocumentStore::OptimizeResult optimize_result3, + doc_store->OptimizeInto(optimized_dir, lang_segmenter_.get())); + EXPECT_THAT(optimize_result3.document_id_old_to_new, + ElementsAre(kInvalidDocumentId, 0, kInvalidDocumentId)); + EXPECT_THAT(optimize_result3.namespace_id_old_to_new, ElementsAre(0)); + EXPECT_THAT(optimize_result3.should_rebuild_index, IsFalse()); int64_t optimized_size3 = filesystem_.GetFileSize(optimized_document_log.c_str()); EXPECT_THAT(optimized_size2, Gt(optimized_size3)); @@ -1134,17 +1160,229 @@ TEST_F(DocumentStoreTest, OptimizeInto) { ASSERT_TRUE(filesystem_.CreateDirectoryRecursively(optimized_dir.c_str())); ICING_ASSERT_OK(doc_store->Delete("namespace", "uri2", fake_clock_.GetSystemTimeMilliseconds())); - // DocumentId 0 and 1 is removed, and DocumentId 2 is expired. - EXPECT_THAT(doc_store->OptimizeInto(optimized_dir, lang_segmenter_.get(), - /*namespace_id_fingerprint=*/false), - IsOkAndHolds(ElementsAre(kInvalidDocumentId, kInvalidDocumentId, - kInvalidDocumentId))); + // DocumentId 0 and 1 is removed, and DocumentId 2 is expired. Since no + // document with the namespace is added into new document store, the namespace + // id will be invalid. + ICING_ASSERT_OK_AND_ASSIGN( + DocumentStore::OptimizeResult optimize_result4, + doc_store->OptimizeInto(optimized_dir, lang_segmenter_.get())); + EXPECT_THAT( + optimize_result4.document_id_old_to_new, + ElementsAre(kInvalidDocumentId, kInvalidDocumentId, kInvalidDocumentId)); + EXPECT_THAT(optimize_result4.namespace_id_old_to_new, + ElementsAre(kInvalidNamespaceId)); + EXPECT_THAT(optimize_result4.should_rebuild_index, IsFalse()); + int64_t optimized_size4 = + filesystem_.GetFileSize(optimized_document_log.c_str()); + EXPECT_THAT(optimized_size3, Gt(optimized_size4)); +} + +TEST_P(DocumentStoreTest, OptimizeIntoMultipleNamespaces) { + ICING_ASSERT_OK_AND_ASSIGN( + DocumentStore::CreateResult create_result, + CreateDocumentStore(&filesystem_, document_store_dir_, &fake_clock_, + schema_store_.get())); + std::unique_ptr<DocumentStore> doc_store = + std::move(create_result.document_store); + + DocumentProto document0 = DocumentBuilder() + .SetKey("namespace1", "uri0") + .SetSchema("email") + .SetCreationTimestampMs(100) + .SetTtlMs(1000) + .Build(); + + DocumentProto document1 = DocumentBuilder() + .SetKey("namespace1", "uri1") + .SetSchema("email") + .SetCreationTimestampMs(100) + .SetTtlMs(1000) + .Build(); + + DocumentProto document2 = DocumentBuilder() + .SetKey("namespace2", "uri2") + .SetSchema("email") + .SetCreationTimestampMs(100) + .SetTtlMs(1000) + .Build(); + + DocumentProto document3 = DocumentBuilder() + .SetKey("namespace1", "uri3") + .SetSchema("email") + .SetCreationTimestampMs(100) + .SetTtlMs(1000) + .Build(); + + DocumentProto document4 = DocumentBuilder() + .SetKey("namespace3", "uri4") + .SetSchema("email") + .SetCreationTimestampMs(100) + .SetTtlMs(1000) + .Build(); + + // Nothing should have expired yet. + fake_clock_.SetSystemTimeMilliseconds(100); + + ICING_ASSERT_OK(doc_store->Put(document0)); + ICING_ASSERT_OK(doc_store->Put(document1)); + ICING_ASSERT_OK(doc_store->Put(document2)); + ICING_ASSERT_OK(doc_store->Put(document3)); + ICING_ASSERT_OK(doc_store->Put(document4)); + + std::string original_document_log = absl_ports::StrCat( + document_store_dir_, "/", DocumentLogCreator::GetDocumentLogFilename()); + + int64_t original_size = + filesystem_.GetFileSize(original_document_log.c_str()); + + std::string optimized_dir = document_store_dir_ + "_optimize"; + std::string optimized_document_log = + optimized_dir + "/" + DocumentLogCreator::GetDocumentLogFilename(); + + // Validates that the optimized document log has the same size if nothing is + // deleted. Also namespace ids remain the same. + ASSERT_TRUE(filesystem_.DeleteDirectoryRecursively(optimized_dir.c_str())); + ASSERT_TRUE(filesystem_.CreateDirectoryRecursively(optimized_dir.c_str())); + ICING_ASSERT_OK_AND_ASSIGN( + DocumentStore::OptimizeResult optimize_result1, + doc_store->OptimizeInto(optimized_dir, lang_segmenter_.get())); + EXPECT_THAT(optimize_result1.document_id_old_to_new, + ElementsAre(0, 1, 2, 3, 4)); + EXPECT_THAT(optimize_result1.namespace_id_old_to_new, ElementsAre(0, 1, 2)); + EXPECT_THAT(optimize_result1.should_rebuild_index, IsFalse()); + int64_t optimized_size1 = + filesystem_.GetFileSize(optimized_document_log.c_str()); + EXPECT_EQ(original_size, optimized_size1); + + // Validates that the optimized document log has a smaller size if something + // is deleted. + ASSERT_TRUE(filesystem_.DeleteDirectoryRecursively(optimized_dir.c_str())); + ASSERT_TRUE(filesystem_.CreateDirectoryRecursively(optimized_dir.c_str())); + // Delete DocumentId 0 with namespace1. + // - Before: ["namespace1#uri0", "namespace1#uri1", "namespace2#uri2", + // "namespace1#uri3", "namespace3#uri4"] + // - After: [nil, "namespace1#uri1", "namespace2#uri2", "namespace1#uri3", + // "namespace3#uri4"] + // In this case, new_doc_store will assign namespace ids in ["namespace1", + // "namespace2", "namespace3"] order. Since new_doc_store has the same order + // of namespace id assignment, namespace ids remain the same. + ICING_ASSERT_OK(doc_store->Delete("namespace1", "uri0", + fake_clock_.GetSystemTimeMilliseconds())); + ICING_ASSERT_OK_AND_ASSIGN( + DocumentStore::OptimizeResult optimize_result2, + doc_store->OptimizeInto(optimized_dir, lang_segmenter_.get())); + EXPECT_THAT(optimize_result2.document_id_old_to_new, + ElementsAre(kInvalidDocumentId, 0, 1, 2, 3)); + EXPECT_THAT(optimize_result2.namespace_id_old_to_new, ElementsAre(0, 1, 2)); + EXPECT_THAT(optimize_result2.should_rebuild_index, IsFalse()); + int64_t optimized_size2 = + filesystem_.GetFileSize(optimized_document_log.c_str()); + EXPECT_THAT(original_size, Gt(optimized_size2)); + + // Validates that the optimized document log has a smaller size if something + // is deleted. + ASSERT_TRUE(filesystem_.DeleteDirectoryRecursively(optimized_dir.c_str())); + ASSERT_TRUE(filesystem_.CreateDirectoryRecursively(optimized_dir.c_str())); + // Delete DocumentId 1 with namespace1. + // - Before: [nil, "namespace1#uri1", "namespace2#uri2", "namespace1#uri3", + // "namespace3#uri4"] + // - After: [nil, nil, "namespace2#uri2", "namespace1#uri3", + // "namespace3#uri4"] + // In this case, new_doc_store will assign namespace ids in ["namespace2", + // "namespace1", "namespace3"] order, so namespace_id_old_to_new should + // reflect the change. + ICING_ASSERT_OK(doc_store->Delete("namespace1", "uri1", + fake_clock_.GetSystemTimeMilliseconds())); + ICING_ASSERT_OK_AND_ASSIGN( + DocumentStore::OptimizeResult optimize_result3, + doc_store->OptimizeInto(optimized_dir, lang_segmenter_.get())); + EXPECT_THAT(optimize_result3.document_id_old_to_new, + ElementsAre(kInvalidDocumentId, kInvalidDocumentId, 0, 1, 2)); + EXPECT_THAT(optimize_result3.namespace_id_old_to_new, ElementsAre(1, 0, 2)); + EXPECT_THAT(optimize_result3.should_rebuild_index, IsFalse()); + int64_t optimized_size3 = + filesystem_.GetFileSize(optimized_document_log.c_str()); + EXPECT_THAT(optimized_size2, Gt(optimized_size3)); + + // Validates that the optimized document log has a smaller size if something + // is deleted. + ASSERT_TRUE(filesystem_.DeleteDirectoryRecursively(optimized_dir.c_str())); + ASSERT_TRUE(filesystem_.CreateDirectoryRecursively(optimized_dir.c_str())); + // Delete DocumentId 3 with namespace1. + // - Before: [nil, nil, "namespace2#uri2", "namespace1#uri3", + // "namespace3#uri4"] + // - After: [nil, nil, "namespace2#uri2", nil, "namespace3#uri4"] + // In this case, new_doc_store will assign namespace ids in ["namespace2", + // "namespace3"] order and "namespace1" will be never assigned, so + // namespace_id_old_to_new should reflect the change. + ICING_ASSERT_OK(doc_store->Delete("namespace1", "uri3", + fake_clock_.GetSystemTimeMilliseconds())); + ICING_ASSERT_OK_AND_ASSIGN( + DocumentStore::OptimizeResult optimize_result4, + doc_store->OptimizeInto(optimized_dir, lang_segmenter_.get())); + EXPECT_THAT(optimize_result4.document_id_old_to_new, + ElementsAre(kInvalidDocumentId, kInvalidDocumentId, 0, + kInvalidDocumentId, 1)); + EXPECT_THAT(optimize_result4.namespace_id_old_to_new, + ElementsAre(kInvalidNamespaceId, 0, 1)); + EXPECT_THAT(optimize_result4.should_rebuild_index, IsFalse()); int64_t optimized_size4 = filesystem_.GetFileSize(optimized_document_log.c_str()); EXPECT_THAT(optimized_size3, Gt(optimized_size4)); + + // Validates that the optimized document log has a smaller size if something + // is deleted. + ASSERT_TRUE(filesystem_.DeleteDirectoryRecursively(optimized_dir.c_str())); + ASSERT_TRUE(filesystem_.CreateDirectoryRecursively(optimized_dir.c_str())); + // Delete DocumentId 4 with namespace3. + // - Before: [nil, nil, "namespace2#uri2", nil, "namespace3#uri4"] + // - After: [nil, nil, "namespace2#uri2", nil, nil] + // In this case, new_doc_store will assign namespace ids in ["namespace2"] + // order and "namespace1", "namespace3" will be never assigned, so + // namespace_id_old_to_new should reflect the change. + ICING_ASSERT_OK(doc_store->Delete("namespace3", "uri4", + fake_clock_.GetSystemTimeMilliseconds())); + ICING_ASSERT_OK_AND_ASSIGN( + DocumentStore::OptimizeResult optimize_result5, + doc_store->OptimizeInto(optimized_dir, lang_segmenter_.get())); + EXPECT_THAT(optimize_result5.document_id_old_to_new, + ElementsAre(kInvalidDocumentId, kInvalidDocumentId, 0, + kInvalidDocumentId, kInvalidDocumentId)); + EXPECT_THAT(optimize_result5.namespace_id_old_to_new, + ElementsAre(kInvalidNamespaceId, 0, kInvalidNamespaceId)); + EXPECT_THAT(optimize_result5.should_rebuild_index, IsFalse()); + int64_t optimized_size5 = + filesystem_.GetFileSize(optimized_document_log.c_str()); + EXPECT_THAT(optimized_size4, Gt(optimized_size5)); + + // Validates that the optimized document log has a smaller size if something + // is deleted. + ASSERT_TRUE(filesystem_.DeleteDirectoryRecursively(optimized_dir.c_str())); + ASSERT_TRUE(filesystem_.CreateDirectoryRecursively(optimized_dir.c_str())); + // Delete DocumentId 2 with namespace2. + // - Before: [nil, nil, "namespace2#uri2", nil, nil] + // - After: [nil, nil, nil, nil, nil] + // In this case, all documents were deleted, so there will be no namespace ids + // either. namespace_id_old_to_new should reflect the change. + ICING_ASSERT_OK(doc_store->Delete("namespace2", "uri2", + fake_clock_.GetSystemTimeMilliseconds())); + ICING_ASSERT_OK_AND_ASSIGN( + DocumentStore::OptimizeResult optimize_result6, + doc_store->OptimizeInto(optimized_dir, lang_segmenter_.get())); + EXPECT_THAT( + optimize_result6.document_id_old_to_new, + ElementsAre(kInvalidDocumentId, kInvalidDocumentId, kInvalidDocumentId, + kInvalidDocumentId, kInvalidDocumentId)); + EXPECT_THAT(optimize_result6.namespace_id_old_to_new, + ElementsAre(kInvalidNamespaceId, kInvalidNamespaceId, + kInvalidNamespaceId)); + EXPECT_THAT(optimize_result6.should_rebuild_index, IsFalse()); + int64_t optimized_size6 = + filesystem_.GetFileSize(optimized_document_log.c_str()); + EXPECT_THAT(optimized_size5, Gt(optimized_size6)); } -TEST_F(DocumentStoreTest, OptimizeIntoForEmptyDocumentStore) { +TEST_P(DocumentStoreTest, OptimizeIntoForEmptyDocumentStore) { ICING_ASSERT_OK_AND_ASSIGN( DocumentStore::CreateResult create_result, CreateDocumentStore(&filesystem_, document_store_dir_, &fake_clock_, @@ -1154,12 +1392,16 @@ TEST_F(DocumentStoreTest, OptimizeIntoForEmptyDocumentStore) { std::string optimized_dir = document_store_dir_ + "_optimize"; ASSERT_TRUE(filesystem_.DeleteDirectoryRecursively(optimized_dir.c_str())); ASSERT_TRUE(filesystem_.CreateDirectoryRecursively(optimized_dir.c_str())); - EXPECT_THAT(doc_store->OptimizeInto(optimized_dir, lang_segmenter_.get(), - /*namespace_id_fingerprint=*/false), - IsOkAndHolds(IsEmpty())); + + ICING_ASSERT_OK_AND_ASSIGN( + DocumentStore::OptimizeResult optimize_result, + doc_store->OptimizeInto(optimized_dir, lang_segmenter_.get())); + EXPECT_THAT(optimize_result.document_id_old_to_new, IsEmpty()); + EXPECT_THAT(optimize_result.namespace_id_old_to_new, IsEmpty()); + EXPECT_THAT(optimize_result.should_rebuild_index, IsFalse()); } -TEST_F(DocumentStoreTest, ShouldRecoverFromDataLoss) { +TEST_P(DocumentStoreTest, ShouldRecoverFromDataLoss) { DocumentId document_id1, document_id2; { // Can put and delete fine. @@ -1250,7 +1492,7 @@ TEST_F(DocumentStoreTest, ShouldRecoverFromDataLoss) { /*num_docs=*/1, /*sum_length_in_tokens=*/4))); } -TEST_F(DocumentStoreTest, ShouldRecoverFromCorruptDerivedFile) { +TEST_P(DocumentStoreTest, ShouldRecoverFromCorruptDerivedFile) { DocumentId document_id1, document_id2; { // Can put and delete fine. @@ -1361,7 +1603,7 @@ TEST_F(DocumentStoreTest, ShouldRecoverFromCorruptDerivedFile) { EXPECT_THAT(actual_scores, Eq(expected_scores)); } -TEST_F(DocumentStoreTest, ShouldRecoverFromDiscardDerivedFiles) { +TEST_P(DocumentStoreTest, ShouldRecoverFromDiscardDerivedFiles) { DocumentId document_id1, document_id2; { // Can put and delete fine. @@ -1459,7 +1701,7 @@ TEST_F(DocumentStoreTest, ShouldRecoverFromDiscardDerivedFiles) { EXPECT_THAT(actual_scores, Eq(expected_scores)); } -TEST_F(DocumentStoreTest, ShouldRecoverFromBadChecksum) { +TEST_P(DocumentStoreTest, ShouldRecoverFromBadChecksum) { DocumentId document_id1, document_id2; { // Can put and delete fine. @@ -1537,7 +1779,7 @@ TEST_F(DocumentStoreTest, ShouldRecoverFromBadChecksum) { /*num_docs=*/1, /*sum_length_in_tokens=*/4))); } -TEST_F(DocumentStoreTest, GetStorageInfo) { +TEST_P(DocumentStoreTest, GetStorageInfo) { ICING_ASSERT_OK_AND_ASSIGN( DocumentStore::CreateResult create_result, CreateDocumentStore(&filesystem_, document_store_dir_, &fake_clock_, @@ -1580,7 +1822,7 @@ TEST_F(DocumentStoreTest, GetStorageInfo) { EXPECT_THAT(doc_store_storage_info.document_store_size(), Eq(-1)); } -TEST_F(DocumentStoreTest, MaxDocumentId) { +TEST_P(DocumentStoreTest, MaxDocumentId) { ICING_ASSERT_OK_AND_ASSIGN( DocumentStore::CreateResult create_result, CreateDocumentStore(&filesystem_, document_store_dir_, &fake_clock_, @@ -1605,7 +1847,7 @@ TEST_F(DocumentStoreTest, MaxDocumentId) { EXPECT_THAT(doc_store->last_added_document_id(), Eq(document_id2)); } -TEST_F(DocumentStoreTest, GetNamespaceId) { +TEST_P(DocumentStoreTest, GetNamespaceId) { ICING_ASSERT_OK_AND_ASSIGN( DocumentStore::CreateResult create_result, CreateDocumentStore(&filesystem_, document_store_dir_, &fake_clock_, @@ -1638,7 +1880,7 @@ TEST_F(DocumentStoreTest, GetNamespaceId) { EXPECT_THAT(doc_store->GetNamespaceId("namespace1"), IsOkAndHolds(Eq(0))); } -TEST_F(DocumentStoreTest, GetDuplicateNamespaceId) { +TEST_P(DocumentStoreTest, GetDuplicateNamespaceId) { ICING_ASSERT_OK_AND_ASSIGN( DocumentStore::CreateResult create_result, CreateDocumentStore(&filesystem_, document_store_dir_, &fake_clock_, @@ -1658,7 +1900,7 @@ TEST_F(DocumentStoreTest, GetDuplicateNamespaceId) { EXPECT_THAT(doc_store->GetNamespaceId("namespace"), IsOkAndHolds(Eq(0))); } -TEST_F(DocumentStoreTest, NonexistentNamespaceNotFound) { +TEST_P(DocumentStoreTest, NonexistentNamespaceNotFound) { ICING_ASSERT_OK_AND_ASSIGN( DocumentStore::CreateResult create_result, CreateDocumentStore(&filesystem_, document_store_dir_, &fake_clock_, @@ -1670,7 +1912,7 @@ TEST_F(DocumentStoreTest, NonexistentNamespaceNotFound) { StatusIs(libtextclassifier3::StatusCode::NOT_FOUND)); } -TEST_F(DocumentStoreTest, GetCorpusDuplicateCorpusId) { +TEST_P(DocumentStoreTest, GetCorpusDuplicateCorpusId) { ICING_ASSERT_OK_AND_ASSIGN( DocumentStore::CreateResult create_result, CreateDocumentStore(&filesystem_, document_store_dir_, &fake_clock_, @@ -1691,7 +1933,7 @@ TEST_F(DocumentStoreTest, GetCorpusDuplicateCorpusId) { IsOkAndHolds(Eq(0))); } -TEST_F(DocumentStoreTest, GetCorpusId) { +TEST_P(DocumentStoreTest, GetCorpusId) { ICING_ASSERT_OK_AND_ASSIGN( DocumentStore::CreateResult create_result, CreateDocumentStore(&filesystem_, document_store_dir_, &fake_clock_, @@ -1726,7 +1968,7 @@ TEST_F(DocumentStoreTest, GetCorpusId) { EXPECT_THAT(doc_store->GetNamespaceId("namespace1"), IsOkAndHolds(Eq(0))); } -TEST_F(DocumentStoreTest, NonexistentCorpusNotFound) { +TEST_P(DocumentStoreTest, NonexistentCorpusNotFound) { ICING_ASSERT_OK_AND_ASSIGN( DocumentStore::CreateResult create_result, CreateDocumentStore(&filesystem_, document_store_dir_, &fake_clock_, @@ -1750,7 +1992,7 @@ TEST_F(DocumentStoreTest, NonexistentCorpusNotFound) { StatusIs(libtextclassifier3::StatusCode::OUT_OF_RANGE)); } -TEST_F(DocumentStoreTest, GetCorpusAssociatedScoreDataSameCorpus) { +TEST_P(DocumentStoreTest, GetCorpusAssociatedScoreDataSameCorpus) { ICING_ASSERT_OK_AND_ASSIGN( DocumentStore::CreateResult create_result, CreateDocumentStore(&filesystem_, document_store_dir_, &fake_clock_, @@ -1775,7 +2017,7 @@ TEST_F(DocumentStoreTest, GetCorpusAssociatedScoreDataSameCorpus) { StatusIs(libtextclassifier3::StatusCode::OUT_OF_RANGE)); } -TEST_F(DocumentStoreTest, GetCorpusAssociatedScoreData) { +TEST_P(DocumentStoreTest, GetCorpusAssociatedScoreData) { ICING_ASSERT_OK_AND_ASSIGN( DocumentStore::CreateResult create_result, CreateDocumentStore(&filesystem_, document_store_dir_, &fake_clock_, @@ -1813,7 +2055,7 @@ TEST_F(DocumentStoreTest, GetCorpusAssociatedScoreData) { /*num_docs=*/1, /*sum_length_in_tokens=*/5))); } -TEST_F(DocumentStoreTest, NonexistentCorpusAssociatedScoreDataOutOfRange) { +TEST_P(DocumentStoreTest, NonexistentCorpusAssociatedScoreDataOutOfRange) { ICING_ASSERT_OK_AND_ASSIGN( DocumentStore::CreateResult create_result, CreateDocumentStore(&filesystem_, document_store_dir_, &fake_clock_, @@ -1825,7 +2067,7 @@ TEST_F(DocumentStoreTest, NonexistentCorpusAssociatedScoreDataOutOfRange) { StatusIs(libtextclassifier3::StatusCode::OUT_OF_RANGE)); } -TEST_F(DocumentStoreTest, GetDocumentAssociatedScoreDataSameCorpus) { +TEST_P(DocumentStoreTest, GetDocumentAssociatedScoreDataSameCorpus) { ICING_ASSERT_OK_AND_ASSIGN( DocumentStore::CreateResult create_result, CreateDocumentStore(&filesystem_, document_store_dir_, &fake_clock_, @@ -1869,7 +2111,7 @@ TEST_F(DocumentStoreTest, GetDocumentAssociatedScoreDataSameCorpus) { /*length_in_tokens=*/7))); } -TEST_F(DocumentStoreTest, GetDocumentAssociatedScoreDataDifferentCorpus) { +TEST_P(DocumentStoreTest, GetDocumentAssociatedScoreDataDifferentCorpus) { ICING_ASSERT_OK_AND_ASSIGN( DocumentStore::CreateResult create_result, CreateDocumentStore(&filesystem_, document_store_dir_, &fake_clock_, @@ -1913,7 +2155,7 @@ TEST_F(DocumentStoreTest, GetDocumentAssociatedScoreDataDifferentCorpus) { /*length_in_tokens=*/7))); } -TEST_F(DocumentStoreTest, NonexistentDocumentAssociatedScoreDataNotFound) { +TEST_P(DocumentStoreTest, NonexistentDocumentAssociatedScoreDataNotFound) { ICING_ASSERT_OK_AND_ASSIGN( DocumentStore::CreateResult create_result, CreateDocumentStore(&filesystem_, document_store_dir_, &fake_clock_, @@ -1925,7 +2167,7 @@ TEST_F(DocumentStoreTest, NonexistentDocumentAssociatedScoreDataNotFound) { StatusIs(libtextclassifier3::StatusCode::NOT_FOUND)); } -TEST_F(DocumentStoreTest, NonexistentDocumentFilterDataNotFound) { +TEST_P(DocumentStoreTest, NonexistentDocumentFilterDataNotFound) { ICING_ASSERT_OK_AND_ASSIGN( DocumentStore::CreateResult create_result, CreateDocumentStore(&filesystem_, document_store_dir_, &fake_clock_, @@ -1937,7 +2179,7 @@ TEST_F(DocumentStoreTest, NonexistentDocumentFilterDataNotFound) { /*document_id=*/0, fake_clock_.GetSystemTimeMilliseconds())); } -TEST_F(DocumentStoreTest, DeleteClearsFilterCache) { +TEST_P(DocumentStoreTest, DeleteClearsFilterCache) { ICING_ASSERT_OK_AND_ASSIGN( DocumentStore::CreateResult create_result, CreateDocumentStore(&filesystem_, document_store_dir_, &fake_clock_, @@ -1964,7 +2206,7 @@ TEST_F(DocumentStoreTest, DeleteClearsFilterCache) { document_id, fake_clock_.GetSystemTimeMilliseconds())); } -TEST_F(DocumentStoreTest, DeleteClearsScoreCache) { +TEST_P(DocumentStoreTest, DeleteClearsScoreCache) { ICING_ASSERT_OK_AND_ASSIGN( DocumentStore::CreateResult create_result, CreateDocumentStore(&filesystem_, document_store_dir_, &fake_clock_, @@ -1993,7 +2235,7 @@ TEST_F(DocumentStoreTest, DeleteClearsScoreCache) { /*length_in_tokens=*/0))); } -TEST_F(DocumentStoreTest, DeleteShouldPreventUsageScores) { +TEST_P(DocumentStoreTest, DeleteShouldPreventUsageScores) { ICING_ASSERT_OK_AND_ASSIGN( DocumentStore::CreateResult create_result, CreateDocumentStore(&filesystem_, document_store_dir_, &fake_clock_, @@ -2032,7 +2274,7 @@ TEST_F(DocumentStoreTest, DeleteShouldPreventUsageScores) { document_id, fake_clock_.GetSystemTimeMilliseconds())); } -TEST_F(DocumentStoreTest, ExpirationShouldPreventUsageScores) { +TEST_P(DocumentStoreTest, ExpirationShouldPreventUsageScores) { ICING_ASSERT_OK_AND_ASSIGN( DocumentStore::CreateResult create_result, CreateDocumentStore(&filesystem_, document_store_dir_, &fake_clock_, @@ -2082,7 +2324,7 @@ TEST_F(DocumentStoreTest, ExpirationShouldPreventUsageScores) { document_id, fake_clock_.GetSystemTimeMilliseconds())); } -TEST_F(DocumentStoreTest, +TEST_P(DocumentStoreTest, ExpirationTimestampIsSumOfNonZeroTtlAndCreationTimestamp) { DocumentProto document = DocumentBuilder() .SetKey("namespace1", "1") @@ -2109,7 +2351,7 @@ TEST_F(DocumentStoreTest, /*expiration_timestamp_ms=*/1100))); } -TEST_F(DocumentStoreTest, ExpirationTimestampIsInt64MaxIfTtlIsZero) { +TEST_P(DocumentStoreTest, ExpirationTimestampIsInt64MaxIfTtlIsZero) { DocumentProto document = DocumentBuilder() .SetKey("namespace1", "1") .SetSchema("email") @@ -2139,7 +2381,7 @@ TEST_F(DocumentStoreTest, ExpirationTimestampIsInt64MaxIfTtlIsZero) { /*expiration_timestamp_ms=*/std::numeric_limits<int64_t>::max()))); } -TEST_F(DocumentStoreTest, ExpirationTimestampIsInt64MaxOnOverflow) { +TEST_P(DocumentStoreTest, ExpirationTimestampIsInt64MaxOnOverflow) { DocumentProto document = DocumentBuilder() .SetKey("namespace1", "1") @@ -2170,7 +2412,7 @@ TEST_F(DocumentStoreTest, ExpirationTimestampIsInt64MaxOnOverflow) { /*expiration_timestamp_ms=*/std::numeric_limits<int64_t>::max()))); } -TEST_F(DocumentStoreTest, CreationTimestampShouldBePopulated) { +TEST_P(DocumentStoreTest, CreationTimestampShouldBePopulated) { // Creates a document without a given creation timestamp DocumentProto document_without_creation_timestamp = DocumentBuilder() @@ -2201,7 +2443,7 @@ TEST_F(DocumentStoreTest, CreationTimestampShouldBePopulated) { Eq(fake_real_time)); } -TEST_F(DocumentStoreTest, ShouldWriteAndReadScoresCorrectly) { +TEST_P(DocumentStoreTest, ShouldWriteAndReadScoresCorrectly) { DocumentProto document1 = DocumentBuilder() .SetKey("icing", "email/1") .SetSchema("email") @@ -2240,7 +2482,7 @@ TEST_F(DocumentStoreTest, ShouldWriteAndReadScoresCorrectly) { /*length_in_tokens=*/0))); } -TEST_F(DocumentStoreTest, ComputeChecksumSameBetweenCalls) { +TEST_P(DocumentStoreTest, ComputeChecksumSameBetweenCalls) { ICING_ASSERT_OK_AND_ASSIGN( DocumentStore::CreateResult create_result, CreateDocumentStore(&filesystem_, document_store_dir_, &fake_clock_, @@ -2255,7 +2497,7 @@ TEST_F(DocumentStoreTest, ComputeChecksumSameBetweenCalls) { EXPECT_THAT(document_store->ComputeChecksum(), IsOkAndHolds(checksum)); } -TEST_F(DocumentStoreTest, ComputeChecksumSameAcrossInstances) { +TEST_P(DocumentStoreTest, ComputeChecksumSameAcrossInstances) { ICING_ASSERT_OK_AND_ASSIGN( DocumentStore::CreateResult create_result, CreateDocumentStore(&filesystem_, document_store_dir_, &fake_clock_, @@ -2276,7 +2518,7 @@ TEST_F(DocumentStoreTest, ComputeChecksumSameAcrossInstances) { EXPECT_THAT(document_store->ComputeChecksum(), IsOkAndHolds(checksum)); } -TEST_F(DocumentStoreTest, ComputeChecksumChangesOnNewDocument) { +TEST_P(DocumentStoreTest, ComputeChecksumChangesOnNewDocument) { ICING_ASSERT_OK_AND_ASSIGN( DocumentStore::CreateResult create_result, CreateDocumentStore(&filesystem_, document_store_dir_, &fake_clock_, @@ -2292,7 +2534,7 @@ TEST_F(DocumentStoreTest, ComputeChecksumChangesOnNewDocument) { IsOkAndHolds(Not(Eq(checksum)))); } -TEST_F(DocumentStoreTest, ComputeChecksumDoesntChangeOnNewUsage) { +TEST_P(DocumentStoreTest, ComputeChecksumDoesntChangeOnNewUsage) { ICING_ASSERT_OK_AND_ASSIGN( DocumentStore::CreateResult create_result, CreateDocumentStore(&filesystem_, document_store_dir_, &fake_clock_, @@ -2310,7 +2552,7 @@ TEST_F(DocumentStoreTest, ComputeChecksumDoesntChangeOnNewUsage) { EXPECT_THAT(document_store->ComputeChecksum(), IsOkAndHolds(Eq(checksum))); } -TEST_F(DocumentStoreTest, RegenerateDerivedFilesSkipsUnknownSchemaTypeIds) { +TEST_P(DocumentStoreTest, RegenerateDerivedFilesSkipsUnknownSchemaTypeIds) { const std::string schema_store_dir = schema_store_dir_ + "_custom"; DocumentId email_document_id; @@ -2445,7 +2687,7 @@ TEST_F(DocumentStoreTest, RegenerateDerivedFilesSkipsUnknownSchemaTypeIds) { Eq(message_expiration_timestamp)); } -TEST_F(DocumentStoreTest, UpdateSchemaStoreUpdatesSchemaTypeIds) { +TEST_P(DocumentStoreTest, UpdateSchemaStoreUpdatesSchemaTypeIds) { const std::string schema_store_dir = test_dir_ + "_custom"; filesystem_.DeleteDirectoryRecursively(schema_store_dir.c_str()); filesystem_.CreateDirectoryRecursively(schema_store_dir.c_str()); @@ -2541,7 +2783,7 @@ TEST_F(DocumentStoreTest, UpdateSchemaStoreUpdatesSchemaTypeIds) { EXPECT_THAT(message_data.schema_type_id(), Eq(new_message_schema_type_id)); } -TEST_F(DocumentStoreTest, UpdateSchemaStoreDeletesInvalidDocuments) { +TEST_P(DocumentStoreTest, UpdateSchemaStoreDeletesInvalidDocuments) { const std::string schema_store_dir = test_dir_ + "_custom"; filesystem_.DeleteDirectoryRecursively(schema_store_dir.c_str()); filesystem_.CreateDirectoryRecursively(schema_store_dir.c_str()); @@ -2617,7 +2859,7 @@ TEST_F(DocumentStoreTest, UpdateSchemaStoreDeletesInvalidDocuments) { IsOkAndHolds(EqualsProto(email_with_subject))); } -TEST_F(DocumentStoreTest, +TEST_P(DocumentStoreTest, UpdateSchemaStoreDeletesDocumentsByDeletedSchemaType) { const std::string schema_store_dir = test_dir_ + "_custom"; filesystem_.DeleteDirectoryRecursively(schema_store_dir.c_str()); @@ -2691,7 +2933,7 @@ TEST_F(DocumentStoreTest, IsOkAndHolds(EqualsProto(message_document))); } -TEST_F(DocumentStoreTest, OptimizedUpdateSchemaStoreUpdatesSchemaTypeIds) { +TEST_P(DocumentStoreTest, OptimizedUpdateSchemaStoreUpdatesSchemaTypeIds) { const std::string schema_store_dir = test_dir_ + "_custom"; filesystem_.DeleteDirectoryRecursively(schema_store_dir.c_str()); filesystem_.CreateDirectoryRecursively(schema_store_dir.c_str()); @@ -2790,7 +3032,7 @@ TEST_F(DocumentStoreTest, OptimizedUpdateSchemaStoreUpdatesSchemaTypeIds) { EXPECT_THAT(message_data.schema_type_id(), Eq(new_message_schema_type_id)); } -TEST_F(DocumentStoreTest, OptimizedUpdateSchemaStoreDeletesInvalidDocuments) { +TEST_P(DocumentStoreTest, OptimizedUpdateSchemaStoreDeletesInvalidDocuments) { const std::string schema_store_dir = test_dir_ + "_custom"; filesystem_.DeleteDirectoryRecursively(schema_store_dir.c_str()); filesystem_.CreateDirectoryRecursively(schema_store_dir.c_str()); @@ -2869,7 +3111,7 @@ TEST_F(DocumentStoreTest, OptimizedUpdateSchemaStoreDeletesInvalidDocuments) { IsOkAndHolds(EqualsProto(email_with_subject))); } -TEST_F(DocumentStoreTest, +TEST_P(DocumentStoreTest, OptimizedUpdateSchemaStoreDeletesDocumentsByDeletedSchemaType) { const std::string schema_store_dir = test_dir_ + "_custom"; filesystem_.DeleteDirectoryRecursively(schema_store_dir.c_str()); @@ -2945,7 +3187,7 @@ TEST_F(DocumentStoreTest, IsOkAndHolds(EqualsProto(message_document))); } -TEST_F(DocumentStoreTest, GetOptimizeInfo) { +TEST_P(DocumentStoreTest, GetOptimizeInfo) { ICING_ASSERT_OK_AND_ASSIGN( DocumentStore::CreateResult create_result, CreateDocumentStore(&filesystem_, document_store_dir_, &fake_clock_, @@ -2983,8 +3225,7 @@ TEST_F(DocumentStoreTest, GetOptimizeInfo) { EXPECT_TRUE(filesystem_.DeleteDirectoryRecursively(optimized_dir.c_str())); EXPECT_TRUE(filesystem_.CreateDirectoryRecursively(optimized_dir.c_str())); ICING_ASSERT_OK( - document_store->OptimizeInto(optimized_dir, lang_segmenter_.get(), - /*namespace_id_fingerprint=*/false)); + document_store->OptimizeInto(optimized_dir, lang_segmenter_.get())); document_store.reset(); ICING_ASSERT_OK_AND_ASSIGN( create_result, CreateDocumentStore(&filesystem_, optimized_dir, @@ -2999,7 +3240,7 @@ TEST_F(DocumentStoreTest, GetOptimizeInfo) { EXPECT_THAT(optimize_info.estimated_optimizable_bytes, Eq(0)); } -TEST_F(DocumentStoreTest, GetAllNamespaces) { +TEST_P(DocumentStoreTest, GetAllNamespaces) { ICING_ASSERT_OK_AND_ASSIGN( DocumentStore::CreateResult create_result, CreateDocumentStore(&filesystem_, document_store_dir_, &fake_clock_, @@ -3071,7 +3312,7 @@ TEST_F(DocumentStoreTest, GetAllNamespaces) { UnorderedElementsAre("namespace1")); } -TEST_F(DocumentStoreTest, ReportUsageWithDifferentTimestampsAndGetUsageScores) { +TEST_P(DocumentStoreTest, ReportUsageWithDifferentTimestampsAndGetUsageScores) { ICING_ASSERT_OK_AND_ASSIGN( DocumentStore::CreateResult create_result, CreateDocumentStore(&filesystem_, document_store_dir_, &fake_clock_, @@ -3163,7 +3404,7 @@ TEST_F(DocumentStoreTest, ReportUsageWithDifferentTimestampsAndGetUsageScores) { EXPECT_THAT(actual_scores, Eq(expected_scores)); } -TEST_F(DocumentStoreTest, ReportUsageWithDifferentTypesAndGetUsageScores) { +TEST_P(DocumentStoreTest, ReportUsageWithDifferentTypesAndGetUsageScores) { ICING_ASSERT_OK_AND_ASSIGN( DocumentStore::CreateResult create_result, CreateDocumentStore(&filesystem_, document_store_dir_, &fake_clock_, @@ -3213,7 +3454,7 @@ TEST_F(DocumentStoreTest, ReportUsageWithDifferentTypesAndGetUsageScores) { EXPECT_THAT(actual_scores, Eq(expected_scores)); } -TEST_F(DocumentStoreTest, UsageScoresShouldNotBeClearedOnChecksumMismatch) { +TEST_P(DocumentStoreTest, UsageScoresShouldNotBeClearedOnChecksumMismatch) { UsageStore::UsageScores expected_scores; DocumentId document_id; { @@ -3258,7 +3499,7 @@ TEST_F(DocumentStoreTest, UsageScoresShouldNotBeClearedOnChecksumMismatch) { EXPECT_THAT(actual_scores, Eq(expected_scores)); } -TEST_F(DocumentStoreTest, UsageScoresShouldBeAvailableAfterDataLoss) { +TEST_P(DocumentStoreTest, UsageScoresShouldBeAvailableAfterDataLoss) { UsageStore::UsageScores expected_scores; DocumentId document_id; { @@ -3314,7 +3555,7 @@ TEST_F(DocumentStoreTest, UsageScoresShouldBeAvailableAfterDataLoss) { EXPECT_THAT(actual_scores, Eq(expected_scores)); } -TEST_F(DocumentStoreTest, UsageScoresShouldBeCopiedOverToUpdatedDocument) { +TEST_P(DocumentStoreTest, UsageScoresShouldBeCopiedOverToUpdatedDocument) { ICING_ASSERT_OK_AND_ASSIGN( DocumentStore::CreateResult create_result, CreateDocumentStore(&filesystem_, document_store_dir_, &fake_clock_, @@ -3355,7 +3596,7 @@ TEST_F(DocumentStoreTest, UsageScoresShouldBeCopiedOverToUpdatedDocument) { EXPECT_THAT(actual_scores, Eq(expected_scores)); } -TEST_F(DocumentStoreTest, UsageScoresShouldPersistOnOptimize) { +TEST_P(DocumentStoreTest, UsageScoresShouldPersistOnOptimize) { ICING_ASSERT_OK_AND_ASSIGN( DocumentStore::CreateResult create_result, CreateDocumentStore(&filesystem_, document_store_dir_, &fake_clock_, @@ -3390,8 +3631,7 @@ TEST_F(DocumentStoreTest, UsageScoresShouldPersistOnOptimize) { std::string optimized_dir = document_store_dir_ + "/optimize_test"; filesystem_.CreateDirectoryRecursively(optimized_dir.c_str()); ICING_ASSERT_OK( - document_store->OptimizeInto(optimized_dir, lang_segmenter_.get(), - /*namespace_id_fingerprint=*/false)); + document_store->OptimizeInto(optimized_dir, lang_segmenter_.get())); // Get optimized document store ICING_ASSERT_OK_AND_ASSIGN( @@ -3409,7 +3649,7 @@ TEST_F(DocumentStoreTest, UsageScoresShouldPersistOnOptimize) { EXPECT_THAT(actual_scores, Eq(expected_scores)); } -TEST_F(DocumentStoreTest, DetectPartialDataLoss) { +TEST_P(DocumentStoreTest, DetectPartialDataLoss) { { // Can put and delete fine. ICING_ASSERT_OK_AND_ASSIGN( @@ -3419,6 +3659,7 @@ TEST_F(DocumentStoreTest, DetectPartialDataLoss) { std::unique_ptr<DocumentStore> doc_store = std::move(create_result.document_store); EXPECT_THAT(create_result.data_loss, Eq(DataLoss::NONE)); + EXPECT_THAT(create_result.derived_files_regenerated, IsFalse()); ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id, doc_store->Put(DocumentProto(test_document1_))); @@ -3447,10 +3688,11 @@ TEST_F(DocumentStoreTest, DetectPartialDataLoss) { schema_store_.get())); std::unique_ptr<DocumentStore> doc_store = std::move(create_result.document_store); - ASSERT_THAT(create_result.data_loss, Eq(DataLoss::PARTIAL)); + EXPECT_THAT(create_result.data_loss, Eq(DataLoss::PARTIAL)); + EXPECT_THAT(create_result.derived_files_regenerated, IsTrue()); } -TEST_F(DocumentStoreTest, DetectCompleteDataLoss) { +TEST_P(DocumentStoreTest, DetectCompleteDataLoss) { int64_t corruptible_offset; const std::string document_log_file = absl_ports::StrCat( document_store_dir_, "/", DocumentLogCreator::GetDocumentLogFilename()); @@ -3463,6 +3705,7 @@ TEST_F(DocumentStoreTest, DetectCompleteDataLoss) { std::unique_ptr<DocumentStore> doc_store = std::move(create_result.document_store); EXPECT_THAT(create_result.data_loss, Eq(DataLoss::NONE)); + EXPECT_THAT(create_result.derived_files_regenerated, IsFalse()); // There's some space at the beginning of the file (e.g. header, kmagic, // etc) that is necessary to initialize the FileBackedProtoLog. We can't @@ -3512,10 +3755,11 @@ TEST_F(DocumentStoreTest, DetectCompleteDataLoss) { schema_store_.get())); std::unique_ptr<DocumentStore> doc_store = std::move(create_result.document_store); - ASSERT_THAT(create_result.data_loss, Eq(DataLoss::COMPLETE)); + EXPECT_THAT(create_result.data_loss, Eq(DataLoss::COMPLETE)); + EXPECT_THAT(create_result.derived_files_regenerated, IsTrue()); } -TEST_F(DocumentStoreTest, LoadScoreCacheAndInitializeSuccessfully) { +TEST_P(DocumentStoreTest, LoadScoreCacheAndInitializeSuccessfully) { // The directory testdata/score_cache_without_length_in_tokens/document_store // contains only the scoring_cache and the document_store_header (holding the // crc for the scoring_cache). If the current code is compatible with the @@ -3557,18 +3801,23 @@ TEST_F(DocumentStoreTest, LoadScoreCacheAndInitializeSuccessfully) { DocumentStore::Create( &filesystem_, document_store_dir_, &fake_clock_, schema_store_.get(), /*force_recovery_and_revalidate_documents=*/false, - /*namespace_id_fingerprint=*/false, + GetParam().namespace_id_fingerprint, GetParam().pre_mapping_fbv, + GetParam().use_persistent_hash_map, PortableFileBackedProtoLog<DocumentWrapper>::kDeflateCompressionLevel, &initialize_stats)); std::unique_ptr<DocumentStore> doc_store = std::move(create_result.document_store); // The document log is using the legacy v0 format so that a migration is // needed, which will also trigger regeneration. - EXPECT_EQ(initialize_stats.document_store_recovery_cause(), - InitializeStatsProto::LEGACY_DOCUMENT_LOG_FORMAT); + EXPECT_THAT(initialize_stats.document_store_recovery_cause(), + Eq(InitializeStatsProto::LEGACY_DOCUMENT_LOG_FORMAT)); + // There should be no data loss, but we still need to regenerate derived files + // since we migrated document log from v0 to v1. + EXPECT_THAT(create_result.data_loss, Eq(DataLoss::NONE)); + EXPECT_THAT(create_result.derived_files_regenerated, IsTrue()); } -TEST_F(DocumentStoreTest, DocumentStoreStorageInfo) { +TEST_P(DocumentStoreTest, DocumentStoreStorageInfo) { ICING_ASSERT_OK_AND_ASSIGN( DocumentStore::CreateResult create_result, CreateDocumentStore(&filesystem_, document_store_dir_, &fake_clock_, @@ -3678,7 +3927,7 @@ TEST_F(DocumentStoreTest, DocumentStoreStorageInfo) { Eq(0)); } -TEST_F(DocumentStoreTest, InitializeForceRecoveryUpdatesTypeIds) { +TEST_P(DocumentStoreTest, InitializeForceRecoveryUpdatesTypeIds) { // Start fresh and set the schema with one type. filesystem_.DeleteDirectoryRecursively(test_dir_.c_str()); filesystem_.CreateDirectoryRecursively(test_dir_.c_str()); @@ -3768,13 +4017,14 @@ TEST_F(DocumentStoreTest, InitializeForceRecoveryUpdatesTypeIds) { InitializeStatsProto initialize_stats; ICING_ASSERT_OK_AND_ASSIGN( DocumentStore::CreateResult create_result, - DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_, - schema_store.get(), - /*force_recovery_and_revalidate_documents=*/true, - /*namespace_id_fingerprint=*/false, - PortableFileBackedProtoLog< - DocumentWrapper>::kDeflateCompressionLevel, - &initialize_stats)); + DocumentStore::Create( + &filesystem_, document_store_dir_, &fake_clock_, schema_store.get(), + /*force_recovery_and_revalidate_documents=*/true, + GetParam().namespace_id_fingerprint, GetParam().pre_mapping_fbv, + GetParam().use_persistent_hash_map, + PortableFileBackedProtoLog< + DocumentWrapper>::kDeflateCompressionLevel, + &initialize_stats)); std::unique_ptr<DocumentStore> doc_store = std::move(create_result.document_store); @@ -3789,7 +4039,7 @@ TEST_F(DocumentStoreTest, InitializeForceRecoveryUpdatesTypeIds) { } } -TEST_F(DocumentStoreTest, InitializeDontForceRecoveryDoesntUpdateTypeIds) { +TEST_P(DocumentStoreTest, InitializeDontForceRecoveryDoesntUpdateTypeIds) { // Start fresh and set the schema with one type. filesystem_.DeleteDirectoryRecursively(test_dir_.c_str()); filesystem_.CreateDirectoryRecursively(test_dir_.c_str()); @@ -3892,7 +4142,7 @@ TEST_F(DocumentStoreTest, InitializeDontForceRecoveryDoesntUpdateTypeIds) { } } -TEST_F(DocumentStoreTest, InitializeForceRecoveryDeletesInvalidDocument) { +TEST_P(DocumentStoreTest, InitializeForceRecoveryDeletesInvalidDocument) { // Start fresh and set the schema with one type. filesystem_.DeleteDirectoryRecursively(test_dir_.c_str()); filesystem_.CreateDirectoryRecursively(test_dir_.c_str()); @@ -3987,13 +4237,14 @@ TEST_F(DocumentStoreTest, InitializeForceRecoveryDeletesInvalidDocument) { CorruptDocStoreHeaderChecksumFile(); ICING_ASSERT_OK_AND_ASSIGN( DocumentStore::CreateResult create_result, - DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_, - schema_store.get(), - /*force_recovery_and_revalidate_documents=*/true, - /*namespace_id_fingerprint=*/false, - PortableFileBackedProtoLog< - DocumentWrapper>::kDeflateCompressionLevel, - /*initialize_stats=*/nullptr)); + DocumentStore::Create( + &filesystem_, document_store_dir_, &fake_clock_, schema_store.get(), + /*force_recovery_and_revalidate_documents=*/true, + GetParam().namespace_id_fingerprint, GetParam().pre_mapping_fbv, + GetParam().use_persistent_hash_map, + PortableFileBackedProtoLog< + DocumentWrapper>::kDeflateCompressionLevel, + /*initialize_stats=*/nullptr)); std::unique_ptr<DocumentStore> doc_store = std::move(create_result.document_store); @@ -4005,7 +4256,7 @@ TEST_F(DocumentStoreTest, InitializeForceRecoveryDeletesInvalidDocument) { } } -TEST_F(DocumentStoreTest, InitializeDontForceRecoveryKeepsInvalidDocument) { +TEST_P(DocumentStoreTest, InitializeDontForceRecoveryKeepsInvalidDocument) { // Start fresh and set the schema with one type. filesystem_.DeleteDirectoryRecursively(test_dir_.c_str()); filesystem_.CreateDirectoryRecursively(test_dir_.c_str()); @@ -4114,7 +4365,7 @@ TEST_F(DocumentStoreTest, InitializeDontForceRecoveryKeepsInvalidDocument) { } } -TEST_F(DocumentStoreTest, MigrateToPortableFileBackedProtoLog) { +TEST_P(DocumentStoreTest, MigrateToPortableFileBackedProtoLog) { // Set up schema. SchemaProto schema = SchemaBuilder() @@ -4182,7 +4433,8 @@ TEST_F(DocumentStoreTest, MigrateToPortableFileBackedProtoLog) { DocumentStore::Create( &filesystem_, document_store_dir, &fake_clock_, schema_store.get(), /*force_recovery_and_revalidate_documents=*/false, - /*namespace_id_fingerprint=*/false, + GetParam().pre_mapping_fbv, GetParam().use_persistent_hash_map, + GetParam().namespace_id_fingerprint, PortableFileBackedProtoLog<DocumentWrapper>::kDeflateCompressionLevel, &initialize_stats)); std::unique_ptr<DocumentStore> document_store = @@ -4215,8 +4467,10 @@ TEST_F(DocumentStoreTest, MigrateToPortableFileBackedProtoLog) { .Build(); // Check that we didn't lose anything. A migration also doesn't technically - // count as a recovery. + // count as data loss, but we still have to regenerate derived files after + // migration. EXPECT_THAT(create_result.data_loss, Eq(DataLoss::NONE)); + EXPECT_THAT(create_result.derived_files_regenerated, IsTrue()); EXPECT_EQ(initialize_stats.document_store_recovery_cause(), InitializeStatsProto::LEGACY_DOCUMENT_LOG_FORMAT); @@ -4240,7 +4494,7 @@ TEST_F(DocumentStoreTest, MigrateToPortableFileBackedProtoLog) { IsOkAndHolds(EqualsProto(document3))); } -TEST_F(DocumentStoreTest, GetDebugInfo) { +TEST_P(DocumentStoreTest, GetDebugInfo) { SchemaProto schema = SchemaBuilder() .AddType(SchemaTypeConfigBuilder() @@ -4365,7 +4619,7 @@ TEST_F(DocumentStoreTest, GetDebugInfo) { EXPECT_THAT(out3.corpus_info(), IsEmpty()); } -TEST_F(DocumentStoreTest, GetDebugInfoWithoutSchema) { +TEST_P(DocumentStoreTest, GetDebugInfoWithoutSchema) { std::string schema_store_dir = schema_store_dir_ + "_custom"; filesystem_.DeleteDirectoryRecursively(schema_store_dir.c_str()); filesystem_.CreateDirectoryRecursively(schema_store_dir.c_str()); @@ -4389,7 +4643,7 @@ TEST_F(DocumentStoreTest, GetDebugInfoWithoutSchema) { EXPECT_THAT(out.corpus_info(), IsEmpty()); } -TEST_F(DocumentStoreTest, GetDebugInfoForEmptyDocumentStore) { +TEST_P(DocumentStoreTest, GetDebugInfoForEmptyDocumentStore) { ICING_ASSERT_OK_AND_ASSIGN( DocumentStore::CreateResult create_result, CreateDocumentStore(&filesystem_, document_store_dir_, &fake_clock_, @@ -4406,6 +4660,238 @@ TEST_F(DocumentStoreTest, GetDebugInfoForEmptyDocumentStore) { EXPECT_THAT(out.corpus_info(), IsEmpty()); } +TEST_P(DocumentStoreTest, SwitchKeyMapperTypeShouldRegenerateDerivedFiles) { + std::string dynamic_trie_uri_mapper_dir = + document_store_dir_ + "/key_mapper_dir"; + std::string persistent_hash_map_uri_mapper_dir = + document_store_dir_ + "/uri_mapper"; + DocumentId document_id1; + { + ICING_ASSERT_OK_AND_ASSIGN( + DocumentStore::CreateResult create_result, + DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_, + schema_store_.get(), + /*force_recovery_and_revalidate_documents=*/false, + GetParam().namespace_id_fingerprint, + GetParam().pre_mapping_fbv, + GetParam().use_persistent_hash_map, + PortableFileBackedProtoLog< + DocumentWrapper>::kDeflateCompressionLevel, + /*initialize_stats=*/nullptr)); + + std::unique_ptr<DocumentStore> doc_store = + std::move(create_result.document_store); + ICING_ASSERT_OK_AND_ASSIGN(document_id1, doc_store->Put(test_document1_)); + + if (GetParam().use_persistent_hash_map) { + EXPECT_THAT(filesystem_.DirectoryExists( + persistent_hash_map_uri_mapper_dir.c_str()), + IsTrue()); + EXPECT_THAT( + filesystem_.DirectoryExists(dynamic_trie_uri_mapper_dir.c_str()), + IsFalse()); + } else { + EXPECT_THAT(filesystem_.DirectoryExists( + persistent_hash_map_uri_mapper_dir.c_str()), + IsFalse()); + EXPECT_THAT( + filesystem_.DirectoryExists(dynamic_trie_uri_mapper_dir.c_str()), + IsTrue()); + } + } + + // Switch key mapper. We should get I/O error and derived files should be + // regenerated. + { + bool switch_key_mapper_flag = !GetParam().use_persistent_hash_map; + InitializeStatsProto initialize_stats; + ICING_ASSERT_OK_AND_ASSIGN( + DocumentStore::CreateResult create_result, + DocumentStore::Create( + &filesystem_, document_store_dir_, &fake_clock_, + schema_store_.get(), + /*force_recovery_and_revalidate_documents=*/false, + GetParam().namespace_id_fingerprint, GetParam().pre_mapping_fbv, + /*use_persistent_hash_map=*/switch_key_mapper_flag, + PortableFileBackedProtoLog< + DocumentWrapper>::kDeflateCompressionLevel, + &initialize_stats)); + EXPECT_THAT(initialize_stats.document_store_recovery_cause(), + Eq(InitializeStatsProto::IO_ERROR)); + + std::unique_ptr<DocumentStore> doc_store = + std::move(create_result.document_store); + EXPECT_THAT(doc_store->GetDocumentId(test_document1_.namespace_(), + test_document1_.uri()), + IsOkAndHolds(document_id1)); + + if (switch_key_mapper_flag) { + EXPECT_THAT(filesystem_.DirectoryExists( + persistent_hash_map_uri_mapper_dir.c_str()), + IsTrue()); + EXPECT_THAT( + filesystem_.DirectoryExists(dynamic_trie_uri_mapper_dir.c_str()), + IsFalse()); + } else { + EXPECT_THAT(filesystem_.DirectoryExists( + persistent_hash_map_uri_mapper_dir.c_str()), + IsFalse()); + EXPECT_THAT( + filesystem_.DirectoryExists(dynamic_trie_uri_mapper_dir.c_str()), + IsTrue()); + } + } +} + +TEST_P(DocumentStoreTest, SameKeyMapperTypeShouldNotRegenerateDerivedFiles) { + std::string dynamic_trie_uri_mapper_dir = + document_store_dir_ + "/key_mapper_dir"; + std::string persistent_hash_map_uri_mapper_dir = + document_store_dir_ + "/uri_mapper"; + DocumentId document_id1; + { + ICING_ASSERT_OK_AND_ASSIGN( + DocumentStore::CreateResult create_result, + DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_, + schema_store_.get(), + /*force_recovery_and_revalidate_documents=*/false, + GetParam().namespace_id_fingerprint, + GetParam().pre_mapping_fbv, + GetParam().use_persistent_hash_map, + PortableFileBackedProtoLog< + DocumentWrapper>::kDeflateCompressionLevel, + /*initialize_stats=*/nullptr)); + + std::unique_ptr<DocumentStore> doc_store = + std::move(create_result.document_store); + ICING_ASSERT_OK_AND_ASSIGN(document_id1, doc_store->Put(test_document1_)); + + if (GetParam().use_persistent_hash_map) { + EXPECT_THAT(filesystem_.DirectoryExists( + persistent_hash_map_uri_mapper_dir.c_str()), + IsTrue()); + EXPECT_THAT( + filesystem_.DirectoryExists(dynamic_trie_uri_mapper_dir.c_str()), + IsFalse()); + } else { + EXPECT_THAT(filesystem_.DirectoryExists( + persistent_hash_map_uri_mapper_dir.c_str()), + IsFalse()); + EXPECT_THAT( + filesystem_.DirectoryExists(dynamic_trie_uri_mapper_dir.c_str()), + IsTrue()); + } + } + + // Use the same key mapper type. Derived files should not be regenerated. + { + InitializeStatsProto initialize_stats; + ICING_ASSERT_OK_AND_ASSIGN( + DocumentStore::CreateResult create_result, + DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_, + schema_store_.get(), + /*force_recovery_and_revalidate_documents=*/false, + GetParam().namespace_id_fingerprint, + GetParam().pre_mapping_fbv, + GetParam().use_persistent_hash_map, + PortableFileBackedProtoLog< + DocumentWrapper>::kDeflateCompressionLevel, + &initialize_stats)); + EXPECT_THAT(initialize_stats.document_store_recovery_cause(), + Eq(InitializeStatsProto::NONE)); + + std::unique_ptr<DocumentStore> doc_store = + std::move(create_result.document_store); + EXPECT_THAT(doc_store->GetDocumentId(test_document1_.namespace_(), + test_document1_.uri()), + IsOkAndHolds(document_id1)); + + if (GetParam().use_persistent_hash_map) { + EXPECT_THAT(filesystem_.DirectoryExists( + persistent_hash_map_uri_mapper_dir.c_str()), + IsTrue()); + EXPECT_THAT( + filesystem_.DirectoryExists(dynamic_trie_uri_mapper_dir.c_str()), + IsFalse()); + } else { + EXPECT_THAT(filesystem_.DirectoryExists( + persistent_hash_map_uri_mapper_dir.c_str()), + IsFalse()); + EXPECT_THAT( + filesystem_.DirectoryExists(dynamic_trie_uri_mapper_dir.c_str()), + IsTrue()); + } + } +} + +TEST_P(DocumentStoreTest, GetDocumentIdByNamespaceFingerprintIdentifier) { + std::string dynamic_trie_uri_mapper_dir = + document_store_dir_ + "/key_mapper_dir"; + std::string persistent_hash_map_uri_mapper_dir = + document_store_dir_ + "/uri_mapper"; + ICING_ASSERT_OK_AND_ASSIGN( + DocumentStore::CreateResult create_result, + DocumentStore::Create( + &filesystem_, document_store_dir_, &fake_clock_, schema_store_.get(), + /*force_recovery_and_revalidate_documents=*/false, + GetParam().namespace_id_fingerprint, GetParam().pre_mapping_fbv, + GetParam().use_persistent_hash_map, + PortableFileBackedProtoLog<DocumentWrapper>::kDeflateCompressionLevel, + /*initialize_stats=*/nullptr)); + + std::unique_ptr<DocumentStore> doc_store = + std::move(create_result.document_store); + ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id, + doc_store->Put(test_document1_)); + + ICING_ASSERT_OK_AND_ASSIGN( + NamespaceId namespace_id, + doc_store->GetNamespaceId(test_document1_.namespace_())); + NamespaceFingerprintIdentifier ns_fingerprint( + namespace_id, + /*target_str=*/test_document1_.uri()); + if (GetParam().namespace_id_fingerprint) { + EXPECT_THAT(doc_store->GetDocumentId(ns_fingerprint), + IsOkAndHolds(document_id)); + + NamespaceFingerprintIdentifier non_existing_ns_fingerprint( + namespace_id + 1, /*target_str=*/test_document1_.uri()); + EXPECT_THAT(doc_store->GetDocumentId(non_existing_ns_fingerprint), + StatusIs(libtextclassifier3::StatusCode::NOT_FOUND)); + } else { + EXPECT_THAT(doc_store->GetDocumentId(ns_fingerprint), + StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION)); + } +} + +INSTANTIATE_TEST_SUITE_P( + DocumentStoreTest, DocumentStoreTest, + testing::Values( + DocumentStoreTestParam(/*namespace_id_fingerprint_in=*/false, + /*pre_mapping_fbv_in=*/false, + /*use_persistent_hash_map_in=*/false), + DocumentStoreTestParam(/*namespace_id_fingerprint_in=*/true, + /*pre_mapping_fbv_in=*/false, + /*use_persistent_hash_map_in=*/false), + DocumentStoreTestParam(/*namespace_id_fingerprint_in=*/false, + /*pre_mapping_fbv_in=*/true, + /*use_persistent_hash_map_in=*/false), + DocumentStoreTestParam(/*namespace_id_fingerprint_in=*/true, + /*pre_mapping_fbv_in=*/true, + /*use_persistent_hash_map_in=*/false), + DocumentStoreTestParam(/*namespace_id_fingerprint_in=*/false, + /*pre_mapping_fbv_in=*/false, + /*use_persistent_hash_map_in=*/true), + DocumentStoreTestParam(/*namespace_id_fingerprint_in=*/true, + /*pre_mapping_fbv_in=*/false, + /*use_persistent_hash_map_in=*/true), + DocumentStoreTestParam(/*namespace_id_fingerprint_in=*/false, + /*pre_mapping_fbv_in=*/true, + /*use_persistent_hash_map_in=*/true), + DocumentStoreTestParam(/*namespace_id_fingerprint_in=*/true, + /*pre_mapping_fbv_in=*/true, + /*use_persistent_hash_map_in=*/true))); + } // namespace } // namespace lib diff --git a/icing/store/namespace-fingerprint-identifier.cc b/icing/store/namespace-fingerprint-identifier.cc new file mode 100644 index 0000000..3910105 --- /dev/null +++ b/icing/store/namespace-fingerprint-identifier.cc @@ -0,0 +1,73 @@ +// Copyright (C) 2023 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "icing/store/namespace-fingerprint-identifier.h" + +#include <cstdint> +#include <string> +#include <string_view> + +#include "icing/text_classifier/lib3/utils/base/statusor.h" +#include "icing/text_classifier/lib3/utils/hash/farmhash.h" +#include "icing/absl_ports/canonical_errors.h" +#include "icing/absl_ports/str_cat.h" +#include "icing/store/namespace-id.h" +#include "icing/util/encode-util.h" + +namespace icing { +namespace lib { + +/* static */ libtextclassifier3::StatusOr<NamespaceFingerprintIdentifier> +NamespaceFingerprintIdentifier::DecodeFromCString( + std::string_view encoded_cstr) { + if (encoded_cstr.size() < kMinEncodedLength) { + return absl_ports::InvalidArgumentError("Invalid length"); + } + + NamespaceId namespace_id = encode_util::DecodeIntFromCString( + encoded_cstr.substr(0, kEncodedNamespaceIdLength)); + uint64_t fingerprint = encode_util::DecodeIntFromCString( + encoded_cstr.substr(kEncodedNamespaceIdLength)); + return NamespaceFingerprintIdentifier(namespace_id, fingerprint); +} + +NamespaceFingerprintIdentifier::NamespaceFingerprintIdentifier( + NamespaceId namespace_id, std::string_view target_str) + : namespace_id_(namespace_id), + fingerprint_(tc3farmhash::Fingerprint64(target_str)) {} + +std::string NamespaceFingerprintIdentifier::EncodeToCString() const { + // encoded_namespace_id_str should be 1 to 3 bytes based on the value of + // namespace_id. + std::string encoded_namespace_id_str = + encode_util::EncodeIntToCString(namespace_id_); + // Make encoded_namespace_id_str to fixed kEncodedNamespaceIdLength bytes. + while (encoded_namespace_id_str.size() < kEncodedNamespaceIdLength) { + // C string cannot contain 0 bytes, so we append it using 1, just like what + // we do in encode_util::EncodeIntToCString. + // + // The reason that this works is because DecodeIntToString decodes a byte + // value of 0x01 as 0x00. When EncodeIntToCString returns an encoded + // namespace id that is less than 3 bytes, it means that the id contains + // unencoded leading 0x00. So here we're explicitly encoding those bytes as + // 0x01. + encoded_namespace_id_str.push_back(1); + } + + return absl_ports::StrCat(encoded_namespace_id_str, + encode_util::EncodeIntToCString(fingerprint_)); +} + +} // namespace lib +} // namespace icing diff --git a/icing/store/namespace-fingerprint-identifier.h b/icing/store/namespace-fingerprint-identifier.h new file mode 100644 index 0000000..d91ef94 --- /dev/null +++ b/icing/store/namespace-fingerprint-identifier.h @@ -0,0 +1,72 @@ +// Copyright (C) 2023 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef ICING_STORE_NAMESPACE_FINGERPRINT_IDENTIFIER_H_ +#define ICING_STORE_NAMESPACE_FINGERPRINT_IDENTIFIER_H_ + +#include <cstdint> +#include <string> +#include <string_view> + +#include "icing/text_classifier/lib3/utils/base/statusor.h" +#include "icing/store/namespace-id.h" + +namespace icing { +namespace lib { + +class NamespaceFingerprintIdentifier { + public: + static constexpr int kEncodedNamespaceIdLength = 3; + static constexpr int kMinEncodedLength = kEncodedNamespaceIdLength + 1; + + static libtextclassifier3::StatusOr<NamespaceFingerprintIdentifier> + DecodeFromCString(std::string_view encoded_cstr); + + explicit NamespaceFingerprintIdentifier() + : namespace_id_(0), fingerprint_(0) {} + + explicit NamespaceFingerprintIdentifier(NamespaceId namespace_id, + uint64_t fingerprint) + : namespace_id_(namespace_id), fingerprint_(fingerprint) {} + + explicit NamespaceFingerprintIdentifier(NamespaceId namespace_id, + std::string_view target_str); + + std::string EncodeToCString() const; + + bool operator<(const NamespaceFingerprintIdentifier& other) const { + if (namespace_id_ != other.namespace_id_) { + return namespace_id_ < other.namespace_id_; + } + return fingerprint_ < other.fingerprint_; + } + + bool operator==(const NamespaceFingerprintIdentifier& other) const { + return namespace_id_ == other.namespace_id_ && + fingerprint_ == other.fingerprint_; + } + + NamespaceId namespace_id() const { return namespace_id_; } + uint64_t fingerprint() const { return fingerprint_; } + + private: + NamespaceId namespace_id_; + uint64_t fingerprint_; +} __attribute__((packed)); +static_assert(sizeof(NamespaceFingerprintIdentifier) == 10, ""); + +} // namespace lib +} // namespace icing + +#endif // ICING_STORE_NAMESPACE_FINGERPRINT_IDENTIFIER_H_ diff --git a/icing/store/namespace-fingerprint-identifier_test.cc b/icing/store/namespace-fingerprint-identifier_test.cc new file mode 100644 index 0000000..5f86156 --- /dev/null +++ b/icing/store/namespace-fingerprint-identifier_test.cc @@ -0,0 +1,148 @@ +// Copyright (C) 2023 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "icing/store/namespace-fingerprint-identifier.h" + +#include <cstdint> +#include <limits> +#include <string> + +#include "icing/text_classifier/lib3/utils/base/status.h" +#include "gmock/gmock.h" +#include "gtest/gtest.h" +#include "icing/store/namespace-id.h" +#include "icing/testing/common-matchers.h" + +namespace icing { +namespace lib { + +namespace { + +using ::testing::Eq; + +TEST(NamespaceFingerprintIdentifierTest, EncodeToCString) { + NamespaceFingerprintIdentifier identifier1(/*namespace_id=*/0, + /*fingerprint=*/0); + EXPECT_THAT(identifier1.EncodeToCString(), Eq("\x01\x01\x01\x01")); + + NamespaceFingerprintIdentifier identifier2(/*namespace_id=*/0, + /*fingerprint=*/1); + EXPECT_THAT(identifier2.EncodeToCString(), Eq("\x01\x01\x01\x02")); + + NamespaceFingerprintIdentifier identifier3( + /*namespace_id=*/0, /*fingerprint=*/std::numeric_limits<uint64_t>::max()); + EXPECT_THAT(identifier3.EncodeToCString(), + Eq("\x01\x01\x01\x80\x80\x80\x80\x80\x80\x80\x80\x80\x02")); + + NamespaceFingerprintIdentifier identifier4(/*namespace_id=*/1, + /*fingerprint=*/0); + EXPECT_THAT(identifier4.EncodeToCString(), Eq("\x02\x01\x01\x01")); + + NamespaceFingerprintIdentifier identifier5(/*namespace_id=*/1, + /*fingerprint=*/1); + EXPECT_THAT(identifier5.EncodeToCString(), Eq("\x02\x01\x01\x02")); + + NamespaceFingerprintIdentifier identifier6( + /*namespace_id=*/1, /*fingerprint=*/std::numeric_limits<uint64_t>::max()); + EXPECT_THAT(identifier6.EncodeToCString(), + Eq("\x02\x01\x01\x80\x80\x80\x80\x80\x80\x80\x80\x80\x02")); + + NamespaceFingerprintIdentifier identifier7( + /*namespace_id=*/std::numeric_limits<NamespaceId>::max(), + /*fingerprint=*/0); + EXPECT_THAT(identifier7.EncodeToCString(), Eq("\x80\x80\x02\x01")); + + NamespaceFingerprintIdentifier identifier8( + /*namespace_id=*/std::numeric_limits<NamespaceId>::max(), + /*fingerprint=*/1); + EXPECT_THAT(identifier8.EncodeToCString(), Eq("\x80\x80\x02\x02")); + + NamespaceFingerprintIdentifier identifier9( + /*namespace_id=*/std::numeric_limits<NamespaceId>::max(), + /*fingerprint=*/std::numeric_limits<uint64_t>::max()); + EXPECT_THAT(identifier9.EncodeToCString(), + Eq("\x80\x80\x02\x80\x80\x80\x80\x80\x80\x80\x80\x80\x02")); +} + +TEST(NamespaceFingerprintIdentifierTest, + MultipleCStringConversionsAreReversible) { + NamespaceFingerprintIdentifier identifier1(/*namespace_id=*/0, + /*fingerprint=*/0); + EXPECT_THAT(NamespaceFingerprintIdentifier::DecodeFromCString( + identifier1.EncodeToCString()), + IsOkAndHolds(identifier1)); + + NamespaceFingerprintIdentifier identifier2(/*namespace_id=*/0, + /*fingerprint=*/1); + EXPECT_THAT(NamespaceFingerprintIdentifier::DecodeFromCString( + identifier2.EncodeToCString()), + IsOkAndHolds(identifier2)); + + NamespaceFingerprintIdentifier identifier3( + /*namespace_id=*/0, /*fingerprint=*/std::numeric_limits<uint64_t>::max()); + EXPECT_THAT(NamespaceFingerprintIdentifier::DecodeFromCString( + identifier3.EncodeToCString()), + IsOkAndHolds(identifier3)); + + NamespaceFingerprintIdentifier identifier4(/*namespace_id=*/1, + /*fingerprint=*/0); + EXPECT_THAT(NamespaceFingerprintIdentifier::DecodeFromCString( + identifier4.EncodeToCString()), + IsOkAndHolds(identifier4)); + + NamespaceFingerprintIdentifier identifier5(/*namespace_id=*/1, + /*fingerprint=*/1); + EXPECT_THAT(NamespaceFingerprintIdentifier::DecodeFromCString( + identifier5.EncodeToCString()), + IsOkAndHolds(identifier5)); + + NamespaceFingerprintIdentifier identifier6( + /*namespace_id=*/1, /*fingerprint=*/std::numeric_limits<uint64_t>::max()); + EXPECT_THAT(NamespaceFingerprintIdentifier::DecodeFromCString( + identifier6.EncodeToCString()), + IsOkAndHolds(identifier6)); + + NamespaceFingerprintIdentifier identifier7( + /*namespace_id=*/std::numeric_limits<NamespaceId>::max(), + /*fingerprint=*/0); + EXPECT_THAT(NamespaceFingerprintIdentifier::DecodeFromCString( + identifier7.EncodeToCString()), + IsOkAndHolds(identifier7)); + + NamespaceFingerprintIdentifier identifier8( + /*namespace_id=*/std::numeric_limits<NamespaceId>::max(), + /*fingerprint=*/1); + EXPECT_THAT(NamespaceFingerprintIdentifier::DecodeFromCString( + identifier8.EncodeToCString()), + IsOkAndHolds(identifier8)); + + NamespaceFingerprintIdentifier identifier9( + /*namespace_id=*/std::numeric_limits<NamespaceId>::max(), + /*fingerprint=*/std::numeric_limits<uint64_t>::max()); + EXPECT_THAT(NamespaceFingerprintIdentifier::DecodeFromCString( + identifier9.EncodeToCString()), + IsOkAndHolds(identifier9)); +} + +TEST(NamespaceFingerprintIdentifierTest, + DecodeFromCStringInvalidLengthShouldReturnError) { + std::string invalid_str = "\x01\x01\x01"; + EXPECT_THAT(NamespaceFingerprintIdentifier::DecodeFromCString(invalid_str), + StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); +} + +} // namespace + +} // namespace lib +} // namespace icing diff --git a/icing/store/usage-store_test.cc b/icing/store/usage-store_test.cc index 2b17f13..07fe2c5 100644 --- a/icing/store/usage-store_test.cc +++ b/icing/store/usage-store_test.cc @@ -154,7 +154,8 @@ TEST_F(UsageStoreTest, AddUsageReportShouldUpdateLastUsedTimestamp) { UsageStore::Create(&filesystem_, test_dir_)); // Report a usage with timestamp 5. - usage_store->AddUsageReport(usage_report_time5, /*document_id=*/1); + ICING_ASSERT_OK( + usage_store->AddUsageReport(usage_report_time5, /*document_id=*/1)); UsageStore::UsageScores expected_scores = CreateUsageScores( /*type1_timestamp=*/5, /*type2_timestamp=*/0, /*type3_timestamp=*/0, /*type1_count=*/1, /*type2_count=*/0, /*type3_count=*/0); @@ -162,13 +163,15 @@ TEST_F(UsageStoreTest, AddUsageReportShouldUpdateLastUsedTimestamp) { IsOkAndHolds(expected_scores)); // Report a usage with timestamp 1. The timestamp won't be updated. - usage_store->AddUsageReport(usage_report_time1, /*document_id=*/1); + ICING_ASSERT_OK( + usage_store->AddUsageReport(usage_report_time1, /*document_id=*/1)); ++expected_scores.usage_type1_count; EXPECT_THAT(usage_store->GetUsageScores(/*document_id=*/1), IsOkAndHolds(expected_scores)); // Report a usage with timestamp 10. The timestamp should be updated. - usage_store->AddUsageReport(usage_report_time10, /*document_id=*/1); + ICING_ASSERT_OK( + usage_store->AddUsageReport(usage_report_time10, /*document_id=*/1)); expected_scores.usage_type1_last_used_timestamp_s = 10; ++expected_scores.usage_type1_count; EXPECT_THAT(usage_store->GetUsageScores(/*document_id=*/1), @@ -188,7 +191,8 @@ TEST_F(UsageStoreTest, AddUsageReportShouldUpdateCounts) { UsageStore::Create(&filesystem_, test_dir_)); // Report a usage with type 1. - usage_store->AddUsageReport(usage_report_type1, /*document_id=*/1); + ICING_ASSERT_OK( + usage_store->AddUsageReport(usage_report_type1, /*document_id=*/1)); UsageStore::UsageScores expected_scores = CreateUsageScores( /*type1_timestamp=*/0, /*type2_timestamp=*/0, /*type3_timestamp=*/0, /*type1_count=*/1, /*type2_count=*/0, /*type3_count=*/0); @@ -196,29 +200,34 @@ TEST_F(UsageStoreTest, AddUsageReportShouldUpdateCounts) { EXPECT_THAT(usage_store->GetUsageScores(/*document_id=*/1), IsOkAndHolds(expected_scores)); // Report another usage with type 1. - usage_store->AddUsageReport(usage_report_type1, /*document_id=*/1); + ICING_ASSERT_OK( + usage_store->AddUsageReport(usage_report_type1, /*document_id=*/1)); ++expected_scores.usage_type1_count; EXPECT_THAT(usage_store->GetUsageScores(/*document_id=*/1), IsOkAndHolds(expected_scores)); // Report a usage with type 2. - usage_store->AddUsageReport(usage_report_type2, /*document_id=*/1); + ICING_ASSERT_OK( + usage_store->AddUsageReport(usage_report_type2, /*document_id=*/1)); ++expected_scores.usage_type2_count; EXPECT_THAT(usage_store->GetUsageScores(/*document_id=*/1), IsOkAndHolds(expected_scores)); // Report another usage with type 2. - usage_store->AddUsageReport(usage_report_type2, /*document_id=*/1); + ICING_ASSERT_OK( + usage_store->AddUsageReport(usage_report_type2, /*document_id=*/1)); ++expected_scores.usage_type2_count; EXPECT_THAT(usage_store->GetUsageScores(/*document_id=*/1), IsOkAndHolds(expected_scores)); // Report a usage with type 3. - usage_store->AddUsageReport(usage_report_type3, /*document_id=*/1); + ICING_ASSERT_OK( + usage_store->AddUsageReport(usage_report_type3, /*document_id=*/1)); ++expected_scores.usage_type3_count; EXPECT_THAT(usage_store->GetUsageScores(/*document_id=*/1), IsOkAndHolds(expected_scores)); // Report another usage with type 3. - usage_store->AddUsageReport(usage_report_type3, /*document_id=*/1); + ICING_ASSERT_OK( + usage_store->AddUsageReport(usage_report_type3, /*document_id=*/1)); ++expected_scores.usage_type3_count; EXPECT_THAT(usage_store->GetUsageScores(/*document_id=*/1), IsOkAndHolds(expected_scores)); @@ -457,7 +466,7 @@ TEST_F(UsageStoreTest, TimestampInSecondsShouldNotOverflow) { UsageStore::Create(&filesystem_, test_dir_)); // The stored timestamp in seconds should be the max value of uint32. - usage_store->AddUsageReport(usage_report, /*document_id=*/1); + ICING_ASSERT_OK(usage_store->AddUsageReport(usage_report, /*document_id=*/1)); UsageStore::UsageScores expected_scores = CreateUsageScores( /*type1_timestamp=*/std::numeric_limits<uint32_t>::max(), /*type2_timestamp=*/0, /*type3_timestamp=*/0, @@ -483,7 +492,7 @@ TEST_F(UsageStoreTest, CountsShouldNotOverflow) { // Report another usage with type 1. UsageReport usage_report = CreateUsageReport( "namespace", "uri", /*timestamp_ms=*/0, UsageReport::USAGE_TYPE1); - usage_store->AddUsageReport(usage_report, /*document_id=*/1); + ICING_ASSERT_OK(usage_store->AddUsageReport(usage_report, /*document_id=*/1)); // usage_type1_count should not change because it's already the max value. EXPECT_THAT(usage_store->GetUsageScores(/*document_id=*/1), @@ -571,7 +580,7 @@ TEST_F(UsageStoreTest, GetElementsFileSize) { UsageReport usage_report = CreateUsageReport( "namespace", "uri", /*timestamp_ms=*/1000, UsageReport::USAGE_TYPE1); - usage_store->AddUsageReport(usage_report, /*document_id=*/1); + ICING_ASSERT_OK(usage_store->AddUsageReport(usage_report, /*document_id=*/1)); EXPECT_THAT(usage_store->GetElementsFileSize(), IsOkAndHolds(Gt(empty_file_size))); @@ -602,12 +611,13 @@ TEST_F(UsageStoreTest, GetDiskUsageNonEmpty) { UsageReport usage_report = CreateUsageReport( "namespace", "uri", /*timestamp_ms=*/1000, UsageReport::USAGE_TYPE1); for (int i = 0; i < 200; ++i) { - usage_store->AddUsageReport(usage_report, /*document_id=*/i); + ICING_ASSERT_OK( + usage_store->AddUsageReport(usage_report, /*document_id=*/i)); } // We need to persist since iOS won't see the new disk allocations until after // everything gets written. - usage_store->PersistToDisk(); + ICING_ASSERT_OK(usage_store->PersistToDisk()); EXPECT_THAT(usage_store->GetDiskUsage(), IsOkAndHolds(Gt(empty_disk_usage))); } diff --git a/icing/testing/common-matchers.h b/icing/testing/common-matchers.h index bbc1a59..7d8e0cb 100644 --- a/icing/testing/common-matchers.h +++ b/icing/testing/common-matchers.h @@ -29,6 +29,7 @@ #include "icing/index/hit/doc-hit-info.h" #include "icing/index/hit/hit.h" #include "icing/index/iterator/doc-hit-info-iterator-test-util.h" +#include "icing/index/iterator/doc-hit-info-iterator.h" #include "icing/legacy/core/icing-string-util.h" #include "icing/portable/equals-proto.h" #include "icing/proto/search.pb.h" @@ -70,6 +71,39 @@ MATCHER_P2(EqualsDocHitInfo, document_id, section_ids, "") { actual.hit_section_ids_mask() == section_mask; } +// Used to match a DocHitInfoIterator::CallStats +MATCHER_P5(EqualsDocHitInfoIteratorCallStats, num_leaf_advance_calls_lite_index, + num_leaf_advance_calls_main_index, + num_leaf_advance_calls_integer_index, + num_leaf_advance_calls_no_index, num_blocks_inspected, "") { + const DocHitInfoIterator::CallStats& actual = arg; + *result_listener << IcingStringUtil::StringPrintf( + "(actual is {num_leaf_advance_calls_lite_index=%d, " + "num_leaf_advance_calls_main_index=%d, " + "num_leaf_advance_calls_integer_index=%d, " + "num_leaf_advance_calls_no_index=%d, num_blocks_inspected=%d}, but " + "expected was {num_leaf_advance_calls_lite_index=%d, " + "num_leaf_advance_calls_main_index=%d, " + "num_leaf_advance_calls_integer_index=%d, " + "num_leaf_advance_calls_no_index=%d, num_blocks_inspected=%d}.)", + actual.num_leaf_advance_calls_lite_index, + actual.num_leaf_advance_calls_main_index, + actual.num_leaf_advance_calls_integer_index, + actual.num_leaf_advance_calls_no_index, actual.num_blocks_inspected, + num_leaf_advance_calls_lite_index, num_leaf_advance_calls_main_index, + num_leaf_advance_calls_integer_index, num_leaf_advance_calls_no_index, + num_blocks_inspected); + return actual.num_leaf_advance_calls_lite_index == + num_leaf_advance_calls_lite_index && + actual.num_leaf_advance_calls_main_index == + num_leaf_advance_calls_main_index && + actual.num_leaf_advance_calls_integer_index == + num_leaf_advance_calls_integer_index && + actual.num_leaf_advance_calls_no_index == + num_leaf_advance_calls_no_index && + actual.num_blocks_inspected == num_blocks_inspected; +} + struct ExtractTermFrequenciesResult { std::array<Hit::TermFrequency, kTotalNumSections> term_frequencies = {0}; SectionIdMask section_mask = kSectionIdMaskNone; @@ -241,7 +275,9 @@ MATCHER_P(EqualsSetSchemaResult, expected, "") { actual.schema_types_changed_fully_compatible_by_name == expected.schema_types_changed_fully_compatible_by_name && actual.schema_types_index_incompatible_by_name == - expected.schema_types_index_incompatible_by_name) { + expected.schema_types_index_incompatible_by_name && + actual.schema_types_join_incompatible_by_name == + expected.schema_types_join_incompatible_by_name) { return true; } @@ -338,6 +374,21 @@ MATCHER_P(EqualsSetSchemaResult, expected, "") { ","), "]"); + // Format schema_types_join_incompatible_by_name + std::string actual_schema_types_join_incompatible_by_name = + absl_ports::StrCat( + "[", + absl_ports::StrJoin(actual.schema_types_join_incompatible_by_name, + ","), + "]"); + + std::string expected_schema_types_join_incompatible_by_name = + absl_ports::StrCat( + "[", + absl_ports::StrJoin(expected.schema_types_join_incompatible_by_name, + ","), + "]"); + *result_listener << IcingStringUtil::StringPrintf( "\nExpected {\n" "\tsuccess=%d,\n" @@ -347,8 +398,9 @@ MATCHER_P(EqualsSetSchemaResult, expected, "") { "\tschema_types_incompatible_by_name=%s,\n" "\tschema_types_incompatible_by_id=%s\n" "\tschema_types_new_by_name=%s,\n" - "\tschema_types_index_incompatible_by_name=%s,\n" "\tschema_types_changed_fully_compatible_by_name=%s\n" + "\tschema_types_index_incompatible_by_name=%s,\n" + "\tschema_types_join_incompatible_by_name=%s\n" "}\n" "Actual {\n" "\tsuccess=%d,\n" @@ -358,8 +410,9 @@ MATCHER_P(EqualsSetSchemaResult, expected, "") { "\tschema_types_incompatible_by_name=%s,\n" "\tschema_types_incompatible_by_id=%s\n" "\tschema_types_new_by_name=%s,\n" - "\tschema_types_index_incompatible_by_name=%s,\n" "\tschema_types_changed_fully_compatible_by_name=%s\n" + "\tschema_types_index_incompatible_by_name=%s,\n" + "\tschema_types_join_incompatible_by_name=%s\n" "}\n", expected.success, expected_old_schema_type_ids_changed.c_str(), expected_schema_types_deleted_by_name.c_str(), @@ -368,7 +421,8 @@ MATCHER_P(EqualsSetSchemaResult, expected, "") { expected_schema_types_incompatible_by_id.c_str(), expected_schema_types_new_by_name.c_str(), expected_schema_types_changed_fully_compatible_by_name.c_str(), - expected_schema_types_index_incompatible_by_name.c_str(), actual.success, + expected_schema_types_index_incompatible_by_name.c_str(), + expected_schema_types_join_incompatible_by_name.c_str(), actual.success, actual_old_schema_type_ids_changed.c_str(), actual_schema_types_deleted_by_name.c_str(), actual_schema_types_deleted_by_id.c_str(), @@ -376,7 +430,8 @@ MATCHER_P(EqualsSetSchemaResult, expected, "") { actual_schema_types_incompatible_by_id.c_str(), actual_schema_types_new_by_name.c_str(), actual_schema_types_changed_fully_compatible_by_name.c_str(), - actual_schema_types_index_incompatible_by_name.c_str()); + actual_schema_types_index_incompatible_by_name.c_str(), + actual_schema_types_join_incompatible_by_name.c_str()); return false; } diff --git a/icing/tokenization/combined-tokenizer_test.cc b/icing/tokenization/combined-tokenizer_test.cc index 8314e91..0e400e2 100644 --- a/icing/tokenization/combined-tokenizer_test.cc +++ b/icing/tokenization/combined-tokenizer_test.cc @@ -178,7 +178,7 @@ TEST_F(CombinedTokenizerTest, ColonsPropertyRestricts) { CreateQueryTokenizer(tokenizer_factory::QueryTokenizerType::RAW_QUERY, lang_segmenter_.get())); - if (IsIcu72PlusTokenization()) { + if (GetIcuTokenizationVersion() >= 72) { // In ICU 72+ and above, ':' are no longer considered word connectors. The // query tokenizer should still consider them to be property restricts. constexpr std::string_view kText = "foo:bar"; diff --git a/icing/tokenization/icu/icu-language-segmenter_test.cc b/icing/tokenization/icu/icu-language-segmenter_test.cc index 3bacbc6..a7f7419 100644 --- a/icing/tokenization/icu/icu-language-segmenter_test.cc +++ b/icing/tokenization/icu/icu-language-segmenter_test.cc @@ -296,12 +296,19 @@ TEST_P(IcuLanguageSegmenterAllLocalesTest, WordConnector) { // 2. '@' became a word connector // 3. <numeric><word-connector><numeric> such as "3'14" is now considered as // a single token. - if (IsIcu72PlusTokenization()) { + if (GetIcuTokenizationVersion() >= 72) { EXPECT_THAT( language_segmenter->GetAllTerms("com:google:android"), IsOkAndHolds(ElementsAre("com", ":", "google", ":", "android"))); - EXPECT_THAT(language_segmenter->GetAllTerms("com@google@android"), - IsOkAndHolds(ElementsAre("com@google@android"))); + // In ICU 74, the rules for '@' were reverted. + if (GetIcuTokenizationVersion() >= 74) { + EXPECT_THAT( + language_segmenter->GetAllTerms("com@google@android"), + IsOkAndHolds(ElementsAre("com", "@", "google", "@", "android"))); + } else { + EXPECT_THAT(language_segmenter->GetAllTerms("com@google@android"), + IsOkAndHolds(ElementsAre("com@google@android"))); + } EXPECT_THAT(language_segmenter->GetAllTerms("3'14"), IsOkAndHolds(ElementsAre("3'14"))); } else { diff --git a/icing/tokenization/raw-query-tokenizer_test.cc b/icing/tokenization/raw-query-tokenizer_test.cc index a00f2f7..39cc0ed 100644 --- a/icing/tokenization/raw-query-tokenizer_test.cc +++ b/icing/tokenization/raw-query-tokenizer_test.cc @@ -349,7 +349,7 @@ TEST_F(RawQueryTokenizerTest, PropertyRestriction) { // connector pre-ICU 72. For ICU 72 and above, it's no longer considered a // connector. // TODO(b/254874614): Handle colon word breaks in ICU 72+ - if (IsIcu72PlusTokenization()) { + if (GetIcuTokenizationVersion() >= 72) { EXPECT_THAT(raw_query_tokenizer->TokenizeAll("property:foo:bar"), IsOkAndHolds(ElementsAre( EqualsToken(Token::Type::QUERY_PROPERTY, "property"), diff --git a/icing/transform/icu/icu-normalizer.cc b/icing/transform/icu/icu-normalizer.cc index f32e541..58d4956 100644 --- a/icing/transform/icu/icu-normalizer.cc +++ b/icing/transform/icu/icu-normalizer.cc @@ -50,6 +50,7 @@ constexpr UChar kTransformRulesUtf16[] = "Latin-ASCII; " // Map Latin characters to ASCII characters "Hiragana-Katakana; " // Map hiragana to katakana "[:Latin:] NFD; " // Decompose Latin letters + "[:Greek:] NFD; " // Decompose Greek letters "[:Nonspacing Mark:] Remove; " // Remove accent / diacritic marks "NFKC"; // Decompose and compose everything diff --git a/icing/transform/icu/icu-normalizer.h b/icing/transform/icu/icu-normalizer.h index 7c64506..f6f2b78 100644 --- a/icing/transform/icu/icu-normalizer.h +++ b/icing/transform/icu/icu-normalizer.h @@ -33,7 +33,8 @@ namespace lib { // 2. Transforms full-width Latin characters to ASCII characters if possible. // 3. Transforms hiragana to katakana. // 4. Removes accent / diacritic marks on Latin characters -// 5. Normalized text must be less than or equal to max_term_byte_size, +// 5. Removes accent / diacritic marks on Greek characters +// 6. Normalized text must be less than or equal to max_term_byte_size, // otherwise it will be truncated. // // There're some other rules from ICU not listed here, please see .cc file for diff --git a/icing/transform/icu/icu-normalizer_benchmark.cc b/icing/transform/icu/icu-normalizer_benchmark.cc index fe8289a..89d5f1e 100644 --- a/icing/transform/icu/icu-normalizer_benchmark.cc +++ b/icing/transform/icu/icu-normalizer_benchmark.cc @@ -39,8 +39,8 @@ // blaze-bin/icing/transform/icu/icu-normalizer_benchmark // /data/local/tmp/ // -// $ adb shell /data/local/tmp/icu-normalizer_benchmark --benchmark_filter=all -// --adb +// $ adb shell /data/local/tmp/icu-normalizer_benchmark +// --benchmark_filter=all --adb // Flag to tell the benchmark that it'll be run on an Android device via adb, // the benchmark will set up data files accordingly. @@ -61,7 +61,6 @@ void BM_NormalizeUppercase(benchmark::State& state) { ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<Normalizer> normalizer, normalizer_factory::Create( - /*max_term_byte_size=*/std::numeric_limits<int>::max())); std::string input_string(state.range(0), 'A'); @@ -95,7 +94,6 @@ void BM_NormalizeAccent(benchmark::State& state) { ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<Normalizer> normalizer, normalizer_factory::Create( - /*max_term_byte_size=*/std::numeric_limits<int>::max())); std::string input_string; @@ -123,7 +121,7 @@ BENCHMARK(BM_NormalizeAccent) ->Arg(2048000) ->Arg(4096000); -void BM_NormalizeHiragana(benchmark::State& state) { +void BM_NormalizeGreekAccent(benchmark::State& state) { bool run_via_adb = absl::GetFlag(FLAGS_adb); if (!run_via_adb) { ICING_ASSERT_OK(icu_data_file_helper::SetUpICUDataFile( @@ -133,7 +131,43 @@ void BM_NormalizeHiragana(benchmark::State& state) { ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<Normalizer> normalizer, normalizer_factory::Create( + /*max_term_byte_size=*/std::numeric_limits<int>::max())); + + std::string input_string; + while (input_string.length() < state.range(0)) { + input_string.append("άὰᾶἀἄ"); + } + + for (auto _ : state) { + normalizer->NormalizeTerm(input_string); + } +} +BENCHMARK(BM_NormalizeGreekAccent) + ->Arg(1000) + ->Arg(2000) + ->Arg(4000) + ->Arg(8000) + ->Arg(16000) + ->Arg(32000) + ->Arg(64000) + ->Arg(128000) + ->Arg(256000) + ->Arg(384000) + ->Arg(512000) + ->Arg(1024000) + ->Arg(2048000) + ->Arg(4096000); + +void BM_NormalizeHiragana(benchmark::State& state) { + bool run_via_adb = absl::GetFlag(FLAGS_adb); + if (!run_via_adb) { + ICING_ASSERT_OK(icu_data_file_helper::SetUpICUDataFile( + GetTestFilePath("icing/icu.dat"))); + } + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<Normalizer> normalizer, + normalizer_factory::Create( /*max_term_byte_size=*/std::numeric_limits<int>::max())); std::string input_string; @@ -171,7 +205,6 @@ void BM_UppercaseSubTokenLength(benchmark::State& state) { ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<Normalizer> normalizer, normalizer_factory::Create( - /*max_term_byte_size=*/std::numeric_limits<int>::max())); std::string input_string(state.range(0), 'A'); @@ -207,7 +240,6 @@ void BM_AccentSubTokenLength(benchmark::State& state) { ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<Normalizer> normalizer, normalizer_factory::Create( - /*max_term_byte_size=*/std::numeric_limits<int>::max())); std::string input_string; @@ -248,7 +280,6 @@ void BM_HiraganaSubTokenLength(benchmark::State& state) { ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<Normalizer> normalizer, normalizer_factory::Create( - /*max_term_byte_size=*/std::numeric_limits<int>::max())); std::string input_string; diff --git a/icing/transform/icu/icu-normalizer_test.cc b/icing/transform/icu/icu-normalizer_test.cc index 719f7be..0df23fc 100644 --- a/icing/transform/icu/icu-normalizer_test.cc +++ b/icing/transform/icu/icu-normalizer_test.cc @@ -83,14 +83,12 @@ TEST_F(IcuNormalizerTest, LatinLetterRemoveAccent) { Eq("eeeeeeeeeeeeeeeeeeeeeeeeeee")); EXPECT_THAT(normalizer_->NormalizeTerm("Ḟḟ"), Eq("ff")); EXPECT_THAT(normalizer_->NormalizeTerm("ĜĞĠĢḠḡĝğġģ"), Eq("gggggggggg")); - EXPECT_THAT(normalizer_->NormalizeTerm("ĤḢḤḦḨḪḣḥḧḩḫĥẖ"), - Eq("hhhhhhhhhhhhh")); + EXPECT_THAT(normalizer_->NormalizeTerm("ĤḢḤḦḨḪḣḥḧḩḫĥẖ"), Eq("hhhhhhhhhhhhh")); EXPECT_THAT(normalizer_->NormalizeTerm("ÌÍÎÏĨĪĬḬḭḯìíîïĩīĭ"), Eq("iiiiiiiiiiiiiiiii")); EXPECT_THAT(normalizer_->NormalizeTerm("Ĵĵ"), Eq("jj")); EXPECT_THAT(normalizer_->NormalizeTerm("ĶḰḲḴḵḱḳķ"), Eq("kkkkkkkk")); - EXPECT_THAT(normalizer_->NormalizeTerm("ĹĻĽḶḸḼḷḹḻḽĺļľ"), - Eq("lllllllllllll")); + EXPECT_THAT(normalizer_->NormalizeTerm("ĹĻĽḶḸḼḷḹḻḽĺļľ"), Eq("lllllllllllll")); EXPECT_THAT(normalizer_->NormalizeTerm("ḾṀṂḿṁṃ"), Eq("mmmmmm")); EXPECT_THAT(normalizer_->NormalizeTerm("ÑŃŅŇṄṆṈṊṅṇṉṋñńņň"), Eq("nnnnnnnnnnnnnnnn")); @@ -109,19 +107,38 @@ TEST_F(IcuNormalizerTest, LatinLetterRemoveAccent) { EXPECT_THAT(normalizer_->NormalizeTerm("ŴẀẂẄẆẈẁẃẅẇẉŵ"), Eq("wwwwwwwwwwww")); EXPECT_THAT(normalizer_->NormalizeTerm("ẊẌẋẍ"), Eq("xxxx")); EXPECT_THAT(normalizer_->NormalizeTerm("ÝŶŸẎẏŷýÿ"), Eq("yyyyyyyy")); - EXPECT_THAT(normalizer_->NormalizeTerm("ŹŻŽẐẒẔẑẓẕźżž"), - Eq("zzzzzzzzzzzz")); + EXPECT_THAT(normalizer_->NormalizeTerm("ŹŻŽẐẒẔẑẓẕźżž"), Eq("zzzzzzzzzzzz")); EXPECT_THAT(normalizer_->NormalizeTerm("Barış"), Eq("baris")); } +TEST_F(IcuNormalizerTest, GreekLetterRemoveAccent) { + EXPECT_THAT(normalizer_->NormalizeTerm("kαλημέρα"), Eq("kαλημερα")); + EXPECT_THAT(normalizer_->NormalizeTerm("εγγραφή"), Eq("εγγραφη")); + EXPECT_THAT(normalizer_->NormalizeTerm( + "ἈἉἊἋἌἍἎἏᾈᾉᾊᾋᾌᾍᾎᾏᾸᾹᾺΆᾼἀἁἂἃἄἅἆἇὰάᾀᾁᾂᾃᾄᾅᾆᾇᾰᾱᾲᾳᾴᾶᾷ"), + Eq("αααααααααααααααααααααααααααααααααααααααααααααα")); + EXPECT_THAT(normalizer_->NormalizeTerm("ἘἙἚἛἜἝῈΈἐἑἒἓἔἕὲέ"), + Eq("εεεεεεεεεεεεεεεε")); + EXPECT_THAT( + normalizer_->NormalizeTerm("ἨἩἪἫἬἭἮἯᾘᾙᾚᾛᾜᾝᾞᾟῊΉῌἠἡἢἣἤἥἦἧὴήᾐᾑᾒᾓᾔᾕᾖᾗῂῃῄῆῇ"), + Eq("ηηηηηηηηηηηηηηηηηηηηηηηηηηηηηηηηηηηηηηηηηη")); + EXPECT_THAT(normalizer_->NormalizeTerm("ἸἹἺἻἼἽἾἿῘῙῚΊἰἱἲἳἴἵἶἷὶίῐῑῒΐῖῗ"), + Eq("ιιιιιιιιιιιιιιιιιιιιιιιιιιιι")); + EXPECT_THAT(normalizer_->NormalizeTerm("ὈὉὊὋὌὍῸΌὀὁὂὃὄὅὸό"), + Eq("οοοοοοοοοοοοοοοο")); + EXPECT_THAT(normalizer_->NormalizeTerm("ὙὛὝὟῨῩῪΎὐὑὒὓὔὕὖὗὺύῠῡῢΰῦῧ"), + Eq("υυυυυυυυυυυυυυυυυυυυυυυυ")); + EXPECT_THAT( + normalizer_->NormalizeTerm("ὨὩὪὫὬὭὮὯᾨᾩᾪᾫᾬᾭᾮᾯῺΏῼὠὡὢὣὤὥὦὧὼώᾠᾡᾢᾣᾤᾥᾦᾧῲῳῴῶῷ"), + Eq("ωωωωωωωωωωωωωωωωωωωωωωωωωωωωωωωωωωωωωωωωωω")); + EXPECT_THAT(normalizer_->NormalizeTerm("Ῥῤῥ"), Eq("ρρρ")); +} + // Accent / diacritic marks won't be removed in non-latin chars, e.g. in -// Japanese and Greek +// Japanese TEST_F(IcuNormalizerTest, NonLatinLetterNotRemoveAccent) { // Katakana EXPECT_THAT(normalizer_->NormalizeTerm("ダヂヅデド"), Eq("ダヂヅデド")); - // Greek - EXPECT_THAT(normalizer_->NormalizeTerm("kαλημέρα"), Eq("kαλημέρα")); - EXPECT_THAT(normalizer_->NormalizeTerm("εγγραφή"), Eq("εγγραφή")); // Our current ICU rules can't handle Hebrew properly, e.g. the accents in // "אָלֶף־בֵּית עִבְרִי" @@ -287,6 +304,27 @@ TEST_F(IcuNormalizerTest, PrefixMatchLength) { term = "ÀĄḁáIcing"; match_end = normalizer->FindNormalizedMatchEndPosition(term, "aaaa"); EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("ÀĄḁá")); + + // Greek accents + term = "άνθρωπος"; + match_end = normalizer->FindNormalizedMatchEndPosition(term, "ανθ"); + EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("άνθ")); + + term = "καλημέρα"; + match_end = normalizer->FindNormalizedMatchEndPosition(term, "καλημε"); + EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("καλημέ")); + + term = "όχι"; + match_end = normalizer->FindNormalizedMatchEndPosition(term, "οχ"); + EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("όχ")); + + term = "πότε"; + match_end = normalizer->FindNormalizedMatchEndPosition(term, "ποτ"); + EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("πότ")); + + term = "ἈἉἊἋIcing"; + match_end = normalizer->FindNormalizedMatchEndPosition(term, "αααα"); + EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("ἈἉἊἋ")); } TEST_F(IcuNormalizerTest, SharedPrefixMatchLength) { @@ -340,6 +378,27 @@ TEST_F(IcuNormalizerTest, SharedPrefixMatchLength) { term = "BarışIcing"; match_end = normalizer->FindNormalizedMatchEndPosition(term, "barismdi"); EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("Barış")); + + // Greek accents + term = "άνθρωπος"; + match_end = normalizer->FindNormalizedMatchEndPosition(term, "ανθν"); + EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("άνθ")); + + term = "καλημέρα"; + match_end = normalizer->FindNormalizedMatchEndPosition(term, "καλημεος"); + EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("καλημέ")); + + term = "όχι"; + match_end = normalizer->FindNormalizedMatchEndPosition(term, "οχκα"); + EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("όχ")); + + term = "πότε"; + match_end = normalizer->FindNormalizedMatchEndPosition(term, "ποτρα"); + EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("πότ")); + + term = "ἈἉἊἋIcing"; + match_end = normalizer->FindNormalizedMatchEndPosition(term, "ααααmdi"); + EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("ἈἉἊἋ")); } } // namespace diff --git a/icing/util/document-validator.cc b/icing/util/document-validator.cc index 9d5fea7..e0880ea 100644 --- a/icing/util/document-validator.cc +++ b/icing/util/document-validator.cc @@ -172,7 +172,7 @@ libtextclassifier3::Status DocumentValidator::Validate( } } if (num_required_properties_actual < - parsed_property_configs.num_required_properties) { + parsed_property_configs.required_properties.size()) { return absl_ports::InvalidArgumentError( absl_ports::StrCat("One or more required fields missing for key: (", document.namespace_(), ", ", document.uri(), ").")); diff --git a/icing/util/i18n-utils.cc b/icing/util/i18n-utils.cc index ec327ad..ada9ef2 100644 --- a/icing/util/i18n-utils.cc +++ b/icing/util/i18n-utils.cc @@ -38,7 +38,7 @@ namespace { // (https://www.fileformat.info/info/unicode/category/index.htm). The set of // characters that are regarded as punctuation is not the same for std::ispunct // and u_ispunct. -const std::string ascii_icu_punctuation = "!\"#%&'*,./:;?@\\_-([{}])"; +constexpr std::string_view kAsciiIcuPunctuation = "!\"#%&'*,./:;?@\\_-([{}])"; } // namespace @@ -129,7 +129,7 @@ bool IsPunctuationAt(std::string_view input, int position, int* char_len_out) { if (char_len_out != nullptr) { *char_len_out = 1; } - return ascii_icu_punctuation.find(input[position]) != std::string::npos; + return kAsciiIcuPunctuation.find(input[position]) != std::string_view::npos; } UChar32 c = GetUChar32At(input.data(), input.length(), position); if (char_len_out != nullptr) { diff --git a/java/src/com/google/android/icing/IcingSearchEngine.java b/java/src/com/google/android/icing/IcingSearchEngine.java index 47b94a5..79fcdb8 100644 --- a/java/src/com/google/android/icing/IcingSearchEngine.java +++ b/java/src/com/google/android/icing/IcingSearchEngine.java @@ -77,6 +77,7 @@ public class IcingSearchEngine implements IcingSearchEngineInterface { icingSearchEngineImpl.close(); } + @SuppressWarnings("deprecation") @Override protected void finalize() throws Throwable { icingSearchEngineImpl.close(); diff --git a/java/src/com/google/android/icing/IcingSearchEngineImpl.java b/java/src/com/google/android/icing/IcingSearchEngineImpl.java index 8e79a88..57744c4 100644 --- a/java/src/com/google/android/icing/IcingSearchEngineImpl.java +++ b/java/src/com/google/android/icing/IcingSearchEngineImpl.java @@ -71,6 +71,7 @@ public class IcingSearchEngineImpl implements Closeable { closed = true; } + @SuppressWarnings("deprecation") @Override protected void finalize() throws Throwable { close(); diff --git a/java/tests/instrumentation/src/com/google/android/icing/IcingSearchEngineTest.java b/java/tests/instrumentation/src/com/google/android/icing/IcingSearchEngineTest.java index 1ed2d9a..2bbd621 100644 --- a/java/tests/instrumentation/src/com/google/android/icing/IcingSearchEngineTest.java +++ b/java/tests/instrumentation/src/com/google/android/icing/IcingSearchEngineTest.java @@ -139,26 +139,6 @@ public final class IcingSearchEngineTest { } @Test - public void testSetAndGetSchema() throws Exception { - assertStatusOk(icingSearchEngine.initialize().getStatus()); - - SchemaTypeConfigProto emailTypeConfig = createEmailTypeConfig(); - SchemaProto schema = SchemaProto.newBuilder().addTypes(emailTypeConfig).build(); - SetSchemaResultProto setSchemaResultProto = - icingSearchEngine.setSchema(schema, /*ignoreErrorsAndDeleteDocuments=*/ false); - assertStatusOk(setSchemaResultProto.getStatus()); - - GetSchemaResultProto getSchemaResultProto = icingSearchEngine.getSchema(); - assertStatusOk(getSchemaResultProto.getStatus()); - assertThat(getSchemaResultProto.getSchema()).isEqualTo(schema); - - GetSchemaTypeResultProto getSchemaTypeResultProto = - icingSearchEngine.getSchemaType(emailTypeConfig.getSchemaType()); - assertStatusOk(getSchemaTypeResultProto.getStatus()); - assertThat(getSchemaTypeResultProto.getSchemaTypeConfig()).isEqualTo(emailTypeConfig); - } - - @Test public void testPutAndGetDocuments() throws Exception { assertStatusOk(icingSearchEngine.initialize().getStatus()); diff --git a/proto/icing/proto/initialize.proto b/proto/icing/proto/initialize.proto index d4b1aee..9dd9e88 100644 --- a/proto/icing/proto/initialize.proto +++ b/proto/icing/proto/initialize.proto @@ -23,7 +23,7 @@ option java_package = "com.google.android.icing.proto"; option java_multiple_files = true; option objc_class_prefix = "ICNG"; -// Next tag: 11 +// Next tag: 16 message IcingSearchEngineOptions { // Directory to persist files for Icing. Required. // If Icing was previously initialized with this directory, it will reload @@ -104,6 +104,38 @@ message IcingSearchEngineOptions { // to dynamic trie key mapper). optional bool use_persistent_hash_map = 10; + // Integer index bucket split threshold. + optional int32 integer_index_bucket_split_threshold = 11 [default = 65536]; + + // Whether Icing should sort and merge its lite index HitBuffer unsorted tail + // at indexing time. + // + // If set to true, the HitBuffer will be sorted at indexing time after + // exceeding the sort threshold. If false, the HifBuffer will be sorted at + // querying time, before the first query after inserting new elements into the + // HitBuffer. + // + // The default value is false. + optional bool lite_index_sort_at_indexing = 12; + + // Size (in bytes) at which Icing's lite index should sort and merge the + // HitBuffer's unsorted tail into the sorted head for sorting at indexing + // time. Size specified here is the maximum byte size to allow for the + // unsorted tail section. + // + // Setting a lower sort size reduces querying latency at the expense of + // indexing latency. + optional int32 lite_index_sort_size = 13 [default = 8192]; // 8 KiB + + optional bool use_new_qualified_id_join_index = 14; + + // Whether to build the metadata hits used for property existence check, which + // is required to support the hasProperty function in advanced query. + // + // TODO(b/309826655): Implement the feature flag derived files rebuild + // mechanism to handle index rebuild, instead of using index's magic value. + optional bool build_property_existence_metadata_hits = 15; + reserved 2; } diff --git a/proto/icing/proto/logging.proto b/proto/icing/proto/logging.proto index ca795cd..fcedeed 100644 --- a/proto/icing/proto/logging.proto +++ b/proto/icing/proto/logging.proto @@ -52,6 +52,9 @@ message InitializeStatsProto { // The current code version is different from existing data version. VERSION_CHANGED = 6; + + // Any dependencies have changed. + DEPENDENCIES_CHANGED = 7; } // Possible recovery causes for document store: @@ -76,7 +79,7 @@ message InitializeStatsProto { // Time used to restore the index. optional int32 index_restoration_latency_ms = 6; - // Time used to restore the index. + // Time used to restore the schema store. optional int32 schema_store_recovery_latency_ms = 7; // Status regarding how much data is lost during the initialization. @@ -117,7 +120,7 @@ message InitializeStatsProto { } // Stats of the top-level function IcingSearchEngine::Put(). -// Next tag: 10 +// Next tag: 12 message PutDocumentStatsProto { // Overall time used for the function call. optional int32 latency_ms = 1; @@ -138,12 +141,17 @@ message PutDocumentStatsProto { // Number of tokens added to the index. optional int32 num_tokens_indexed = 1; + // Number of metadata tokens added to the index, which can only be added by + // PropertyExistenceIndexingHandler currently. + optional int32 num_metadata_tokens_indexed = 3; + reserved 2; } optional TokenizationStats tokenization_stats = 6; - // Time used to index all indexable string terms in the document. It does not - // include the time to merge indices. + // Time used to index all indexable string terms and property existence + // metadata terms in the document. It does not include the time to merge + // indices or the time to sort the lite index. optional int32 term_index_latency_ms = 7; // Time used to index all indexable integers in the document. @@ -151,24 +159,36 @@ message PutDocumentStatsProto { // Time used to index all qualified id join strings in the document. optional int32 qualified_id_join_index_latency_ms = 9; + + // Time used to sort the LiteIndex's HitBuffer. + optional int32 lite_index_sort_latency_ms = 10; + + // Time used to index all metadata terms in the document, which can only be + // added by PropertyExistenceIndexingHandler currently. + optional int32 metadata_term_index_latency_ms = 11; } // Stats of the top-level function IcingSearchEngine::Search() and // IcingSearchEngine::GetNextPage(). -// Next tag: 23 +// Next tag: 26 message QueryStatsProto { + // TODO(b/305098009): deprecate. Use parent_search_stats instead. // The UTF-8 length of the query string optional int32 query_length = 16; + // TODO(b/305098009): deprecate. Use parent_search_stats instead. // Number of terms in the query string. optional int32 num_terms = 1; + // TODO(b/305098009): deprecate. Use parent_search_stats instead. // Number of namespaces filtered. optional int32 num_namespaces_filtered = 2; + // TODO(b/305098009): deprecate. Use parent_search_stats instead. // Number of schema types filtered. optional int32 num_schema_types_filtered = 3; + // TODO(b/305098009): deprecate. Use parent_search_stats instead. // Strategy of scoring and ranking. optional ScoringSpecProto.RankingStrategy.Code ranking_strategy = 4; @@ -183,6 +203,7 @@ message QueryStatsProto { // The actual number of results returned in the current page. optional int32 num_results_returned_current_page = 7; + // TODO(b/305098009): deprecate. Use parent_search_stats instead. // Number of documents scored. optional int32 num_documents_scored = 8; @@ -192,10 +213,12 @@ message QueryStatsProto { // Overall time used for the function call. optional int32 latency_ms = 10; + // TODO(b/305098009): deprecate. Use parent_search_stats instead. // Time used to parse the query, including 2 parts: tokenizing and // transforming tokens into an iterator tree. optional int32 parse_query_latency_ms = 11; + // TODO(b/305098009): deprecate. Use parent_search_stats instead. // Time used to score the raw results. optional int32 scoring_latency_ms = 12; @@ -225,6 +248,56 @@ message QueryStatsProto { // Number of documents scored. optional int32 num_joined_results_returned_current_page = 22; + // Whether it contains join query or not. + optional bool is_join_query = 23; + + // Stats of the search. Only valid for first page. + // Next tag: 13 + message SearchStats { + // The UTF-8 length of the query string + optional int32 query_length = 1; + + // Number of terms in the query string. + optional int32 num_terms = 2; + + // Number of namespaces filtered. + optional int32 num_namespaces_filtered = 3; + + // Number of schema types filtered. + optional int32 num_schema_types_filtered = 4; + + // Strategy of scoring and ranking. + optional ScoringSpecProto.RankingStrategy.Code ranking_strategy = 5; + + // Number of documents scored. + optional int32 num_documents_scored = 6; + + // Time used to parse the query, including 2 parts: tokenizing and + // transforming tokens into an iterator tree. + optional int32 parse_query_latency_ms = 7; + + // Time used to score the raw results. + optional int32 scoring_latency_ms = 8; + + // Whether it contains numeric query or not. + optional bool is_numeric_query = 9; + + // Number of hits fetched by lite index before applying any filters. + optional int32 num_fetched_hits_lite_index = 10; + + // Number of hits fetched by main index before applying any filters. + optional int32 num_fetched_hits_main_index = 11; + + // Number of hits fetched by integer index before applying any filters. + optional int32 num_fetched_hits_integer_index = 12; + } + + // Search stats for parent. Only valid for first page. + optional SearchStats parent_search_stats = 24; + + // Search stats for child. + optional SearchStats child_search_stats = 25; + reserved 9; } diff --git a/proto/icing/proto/optimize.proto b/proto/icing/proto/optimize.proto index 0accb9a..675f980 100644 --- a/proto/icing/proto/optimize.proto +++ b/proto/icing/proto/optimize.proto @@ -63,7 +63,7 @@ message GetOptimizeInfoResultProto { optional int64 time_since_last_optimize_ms = 4; } -// Next tag: 11 +// Next tag: 13 message OptimizeStatsProto { // Overall time used for the function call. optional int32 latency_ms = 1; @@ -102,4 +102,10 @@ message OptimizeStatsProto { FULL_INDEX_REBUILD = 1; } optional IndexRestorationMode index_restoration_mode = 10; + + // Number of namespaces before the optimization. + optional int32 num_original_namespaces = 11; + + // Number of namespaces deleted. + optional int32 num_deleted_namespaces = 12; } diff --git a/proto/icing/proto/schema.proto b/proto/icing/proto/schema.proto index b972ece..c716dba 100644 --- a/proto/icing/proto/schema.proto +++ b/proto/icing/proto/schema.proto @@ -138,15 +138,22 @@ message StringIndexingConfig { } // Describes how a document property should be indexed. -// Next tag: 2 +// Next tag: 3 message DocumentIndexingConfig { // OPTIONAL: Whether nested properties within the document property should be - // indexed. If true, then the nested properties will be indexed according to + // indexed. If true, then all nested properties will be indexed according to // the property's own indexing configurations. If false, nested documents' // properties will not be indexed even if they have an indexing configuration. // // The default value is false. optional bool index_nested_properties = 1; + + // List of nested properties within the document to index. Only the + // provided list of properties will be indexed according to the property's + // indexing configurations. + // + // index_nested_properties must be false in order to use this feature. + repeated string indexable_nested_properties_list = 2; } // Describes how a int64 property should be indexed. diff --git a/proto/icing/proto/search.proto b/proto/icing/proto/search.proto index fca669a..7f4fb3e 100644 --- a/proto/icing/proto/search.proto +++ b/proto/icing/proto/search.proto @@ -27,7 +27,7 @@ option java_multiple_files = true; option objc_class_prefix = "ICNG"; // Client-supplied specifications on what documents to retrieve. -// Next tag: 10 +// Next tag: 11 message SearchSpecProto { // REQUIRED: The "raw" query string that users may type. For example, "cat" // will search for documents with the term cat in it. @@ -102,11 +102,21 @@ message SearchSpecProto { // Finer-grained locks are implemented around code paths that write changes to // Icing during Search. optional bool use_read_only_search = 9 [default = true]; + + // TODO(b/294266822): Handle multiple property filter lists for same schema + // type. + // How to specify a subset of properties to be searched. If no type property + // filter has been specified for a schema type (no TypePropertyMask for the + // given schema type), then *all* properties of that schema type will be + // searched. If an empty property filter is specified for a given schema type + // (TypePropertyMask for the given schema type has empty paths field), no + // properties of that schema type will be searched. + repeated TypePropertyMask type_property_filters = 10; } // Client-supplied specifications on what to include/how to format the search // results. -// Next tag: 9 +// Next tag: 10 message ResultSpecProto { // The results will be returned in pages, and num_per_page specifies the // number of documents in one page. @@ -211,6 +221,18 @@ message ResultSpecProto { // The max # of child documents will be attached and returned in the result // for each parent. It is only used for join API. optional int32 max_joined_children_per_parent_to_return = 8; + + // The max # of results being scored and ranked. + // Running time of ScoringProcessor and Ranker is O(num_to_score) according to + // results of //icing/scoring:score-and-rank_benchmark. Note that + // the process includes scoring, building a heap, and popping results from the + // heap. + // + // 30000 results can be scored and ranked within 3 ms on a Pixel 3 XL + // according to results of + // //icing/scoring:score-and-rank_benchmark, so set it as the + // default value. + optional int32 num_to_score = 9 [default = 30000]; } // The representation of a single match within a DocumentProto property. diff --git a/synced_AOSP_CL_number.txt b/synced_AOSP_CL_number.txt index afb1234..dd08fd1 100644 --- a/synced_AOSP_CL_number.txt +++ b/synced_AOSP_CL_number.txt @@ -1 +1 @@ -set(synced_AOSP_CL_number=537223436) +set(synced_AOSP_CL_number=587883838) |