aboutsummaryrefslogtreecommitdiff
path: root/icing/store/document-store.cc
diff options
context:
space:
mode:
Diffstat (limited to 'icing/store/document-store.cc')
-rw-r--r--icing/store/document-store.cc158
1 files changed, 109 insertions, 49 deletions
diff --git a/icing/store/document-store.cc b/icing/store/document-store.cc
index 30de410..094eea1 100644
--- a/icing/store/document-store.cc
+++ b/icing/store/document-store.cc
@@ -53,6 +53,7 @@
#include "icing/store/document-id.h"
#include "icing/store/document-log-creator.h"
#include "icing/store/dynamic-trie-key-mapper.h"
+#include "icing/store/namespace-fingerprint-identifier.h"
#include "icing/store/namespace-id.h"
#include "icing/store/persistent-hash-map-key-mapper.h"
#include "icing/store/usage-store.h"
@@ -142,25 +143,6 @@ std::string MakeCorpusMapperFilename(const std::string& base_dir) {
return absl_ports::StrCat(base_dir, "/", kCorpusIdMapperFilename);
}
-// This function will encode a namespace id into a fixed 3 bytes string.
-std::string EncodeNamespaceId(NamespaceId namespace_id) {
- // encoding should be 1 to 3 bytes based on the value of namespace_id.
- std::string encoding = encode_util::EncodeIntToCString(namespace_id);
- // Make encoding to fixed 3 bytes.
- while (encoding.size() < 3) {
- // DynamicTrie cannot handle keys with 0 as bytes, so we append it using 1,
- // just like what we do in encode_util::EncodeIntToCString.
- //
- // The reason that this works is because DecodeIntToString decodes a byte
- // value of 0x01 as 0x00. When EncodeIntToCString returns a namespaceid
- // encoding that is less than 3 bytes, it means that the id contains
- // unencoded leading 0x00. So here we're explicitly encoding those bytes as
- // 0x01.
- encoding.push_back(1);
- }
- return encoding;
-}
-
int64_t CalculateExpirationTimestampMs(int64_t creation_timestamp_ms,
int64_t ttl_ms) {
if (ttl_ms == 0) {
@@ -269,9 +251,8 @@ std::string DocumentStore::MakeFingerprint(
absl_ports::StrCat(namespace_, uri_or_schema));
return fingerprint_util::GetFingerprintString(fprint);
} else {
- return absl_ports::StrCat(EncodeNamespaceId(namespace_id),
- encode_util::EncodeIntToCString(
- tc3farmhash::Fingerprint64(uri_or_schema)));
+ return NamespaceFingerprintIdentifier(namespace_id, uri_or_schema)
+ .EncodeToCString();
}
}
@@ -328,13 +309,15 @@ libtextclassifier3::StatusOr<DocumentStore::CreateResult> DocumentStore::Create(
filesystem, base_dir, clock, schema_store, namespace_id_fingerprint,
pre_mapping_fbv, use_persistent_hash_map, compression_level));
ICING_ASSIGN_OR_RETURN(
- DataLoss data_loss,
+ InitializeResult initialize_result,
document_store->Initialize(force_recovery_and_revalidate_documents,
initialize_stats));
CreateResult create_result;
create_result.document_store = std::move(document_store);
- create_result.data_loss = data_loss;
+ create_result.data_loss = initialize_result.data_loss;
+ create_result.derived_files_regenerated =
+ initialize_result.derived_files_regenerated;
return create_result;
}
@@ -380,9 +363,9 @@ libtextclassifier3::StatusOr<DocumentStore::CreateResult> DocumentStore::Create(
return libtextclassifier3::Status::OK;
}
-libtextclassifier3::StatusOr<DataLoss> DocumentStore::Initialize(
- bool force_recovery_and_revalidate_documents,
- InitializeStatsProto* initialize_stats) {
+libtextclassifier3::StatusOr<DocumentStore::InitializeResult>
+DocumentStore::Initialize(bool force_recovery_and_revalidate_documents,
+ InitializeStatsProto* initialize_stats) {
auto create_result_or =
DocumentLogCreator::Create(filesystem_, base_dir_, compression_level_);
@@ -400,6 +383,7 @@ libtextclassifier3::StatusOr<DataLoss> DocumentStore::Initialize(
InitializeStatsProto::RecoveryCause recovery_cause =
GetRecoveryCause(create_result, force_recovery_and_revalidate_documents);
+ bool derived_files_regenerated = false;
if (recovery_cause != InitializeStatsProto::NONE || create_result.new_file) {
ICING_LOG(INFO) << "Starting Document Store Recovery with cause="
<< recovery_cause << ", and create result { new_file="
@@ -416,16 +400,18 @@ libtextclassifier3::StatusOr<DataLoss> DocumentStore::Initialize(
std::unique_ptr<Timer> document_recovery_timer = clock_.GetNewTimer();
libtextclassifier3::Status status =
RegenerateDerivedFiles(force_recovery_and_revalidate_documents);
- if (initialize_stats != nullptr &&
- recovery_cause != InitializeStatsProto::NONE) {
+ if (recovery_cause != InitializeStatsProto::NONE) {
// Only consider it a recovery if the client forced a recovery or there
// was data loss. Otherwise, this could just be the first time we're
// initializing and generating derived files.
- initialize_stats->set_document_store_recovery_latency_ms(
- document_recovery_timer->GetElapsedMilliseconds());
- initialize_stats->set_document_store_recovery_cause(recovery_cause);
- initialize_stats->set_document_store_data_status(
- GetDataStatus(create_result.log_create_result.data_loss));
+ derived_files_regenerated = true;
+ if (initialize_stats != nullptr) {
+ initialize_stats->set_document_store_recovery_latency_ms(
+ document_recovery_timer->GetElapsedMilliseconds());
+ initialize_stats->set_document_store_recovery_cause(recovery_cause);
+ initialize_stats->set_document_store_data_status(
+ GetDataStatus(create_result.log_create_result.data_loss));
+ }
}
if (!status.ok()) {
ICING_LOG(ERROR)
@@ -438,6 +424,7 @@ libtextclassifier3::StatusOr<DataLoss> DocumentStore::Initialize(
<< "Couldn't find derived files or failed to initialize them, "
"regenerating derived files for DocumentStore.";
std::unique_ptr<Timer> document_recovery_timer = clock_.GetNewTimer();
+ derived_files_regenerated = true;
libtextclassifier3::Status status = RegenerateDerivedFiles(
/*force_recovery_and_revalidate_documents=*/false);
if (initialize_stats != nullptr) {
@@ -459,7 +446,10 @@ libtextclassifier3::StatusOr<DataLoss> DocumentStore::Initialize(
initialize_stats->set_num_documents(document_id_mapper_->num_elements());
}
- return create_result.log_create_result.data_loss;
+ InitializeResult initialize_result = {
+ .data_loss = create_result.log_create_result.data_loss,
+ .derived_files_regenerated = derived_files_regenerated};
+ return initialize_result;
}
libtextclassifier3::Status DocumentStore::InitializeExistingDerivedFiles() {
@@ -1177,6 +1167,25 @@ libtextclassifier3::StatusOr<DocumentId> DocumentStore::GetDocumentId(
"Failed to find DocumentId by key: ", name_space, ", ", uri));
}
+libtextclassifier3::StatusOr<DocumentId> DocumentStore::GetDocumentId(
+ const NamespaceFingerprintIdentifier& namespace_fingerprint_identifier)
+ const {
+ if (!namespace_id_fingerprint_) {
+ return absl_ports::FailedPreconditionError(
+ "Cannot lookup document id by namespace id + fingerprint without "
+ "enabling it on uri_mapper");
+ }
+
+ auto document_id_or = document_key_mapper_->Get(
+ namespace_fingerprint_identifier.EncodeToCString());
+ if (document_id_or.ok()) {
+ return document_id_or.ValueOrDie();
+ }
+ return absl_ports::Annotate(
+ std::move(document_id_or).status(),
+ "Failed to find DocumentId by namespace id + fingerprint");
+}
+
std::vector<std::string> DocumentStore::GetAllNamespaces() const {
std::unordered_map<NamespaceId, std::string> namespace_id_to_namespace =
GetNamespaceIdsToNamespaces(namespace_mapper_.get());
@@ -1829,10 +1838,10 @@ libtextclassifier3::Status DocumentStore::Optimize() {
return libtextclassifier3::Status::OK;
}
-libtextclassifier3::StatusOr<std::vector<DocumentId>>
+libtextclassifier3::StatusOr<DocumentStore::OptimizeResult>
DocumentStore::OptimizeInto(const std::string& new_directory,
const LanguageSegmenter* lang_segmenter,
- OptimizeStatsProto* stats) {
+ OptimizeStatsProto* stats) const {
// Validates directory
if (new_directory == base_dir_) {
return absl_ports::InvalidArgumentError(
@@ -1850,20 +1859,22 @@ DocumentStore::OptimizeInto(const std::string& new_directory,
std::move(doc_store_create_result.document_store);
// Writes all valid docs into new document store (new directory)
- int size = document_id_mapper_->num_elements();
- int num_deleted = 0;
- int num_expired = 0;
+ int document_cnt = document_id_mapper_->num_elements();
+ int num_deleted_documents = 0;
+ int num_expired_documents = 0;
UsageStore::UsageScores default_usage;
- std::vector<DocumentId> document_id_old_to_new(size, kInvalidDocumentId);
+
+ OptimizeResult result;
+ result.document_id_old_to_new.resize(document_cnt, kInvalidDocumentId);
int64_t current_time_ms = clock_.GetSystemTimeMilliseconds();
- for (DocumentId document_id = 0; document_id < size; document_id++) {
+ for (DocumentId document_id = 0; document_id < document_cnt; document_id++) {
auto document_or = Get(document_id, /*clear_internal_fields=*/false);
if (absl_ports::IsNotFound(document_or.status())) {
if (IsDeleted(document_id)) {
- ++num_deleted;
+ ++num_deleted_documents;
} else if (!GetNonExpiredDocumentFilterData(document_id,
current_time_ms)) {
- ++num_expired;
+ ++num_expired_documents;
}
continue;
} else if (!document_or.ok()) {
@@ -1903,7 +1914,8 @@ DocumentStore::OptimizeInto(const std::string& new_directory,
return new_document_id_or.status();
}
- document_id_old_to_new[document_id] = new_document_id_or.ValueOrDie();
+ result.document_id_old_to_new[document_id] =
+ new_document_id_or.ValueOrDie();
// Copy over usage scores.
ICING_ASSIGN_OR_RETURN(UsageStore::UsageScores usage_scores,
@@ -1917,13 +1929,61 @@ DocumentStore::OptimizeInto(const std::string& new_directory,
new_doc_store->SetUsageScores(new_document_id, usage_scores));
}
}
+
+ // Construct namespace_id_old_to_new
+ int namespace_cnt = namespace_mapper_->num_keys();
+ std::unordered_map<NamespaceId, std::string> old_namespaces =
+ GetNamespaceIdsToNamespaces(namespace_mapper_.get());
+ if (namespace_cnt != old_namespaces.size()) {
+ // This really shouldn't happen. If it really happens, then:
+ // - It won't block DocumentStore optimization, so don't return error here.
+ // - Instead, write a warning log here and hint the caller to rebuild index.
+ ICING_LOG(WARNING) << "Unexpected old namespace count " << namespace_cnt
+ << " vs " << old_namespaces.size();
+ result.should_rebuild_index = true;
+ } else {
+ result.namespace_id_old_to_new.resize(namespace_cnt, kInvalidNamespaceId);
+ for (const auto& [old_namespace_id, ns] : old_namespaces) {
+ if (old_namespace_id >= result.namespace_id_old_to_new.size()) {
+ // This really shouldn't happen. If it really happens, then:
+ // - It won't block DocumentStore optimization, so don't return error
+ // here.
+ // - Instead, write a warning log here and hint the caller to rebuild
+ // index.
+ ICING_LOG(WARNING) << "Found unexpected namespace id "
+ << old_namespace_id << ". Should be in range 0 to "
+ << result.namespace_id_old_to_new.size()
+ << " (exclusive).";
+ result.namespace_id_old_to_new.clear();
+ result.should_rebuild_index = true;
+ break;
+ }
+
+ auto new_namespace_id_or = new_doc_store->namespace_mapper_->Get(ns);
+ if (!new_namespace_id_or.ok()) {
+ if (absl_ports::IsNotFound(new_namespace_id_or.status())) {
+ continue;
+ }
+ // Real error, return it.
+ return std::move(new_namespace_id_or).status();
+ }
+
+ NamespaceId new_namespace_id = new_namespace_id_or.ValueOrDie();
+ // Safe to use bracket to assign given that we've checked the range above.
+ result.namespace_id_old_to_new[old_namespace_id] = new_namespace_id;
+ }
+ }
+
if (stats != nullptr) {
- stats->set_num_original_documents(size);
- stats->set_num_deleted_documents(num_deleted);
- stats->set_num_expired_documents(num_expired);
+ stats->set_num_original_documents(document_cnt);
+ stats->set_num_deleted_documents(num_deleted_documents);
+ stats->set_num_expired_documents(num_expired_documents);
+ stats->set_num_original_namespaces(namespace_cnt);
+ stats->set_num_deleted_namespaces(
+ namespace_cnt - new_doc_store->namespace_mapper_->num_keys());
}
ICING_RETURN_IF_ERROR(new_doc_store->PersistToDisk(PersistType::FULL));
- return document_id_old_to_new;
+ return result;
}
libtextclassifier3::StatusOr<DocumentStore::OptimizeInfo>