aboutsummaryrefslogtreecommitdiff
path: root/icing/transform/icu/icu-normalizer_test.cc
diff options
context:
space:
mode:
Diffstat (limited to 'icing/transform/icu/icu-normalizer_test.cc')
-rw-r--r--icing/transform/icu/icu-normalizer_test.cc79
1 files changed, 69 insertions, 10 deletions
diff --git a/icing/transform/icu/icu-normalizer_test.cc b/icing/transform/icu/icu-normalizer_test.cc
index 719f7be..0df23fc 100644
--- a/icing/transform/icu/icu-normalizer_test.cc
+++ b/icing/transform/icu/icu-normalizer_test.cc
@@ -83,14 +83,12 @@ TEST_F(IcuNormalizerTest, LatinLetterRemoveAccent) {
Eq("eeeeeeeeeeeeeeeeeeeeeeeeeee"));
EXPECT_THAT(normalizer_->NormalizeTerm("Ḟḟ"), Eq("ff"));
EXPECT_THAT(normalizer_->NormalizeTerm("ĜĞĠĢḠḡĝğġģ"), Eq("gggggggggg"));
- EXPECT_THAT(normalizer_->NormalizeTerm("ĤḢḤḦḨḪḣḥḧḩḫĥẖ"),
- Eq("hhhhhhhhhhhhh"));
+ EXPECT_THAT(normalizer_->NormalizeTerm("ĤḢḤḦḨḪḣḥḧḩḫĥẖ"), Eq("hhhhhhhhhhhhh"));
EXPECT_THAT(normalizer_->NormalizeTerm("ÌÍÎÏĨĪĬḬḭḯìíîïĩīĭ"),
Eq("iiiiiiiiiiiiiiiii"));
EXPECT_THAT(normalizer_->NormalizeTerm("Ĵĵ"), Eq("jj"));
EXPECT_THAT(normalizer_->NormalizeTerm("ĶḰḲḴḵḱḳķ"), Eq("kkkkkkkk"));
- EXPECT_THAT(normalizer_->NormalizeTerm("ĹĻĽḶḸḼḷḹḻḽĺļľ"),
- Eq("lllllllllllll"));
+ EXPECT_THAT(normalizer_->NormalizeTerm("ĹĻĽḶḸḼḷḹḻḽĺļľ"), Eq("lllllllllllll"));
EXPECT_THAT(normalizer_->NormalizeTerm("ḾṀṂḿṁṃ"), Eq("mmmmmm"));
EXPECT_THAT(normalizer_->NormalizeTerm("ÑŃŅŇṄṆṈṊṅṇṉṋñńņň"),
Eq("nnnnnnnnnnnnnnnn"));
@@ -109,19 +107,38 @@ TEST_F(IcuNormalizerTest, LatinLetterRemoveAccent) {
EXPECT_THAT(normalizer_->NormalizeTerm("ŴẀẂẄẆẈẁẃẅẇẉŵ"), Eq("wwwwwwwwwwww"));
EXPECT_THAT(normalizer_->NormalizeTerm("ẊẌẋẍ"), Eq("xxxx"));
EXPECT_THAT(normalizer_->NormalizeTerm("ÝŶŸẎẏŷýÿ"), Eq("yyyyyyyy"));
- EXPECT_THAT(normalizer_->NormalizeTerm("ŹŻŽẐẒẔẑẓẕźżž"),
- Eq("zzzzzzzzzzzz"));
+ EXPECT_THAT(normalizer_->NormalizeTerm("ŹŻŽẐẒẔẑẓẕźżž"), Eq("zzzzzzzzzzzz"));
EXPECT_THAT(normalizer_->NormalizeTerm("Barış"), Eq("baris"));
}
+TEST_F(IcuNormalizerTest, GreekLetterRemoveAccent) {
+ EXPECT_THAT(normalizer_->NormalizeTerm("kαλημέρα"), Eq("kαλημερα"));
+ EXPECT_THAT(normalizer_->NormalizeTerm("εγγραφή"), Eq("εγγραφη"));
+ EXPECT_THAT(normalizer_->NormalizeTerm(
+ "ἈἉἊἋἌἍἎἏᾈᾉᾊᾋᾌᾍᾎᾏᾸᾹᾺΆᾼἀἁἂἃἄἅἆἇὰάᾀᾁᾂᾃᾄᾅᾆᾇᾰᾱᾲᾳᾴᾶᾷ"),
+ Eq("αααααααααααααααααααααααααααααααααααααααααααααα"));
+ EXPECT_THAT(normalizer_->NormalizeTerm("ἘἙἚἛἜἝῈΈἐἑἒἓἔἕὲέ"),
+ Eq("εεεεεεεεεεεεεεεε"));
+ EXPECT_THAT(
+ normalizer_->NormalizeTerm("ἨἩἪἫἬἭἮἯᾘᾙᾚᾛᾜᾝᾞᾟῊΉῌἠἡἢἣἤἥἦἧὴήᾐᾑᾒᾓᾔᾕᾖᾗῂῃῄῆῇ"),
+ Eq("ηηηηηηηηηηηηηηηηηηηηηηηηηηηηηηηηηηηηηηηηηη"));
+ EXPECT_THAT(normalizer_->NormalizeTerm("ἸἹἺἻἼἽἾἿῘῙῚΊἰἱἲἳἴἵἶἷὶίῐῑῒΐῖῗ"),
+ Eq("ιιιιιιιιιιιιιιιιιιιιιιιιιιιι"));
+ EXPECT_THAT(normalizer_->NormalizeTerm("ὈὉὊὋὌὍῸΌὀὁὂὃὄὅὸό"),
+ Eq("οοοοοοοοοοοοοοοο"));
+ EXPECT_THAT(normalizer_->NormalizeTerm("ὙὛὝὟῨῩῪΎὐὑὒὓὔὕὖὗὺύῠῡῢΰῦῧ"),
+ Eq("υυυυυυυυυυυυυυυυυυυυυυυυ"));
+ EXPECT_THAT(
+ normalizer_->NormalizeTerm("ὨὩὪὫὬὭὮὯᾨᾩᾪᾫᾬᾭᾮᾯῺΏῼὠὡὢὣὤὥὦὧὼώᾠᾡᾢᾣᾤᾥᾦᾧῲῳῴῶῷ"),
+ Eq("ωωωωωωωωωωωωωωωωωωωωωωωωωωωωωωωωωωωωωωωωωω"));
+ EXPECT_THAT(normalizer_->NormalizeTerm("Ῥῤῥ"), Eq("ρρρ"));
+}
+
// Accent / diacritic marks won't be removed in non-latin chars, e.g. in
-// Japanese and Greek
+// Japanese
TEST_F(IcuNormalizerTest, NonLatinLetterNotRemoveAccent) {
// Katakana
EXPECT_THAT(normalizer_->NormalizeTerm("ダヂヅデド"), Eq("ダヂヅデド"));
- // Greek
- EXPECT_THAT(normalizer_->NormalizeTerm("kαλημέρα"), Eq("kαλημέρα"));
- EXPECT_THAT(normalizer_->NormalizeTerm("εγγραφή"), Eq("εγγραφή"));
// Our current ICU rules can't handle Hebrew properly, e.g. the accents in
// "אָלֶף־בֵּית עִבְרִי"
@@ -287,6 +304,27 @@ TEST_F(IcuNormalizerTest, PrefixMatchLength) {
term = "ÀĄḁáIcing";
match_end = normalizer->FindNormalizedMatchEndPosition(term, "aaaa");
EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("ÀĄḁá"));
+
+ // Greek accents
+ term = "άνθρωπος";
+ match_end = normalizer->FindNormalizedMatchEndPosition(term, "ανθ");
+ EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("άνθ"));
+
+ term = "καλημέρα";
+ match_end = normalizer->FindNormalizedMatchEndPosition(term, "καλημε");
+ EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("καλημέ"));
+
+ term = "όχι";
+ match_end = normalizer->FindNormalizedMatchEndPosition(term, "οχ");
+ EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("όχ"));
+
+ term = "πότε";
+ match_end = normalizer->FindNormalizedMatchEndPosition(term, "ποτ");
+ EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("πότ"));
+
+ term = "ἈἉἊἋIcing";
+ match_end = normalizer->FindNormalizedMatchEndPosition(term, "αααα");
+ EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("ἈἉἊἋ"));
}
TEST_F(IcuNormalizerTest, SharedPrefixMatchLength) {
@@ -340,6 +378,27 @@ TEST_F(IcuNormalizerTest, SharedPrefixMatchLength) {
term = "BarışIcing";
match_end = normalizer->FindNormalizedMatchEndPosition(term, "barismdi");
EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("Barış"));
+
+ // Greek accents
+ term = "άνθρωπος";
+ match_end = normalizer->FindNormalizedMatchEndPosition(term, "ανθν");
+ EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("άνθ"));
+
+ term = "καλημέρα";
+ match_end = normalizer->FindNormalizedMatchEndPosition(term, "καλημεος");
+ EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("καλημέ"));
+
+ term = "όχι";
+ match_end = normalizer->FindNormalizedMatchEndPosition(term, "οχκα");
+ EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("όχ"));
+
+ term = "πότε";
+ match_end = normalizer->FindNormalizedMatchEndPosition(term, "ποτρα");
+ EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("πότ"));
+
+ term = "ἈἉἊἋIcing";
+ match_end = normalizer->FindNormalizedMatchEndPosition(term, "ααααmdi");
+ EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("ἈἉἊἋ"));
}
} // namespace