summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorVictor Chang <vichang@google.com>2024-04-25 10:36:20 +0100
committerVictor Chang <vichang@google.com>2024-04-25 10:43:23 +0100
commit2d40414736f295d14f159284388b8c73f6c6ffaa (patch)
tree8ae187c87ea4410b9f831c6091f1e45553b21e74
parent690a1de1dc8cee61c822c44c88f88fd6f66c4668 (diff)
downloadicu-2d40414736f295d14f159284388b8c73f6c6ffaa.tar.gz
Cherry-pick: ICU-22742 Fix handling of XA,XB,XC for addLikelySubtags
Upstream commit: https://github.com/unicode-org/icu/pull/2977/commits/78502b61366731c5483e48d2e746742e5962632e Upstream bug: https://unicode-org.atlassian.net/browse/ICU-22742 Add more tests. ICU-22742 Add PS... variants ICU-22742 Add java tests ICU-22742 extend tests ICU-22742 wrap java Bug: 331740612 Test: atest CtsIcuTestCases CtsIcu4cTestCases Change-Id: I91546706f5918ac353c0bf779ba6f16c0a0a1c5d
-rw-r--r--android_icu4j/src/main/java/android/icu/impl/locale/LikelySubtags.java73
-rw-r--r--android_icu4j/src/main/tests/android/icu/dev/test/util/ULocaleTest.java132
-rw-r--r--icu4c/source/common/loclikelysubtags.cpp67
-rw-r--r--icu4c/source/test/cintltst/cloctst.c16
-rw-r--r--icu4c/source/test/intltest/loctest.cpp159
-rw-r--r--icu4c/source/test/intltest/loctest.h1
-rw-r--r--icu4j/main/common_tests/src/test/java/com/ibm/icu/dev/test/util/ULocaleTest.java132
-rw-r--r--icu4j/main/core/src/main/java/com/ibm/icu/impl/locale/LikelySubtags.java73
8 files changed, 502 insertions, 151 deletions
diff --git a/android_icu4j/src/main/java/android/icu/impl/locale/LikelySubtags.java b/android_icu4j/src/main/java/android/icu/impl/locale/LikelySubtags.java
index ffc3a7413..6c4b699c9 100644
--- a/android_icu4j/src/main/java/android/icu/impl/locale/LikelySubtags.java
+++ b/android_icu4j/src/main/java/android/icu/impl/locale/LikelySubtags.java
@@ -220,49 +220,42 @@ public final class LikelySubtags {
// Handle pseudolocales like en-XA, ar-XB, fr-PSCRACK.
// They should match only themselves,
// not other locales with what looks like the same language and script subtags.
- if (region.length() == 2 && region.charAt(0) == 'X') {
- switch (region.charAt(1)) {
- case 'A':
- if (returnInputIfUnmatch) {
- return new LSR(language, script, region, LSR.EXPLICIT_LSR);
- }
- return new LSR(PSEUDO_ACCENTS_PREFIX + language,
- PSEUDO_ACCENTS_PREFIX + script, region, LSR.EXPLICIT_LSR);
- case 'B':
- if (returnInputIfUnmatch) {
- return new LSR(language, script, region, LSR.EXPLICIT_LSR);
- }
- return new LSR(PSEUDO_BIDI_PREFIX + language,
- PSEUDO_BIDI_PREFIX + script, region, LSR.EXPLICIT_LSR);
- case 'C':
- if (returnInputIfUnmatch) {
- return new LSR(language, script, region, LSR.EXPLICIT_LSR);
+ if (!returnInputIfUnmatch) {
+ if (region.length() == 2 && region.charAt(0) == 'X') {
+ switch (region.charAt(1)) {
+ case 'A':
+ return new LSR(PSEUDO_ACCENTS_PREFIX + language,
+ PSEUDO_ACCENTS_PREFIX + script, region, LSR.EXPLICIT_LSR);
+ case 'B':
+ return new LSR(PSEUDO_BIDI_PREFIX + language,
+ PSEUDO_BIDI_PREFIX + script, region, LSR.EXPLICIT_LSR);
+ case 'C':
+ return new LSR(PSEUDO_CRACKED_PREFIX + language,
+ PSEUDO_CRACKED_PREFIX + script, region, LSR.EXPLICIT_LSR);
+ default: // normal locale
+ break;
}
- return new LSR(PSEUDO_CRACKED_PREFIX + language,
- PSEUDO_CRACKED_PREFIX + script, region, LSR.EXPLICIT_LSR);
- default: // normal locale
- break;
}
- }
- if (variant.startsWith("PS")) {
- int lsrFlags = region.isEmpty() ?
- LSR.EXPLICIT_LANGUAGE | LSR.EXPLICIT_SCRIPT : LSR.EXPLICIT_LSR;
- switch (variant) {
- case "PSACCENT":
- return new LSR(PSEUDO_ACCENTS_PREFIX + language,
- PSEUDO_ACCENTS_PREFIX + script,
- region.isEmpty() ? "XA" : region, lsrFlags);
- case "PSBIDI":
- return new LSR(PSEUDO_BIDI_PREFIX + language,
- PSEUDO_BIDI_PREFIX + script,
- region.isEmpty() ? "XB" : region, lsrFlags);
- case "PSCRACK":
- return new LSR(PSEUDO_CRACKED_PREFIX + language,
- PSEUDO_CRACKED_PREFIX + script,
- region.isEmpty() ? "XC" : region, lsrFlags);
- default: // normal locale
- break;
+ if (variant.startsWith("PS")) {
+ int lsrFlags = region.isEmpty() ?
+ LSR.EXPLICIT_LANGUAGE | LSR.EXPLICIT_SCRIPT : LSR.EXPLICIT_LSR;
+ switch (variant) {
+ case "PSACCENT":
+ return new LSR(PSEUDO_ACCENTS_PREFIX + language,
+ PSEUDO_ACCENTS_PREFIX + script,
+ region.isEmpty() ? "XA" : region, lsrFlags);
+ case "PSBIDI":
+ return new LSR(PSEUDO_BIDI_PREFIX + language,
+ PSEUDO_BIDI_PREFIX + script,
+ region.isEmpty() ? "XB" : region, lsrFlags);
+ case "PSCRACK":
+ return new LSR(PSEUDO_CRACKED_PREFIX + language,
+ PSEUDO_CRACKED_PREFIX + script,
+ region.isEmpty() ? "XC" : region, lsrFlags);
+ default: // normal locale
+ break;
+ }
}
}
diff --git a/android_icu4j/src/main/tests/android/icu/dev/test/util/ULocaleTest.java b/android_icu4j/src/main/tests/android/icu/dev/test/util/ULocaleTest.java
index a8fb04a62..be223f7c8 100644
--- a/android_icu4j/src/main/tests/android/icu/dev/test/util/ULocaleTest.java
+++ b/android_icu4j/src/main/tests/android/icu/dev/test/util/ULocaleTest.java
@@ -1975,17 +1975,36 @@ public class ULocaleTest extends CoreTestFmwk {
"zh_Hani",
"zh_Hani_CN" // If change, please also update ULocale.java
}, {
- // ICU-22545
- "en_XA",
+ // ICU-22545 & ICU-22742
"en_XA",
+ "en_Latn_XA",
+ }, {
+ // ICU-22545 & ICU-22742
+ "ar_XB",
+ "ar_Arab_XB",
+ }, {
+ // ICU-22545 & ICU-22742
+ "ru_XC",
+ "ru_Cyrl_XC",
+ }, {
+ // ICU-22742
+ "en_PSACCENT",
+ "en_Latn_US_PSACCENT",
+ }, {
+ "ar_PSBIDI",
+ "ar_Arab_EG_PSBIDI",
+ }, {
+ "ru_PSCRACK",
+ "ru_Cyrl_RU_PSCRACK",
}, {
- // ICU-22545
- "en_XB",
- "en_XB",
+ "ar_PSACCENT",
+ "ar_Arab_EG_PSACCENT",
}, {
- // ICU-22545
- "en_XC",
- "en_XC",
+ "ru_PSBIDI",
+ "ru_Cyrl_RU_PSBIDI",
+ }, {
+ "en_PSCRACK",
+ "en_Latn_US_PSCRACK",
}
};
@@ -5595,6 +5614,103 @@ public class ULocaleTest extends CoreTestFmwk {
return tests;
}
+ // ICU-22742, test addLikelySubtags with pseudo-locales
+ @Test
+ public void TestPseudoLocales() {
+ // input locale tag, expected locale tag
+ String[][] testCases = new String[][] {
+ // language + region, en
+ { "en-XA", "en-Latn-XA" },
+ { "en-XB", "en-Latn-XB" },
+ { "en-XC", "en-Latn-XC" },
+
+ // language + region, ar
+ { "ar-XA", "ar-Arab-XA" },
+ { "ar-XB", "ar-Arab-XB" },
+ { "ar-XC", "ar-Arab-XC" },
+
+ // language + region, something other than en, ar
+ { "ru-XA", "ru-Cyrl-XA" },
+ { "el-XB", "el-Grek-XB" },
+
+ // undefined language - region
+ { "und-XA", "en-Latn-XA" },
+ { "und-XB", "en-Latn-XB" },
+ { "und-XC", "en-Latn-XC" },
+
+ // language + script + region
+ { "und-Latn-XA", "en-Latn-XA" },
+ { "und-Latn-XB", "en-Latn-XB" },
+ { "und-Latn-XC", "en-Latn-XC" },
+ { "und-Arab-XA", "ar-Arab-XA" },
+ { "und-Arab-XB", "ar-Arab-XB" },
+ { "und-Arab-XC", "ar-Arab-XC" },
+ { "und-Cyrl-XA", "ru-Cyrl-XA" },
+ { "und-Grek-XB", "el-Grek-XB" },
+
+ // Make sure the script is not damaged, when correct
+ { "ru-Cyrl-XA", "ru-Cyrl-XA" },
+ { "el-Grek-XB", "el-Grek-XB" },
+
+ // Make sure the script is not damaged, even if it is wrong
+ { "ru-Grek-XA", "ru-Grek-XA" },
+ { "el-Cyrl-XB", "el-Cyrl-XB" },
+
+ // PS Variants
+ { "en-XA-PSACCENT", "en-Latn-XA-psaccent" },
+ { "en-XA-PSBIDI", "en-Latn-XA-psbidi" },
+ { "en-XA-PSCRACK", "en-Latn-XA-pscrack" },
+ { "ar-XB-PSACCENT", "ar-Arab-XB-psaccent" },
+ { "ar-XB-PSBIDI", "ar-Arab-XB-psbidi" },
+ { "ar-XB-PSCRACK", "ar-Arab-XB-pscrack" },
+ { "en-XC-PSACCENT", "en-Latn-XC-psaccent" },
+ { "en-XC-PSBIDI", "en-Latn-XC-psbidi" },
+ { "en-XC-PSCRACK", "en-Latn-XC-pscrack" },
+
+ { "en-US-PSACCENT", "en-Latn-US-psaccent" },
+ { "en-US-PSBIDI", "en-Latn-US-psbidi" },
+ { "en-US-PSCRACK", "en-Latn-US-pscrack" },
+ { "ar-EG-PSACCENT", "ar-Arab-EG-psaccent" },
+ { "ar-EG-PSBIDI", "ar-Arab-EG-psbidi" },
+ { "ar-EG-PSCRACK", "ar-Arab-EG-pscrack" },
+
+ { "en-PSACCENT", "en-Latn-US-psaccent" },
+ { "en-PSBIDI", "en-Latn-US-psbidi" },
+ { "en-PSCRACK", "en-Latn-US-pscrack" },
+ { "ar-PSACCENT", "ar-Arab-EG-psaccent" },
+ { "ar-PSBIDI", "ar-Arab-EG-psbidi" },
+ { "ar-PSCRACK", "ar-Arab-EG-pscrack" },
+
+ { "und-US-PSACCENT", "en-Latn-US-psaccent" },
+ { "und-US-PSBIDI", "en-Latn-US-psbidi" },
+ { "und-US-PSCRACK", "en-Latn-US-pscrack" },
+ { "und-EG-PSACCENT", "ar-Arab-EG-psaccent" },
+ { "und-EG-PSBIDI", "ar-Arab-EG-psbidi" },
+ { "und-EG-PSCRACK", "ar-Arab-EG-pscrack" },
+
+ { "und-PSACCENT", "en-Latn-US-psaccent" },
+ { "und-PSBIDI", "en-Latn-US-psbidi" },
+ { "und-PSCRACK", "en-Latn-US-pscrack" },
+ { "und-PSACCENT", "en-Latn-US-psaccent" },
+ { "und-PSBIDI", "en-Latn-US-psbidi" },
+ { "und-PSCRACK", "en-Latn-US-pscrack" },
+ };
+ String extensions = "-u-nu-Deva-hc-h23-fw-mon-mu-celsius-x-something-more";
+
+ for (String[] testCase : testCases) {
+ String inputTag = testCase[0];
+ String expectedTag = testCase[1];
+ ULocale result = ULocale.addLikelySubtags(ULocale.forLanguageTag(inputTag));
+ ULocale expected = ULocale.forLanguageTag(expectedTag);
+ assertEquals("pseudo-locales(" + inputTag + ")", expected, result);
+
+ // Make sure this also works with extensions. Kind of hacky...
+ result = ULocale.addLikelySubtags(ULocale.forLanguageTag(inputTag + extensions));
+ expected = ULocale.forLanguageTag(expectedTag + extensions);
+ assertEquals("pseudo-locales(" + inputTag + ")", expected, result);
+ }
+ }
+
@Test
@Parameters(method = "readLikelySubtagsTestCases")
public void likelySubtagsDataDriven(TestCase test) {
diff --git a/icu4c/source/common/loclikelysubtags.cpp b/icu4c/source/common/loclikelysubtags.cpp
index c18219105..a750bb7b1 100644
--- a/icu4c/source/common/loclikelysubtags.cpp
+++ b/icu4c/source/common/loclikelysubtags.cpp
@@ -564,47 +564,40 @@ LSR LikelySubtags::makeMaximizedLsr(const char *language, const char *script, co
// Handle pseudolocales like en-XA, ar-XB, fr-PSCRACK.
// They should match only themselves,
// not other locales with what looks like the same language and script subtags.
- char c1;
- if (region[0] == 'X' && (c1 = region[1]) != 0 && region[2] == 0) {
- switch (c1) {
- case 'A':
- if (returnInputIfUnmatch) {
- return LSR(language, script, region, LSR::EXPLICIT_LSR);
- }
- return LSR(PSEUDO_ACCENTS_PREFIX, language, script, region,
- LSR::EXPLICIT_LSR, errorCode);
- case 'B':
- if (returnInputIfUnmatch) {
- return LSR(language, script, region, LSR::EXPLICIT_LSR);
- }
- return LSR(PSEUDO_BIDI_PREFIX, language, script, region,
- LSR::EXPLICIT_LSR, errorCode);
- case 'C':
- if (returnInputIfUnmatch) {
- return LSR(language, script, region, LSR::EXPLICIT_LSR);
+ if (!returnInputIfUnmatch) {
+ char c1;
+ if (region[0] == 'X' && (c1 = region[1]) != 0 && region[2] == 0) {
+ switch (c1) {
+ case 'A':
+ return LSR(PSEUDO_ACCENTS_PREFIX, language, script, region,
+ LSR::EXPLICIT_LSR, errorCode);
+ case 'B':
+ return LSR(PSEUDO_BIDI_PREFIX, language, script, region,
+ LSR::EXPLICIT_LSR, errorCode);
+ case 'C':
+ return LSR(PSEUDO_CRACKED_PREFIX, language, script, region,
+ LSR::EXPLICIT_LSR, errorCode);
+ default: // normal locale
+ break;
}
- return LSR(PSEUDO_CRACKED_PREFIX, language, script, region,
- LSR::EXPLICIT_LSR, errorCode);
- default: // normal locale
- break;
}
- }
- if (variant[0] == 'P' && variant[1] == 'S') {
- int32_t lsrFlags = *region == 0 ?
- LSR::EXPLICIT_LANGUAGE | LSR::EXPLICIT_SCRIPT : LSR::EXPLICIT_LSR;
- if (uprv_strcmp(variant, "PSACCENT") == 0) {
- return LSR(PSEUDO_ACCENTS_PREFIX, language, script,
- *region == 0 ? "XA" : region, lsrFlags, errorCode);
- } else if (uprv_strcmp(variant, "PSBIDI") == 0) {
- return LSR(PSEUDO_BIDI_PREFIX, language, script,
- *region == 0 ? "XB" : region, lsrFlags, errorCode);
- } else if (uprv_strcmp(variant, "PSCRACK") == 0) {
- return LSR(PSEUDO_CRACKED_PREFIX, language, script,
- *region == 0 ? "XC" : region, lsrFlags, errorCode);
+ if (variant[0] == 'P' && variant[1] == 'S') {
+ int32_t lsrFlags = *region == 0 ?
+ LSR::EXPLICIT_LANGUAGE | LSR::EXPLICIT_SCRIPT : LSR::EXPLICIT_LSR;
+ if (uprv_strcmp(variant, "PSACCENT") == 0) {
+ return LSR(PSEUDO_ACCENTS_PREFIX, language, script,
+ *region == 0 ? "XA" : region, lsrFlags, errorCode);
+ } else if (uprv_strcmp(variant, "PSBIDI") == 0) {
+ return LSR(PSEUDO_BIDI_PREFIX, language, script,
+ *region == 0 ? "XB" : region, lsrFlags, errorCode);
+ } else if (uprv_strcmp(variant, "PSCRACK") == 0) {
+ return LSR(PSEUDO_CRACKED_PREFIX, language, script,
+ *region == 0 ? "XC" : region, lsrFlags, errorCode);
+ }
+ // else normal locale
}
- // else normal locale
- }
+ } // end of if (!returnInputIfUnmatch)
language = getCanonical(languageAliases, language);
// (We have no script mappings.)
diff --git a/icu4c/source/test/cintltst/cloctst.c b/icu4c/source/test/cintltst/cloctst.c
index 04efd5f37..5815b14f2 100644
--- a/icu4c/source/test/cintltst/cloctst.c
+++ b/icu4c/source/test/cintltst/cloctst.c
@@ -3920,17 +3920,17 @@ const char* const basic_maximize_data[][2] = {
"zh_Hani",
"zh_Hani_CN" // If change, please also update common/unicode/uloc.h
}, {
- // ICU-22545
+ // ICU-22545 & ICU-22742
"en_XA",
- "en_XA"
+ "en_Latn_XA"
}, {
- // ICU-22545
- "en_XB",
- "en_XB"
+ // ICU-22545 & ICU-22742
+ "ar_XB",
+ "ar_Arab_XB"
}, {
- // ICU-22545
- "en_XC",
- "en_XC"
+ // ICU-22545 & ICU-22742
+ "ru_XC",
+ "ru_Cyrl_XC"
}
};
diff --git a/icu4c/source/test/intltest/loctest.cpp b/icu4c/source/test/intltest/loctest.cpp
index f39fb1ad5..dc0947bfd 100644
--- a/icu4c/source/test/intltest/loctest.cpp
+++ b/icu4c/source/test/intltest/loctest.cpp
@@ -233,6 +233,7 @@ void LocaleTest::runIndexedTest( int32_t index, UBool exec, const char* &name, c
#endif
TESTCASE_AUTO(TestSetIsBogus);
TESTCASE_AUTO(TestParallelAPIValues);
+ TESTCASE_AUTO(TestPseudoLocales);
TESTCASE_AUTO(TestAddLikelySubtags);
TESTCASE_AUTO(TestMinimizeSubtags);
TESTCASE_AUTO(TestAddLikelyAndMinimizeSubtags);
@@ -1741,6 +1742,119 @@ LocaleTest::TestSetIsBogus() {
}
+void LocaleTest::TestPseudoLocales() {
+ // input locale tag, expected locale tag
+ static const struct {
+ const char* const input;
+ const char* const expected;
+ } test_cases[] = {
+ // language + region, en
+ { "en-XA", "en-Latn-XA" },
+ { "en-XB", "en-Latn-XB" },
+ { "en-XC", "en-Latn-XC" },
+
+ // language + region, ar
+ { "ar-XA", "ar-Arab-XA" },
+ { "ar-XB", "ar-Arab-XB" },
+ { "ar-XC", "ar-Arab-XC" },
+
+ // language + region, something other than en, ar
+ { "ru-XA", "ru-Cyrl-XA" },
+ { "el-XB", "el-Grek-XB" },
+
+ // undefined language - region
+ { "und-XA", "en-Latn-XA" },
+ { "und-XB", "en-Latn-XB" },
+ { "und-XC", "en-Latn-XC" },
+
+ // language + script + region
+ { "und-Latn-XA", "en-Latn-XA" },
+ { "und-Latn-XB", "en-Latn-XB" },
+ { "und-Latn-XC", "en-Latn-XC" },
+ { "und-Arab-XA", "ar-Arab-XA" },
+ { "und-Arab-XB", "ar-Arab-XB" },
+ { "und-Arab-XC", "ar-Arab-XC" },
+ { "und-Cyrl-XA", "ru-Cyrl-XA" },
+ { "und-Grek-XB", "el-Grek-XB" },
+
+ // Make sure the script is not damaged, when correct
+ { "ru-Cyrl-XA", "ru-Cyrl-XA" },
+ { "el-Grek-XB", "el-Grek-XB" },
+
+ // Make sure the script is not damaged, even if it is wrong
+ { "ru-Grek-XA", "ru-Grek-XA" },
+ { "el-Cyrl-XB", "el-Cyrl-XB" },
+
+ // PS Variants
+ { "en-XA-PSACCENT", "en-Latn-XA-psaccent" },
+ { "en-XA-PSBIDI", "en-Latn-XA-psbidi" },
+ { "en-XA-PSCRACK", "en-Latn-XA-pscrack" },
+ { "ar-XB-PSACCENT", "ar-Arab-XB-psaccent" },
+ { "ar-XB-PSBIDI", "ar-Arab-XB-psbidi" },
+ { "ar-XB-PSCRACK", "ar-Arab-XB-pscrack" },
+ { "en-XC-PSACCENT", "en-Latn-XC-psaccent" },
+ { "en-XC-PSBIDI", "en-Latn-XC-psbidi" },
+ { "en-XC-PSCRACK", "en-Latn-XC-pscrack" },
+
+ { "en-US-PSACCENT", "en-Latn-US-psaccent" },
+ { "en-US-PSBIDI", "en-Latn-US-psbidi" },
+ { "en-US-PSCRACK", "en-Latn-US-pscrack" },
+ { "ar-EG-PSACCENT", "ar-Arab-EG-psaccent" },
+ { "ar-EG-PSBIDI", "ar-Arab-EG-psbidi" },
+ { "ar-EG-PSCRACK", "ar-Arab-EG-pscrack" },
+
+ { "en-PSACCENT", "en-Latn-US-psaccent" },
+ { "en-PSBIDI", "en-Latn-US-psbidi" },
+ { "en-PSCRACK", "en-Latn-US-pscrack" },
+ { "ar-PSACCENT", "ar-Arab-EG-psaccent" },
+ { "ar-PSBIDI", "ar-Arab-EG-psbidi" },
+ { "ar-PSCRACK", "ar-Arab-EG-pscrack" },
+
+ { "und-US-PSACCENT", "en-Latn-US-psaccent" },
+ { "und-US-PSBIDI", "en-Latn-US-psbidi" },
+ { "und-US-PSCRACK", "en-Latn-US-pscrack" },
+ { "und-EG-PSACCENT", "ar-Arab-EG-psaccent" },
+ { "und-EG-PSBIDI", "ar-Arab-EG-psbidi" },
+ { "und-EG-PSCRACK", "ar-Arab-EG-pscrack" },
+
+ { "und-PSACCENT", "en-Latn-US-psaccent" },
+ { "und-PSBIDI", "en-Latn-US-psbidi" },
+ { "und-PSCRACK", "en-Latn-US-pscrack" },
+ { "und-PSACCENT", "en-Latn-US-psaccent" },
+ { "und-PSBIDI", "en-Latn-US-psbidi" },
+ { "und-PSCRACK", "en-Latn-US-pscrack" },
+ };
+
+ std::string extensions("-u-nu-Deva-hc-h23-fw-mon-mu-celsius-x-somethin-more");
+
+ IcuTestErrorCode status(*this, "TestPseudoLocales()");
+ for (const auto& item : test_cases) {
+ const char* const inputTag = item.input;
+ const char* const expectedTag = item.expected;
+ Locale result = Locale::forLanguageTag(inputTag, status);
+ result.addLikelySubtags(status);
+ status.errIfFailureAndReset("\"%s\"", inputTag);
+ Locale expected = Locale::forLanguageTag(expectedTag, status);
+ status.errIfFailureAndReset("\"%s\"", expectedTag);
+ assertEquals(inputTag, expected.getName(), result.getName());
+
+ // Test extension
+ std::string extendedTag(inputTag);
+ extendedTag.append(extensions);
+
+ result = Locale::forLanguageTag(extendedTag, status);
+ result.addLikelySubtags(status);
+ status.errIfFailureAndReset(extendedTag.c_str());
+
+ std::string expectedExtendedTag(expectedTag);
+ expectedExtendedTag.append(extensions);
+
+ expected = Locale::forLanguageTag(expectedExtendedTag, status);
+ status.errIfFailureAndReset(expectedExtendedTag.c_str());
+ assertEquals(extendedTag.c_str(), expected.getName(), result.getName());
+ }
+}
+
void
LocaleTest::TestAddLikelySubtags() {
IcuTestErrorCode status(*this, "TestAddLikelySubtags()");
@@ -3974,20 +4088,45 @@ LocaleTest::TestAddLikelyAndMinimizeSubtags() {
"zh_Hani_CN", // If change, please also update common/unicode/locid.h
"zh_Hani"
}, {
- // ICU-22545
- "en_XA",
+ // ICU-22545 & ICU-22742
"en_XA",
+ "en_Latn_XA",
"en_XA",
}, {
- // ICU-22545
- "en_XB",
- "en_XB",
- "en_XB",
+ // ICU-22545 & ICU-22742
+ "ar_XB",
+ "ar_Arab_XB",
+ "ar_XB",
+ }, {
+ // ICU-22545 & ICU-22742
+ "ru_XC",
+ "ru_Cyrl_XC",
+ "ru_XC",
+ }, {
+ // ICU-22742
+ "en_PSACCENT",
+ "en_Latn_US_PSACCENT",
+ "en__PSACCENT"
+ }, {
+ "ar_PSBIDI",
+ "ar_Arab_EG_PSBIDI",
+ "ar__PSBIDI"
+ }, {
+ "ru_PSCRACK",
+ "ru_Cyrl_RU_PSCRACK",
+ "ru__PSCRACK"
+ }, {
+ "ar_PSACCENT",
+ "ar_Arab_EG_PSACCENT",
+ "ar__PSACCENT"
+ }, {
+ "ru_PSBIDI",
+ "ru_Cyrl_RU_PSBIDI",
+ "ru__PSBIDI"
}, {
- // ICU-22545
- "en_XC",
- "en_XC",
- "en_XC",
+ "en_PSCRACK",
+ "en_Latn_US_PSCRACK",
+ "en__PSCRACK"
}
};
diff --git a/icu4c/source/test/intltest/loctest.h b/icu4c/source/test/intltest/loctest.h
index cb79b456d..b3410242e 100644
--- a/icu4c/source/test/intltest/loctest.h
+++ b/icu4c/source/test/intltest/loctest.h
@@ -129,6 +129,7 @@ public:
void TestKnownCanonicalizedListCorrect();
void TestConstructorAcceptsBCP47();
+ void TestPseudoLocales();
void TestAddLikelySubtags();
void TestMinimizeSubtags();
void TestAddLikelyAndMinimizeSubtags();
diff --git a/icu4j/main/common_tests/src/test/java/com/ibm/icu/dev/test/util/ULocaleTest.java b/icu4j/main/common_tests/src/test/java/com/ibm/icu/dev/test/util/ULocaleTest.java
index 39b981370..3361782ec 100644
--- a/icu4j/main/common_tests/src/test/java/com/ibm/icu/dev/test/util/ULocaleTest.java
+++ b/icu4j/main/common_tests/src/test/java/com/ibm/icu/dev/test/util/ULocaleTest.java
@@ -1972,17 +1972,36 @@ public class ULocaleTest extends CoreTestFmwk {
"zh_Hani",
"zh_Hani_CN" // If change, please also update ULocale.java
}, {
- // ICU-22545
- "en_XA",
+ // ICU-22545 & ICU-22742
"en_XA",
+ "en_Latn_XA",
+ }, {
+ // ICU-22545 & ICU-22742
+ "ar_XB",
+ "ar_Arab_XB",
+ }, {
+ // ICU-22545 & ICU-22742
+ "ru_XC",
+ "ru_Cyrl_XC",
+ }, {
+ // ICU-22742
+ "en_PSACCENT",
+ "en_Latn_US_PSACCENT",
+ }, {
+ "ar_PSBIDI",
+ "ar_Arab_EG_PSBIDI",
+ }, {
+ "ru_PSCRACK",
+ "ru_Cyrl_RU_PSCRACK",
}, {
- // ICU-22545
- "en_XB",
- "en_XB",
+ "ar_PSACCENT",
+ "ar_Arab_EG_PSACCENT",
}, {
- // ICU-22545
- "en_XC",
- "en_XC",
+ "ru_PSBIDI",
+ "ru_Cyrl_RU_PSBIDI",
+ }, {
+ "en_PSCRACK",
+ "en_Latn_US_PSCRACK",
}
};
@@ -5592,6 +5611,103 @@ public class ULocaleTest extends CoreTestFmwk {
return tests;
}
+ // ICU-22742, test addLikelySubtags with pseudo-locales
+ @Test
+ public void TestPseudoLocales() {
+ // input locale tag, expected locale tag
+ String[][] testCases = new String[][] {
+ // language + region, en
+ { "en-XA", "en-Latn-XA" },
+ { "en-XB", "en-Latn-XB" },
+ { "en-XC", "en-Latn-XC" },
+
+ // language + region, ar
+ { "ar-XA", "ar-Arab-XA" },
+ { "ar-XB", "ar-Arab-XB" },
+ { "ar-XC", "ar-Arab-XC" },
+
+ // language + region, something other than en, ar
+ { "ru-XA", "ru-Cyrl-XA" },
+ { "el-XB", "el-Grek-XB" },
+
+ // undefined language - region
+ { "und-XA", "en-Latn-XA" },
+ { "und-XB", "en-Latn-XB" },
+ { "und-XC", "en-Latn-XC" },
+
+ // language + script + region
+ { "und-Latn-XA", "en-Latn-XA" },
+ { "und-Latn-XB", "en-Latn-XB" },
+ { "und-Latn-XC", "en-Latn-XC" },
+ { "und-Arab-XA", "ar-Arab-XA" },
+ { "und-Arab-XB", "ar-Arab-XB" },
+ { "und-Arab-XC", "ar-Arab-XC" },
+ { "und-Cyrl-XA", "ru-Cyrl-XA" },
+ { "und-Grek-XB", "el-Grek-XB" },
+
+ // Make sure the script is not damaged, when correct
+ { "ru-Cyrl-XA", "ru-Cyrl-XA" },
+ { "el-Grek-XB", "el-Grek-XB" },
+
+ // Make sure the script is not damaged, even if it is wrong
+ { "ru-Grek-XA", "ru-Grek-XA" },
+ { "el-Cyrl-XB", "el-Cyrl-XB" },
+
+ // PS Variants
+ { "en-XA-PSACCENT", "en-Latn-XA-psaccent" },
+ { "en-XA-PSBIDI", "en-Latn-XA-psbidi" },
+ { "en-XA-PSCRACK", "en-Latn-XA-pscrack" },
+ { "ar-XB-PSACCENT", "ar-Arab-XB-psaccent" },
+ { "ar-XB-PSBIDI", "ar-Arab-XB-psbidi" },
+ { "ar-XB-PSCRACK", "ar-Arab-XB-pscrack" },
+ { "en-XC-PSACCENT", "en-Latn-XC-psaccent" },
+ { "en-XC-PSBIDI", "en-Latn-XC-psbidi" },
+ { "en-XC-PSCRACK", "en-Latn-XC-pscrack" },
+
+ { "en-US-PSACCENT", "en-Latn-US-psaccent" },
+ { "en-US-PSBIDI", "en-Latn-US-psbidi" },
+ { "en-US-PSCRACK", "en-Latn-US-pscrack" },
+ { "ar-EG-PSACCENT", "ar-Arab-EG-psaccent" },
+ { "ar-EG-PSBIDI", "ar-Arab-EG-psbidi" },
+ { "ar-EG-PSCRACK", "ar-Arab-EG-pscrack" },
+
+ { "en-PSACCENT", "en-Latn-US-psaccent" },
+ { "en-PSBIDI", "en-Latn-US-psbidi" },
+ { "en-PSCRACK", "en-Latn-US-pscrack" },
+ { "ar-PSACCENT", "ar-Arab-EG-psaccent" },
+ { "ar-PSBIDI", "ar-Arab-EG-psbidi" },
+ { "ar-PSCRACK", "ar-Arab-EG-pscrack" },
+
+ { "und-US-PSACCENT", "en-Latn-US-psaccent" },
+ { "und-US-PSBIDI", "en-Latn-US-psbidi" },
+ { "und-US-PSCRACK", "en-Latn-US-pscrack" },
+ { "und-EG-PSACCENT", "ar-Arab-EG-psaccent" },
+ { "und-EG-PSBIDI", "ar-Arab-EG-psbidi" },
+ { "und-EG-PSCRACK", "ar-Arab-EG-pscrack" },
+
+ { "und-PSACCENT", "en-Latn-US-psaccent" },
+ { "und-PSBIDI", "en-Latn-US-psbidi" },
+ { "und-PSCRACK", "en-Latn-US-pscrack" },
+ { "und-PSACCENT", "en-Latn-US-psaccent" },
+ { "und-PSBIDI", "en-Latn-US-psbidi" },
+ { "und-PSCRACK", "en-Latn-US-pscrack" },
+ };
+ String extensions = "-u-nu-Deva-hc-h23-fw-mon-mu-celsius-x-something-more";
+
+ for (String[] testCase : testCases) {
+ String inputTag = testCase[0];
+ String expectedTag = testCase[1];
+ ULocale result = ULocale.addLikelySubtags(ULocale.forLanguageTag(inputTag));
+ ULocale expected = ULocale.forLanguageTag(expectedTag);
+ assertEquals("pseudo-locales(" + inputTag + ")", expected, result);
+
+ // Make sure this also works with extensions. Kind of hacky...
+ result = ULocale.addLikelySubtags(ULocale.forLanguageTag(inputTag + extensions));
+ expected = ULocale.forLanguageTag(expectedTag + extensions);
+ assertEquals("pseudo-locales(" + inputTag + ")", expected, result);
+ }
+ }
+
@Test
@Parameters(method = "readLikelySubtagsTestCases")
public void likelySubtagsDataDriven(TestCase test) {
diff --git a/icu4j/main/core/src/main/java/com/ibm/icu/impl/locale/LikelySubtags.java b/icu4j/main/core/src/main/java/com/ibm/icu/impl/locale/LikelySubtags.java
index 6d5a35379..2a42c60c4 100644
--- a/icu4j/main/core/src/main/java/com/ibm/icu/impl/locale/LikelySubtags.java
+++ b/icu4j/main/core/src/main/java/com/ibm/icu/impl/locale/LikelySubtags.java
@@ -213,49 +213,42 @@ public final class LikelySubtags {
// Handle pseudolocales like en-XA, ar-XB, fr-PSCRACK.
// They should match only themselves,
// not other locales with what looks like the same language and script subtags.
- if (region.length() == 2 && region.charAt(0) == 'X') {
- switch (region.charAt(1)) {
- case 'A':
- if (returnInputIfUnmatch) {
- return new LSR(language, script, region, LSR.EXPLICIT_LSR);
- }
- return new LSR(PSEUDO_ACCENTS_PREFIX + language,
- PSEUDO_ACCENTS_PREFIX + script, region, LSR.EXPLICIT_LSR);
- case 'B':
- if (returnInputIfUnmatch) {
- return new LSR(language, script, region, LSR.EXPLICIT_LSR);
- }
- return new LSR(PSEUDO_BIDI_PREFIX + language,
- PSEUDO_BIDI_PREFIX + script, region, LSR.EXPLICIT_LSR);
- case 'C':
- if (returnInputIfUnmatch) {
- return new LSR(language, script, region, LSR.EXPLICIT_LSR);
+ if (!returnInputIfUnmatch) {
+ if (region.length() == 2 && region.charAt(0) == 'X') {
+ switch (region.charAt(1)) {
+ case 'A':
+ return new LSR(PSEUDO_ACCENTS_PREFIX + language,
+ PSEUDO_ACCENTS_PREFIX + script, region, LSR.EXPLICIT_LSR);
+ case 'B':
+ return new LSR(PSEUDO_BIDI_PREFIX + language,
+ PSEUDO_BIDI_PREFIX + script, region, LSR.EXPLICIT_LSR);
+ case 'C':
+ return new LSR(PSEUDO_CRACKED_PREFIX + language,
+ PSEUDO_CRACKED_PREFIX + script, region, LSR.EXPLICIT_LSR);
+ default: // normal locale
+ break;
}
- return new LSR(PSEUDO_CRACKED_PREFIX + language,
- PSEUDO_CRACKED_PREFIX + script, region, LSR.EXPLICIT_LSR);
- default: // normal locale
- break;
}
- }
- if (variant.startsWith("PS")) {
- int lsrFlags = region.isEmpty() ?
- LSR.EXPLICIT_LANGUAGE | LSR.EXPLICIT_SCRIPT : LSR.EXPLICIT_LSR;
- switch (variant) {
- case "PSACCENT":
- return new LSR(PSEUDO_ACCENTS_PREFIX + language,
- PSEUDO_ACCENTS_PREFIX + script,
- region.isEmpty() ? "XA" : region, lsrFlags);
- case "PSBIDI":
- return new LSR(PSEUDO_BIDI_PREFIX + language,
- PSEUDO_BIDI_PREFIX + script,
- region.isEmpty() ? "XB" : region, lsrFlags);
- case "PSCRACK":
- return new LSR(PSEUDO_CRACKED_PREFIX + language,
- PSEUDO_CRACKED_PREFIX + script,
- region.isEmpty() ? "XC" : region, lsrFlags);
- default: // normal locale
- break;
+ if (variant.startsWith("PS")) {
+ int lsrFlags = region.isEmpty() ?
+ LSR.EXPLICIT_LANGUAGE | LSR.EXPLICIT_SCRIPT : LSR.EXPLICIT_LSR;
+ switch (variant) {
+ case "PSACCENT":
+ return new LSR(PSEUDO_ACCENTS_PREFIX + language,
+ PSEUDO_ACCENTS_PREFIX + script,
+ region.isEmpty() ? "XA" : region, lsrFlags);
+ case "PSBIDI":
+ return new LSR(PSEUDO_BIDI_PREFIX + language,
+ PSEUDO_BIDI_PREFIX + script,
+ region.isEmpty() ? "XB" : region, lsrFlags);
+ case "PSCRACK":
+ return new LSR(PSEUDO_CRACKED_PREFIX + language,
+ PSEUDO_CRACKED_PREFIX + script,
+ region.isEmpty() ? "XC" : region, lsrFlags);
+ default: // normal locale
+ break;
+ }
}
}