absl/strings/internal/utf8_test.cc - external/github.com/abseil/abseil-cpp - Git at Google

 // Copyright 2017 The Abseil Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //      https://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.

 #include "absl/strings/internal/utf8.h"

 #include <cstddef>
 #include <cstdint>
 #include <cstring>
 #include <string>
 #include <type_traits>
 #include <utility>
 #include <vector>

 #include "gmock/gmock.h"
 #include "gtest/gtest.h"
 #include "absl/base/port.h"
 #include "absl/strings/string_view.h"

 namespace {

 using ::absl::strings_internal::kMaxEncodedUTF8Size;
 using ::absl::strings_internal::ShiftState;
 using ::absl::strings_internal::WideToUtf8;
 using ::testing::StartsWith;
 using ::testing::TestParamInfo;
 using ::testing::TestWithParam;
 using ::testing::ValuesIn;

 #if !defined(__cpp_char8_t)
 #if defined(__clang__)
 #pragma clang diagnostic push
 #pragma clang diagnostic ignored "-Wc++2a-compat"
 #endif
 TEST(EncodeUTF8Char, BasicFunction) {
   std::pair<char32_t, std::string> tests[] = {{0x0030, u8"\u0030"},
                                               {0x00A3, u8"\u00A3"},
                                               {0x00010000, u8"\U00010000"},
                                               {0x0000FFFF, u8"\U0000FFFF"},
                                               {0x0010FFFD, u8"\U0010FFFD"}};
   for (auto& test : tests) {
     char buf0[7] = {'\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00'};
     char buf1[7] = {'\xFF', '\xFF', '\xFF', '\xFF', '\xFF', '\xFF', '\xFF'};
     char* buf0_written =
         &buf0[absl::strings_internal::EncodeUTF8Char(buf0, test.first)];
     char* buf1_written =
         &buf1[absl::strings_internal::EncodeUTF8Char(buf1, test.first)];
     int apparent_length = 7;
     while (buf0[apparent_length - 1] == '\x00' &&
            buf1[apparent_length - 1] == '\xFF') {
       if (--apparent_length == 0) break;
     }
     EXPECT_EQ(apparent_length, buf0_written - buf0);
     EXPECT_EQ(apparent_length, buf1_written - buf1);
     EXPECT_EQ(apparent_length, test.second.length());
     EXPECT_EQ(std::string(buf0, apparent_length), test.second);
     EXPECT_EQ(std::string(buf1, apparent_length), test.second);
   }
   char buf[32] = "Don't Tread On Me";
   EXPECT_LE(absl::strings_internal::EncodeUTF8Char(buf, 0x00110000),
             absl::strings_internal::kMaxEncodedUTF8Size);
   char buf2[32] = "Negative is invalid but sane";
   EXPECT_LE(absl::strings_internal::EncodeUTF8Char(buf2, -1),
             absl::strings_internal::kMaxEncodedUTF8Size);
 }
 #if defined(__clang__)
 #pragma clang diagnostic pop
 #endif
 #endif  // !defined(__cpp_char8_t)

 struct WideToUtf8TestCase {
   std::string description;
   wchar_t input;
   std::string expected_utf8_str;
   size_t expected_bytes_written;
   ShiftState initial_state = {false, 0};
   ShiftState expected_state = {false, 0};
 };

 std::vector<WideToUtf8TestCase> GetWideToUtf8TestCases() {
   constexpr size_t kError = static_cast<size_t>(-1);
   std::vector<WideToUtf8TestCase> cases = {
       {"ASCII_A", L'A', "A", 1},
       {"NullChar", L'\0', std::string("\0", 1), 1},
       {"ASCII_Max_7F", L'\x7F', "\x7F", 1},

       {"TwoByte_Min_80", L'\u0080', "\xC2\x80", 2},
       {"PoundSign_A3", L'\u00A3', "\xC2\xA3", 2},
       {"TwoByte_Max_7FF", L'\u07FF', "\xDF\xBF", 2},

       {"ThreeByte_Min_800", L'\u0800', "\xE0\xA0\x80", 3},
       {"EuroSign_20AC", L'\u20AC', "\xE2\x82\xAC", 3},
       {"BMP_MaxBeforeSurrogates_D7FF", L'\uD7FF', "\xED\x9F\xBF", 3},
       {"BMP_FFFF", L'\uFFFF', "\xEF\xBF\xBF", 3},

       {"IsolatedHighSurr_D800", L'\xD800', "\xF0\x90", 2, {}, {true, 0}},
       {"IsolatedHighSurr_DBFF", L'\xDBFF', "\xF4\x8F", 2, {}, {true, 3}},

       {"HighSurr_D800_after_HighD800",
        L'\xD800',
        "\xF0\x90",
        2,
        {true, 0},
        {true, 0}},
       {"HighSurr_DBFF_after_HighDBFF",
        L'\xDBFF',
        "\xF4\x8F",
        2,
        {true, 3},
        {true, 3}},

       {"LowSurr_DC00_after_HighD800", L'\xDC00', "\x80\x80", 2, {true, 0}, {}},
       {"LowSurr_DFFD_after_HighDBFF", L'\xDFFD', "\xBF\xBD", 2, {true, 3}, {}},
       {"LowSurr_DC00_with_InitialState_saw_high_bits_1",
        L'\xDC00',
        "\x90\x80",
        2,
        {true, 1},
        {}},

       // Final state = initial on error.
       {"Error_IsolatedLowSurr_DC00_NoPriorHigh", L'\xDC00', "", kError, {}, {}},
       {"Error_IsolatedLowSurr_DFFF_NoPriorHigh", L'\xDFFF', "", kError, {}, {}},

 #if (defined(WCHAR_MAX) && WCHAR_MAX > 0xFFFF)
       {"DirectSupplementaryChars_U10000", static_cast<wchar_t>(0x10000),
        "\xF0\x90\x80\x80", 4},
       {"DirectSupplementaryChars_U10FFFD", static_cast<wchar_t>(0x10FFFD),
        "\xF4\x8F\xBF\xBD", 4},
 #endif
   };

   wchar_t minus_one = static_cast<wchar_t>(-1);
   if constexpr (sizeof(wchar_t) == 2) {
     cases.push_back({"WChar_MinusOne_as_FFFF", minus_one, "\xEF\xBF\xBF", 3});
   } else {
     cases.push_back(
         {"Error_WChar_MinusOne_as_FFFFFFFF", minus_one, "", kError, {}, {}});
   }

   if constexpr (sizeof(wchar_t) >= 4) {
 #ifdef WCHAR_MAX
     if (static_cast<uintmax_t>(WCHAR_MAX) >= 0x110000UL) {
       cases.push_back({"Error_OutOfRange_110000",
                        static_cast<wchar_t>(0x110000UL),
                        "",
                        kError,
                        {},
                        {}});
     }
 #else
     cases.push_back({"Error_OutOfRange_110000_fallback",
                      static_cast<wchar_t>(0x110000UL),
                      "",
                      kError,
                      {},
                      {}});
 #endif
   }
   return cases;
 }

 class WideToUtf8ParamTest : public TestWithParam<WideToUtf8TestCase> {};

 TEST_P(WideToUtf8ParamTest, SingleCharConversion) {
   const auto& test_case = GetParam();
   ShiftState state = test_case.initial_state;
   constexpr char kFillChar = '\xAB';
   std::string buffer(32, kFillChar);

   size_t bytes_written = WideToUtf8(test_case.input, buffer.data(), state);

   EXPECT_EQ(bytes_written, test_case.expected_bytes_written);
   EXPECT_THAT(buffer, StartsWith(test_case.expected_utf8_str));

   // The remaining bytes should be unchanged.
   ASSERT_LT(test_case.expected_utf8_str.length(), buffer.size());
   EXPECT_EQ(buffer[test_case.expected_utf8_str.length()], kFillChar);

   EXPECT_EQ(state.saw_high_surrogate,
             test_case.expected_state.saw_high_surrogate);
   EXPECT_EQ(state.bits, test_case.expected_state.bits);
 }

 INSTANTIATE_TEST_SUITE_P(WideCharToUtf8Conversion, WideToUtf8ParamTest,
                          ValuesIn(GetWideToUtf8TestCases()),
                          [](auto info) { return info.param.description; });

 // Comprehensive test string for validating wchar_t to UTF-8 conversion.
 // This string is designed to cover a variety of Unicode character types and
 // sequences:
 // 1. Basic ASCII characters (within names, numbers, and spacing).
 // 2. Common 2-byte UTF-8 sequences:
 //    - Accented Latin characters (e.g., 'á' in "Holá").
 //    - Hebrew text with combining vowel points (e.g., "שָׁלוֹם").
 // 3. Common 3-byte UTF-8 sequences:
 //    - Currency symbols (e.g., '€').
 //    - CJK characters (e.g., "你好", "中").
 //    - Components of complex emojis like the Zero Width Joiner (ZWJ) and
 //      Heart symbol.
 // 4. Various 4-byte UTF-8 sequences (representing Supplementary Plane
 // characters):
 //    - An emoji with a skin tone modifier ("👍🏻").
 //    - A flag emoji composed of regional indicators ("🇺🇸").
 //    - A complex ZWJ emoji sequence ("👩‍❤️‍💋‍👨") combining
 //      SP characters (👩, 💋, 👨) with BMP characters (ZWJ and ❤️).
 //    - These are critical for testing the correct handling of surrogate pairs
 //      when wchar_t is 2 bytes (e.g., on Windows).
 // The goal is to ensure accurate conversion across a diverse set of
 // characters.
 //
 // clang-format off
 #define WIDE_STRING_LITERAL L"Holá €1 你好 שָׁלוֹם 👍🏻🇺🇸👩‍❤️‍💋‍👨 中"
 #define UTF8_STRING_LITERAL u8"Holá €1 你好 שָׁלוֹם 👍🏻🇺🇸👩‍❤️‍💋‍👨 中"
 // clang-format on

 absl::string_view GetUtf8TestString() {
   // `u8""` forces UTF-8 encoding; MSVC will default to e.g. CP1252 (and warn)
   // without it. However, the resulting character type differs between pre-C++20
   // (`char`) and C++20 (`char8_t`). So deduce the right character type for all
   // C++ versions, init it with UTF-8, then `memcpy()` to get the result as a
   // `char*`
   static absl::string_view kUtf8TestString = [] {
     using ConstChar8T = std::remove_reference_t<decltype(*u8"a")>;
     constexpr ConstChar8T kOutputUtf8[] = UTF8_STRING_LITERAL;
     static char output[sizeof kOutputUtf8];
     std::memcpy(output, kOutputUtf8, sizeof kOutputUtf8);
     return output;
   }();

   return kUtf8TestString;
 }

 TEST(WideToUtf8, FullString) {
   std::string buffer(kMaxEncodedUTF8Size * sizeof(WIDE_STRING_LITERAL), '\0');
   char* buffer_ptr = buffer.data();

   ShiftState state;
   for (const wchar_t wc : WIDE_STRING_LITERAL) {
     buffer_ptr += WideToUtf8(wc, buffer_ptr, state);
   }

   EXPECT_THAT(buffer, StartsWith(GetUtf8TestString()));
 }

 #undef WIDE_STRING_LITERAL
 #undef UTF8_STRING_LITERAL

 }  // namespace
	// Copyright 2017 The Abseil Authors.
	//
	// Licensed under the Apache License, Version 2.0 (the "License");
	// you may not use this file except in compliance with the License.
	// You may obtain a copy of the License at
	//
	// https://www.apache.org/licenses/LICENSE-2.0
	//
	// Unless required by applicable law or agreed to in writing, software
	// distributed under the License is distributed on an "AS IS" BASIS,
	// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	// See the License for the specific language governing permissions and
	// limitations under the License.

	#include "absl/strings/internal/utf8.h"

	#include <cstddef>
	#include <cstdint>
	#include <cstring>
	#include <string>
	#include <type_traits>
	#include <utility>
	#include <vector>

	#include "gmock/gmock.h"
	#include "gtest/gtest.h"
	#include "absl/base/port.h"
	#include "absl/strings/string_view.h"

	namespace {

	using ::absl::strings_internal::kMaxEncodedUTF8Size;
	using ::absl::strings_internal::ShiftState;
	using ::absl::strings_internal::WideToUtf8;
	using ::testing::StartsWith;
	using ::testing::TestParamInfo;
	using ::testing::TestWithParam;
	using ::testing::ValuesIn;

	#if !defined(__cpp_char8_t)
	#if defined(__clang__)
	#pragma clang diagnostic push
	#pragma clang diagnostic ignored "-Wc++2a-compat"
	#endif
	TEST(EncodeUTF8Char, BasicFunction) {
	std::pair<char32_t, std::string> tests[] = {{0x0030, u8"\u0030"},
	{0x00A3, u8"\u00A3"},
	{0x00010000, u8"\U00010000"},
	{0x0000FFFF, u8"\U0000FFFF"},
	{0x0010FFFD, u8"\U0010FFFD"}};
	for (auto& test : tests) {
	char buf0[7] = {'\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00'};
	char buf1[7] = {'\xFF', '\xFF', '\xFF', '\xFF', '\xFF', '\xFF', '\xFF'};
	char* buf0_written =
	&buf0[absl::strings_internal::EncodeUTF8Char(buf0, test.first)];
	char* buf1_written =
	&buf1[absl::strings_internal::EncodeUTF8Char(buf1, test.first)];
	int apparent_length = 7;
	while (buf0[apparent_length - 1] == '\x00' &&
	buf1[apparent_length - 1] == '\xFF') {
	if (--apparent_length == 0) break;
	}
	EXPECT_EQ(apparent_length, buf0_written - buf0);
	EXPECT_EQ(apparent_length, buf1_written - buf1);
	EXPECT_EQ(apparent_length, test.second.length());
	EXPECT_EQ(std::string(buf0, apparent_length), test.second);
	EXPECT_EQ(std::string(buf1, apparent_length), test.second);
	}
	char buf[32] = "Don't Tread On Me";
	EXPECT_LE(absl::strings_internal::EncodeUTF8Char(buf, 0x00110000),
	absl::strings_internal::kMaxEncodedUTF8Size);
	char buf2[32] = "Negative is invalid but sane";
	EXPECT_LE(absl::strings_internal::EncodeUTF8Char(buf2, -1),
	absl::strings_internal::kMaxEncodedUTF8Size);
	}
	#if defined(__clang__)
	#pragma clang diagnostic pop
	#endif
	#endif // !defined(__cpp_char8_t)

	struct WideToUtf8TestCase {
	std::string description;
	wchar_t input;
	std::string expected_utf8_str;
	size_t expected_bytes_written;
	ShiftState initial_state = {false, 0};
	ShiftState expected_state = {false, 0};
	};

	std::vector<WideToUtf8TestCase> GetWideToUtf8TestCases() {
	constexpr size_t kError = static_cast<size_t>(-1);
	std::vector<WideToUtf8TestCase> cases = {
	{"ASCII_A", L'A', "A", 1},
	{"NullChar", L'\0', std::string("\0", 1), 1},
	{"ASCII_Max_7F", L'\x7F', "\x7F", 1},

	{"TwoByte_Min_80", L'\u0080', "\xC2\x80", 2},
	{"PoundSign_A3", L'\u00A3', "\xC2\xA3", 2},
	{"TwoByte_Max_7FF", L'\u07FF', "\xDF\xBF", 2},

	{"ThreeByte_Min_800", L'\u0800', "\xE0\xA0\x80", 3},
	{"EuroSign_20AC", L'\u20AC', "\xE2\x82\xAC", 3},
	{"BMP_MaxBeforeSurrogates_D7FF", L'\uD7FF', "\xED\x9F\xBF", 3},
	{"BMP_FFFF", L'\uFFFF', "\xEF\xBF\xBF", 3},

	{"IsolatedHighSurr_D800", L'\xD800', "\xF0\x90", 2, {}, {true, 0}},
	{"IsolatedHighSurr_DBFF", L'\xDBFF', "\xF4\x8F", 2, {}, {true, 3}},

	{"HighSurr_D800_after_HighD800",
	L'\xD800',
	"\xF0\x90",
	2,
	{true, 0},
	{true, 0}},
	{"HighSurr_DBFF_after_HighDBFF",
	L'\xDBFF',
	"\xF4\x8F",
	2,
	{true, 3},
	{true, 3}},

	{"LowSurr_DC00_after_HighD800", L'\xDC00', "\x80\x80", 2, {true, 0}, {}},
	{"LowSurr_DFFD_after_HighDBFF", L'\xDFFD', "\xBF\xBD", 2, {true, 3}, {}},
	{"LowSurr_DC00_with_InitialState_saw_high_bits_1",
	L'\xDC00',
	"\x90\x80",
	2,
	{true, 1},
	{}},

	// Final state = initial on error.
	{"Error_IsolatedLowSurr_DC00_NoPriorHigh", L'\xDC00', "", kError, {}, {}},
	{"Error_IsolatedLowSurr_DFFF_NoPriorHigh", L'\xDFFF', "", kError, {}, {}},

	#if (defined(WCHAR_MAX) && WCHAR_MAX > 0xFFFF)
	{"DirectSupplementaryChars_U10000", static_cast<wchar_t>(0x10000),
	"\xF0\x90\x80\x80", 4},
	{"DirectSupplementaryChars_U10FFFD", static_cast<wchar_t>(0x10FFFD),
	"\xF4\x8F\xBF\xBD", 4},
	#endif
	};

	wchar_t minus_one = static_cast<wchar_t>(-1);
	if constexpr (sizeof(wchar_t) == 2) {
	cases.push_back({"WChar_MinusOne_as_FFFF", minus_one, "\xEF\xBF\xBF", 3});
	} else {
	cases.push_back(
	{"Error_WChar_MinusOne_as_FFFFFFFF", minus_one, "", kError, {}, {}});
	}

	if constexpr (sizeof(wchar_t) >= 4) {
	#ifdef WCHAR_MAX
	if (static_cast<uintmax_t>(WCHAR_MAX) >= 0x110000UL) {
	cases.push_back({"Error_OutOfRange_110000",
	static_cast<wchar_t>(0x110000UL),
	"",
	kError,
	{},
	{}});
	}
	#else
	cases.push_back({"Error_OutOfRange_110000_fallback",
	static_cast<wchar_t>(0x110000UL),
	"",
	kError,
	{},
	{}});
	#endif
	}
	return cases;
	}

	class WideToUtf8ParamTest : public TestWithParam<WideToUtf8TestCase> {};

	TEST_P(WideToUtf8ParamTest, SingleCharConversion) {
	const auto& test_case = GetParam();
	ShiftState state = test_case.initial_state;
	constexpr char kFillChar = '\xAB';
	std::string buffer(32, kFillChar);

	size_t bytes_written = WideToUtf8(test_case.input, buffer.data(), state);

	EXPECT_EQ(bytes_written, test_case.expected_bytes_written);
	EXPECT_THAT(buffer, StartsWith(test_case.expected_utf8_str));

	// The remaining bytes should be unchanged.
	ASSERT_LT(test_case.expected_utf8_str.length(), buffer.size());
	EXPECT_EQ(buffer[test_case.expected_utf8_str.length()], kFillChar);

	EXPECT_EQ(state.saw_high_surrogate,
	test_case.expected_state.saw_high_surrogate);
	EXPECT_EQ(state.bits, test_case.expected_state.bits);
	}

	INSTANTIATE_TEST_SUITE_P(WideCharToUtf8Conversion, WideToUtf8ParamTest,
	ValuesIn(GetWideToUtf8TestCases()),
	[](auto info) { return info.param.description; });

	// Comprehensive test string for validating wchar_t to UTF-8 conversion.
	// This string is designed to cover a variety of Unicode character types and
	// sequences:
	// 1. Basic ASCII characters (within names, numbers, and spacing).
	// 2. Common 2-byte UTF-8 sequences:
	// - Accented Latin characters (e.g., 'á' in "Holá").
	// - Hebrew text with combining vowel points (e.g., "שָׁלוֹם").
	// 3. Common 3-byte UTF-8 sequences:
	// - Currency symbols (e.g., '€').
	// - CJK characters (e.g., "你好", "中").
	// - Components of complex emojis like the Zero Width Joiner (ZWJ) and
	// Heart symbol.
	// 4. Various 4-byte UTF-8 sequences (representing Supplementary Plane
	// characters):
	// - An emoji with a skin tone modifier ("👍🏻").
	// - A flag emoji composed of regional indicators ("🇺🇸").
	// - A complex ZWJ emoji sequence ("👩‍❤️‍💋‍👨") combining
	// SP characters (👩, 💋, 👨) with BMP characters (ZWJ and ❤️).
	// - These are critical for testing the correct handling of surrogate pairs
	// when wchar_t is 2 bytes (e.g., on Windows).
	// The goal is to ensure accurate conversion across a diverse set of
	// characters.
	//
	// clang-format off
	#define WIDE_STRING_LITERAL L"Holá €1 你好 שָׁלוֹם 👍🏻🇺🇸👩‍❤️‍💋‍👨 中"
	#define UTF8_STRING_LITERAL u8"Holá €1 你好 שָׁלוֹם 👍🏻🇺🇸👩‍❤️‍💋‍👨 中"
	// clang-format on

	absl::string_view GetUtf8TestString() {
	// `u8""` forces UTF-8 encoding; MSVC will default to e.g. CP1252 (and warn)
	// without it. However, the resulting character type differs between pre-C++20
	// (`char`) and C++20 (`char8_t`). So deduce the right character type for all
	// C++ versions, init it with UTF-8, then `memcpy()` to get the result as a
	// `char*`
	static absl::string_view kUtf8TestString = [] {
	using ConstChar8T = std::remove_reference_t<decltype(*u8"a")>;
	constexpr ConstChar8T kOutputUtf8[] = UTF8_STRING_LITERAL;
	static char output[sizeof kOutputUtf8];
	std::memcpy(output, kOutputUtf8, sizeof kOutputUtf8);
	return output;
	}();

	return kUtf8TestString;
	}

	TEST(WideToUtf8, FullString) {
	std::string buffer(kMaxEncodedUTF8Size * sizeof(WIDE_STRING_LITERAL), '\0');
	char* buffer_ptr = buffer.data();

	ShiftState state;
	for (const wchar_t wc : WIDE_STRING_LITERAL) {
	buffer_ptr += WideToUtf8(wc, buffer_ptr, state);
	}

	EXPECT_THAT(buffer, StartsWith(GetUtf8TestString()));
	}

	#undef WIDE_STRING_LITERAL
	#undef UTF8_STRING_LITERAL

	} // namespace