blob: b88d7bb88a40ece51a0178788b0eac19657d3331 [file] [log] [blame]
// Copyright 2017 The Abseil Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// https://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "absl/strings/internal/utf8.h"
#include <cstddef>
#include <cstdint>
#include <cstring>
#include <string>
#include <type_traits>
#include <utility>
#include <vector>
#include "gmock/gmock.h"
#include "gtest/gtest.h"
#include "absl/base/port.h"
#include "absl/strings/string_view.h"
namespace {
using ::absl::strings_internal::kMaxEncodedUTF8Size;
using ::absl::strings_internal::ShiftState;
using ::absl::strings_internal::WideToUtf8;
using ::testing::StartsWith;
using ::testing::TestParamInfo;
using ::testing::TestWithParam;
using ::testing::ValuesIn;
#if !defined(__cpp_char8_t)
#if defined(__clang__)
#pragma clang diagnostic push
#pragma clang diagnostic ignored "-Wc++2a-compat"
#endif
TEST(EncodeUTF8Char, BasicFunction) {
std::pair<char32_t, std::string> tests[] = {{0x0030, u8"\u0030"},
{0x00A3, u8"\u00A3"},
{0x00010000, u8"\U00010000"},
{0x0000FFFF, u8"\U0000FFFF"},
{0x0010FFFD, u8"\U0010FFFD"}};
for (auto& test : tests) {
char buf0[7] = {'\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00'};
char buf1[7] = {'\xFF', '\xFF', '\xFF', '\xFF', '\xFF', '\xFF', '\xFF'};
char* buf0_written =
&buf0[absl::strings_internal::EncodeUTF8Char(buf0, test.first)];
char* buf1_written =
&buf1[absl::strings_internal::EncodeUTF8Char(buf1, test.first)];
int apparent_length = 7;
while (buf0[apparent_length - 1] == '\x00' &&
buf1[apparent_length - 1] == '\xFF') {
if (--apparent_length == 0) break;
}
EXPECT_EQ(apparent_length, buf0_written - buf0);
EXPECT_EQ(apparent_length, buf1_written - buf1);
EXPECT_EQ(apparent_length, test.second.length());
EXPECT_EQ(std::string(buf0, apparent_length), test.second);
EXPECT_EQ(std::string(buf1, apparent_length), test.second);
}
char buf[32] = "Don't Tread On Me";
EXPECT_LE(absl::strings_internal::EncodeUTF8Char(buf, 0x00110000),
absl::strings_internal::kMaxEncodedUTF8Size);
char buf2[32] = "Negative is invalid but sane";
EXPECT_LE(absl::strings_internal::EncodeUTF8Char(buf2, -1),
absl::strings_internal::kMaxEncodedUTF8Size);
}
#if defined(__clang__)
#pragma clang diagnostic pop
#endif
#endif // !defined(__cpp_char8_t)
struct WideToUtf8TestCase {
std::string description;
wchar_t input;
std::string expected_utf8_str;
size_t expected_bytes_written;
ShiftState initial_state = {false, 0};
ShiftState expected_state = {false, 0};
};
std::vector<WideToUtf8TestCase> GetWideToUtf8TestCases() {
constexpr size_t kError = static_cast<size_t>(-1);
std::vector<WideToUtf8TestCase> cases = {
{"ASCII_A", L'A', "A", 1},
{"NullChar", L'\0', std::string("\0", 1), 1},
{"ASCII_Max_7F", L'\x7F', "\x7F", 1},
{"TwoByte_Min_80", L'\u0080', "\xC2\x80", 2},
{"PoundSign_A3", L'\u00A3', "\xC2\xA3", 2},
{"TwoByte_Max_7FF", L'\u07FF', "\xDF\xBF", 2},
{"ThreeByte_Min_800", L'\u0800', "\xE0\xA0\x80", 3},
{"EuroSign_20AC", L'\u20AC', "\xE2\x82\xAC", 3},
{"BMP_MaxBeforeSurrogates_D7FF", L'\uD7FF', "\xED\x9F\xBF", 3},
{"BMP_FFFF", L'\uFFFF', "\xEF\xBF\xBF", 3},
{"IsolatedHighSurr_D800", L'\xD800', "\xF0\x90", 2, {}, {true, 0}},
{"IsolatedHighSurr_DBFF", L'\xDBFF', "\xF4\x8F", 2, {}, {true, 3}},
{"HighSurr_D800_after_HighD800",
L'\xD800',
"\xF0\x90",
2,
{true, 0},
{true, 0}},
{"HighSurr_DBFF_after_HighDBFF",
L'\xDBFF',
"\xF4\x8F",
2,
{true, 3},
{true, 3}},
{"LowSurr_DC00_after_HighD800", L'\xDC00', "\x80\x80", 2, {true, 0}, {}},
{"LowSurr_DFFD_after_HighDBFF", L'\xDFFD', "\xBF\xBD", 2, {true, 3}, {}},
{"LowSurr_DC00_with_InitialState_saw_high_bits_1",
L'\xDC00',
"\x90\x80",
2,
{true, 1},
{}},
// Final state = initial on error.
{"Error_IsolatedLowSurr_DC00_NoPriorHigh", L'\xDC00', "", kError, {}, {}},
{"Error_IsolatedLowSurr_DFFF_NoPriorHigh", L'\xDFFF', "", kError, {}, {}},
#if (defined(WCHAR_MAX) && WCHAR_MAX > 0xFFFF)
{"DirectSupplementaryChars_U10000", static_cast<wchar_t>(0x10000),
"\xF0\x90\x80\x80", 4},
{"DirectSupplementaryChars_U10FFFD", static_cast<wchar_t>(0x10FFFD),
"\xF4\x8F\xBF\xBD", 4},
#endif
};
wchar_t minus_one = static_cast<wchar_t>(-1);
if constexpr (sizeof(wchar_t) == 2) {
cases.push_back({"WChar_MinusOne_as_FFFF", minus_one, "\xEF\xBF\xBF", 3});
} else {
cases.push_back(
{"Error_WChar_MinusOne_as_FFFFFFFF", minus_one, "", kError, {}, {}});
}
if constexpr (sizeof(wchar_t) >= 4) {
#ifdef WCHAR_MAX
if (static_cast<uintmax_t>(WCHAR_MAX) >= 0x110000UL) {
cases.push_back({"Error_OutOfRange_110000",
static_cast<wchar_t>(0x110000UL),
"",
kError,
{},
{}});
}
#else
cases.push_back({"Error_OutOfRange_110000_fallback",
static_cast<wchar_t>(0x110000UL),
"",
kError,
{},
{}});
#endif
}
return cases;
}
class WideToUtf8ParamTest : public TestWithParam<WideToUtf8TestCase> {};
TEST_P(WideToUtf8ParamTest, SingleCharConversion) {
const auto& test_case = GetParam();
ShiftState state = test_case.initial_state;
constexpr char kFillChar = '\xAB';
std::string buffer(32, kFillChar);
size_t bytes_written = WideToUtf8(test_case.input, buffer.data(), state);
EXPECT_EQ(bytes_written, test_case.expected_bytes_written);
EXPECT_THAT(buffer, StartsWith(test_case.expected_utf8_str));
// The remaining bytes should be unchanged.
ASSERT_LT(test_case.expected_utf8_str.length(), buffer.size());
EXPECT_EQ(buffer[test_case.expected_utf8_str.length()], kFillChar);
EXPECT_EQ(state.saw_high_surrogate,
test_case.expected_state.saw_high_surrogate);
EXPECT_EQ(state.bits, test_case.expected_state.bits);
}
INSTANTIATE_TEST_SUITE_P(WideCharToUtf8Conversion, WideToUtf8ParamTest,
ValuesIn(GetWideToUtf8TestCases()),
[](auto info) { return info.param.description; });
// Comprehensive test string for validating wchar_t to UTF-8 conversion.
// This string is designed to cover a variety of Unicode character types and
// sequences:
// 1. Basic ASCII characters (within names, numbers, and spacing).
// 2. Common 2-byte UTF-8 sequences:
// - Accented Latin characters (e.g., 'á' in "Holá").
// - Hebrew text with combining vowel points (e.g., "שָׁלוֹם").
// 3. Common 3-byte UTF-8 sequences:
// - Currency symbols (e.g., '€').
// - CJK characters (e.g., "你好", "中").
// - Components of complex emojis like the Zero Width Joiner (ZWJ) and
// Heart symbol.
// 4. Various 4-byte UTF-8 sequences (representing Supplementary Plane
// characters):
// - An emoji with a skin tone modifier ("👍🏻").
// - A flag emoji composed of regional indicators ("🇺🇸").
// - A complex ZWJ emoji sequence ("👩‍❤️‍💋‍👨") combining
// SP characters (👩, 💋, 👨) with BMP characters (ZWJ and ❤️).
// - These are critical for testing the correct handling of surrogate pairs
// when wchar_t is 2 bytes (e.g., on Windows).
// The goal is to ensure accurate conversion across a diverse set of
// characters.
//
// clang-format off
#define WIDE_STRING_LITERAL L"Holá €1 你好 שָׁלוֹם 👍🏻🇺🇸👩‍❤️‍💋‍👨 中"
#define UTF8_STRING_LITERAL u8"Holá €1 你好 שָׁלוֹם 👍🏻🇺🇸👩‍❤️‍💋‍👨 中"
// clang-format on
absl::string_view GetUtf8TestString() {
// `u8""` forces UTF-8 encoding; MSVC will default to e.g. CP1252 (and warn)
// without it. However, the resulting character type differs between pre-C++20
// (`char`) and C++20 (`char8_t`). So deduce the right character type for all
// C++ versions, init it with UTF-8, then `memcpy()` to get the result as a
// `char*`
static absl::string_view kUtf8TestString = [] {
using ConstChar8T = std::remove_reference_t<decltype(*u8"a")>;
constexpr ConstChar8T kOutputUtf8[] = UTF8_STRING_LITERAL;
static char output[sizeof kOutputUtf8];
std::memcpy(output, kOutputUtf8, sizeof kOutputUtf8);
return output;
}();
return kUtf8TestString;
}
TEST(WideToUtf8, FullString) {
std::string buffer(kMaxEncodedUTF8Size * sizeof(WIDE_STRING_LITERAL), '\0');
char* buffer_ptr = buffer.data();
ShiftState state;
for (const wchar_t wc : WIDE_STRING_LITERAL) {
buffer_ptr += WideToUtf8(wc, buffer_ptr, state);
}
EXPECT_THAT(buffer, StartsWith(GetUtf8TestString()));
}
#undef WIDE_STRING_LITERAL
#undef UTF8_STRING_LITERAL
} // namespace