Fix UTF16LE support in TextStringToUCS4
Make test a bit more complex by using a nice checkbox
Also copy the text to the qt6 folder
diff --git a/poppler/UTF.cc b/poppler/UTF.cc
index ee0314f..9097b31 100644
--- a/poppler/UTF.cc
+++ b/poppler/UTF.cc
@@ -119,7 +119,7 @@
if (isUnicode)
utf16[i] = (s[2 + i * 2] & 0xff) << 8 | (s[3 + i * 2] & 0xff);
else // UnicodeLE
- utf16[i] = (s[2 + i * 2] & 0xff) | (s[3 + i * 2] & 0xff) >> 8;
+ utf16[i] = (s[3 + i * 2] & 0xff) << 8 | (s[2 + i * 2] & 0xff);
}
len = UTF16toUCS4(utf16, len, &u);
delete[] utf16;
diff --git a/qt5/tests/check_utf_conversion.cpp b/qt5/tests/check_utf_conversion.cpp
index 1f04c2a..b153ae5 100644
--- a/qt5/tests/check_utf_conversion.cpp
+++ b/qt5/tests/check_utf_conversion.cpp
@@ -43,7 +43,17 @@
return false;
}
- return *a == (Unicode)*b;
+ return true;
+}
+
+static bool compare(const Unicode *a, const uint16_t *b, int len)
+{
+ for (int i = 0; i < len; i++) {
+ if (a[i] != b[i])
+ return false;
+ }
+
+ return true;
}
void TestUTFConversion::testUTF_data()
@@ -147,32 +157,34 @@
void TestUTFConversion::testUnicodeLittleEndian()
{
- uint16_t UTF16LE_hi[4] { 0xFFFE, 0x4800, 0x4900, 0x2100 }; // UTF16-LE "HI!"
- GooString GooUTF16LE(reinterpret_cast<const char *>(UTF16LE_hi), 4 * 2);
+ uint16_t UTF16LE_hi[5] { 0xFFFE, 0x4800, 0x4900, 0x2100, 0x1126 }; // UTF16-LE "HI!☑"
+ GooString GooUTF16LE(reinterpret_cast<const char *>(UTF16LE_hi), sizeof(UTF16LE_hi));
- uint16_t UTF16BE_hi[4] { 0xFEFF, 0x0048, 0x0049, 0x0021 }; // UTF16-BE "HI!"
- GooString GooUTF16BE(reinterpret_cast<const char *>(UTF16BE_hi), 4 * 2);
+ uint16_t UTF16BE_hi[5] { 0xFEFF, 0x0048, 0x0049, 0x0021, 0x2611 }; // UTF16-BE "HI!☑"
+ GooString GooUTF16BE(reinterpret_cast<const char *>(UTF16BE_hi), sizeof(UTF16BE_hi));
// Let's assert both GooString's are different
- Q_ASSERT(GooUTF16LE.cmp(&GooUTF16BE) != 0);
+ QVERIFY(GooUTF16LE.cmp(&GooUTF16BE));
Unicode *UCS4fromLE, *UCS4fromBE;
const int len1 = TextStringToUCS4(&GooUTF16LE, &UCS4fromLE);
const int len2 = TextStringToUCS4(&GooUTF16BE, &UCS4fromBE);
- // 3 as TextStringToUCS4() removes the two leading Byte Order Mark (BOM) code points
- Q_ASSERT(len1 == len2);
- Q_ASSERT(len1 == 3);
+ // len is 4 because TextStringToUCS4() removes the two leading Byte Order Mark (BOM) code points
+ QCOMPARE(len1, len2);
+ QCOMPARE(len1, 4);
// Check that now after conversion, UCS4fromLE and UCS4fromBE are now the same
for (int i = 0; i < len1; i++) {
- Q_ASSERT(UCS4fromLE[i] == UCS4fromBE[i]);
+ QCOMPARE(UCS4fromLE[i], UCS4fromBE[i]);
}
+ const QString expected = QStringLiteral("HI!☑");
+
// Do some final verifications, checking the strings to be "HI!"
QVERIFY(*UCS4fromLE == *UCS4fromBE);
- QVERIFY(compare(UCS4fromLE, "HI!", 3));
- QVERIFY(compare(UCS4fromBE, "HI!", 3));
+ QVERIFY(compare(UCS4fromLE, expected.utf16(), len1));
+ QVERIFY(compare(UCS4fromBE, expected.utf16(), len1));
}
QTEST_GUILESS_MAIN(TestUTFConversion)
diff --git a/qt6/tests/check_utf_conversion.cpp b/qt6/tests/check_utf_conversion.cpp
index f28829f..f2a6609 100644
--- a/qt6/tests/check_utf_conversion.cpp
+++ b/qt6/tests/check_utf_conversion.cpp
@@ -18,6 +18,7 @@
void testUTF_data();
void testUTF();
void testUnicodeToAscii7();
+ void testUnicodeLittleEndian();
};
static bool compare(const char *a, const char *b)
@@ -41,9 +42,18 @@
return false;
}
- return *a == (Unicode)*b;
+ return true;
}
+static bool compare(const Unicode *a, const uint16_t *b, int len)
+{
+ for (int i = 0; i < len; i++) {
+ if (a[i] != b[i])
+ return false;
+ }
+
+ return true;
+}
void TestUTFConversion::testUTF_data()
{
QTest::addColumn<QString>("s");
@@ -143,5 +153,37 @@
free(out_ascii_idx);
}
+void TestUTFConversion::testUnicodeLittleEndian()
+{
+ uint16_t UTF16LE_hi[5] { 0xFFFE, 0x4800, 0x4900, 0x2100, 0x1126 }; // UTF16-LE "HI!☑"
+ GooString GooUTF16LE(reinterpret_cast<const char *>(UTF16LE_hi), sizeof(UTF16LE_hi));
+
+ uint16_t UTF16BE_hi[5] { 0xFEFF, 0x0048, 0x0049, 0x0021, 0x2611 }; // UTF16-BE "HI!☑"
+ GooString GooUTF16BE(reinterpret_cast<const char *>(UTF16BE_hi), sizeof(UTF16BE_hi));
+
+ // Let's assert both GooString's are different
+ QVERIFY(GooUTF16LE.cmp(&GooUTF16BE));
+
+ Unicode *UCS4fromLE, *UCS4fromBE;
+ const int len1 = TextStringToUCS4(&GooUTF16LE, &UCS4fromLE);
+ const int len2 = TextStringToUCS4(&GooUTF16BE, &UCS4fromBE);
+
+ // len is 4 because TextStringToUCS4() removes the two leading Byte Order Mark (BOM) code points
+ QCOMPARE(len1, len2);
+ QCOMPARE(len1, 4);
+
+ // Check that now after conversion, UCS4fromLE and UCS4fromBE are now the same
+ for (int i = 0; i < len1; i++) {
+ QCOMPARE(UCS4fromLE[i], UCS4fromBE[i]);
+ }
+
+ const QString expected = QStringLiteral("HI!☑");
+
+ // Do some final verifications, checking the strings to be "HI!"
+ QVERIFY(*UCS4fromLE == *UCS4fromBE);
+ QVERIFY(compare(UCS4fromLE, expected.utf16(), len1));
+ QVERIFY(compare(UCS4fromBE, expected.utf16(), len1));
+}
+
QTEST_GUILESS_MAIN(TestUTFConversion)
#include "check_utf_conversion.moc"