| // © 2016 and later: Unicode, Inc. and others. |
| // License & terms of use: http://www.unicode.org/copyright.html#License |
| /** |
| ******************************************************************************* |
| * Copyright (C) 2002-2004, International Business Machines Corporation and * |
| * others. All Rights Reserved. * |
| ******************************************************************************* |
| */ |
| package com.ibm.icu.dev.test; |
| |
| /** |
| * Utility class for supplementary code point |
| * support. This one is written purely for updating |
| * Normalization sample from the unicode.org site. |
| * If you want the real thing, use UTF16 class |
| * from ICU4J |
| * @author Vladimir Weinstein, Markus Scherer |
| */ |
| public class UTF16Util { |
| static final int suppOffset = (0xd800 << 10) + 0xdc00 - 0x10000; |
| |
| /** |
| * Method nextCodePoint. Returns the next code point |
| * in a string. |
| * @param s String in question |
| * @param i index from which we want a code point |
| * @return int codepoint at index i |
| */ |
| public static final int nextCodePoint(String s, int i) { |
| int ch = s.charAt(i); |
| if (0xd800 <= ch && ch <= 0xdbff && ++i < s.length()) { |
| int ch2 = s.charAt(i); |
| if (0xdc00 <= ch2 && ch2 <= 0xdfff) { |
| ch = (ch << 10) + ch2 - suppOffset; |
| } |
| } |
| return ch; |
| } |
| |
| /** |
| * Method prevCodePoint. Gets the code point preceding |
| * index i (predecrement). |
| * @param s String in question |
| * @param i index in string |
| * @return int codepoint at index --i |
| */ |
| public static final int prevCodePoint(String s, int i) { |
| int ch = s.charAt(--i); |
| if (0xdc00 <= ch && ch <= 0xdfff && --i >= 0) { |
| int ch2 = s.charAt(i); |
| if (0xd800 <= ch2 && ch2 <= 0xdbff) { |
| ch = (ch2 << 10) + ch - suppOffset; |
| } |
| } |
| return ch; |
| } |
| |
| /** |
| * Method nextCodePoint. Returns the next code point |
| * in a string. |
| * @param s StringBuffer in question |
| * @param i index from which we want a code point |
| * @return int codepoint at index i |
| */ |
| public static final int nextCodePoint(StringBuffer s, int i) { |
| int ch = s.charAt(i); |
| if (0xd800 <= ch && ch <= 0xdbff && ++i < s.length()) { |
| int ch2 = s.charAt(i); |
| if (0xdc00 <= ch2 && ch2 <= 0xdfff) { |
| ch = (ch << 10) + ch2 - suppOffset; |
| } |
| } |
| return ch; |
| } |
| |
| /** |
| * Method prevCodePoint. Gets the code point preceding |
| * index i (predecrement). |
| * @param s StringBuffer in question |
| * @param i index in string |
| * @return int codepoint at index --i |
| */ |
| public static final int prevCodePoint(StringBuffer s, int i) { |
| int ch = s.charAt(--i); |
| if (0xdc00 <= ch && ch <= 0xdfff && --i >= 0) { |
| int ch2 = s.charAt(i); |
| if (0xd800 <= ch2 && ch2 <= 0xdbff) { |
| ch = (ch2 << 10) + ch - suppOffset; |
| } |
| } |
| return ch; |
| } |
| |
| /** |
| * Method codePointLength. Returns the length |
| * in UTF-16 code units of a given code point |
| * @param c code point in question |
| * @return int length in UTF-16 code units. Can be 1 or 2 |
| */ |
| public static final int codePointLength(int c) { |
| return c <= 0xffff ? 1 : 2; |
| } |
| |
| /** |
| * Method appendCodePoint. Appends a code point |
| * to a StringBuffer |
| * @param buffer StringBuffer in question |
| * @param ch code point to append |
| */ |
| public static final void appendCodePoint(StringBuffer buffer, int ch) { |
| if (ch <= 0xffff) { |
| buffer.append((char)ch); |
| } else { |
| buffer.append((char)(0xd7c0 + (ch >> 10))); |
| buffer.append((char)(0xdc00 + (ch & 0x3ff))); |
| } |
| } |
| |
| /** |
| * Method insertCodePoint. Inserts a code point in |
| * a StringBuffer |
| * @param buffer StringBuffer in question |
| * @param i index at which we want code point to be inserted |
| * @param ch code point to be inserted |
| */ |
| public static final void insertCodePoint(StringBuffer buffer, int i, int ch) { |
| if (ch <= 0xffff) { |
| buffer.insert(i, (char)ch); |
| } else { |
| buffer.insert(i, (char)(0xd7c0 + (ch >> 10))).insert(i + 1, (char)(0xdc00 + (ch & 0x3ff))); |
| } |
| } |
| |
| /** |
| * Method setCodePointAt. Changes a code point at a |
| * given index. Can change the length of the string. |
| * @param buffer StringBuffer in question |
| * @param i index at which we want to change the contents |
| * @param ch replacement code point |
| * @return int difference in resulting StringBuffer length |
| */ |
| public static final int setCodePointAt(StringBuffer buffer, int i, int ch) { |
| int cp = nextCodePoint(buffer, i); |
| |
| if (ch <= 0xffff && cp <= 0xffff) { // Both BMP |
| buffer.setCharAt(i, (char)ch); |
| return 0; |
| } else if (ch > 0xffff && cp > 0xffff) { // Both supplementary |
| buffer.setCharAt(i, (char)(0xd7c0 + (ch >> 10))); |
| buffer.setCharAt(i+1, (char)(0xdc00 + (ch & 0x3ff))); |
| return 0; |
| } else if (ch <= 0xffff && cp > 0xffff) { // putting BMP instead of supplementary, buffer shrinks |
| buffer.setCharAt(i, (char)ch); |
| buffer.deleteCharAt(i+1); |
| return -1; |
| } else { //if (ch > 0xffff && cp <= 0xffff) { // putting supplementary instead of BMP, buffer grows |
| buffer.setCharAt(i, (char)(0xd7c0 + (ch >> 10))); |
| buffer.insert(i+1, (char)(0xdc00 + (ch & 0x3ff))); |
| return 1; |
| } |
| } |
| |
| /** |
| * Method countCodePoint. Counts the UTF-32 code points |
| * in a UTF-16 encoded string. |
| * @param source String in question. |
| * @return int number of code points in this string |
| */ |
| public static final int countCodePoint(String source) |
| { |
| int result = 0; |
| char ch; |
| boolean hadLeadSurrogate = false; |
| |
| for (int i = 0; i < source.length(); ++ i) |
| { |
| ch = source.charAt(i); |
| if (hadLeadSurrogate && 0xdc00 <= ch && ch <= 0xdfff) { |
| hadLeadSurrogate = false; // count valid trail as zero |
| } |
| else |
| { |
| hadLeadSurrogate = (0xd800 <= ch && ch <= 0xdbff); |
| ++ result; // count others as 1 |
| } |
| } |
| |
| return result; |
| } |
| |
| /** |
| * Method countCodePoint. Counts the UTF-32 code points |
| * in a UTF-16 encoded string. |
| * @param source StringBuffer in question. |
| * @return int number of code points in this string |
| */ |
| public static final int countCodePoint(StringBuffer source) |
| { |
| int result = 0; |
| char ch; |
| boolean hadLeadSurrogate = false; |
| |
| for (int i = 0; i < source.length(); ++ i) |
| { |
| ch = source.charAt(i); |
| if (hadLeadSurrogate && 0xdc00 <= ch && ch <= 0xdfff) { |
| hadLeadSurrogate = false; // count valid trail as zero |
| } |
| else |
| { |
| hadLeadSurrogate = (0xd800 <= ch && ch <= 0xdbff); |
| ++ result; // count others as 1 |
| } |
| } |
| |
| return result; |
| } |
| /** |
| * The minimum value for Supplementary code points |
| */ |
| public static final int SUPPLEMENTARY_MIN_VALUE = 0x10000; |
| /** |
| * Determines how many chars this char32 requires. |
| * If a validity check is required, use <code> |
| * <a href="../UCharacter.html#isLegal(char)">isLegal()</a></code> on |
| * char32 before calling. |
| * @param char32 the input codepoint. |
| * @return 2 if is in supplementary space, otherwise 1. |
| */ |
| public static int getCharCount(int char32) |
| { |
| if (char32 < SUPPLEMENTARY_MIN_VALUE) { |
| return 1; |
| } |
| return 2; |
| } |
| /** |
| * Lead surrogate maximum value |
| * @stable ICU 2.1 |
| */ |
| public static final int LEAD_SURROGATE_MAX_VALUE = 0xDBFF; |
| /** |
| * Lead surrogate minimum value |
| * @stable ICU 2.1 |
| */ |
| public static final int LEAD_SURROGATE_MIN_VALUE = 0xD800; |
| |
| /** |
| * Trail surrogate minimum value |
| * @stable ICU 2.1 |
| */ |
| public static final int TRAIL_SURROGATE_MIN_VALUE = 0xDC00; |
| /** |
| * Trail surrogate maximum value |
| * @stable ICU 2.1 |
| */ |
| public static final int TRAIL_SURROGATE_MAX_VALUE = 0xDFFF; |
| /** |
| * Determines whether the code value is a surrogate. |
| * @param char16 the input character. |
| * @return true iff the input character is a surrogate. |
| * @stable ICU 2.1 |
| */ |
| public static boolean isSurrogate(char char16) |
| { |
| return LEAD_SURROGATE_MIN_VALUE <= char16 && |
| char16 <= TRAIL_SURROGATE_MAX_VALUE; |
| } |
| |
| /** |
| * Determines whether the character is a trail surrogate. |
| * @param char16 the input character. |
| * @return true iff the input character is a trail surrogate. |
| * @stable ICU 2.1 |
| */ |
| public static boolean isTrailSurrogate(char char16) |
| { |
| return (TRAIL_SURROGATE_MIN_VALUE <= char16 && |
| char16 <= TRAIL_SURROGATE_MAX_VALUE); |
| } |
| |
| /** |
| * Determines whether the character is a lead surrogate. |
| * @param char16 the input character. |
| * @return true iff the input character is a lead surrogate |
| * @stable ICU 2.1 |
| */ |
| public static boolean isLeadSurrogate(char char16) |
| { |
| return LEAD_SURROGATE_MIN_VALUE <= char16 && |
| char16 <= LEAD_SURROGATE_MAX_VALUE; |
| } |
| /** |
| * Extract a single UTF-32 value from a substring. |
| * Used when iterating forwards or backwards (with |
| * <code>UTF16.getCharCount()</code>, as well as random access. If a |
| * validity check is required, use |
| * <code><a href="../UCharacter.html#isLegal(char)">UCharacter.isLegal() |
| * </a></code> on the return value. |
| * If the char retrieved is part of a surrogate pair, its supplementary |
| * character will be returned. If a complete supplementary character is |
| * not found the incomplete character will be returned |
| * @param source array of UTF-16 chars |
| * @param start offset to substring in the source array for analyzing |
| * @param limit offset to substring in the source array for analyzing |
| * @param offset16 UTF-16 offset relative to start |
| * @return UTF-32 value for the UTF-32 value that contains the char at |
| * offset16. The boundaries of that codepoint are the same as in |
| * <code>bounds32()</code>. |
| * @exception IndexOutOfBoundsException thrown if offset16 is not within |
| * the range of start and limit. |
| * @stable ICU 2.1 |
| */ |
| public static int charAt(char source[], int start, int limit, |
| int offset16) |
| { |
| offset16 += start; |
| if (offset16 < start || offset16 >= limit) { |
| throw new ArrayIndexOutOfBoundsException(offset16); |
| } |
| |
| char single = source[offset16]; |
| if (!isSurrogate(single)) { |
| return single; |
| } |
| |
| // Convert the UTF-16 surrogate pair if necessary. |
| // For simplicity in usage, and because the frequency of pairs is |
| // low, look both directions. |
| if (single <= LEAD_SURROGATE_MAX_VALUE) { |
| offset16 ++; |
| if (offset16 >= limit) { |
| return single; |
| } |
| char trail = source[offset16]; |
| if (isTrailSurrogate(trail)) { |
| return getRawSupplementary(single, trail); |
| } |
| } |
| else { // isTrailSurrogate(single), so |
| if (offset16 == start) { |
| return single; |
| } |
| offset16 --; |
| char lead = source[offset16]; |
| if (isLeadSurrogate(lead)) |
| return getRawSupplementary(lead, single); |
| } |
| return single; // return unmatched surrogate |
| } |
| /** |
| * Shift value for lead surrogate to form a supplementary character. |
| */ |
| private static final int LEAD_SURROGATE_SHIFT_ = 10; |
| |
| /** |
| * Offset to add to combined surrogate pair to avoid msking. |
| */ |
| private static final int SURROGATE_OFFSET_ = |
| SUPPLEMENTARY_MIN_VALUE - |
| (LEAD_SURROGATE_MIN_VALUE << |
| LEAD_SURROGATE_SHIFT_) - |
| TRAIL_SURROGATE_MIN_VALUE; |
| |
| |
| /** |
| * Forms a supplementary code point from the argument character<br> |
| * Note this is for internal use hence no checks for the validity of the |
| * surrogate characters are done |
| * @param lead lead surrogate character |
| * @param trail trailing surrogate character |
| * @return code point of the supplementary character |
| */ |
| public static int getRawSupplementary(char lead, char trail) |
| { |
| return (lead << LEAD_SURROGATE_SHIFT_) + trail + SURROGATE_OFFSET_; |
| } |
| |
| } |