blob: a2653f82186ef6401f88d711b841061bd3d2735b [file] [log] [blame]
/**
*******************************************************************************
* Copyright (C) 2002-2003, International Business Machines Corporation and *
* others. All Rights Reserved. *
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/dev/test/UTF16Util.java,v $
* $Date: 2003/06/03 18:49:28 $
* $Revision: 1.2 $
*
*******************************************************************************
*/
package com.ibm.icu.dev.test;
/**
* Utility class for supplementary code point
* support. This one is written purely for updating
* Normalization sample from the unicode.org site.
* If you want the real thing, use UTF16 class
* from ICU4J
* @author Vladimir Weinstein, Markus Scherer
*/
public class UTF16Util {
static final int suppOffset = (0xd800 << 10) + 0xdc00 - 0x10000;
/**
* Method nextCodePoint. Returns the next code point
* in a string.
* @param s String in question
* @param i index from which we want a code point
* @return int codepoint at index i
*/
public static final int nextCodePoint(String s, int i) {
int ch = s.charAt(i);
if (0xd800 <= ch && ch <= 0xdbff && ++i < s.length()) {
int ch2 = s.charAt(i);
if (0xdc00 <= ch2 && ch2 <= 0xdfff) {
ch = (ch << 10) + ch2 - suppOffset;
}
}
return ch;
}
/**
* Method prevCodePoint. Gets the code point preceding
* index i (predecrement).
* @param s String in question
* @param i index in string
* @return int codepoint at index --i
*/
public static final int prevCodePoint(String s, int i) {
int ch = s.charAt(--i);
if (0xdc00 <= ch && ch <= 0xdfff && --i >= 0) {
int ch2 = s.charAt(i);
if (0xd800 <= ch2 && ch2 <= 0xdbff) {
ch = (ch2 << 10) + ch - suppOffset;
}
}
return ch;
}
/**
* Method nextCodePoint. Returns the next code point
* in a string.
* @param s StringBuffer in question
* @param i index from which we want a code point
* @return int codepoint at index i
*/
public static final int nextCodePoint(StringBuffer s, int i) {
int ch = s.charAt(i);
if (0xd800 <= ch && ch <= 0xdbff && ++i < s.length()) {
int ch2 = s.charAt(i);
if (0xdc00 <= ch2 && ch2 <= 0xdfff) {
ch = (ch << 10) + ch2 - suppOffset;
}
}
return ch;
}
/**
* Method prevCodePoint. Gets the code point preceding
* index i (predecrement).
* @param s StringBuffer in question
* @param i index in string
* @return int codepoint at index --i
*/
public static final int prevCodePoint(StringBuffer s, int i) {
int ch = s.charAt(--i);
if (0xdc00 <= ch && ch <= 0xdfff && --i >= 0) {
int ch2 = s.charAt(i);
if (0xd800 <= ch2 && ch2 <= 0xdbff) {
ch = (ch2 << 10) + ch - suppOffset;
}
}
return ch;
}
/**
* Method codePointLength. Returns the length
* in UTF-16 code units of a given code point
* @param c code point in question
* @return int length in UTF-16 code units. Can be 1 or 2
*/
public static final int codePointLength(int c) {
return c <= 0xffff ? 1 : 2;
}
/**
* Method appendCodePoint. Appends a code point
* to a StringBuffer
* @param buffer StringBuffer in question
* @param ch code point to append
*/
public static final void appendCodePoint(StringBuffer buffer, int ch) {
if (ch <= 0xffff) {
buffer.append((char)ch);
} else {
buffer.append((char)(0xd7c0 + (ch >> 10)));
buffer.append((char)(0xdc00 + (ch & 0x3ff)));
}
}
/**
* Method insertCodePoint. Inserts a code point in
* a StringBuffer
* @param buffer StringBuffer in question
* @param i index at which we want code point to be inserted
* @param ch code point to be inserted
*/
public static final void insertCodePoint(StringBuffer buffer, int i, int ch) {
if (ch <= 0xffff) {
buffer.insert(i, (char)ch);
} else {
buffer.insert(i, (char)(0xd7c0 + (ch >> 10))).insert(i + 1, (char)(0xdc00 + (ch & 0x3ff)));
}
}
/**
* Method setCodePointAt. Changes a code point at a
* given index. Can change the length of the string.
* @param buffer StringBuffer in question
* @param i index at which we want to change the contents
* @param ch replacement code point
* @return int difference in resulting StringBuffer length
*/
public static final int setCodePointAt(StringBuffer buffer, int i, int ch) {
int cp = nextCodePoint(buffer, i);
if (ch <= 0xffff && cp <= 0xffff) { // Both BMP
buffer.setCharAt(i, (char)ch);
return 0;
} else if (ch > 0xffff && cp > 0xffff) { // Both supplementary
buffer.setCharAt(i, (char)(0xd7c0 + (ch >> 10)));
buffer.setCharAt(i+1, (char)(0xdc00 + (ch & 0x3ff)));
return 0;
} else if (ch <= 0xffff && cp > 0xffff) { // putting BMP instead of supplementary, buffer shrinks
buffer.setCharAt(i, (char)ch);
buffer.deleteCharAt(i+1);
return -1;
} else { //if (ch > 0xffff && cp <= 0xffff) { // putting supplementary instead of BMP, buffer grows
buffer.setCharAt(i, (char)(0xd7c0 + (ch >> 10)));
buffer.insert(i+1, (char)(0xdc00 + (ch & 0x3ff)));
return 1;
}
}
/**
* Method countCodePoint. Counts the UTF-32 code points
* in a UTF-16 encoded string.
* @param source String in question.
* @return int number of code points in this string
*/
public static final int countCodePoint(String source)
{
int result = 0;
char ch;
boolean hadLeadSurrogate = false;
for (int i = 0; i < source.length(); ++ i)
{
ch = source.charAt(i);
if (hadLeadSurrogate && 0xdc00 <= ch && ch <= 0xdfff) {
hadLeadSurrogate = false; // count valid trail as zero
}
else
{
hadLeadSurrogate = (0xd800 <= ch && ch <= 0xdbff);
++ result; // count others as 1
}
}
return result;
}
/**
* Method countCodePoint. Counts the UTF-32 code points
* in a UTF-16 encoded string.
* @param source StringBuffer in question.
* @return int number of code points in this string
*/
public static final int countCodePoint(StringBuffer source)
{
int result = 0;
char ch;
boolean hadLeadSurrogate = false;
for (int i = 0; i < source.length(); ++ i)
{
ch = source.charAt(i);
if (hadLeadSurrogate && 0xdc00 <= ch && ch <= 0xdfff) {
hadLeadSurrogate = false; // count valid trail as zero
}
else
{
hadLeadSurrogate = (0xd800 <= ch && ch <= 0xdbff);
++ result; // count others as 1
}
}
return result;
}
/**
* The minimum value for Supplementary code points
*/
public static final int SUPPLEMENTARY_MIN_VALUE = 0x10000;
/**
* Determines how many chars this char32 requires.
* If a validity check is required, use <code>
* <a href="../UCharacter.html#isLegal(char)">isLegal()</a></code> on
* char32 before calling.
* @param ch the input codepoint.
* @return 2 if is in supplementary space, otherwise 1.
*/
public static int getCharCount(int char32)
{
if (char32 < SUPPLEMENTARY_MIN_VALUE) {
return 1;
}
return 2;
}
/**
* Lead surrogate maximum value
* @stable ICU 2.1
*/
public static final int LEAD_SURROGATE_MAX_VALUE = 0xDBFF;
/**
* Lead surrogate minimum value
* @stable ICU 2.1
*/
public static final int LEAD_SURROGATE_MIN_VALUE = 0xD800;
/**
* Trail surrogate minimum value
* @stable ICU 2.1
*/
public static final int TRAIL_SURROGATE_MIN_VALUE = 0xDC00;
/**
* Trail surrogate maximum value
* @stable ICU 2.1
*/
public static final int TRAIL_SURROGATE_MAX_VALUE = 0xDFFF;
/**
* Determines whether the code value is a surrogate.
* @param ch the input character.
* @return true iff the input character is a surrogate.
* @stable ICU 2.1
*/
public static boolean isSurrogate(char char16)
{
return LEAD_SURROGATE_MIN_VALUE <= char16 &&
char16 <= TRAIL_SURROGATE_MAX_VALUE;
}
/**
* Determines whether the character is a trail surrogate.
* @param char16 the input character.
* @return true iff the input character is a trail surrogate.
* @stable ICU 2.1
*/
public static boolean isTrailSurrogate(char char16)
{
return (TRAIL_SURROGATE_MIN_VALUE <= char16 &&
char16 <= TRAIL_SURROGATE_MAX_VALUE);
}
/**
* Determines whether the character is a lead surrogate.
* @param char16 the input character.
* @return true iff the input character is a lead surrogate
* @stable ICU 2.1
*/
public static boolean isLeadSurrogate(char char16)
{
return LEAD_SURROGATE_MIN_VALUE <= char16 &&
char16 <= LEAD_SURROGATE_MAX_VALUE;
}
/**
* Extract a single UTF-32 value from a substring.
* Used when iterating forwards or backwards (with
* <code>UTF16.getCharCount()</code>, as well as random access. If a
* validity check is required, use
* <code><a href="../UCharacter.html#isLegal(char)">UCharacter.isLegal()
* </a></code> on the return value.
* If the char retrieved is part of a surrogate pair, its supplementary
* character will be returned. If a complete supplementary character is
* not found the incomplete character will be returned
* @param source array of UTF-16 chars
* @param start offset to substring in the source array for analyzing
* @param limit offset to substring in the source array for analyzing
* @param offset16 UTF-16 offset relative to start
* @return UTF-32 value for the UTF-32 value that contains the char at
* offset16. The boundaries of that codepoint are the same as in
* <code>bounds32()</code>.
* @exception IndexOutOfBoundsException thrown if offset16 is not within
* the range of start and limit.
* @stable ICU 2.1
*/
public static int charAt(char source[], int start, int limit,
int offset16)
{
offset16 += start;
if (offset16 < start || offset16 >= limit) {
throw new ArrayIndexOutOfBoundsException(offset16);
}
char single = source[offset16];
if (!isSurrogate(single)) {
return single;
}
// Convert the UTF-16 surrogate pair if necessary.
// For simplicity in usage, and because the frequency of pairs is
// low, look both directions.
if (single <= LEAD_SURROGATE_MAX_VALUE) {
offset16 ++;
if (offset16 >= limit) {
return single;
}
char trail = source[offset16];
if (isTrailSurrogate(trail)) {
return getRawSupplementary(single, trail);
}
}
else { // isTrailSurrogate(single), so
if (offset16 == start) {
return single;
}
offset16 --;
char lead = source[offset16];
if (isLeadSurrogate(lead))
return getRawSupplementary(lead, single);
}
return single; // return unmatched surrogate
}
/**
* Shift value for lead surrogate to form a supplementary character.
*/
private static final int LEAD_SURROGATE_SHIFT_ = 10;
/**
* Offset to add to combined surrogate pair to avoid msking.
*/
private static final int SURROGATE_OFFSET_ =
SUPPLEMENTARY_MIN_VALUE -
(LEAD_SURROGATE_MIN_VALUE <<
LEAD_SURROGATE_SHIFT_) -
TRAIL_SURROGATE_MIN_VALUE;
/**
* Forms a supplementary code point from the argument character<br>
* Note this is for internal use hence no checks for the validity of the
* surrogate characters are done
* @param lead lead surrogate character
* @param trail trailing surrogate character
* @return code point of the supplementary character
*/
public static int getRawSupplementary(char lead, char trail)
{
return (lead << LEAD_SURROGATE_SHIFT_) + trail + SURROGATE_OFFSET_;
}
}