blob: f4784f0d3190cebba8678e8367b0a13780d03a7a [file] [log] [blame]
/*
*******************************************************************************
* Copyright (C) 1996-2000, International Business Machines Corporation and *
* others. All Rights Reserved. *
*******************************************************************************
*
* $Source: /usr/cvs/icu4j/icu4j/src/com/ibm/icu/impl/UForwardCharacterIterator.java,v $
* $Date: 2002/06/20 01:18:09 $
* $Revision: 1.1 $
*
*****************************************************************************************
*/
package com.ibm.icu.text;
/**
* Interface that defines an API for forward-only iteration
* on text objects.
* This is a minimal interface for iteration without random access
* or backwards iteration. It is especially useful for wrapping
* streams with converters into an object for collation or
* normalization.
*
* <p>Characters can be accessed in two ways: as code units or as
* code points.
* Unicode code points are 21-bit integers and are the scalar values
* of Unicode characters. ICU uses the type <code>int</code> for them.
* Unicode code units are the storage units of a given
* Unicode/UCS Transformation Format (a character encoding scheme).
* With UTF-16, all code points can be represented with either one
* or two code units ("surrogates").
* String storage is typically based on code units, while properties
* of characters are typically determined using code point values.
* Some processes may be designed to work with sequences of code units,
* or it may be known that all characters that are important to an
* algorithm can be represented with single code units.
* Other processes will need to use the code point access functions.</p>
*
* <p>ForwardCharacterIterator provides next() to access
* a code unit and advance an internal position into the text object,
* similar to a <code>return text[position++]</code>.<br>
* It provides nextCodePoint() to access a code point and advance an internal
* position.</p>
*
* <p>nextCodePoint() assumes that the current position is that of
* the beginning of a code point, i.e., of its first code unit.
* After nextCodePoint(), this will be true again.
* In general, access to code units and code points in the same
* iteration loop should not be mixed. In UTF-16, if the current position
* is on a second code unit (Low Surrogate), then only that code unit
* is returned even by nextCodePoint().</p>
*
* Usage:
* <code>
* public void function1(UForwardCharacterIterator it) {
* int c;
* while((c=it.next())!=UForwardCharacterIterator.DONE) {
* // use c
* }
* }
* </code>
* </p>
* @draft ICU 2.4
*
*/
public interface UForwardCharacterIterator {
/**
* Indicator that we have reached the ends of the UTF16 text.
* @draft ICU 2.4
*/
public static final int DONE = -1;
/**
* Returns the UTF16 code unit at index, and increments to the next
* code unit (post-increment semantics). If index is out of
* range, DONE is returned, and the iterator is reset to the limit
* of the text.
* @return the next UTF16 code unit, or DONE if the index is at the limit
* of the text.
* @draft ICU 2.4
*/
public int next();
/**
* Returns the code point at index, and increments to the next code
* point (post-increment semantics). If index does not point to a
* valid surrogate pair, the behavior is the same as
* <code>next()<code>. Otherwise the iterator is incremented past
* the surrogate pair, and the code point represented by the pair
* is returned.
* @return the next codepoint in text, or DONE if the index is at
* the limit of the text.
* @draft ICU 2.4
*/
public int nextCodePoint();
}