blob: 106a233752782e83e08cee091d109a0b49e6c054 [file] [log] [blame]
/*
*******************************************************************************
* Copyright (C) 2003-2010, International Business Machines
* Corporation and others. All Rights Reserved.
*******************************************************************************
*/
package com.ibm.icu.impl;
import com.ibm.icu.text.IDNA;
import com.ibm.icu.text.StringPrep;
import com.ibm.icu.text.StringPrepParseException;
import com.ibm.icu.text.UCharacterIterator;
/**
* IDNA2003 implementation code, moved out of com.ibm.icu.text.IDNA.java
* while extending that class to support IDNA2008/UTS #46 as well.
* @author Ram Viswanadha
*/
public final class IDNA2003 {
/* IDNA ACE Prefix is "xn--" */
private static char[] ACE_PREFIX = new char[]{ 0x0078,0x006E,0x002d,0x002d } ;
//private static final int ACE_PREFIX_LENGTH = ACE_PREFIX.length;
private static final int MAX_LABEL_LENGTH = 63;
private static final int HYPHEN = 0x002D;
private static final int CAPITAL_A = 0x0041;
private static final int CAPITAL_Z = 0x005A;
private static final int LOWER_CASE_DELTA = 0x0020;
private static final int FULL_STOP = 0x002E;
private static final int MAX_DOMAIN_NAME_LENGTH = 255;
// The NamePrep profile object
private static final StringPrep namePrep = StringPrep.getInstance(StringPrep.RFC3491_NAMEPREP);
private static boolean startsWithPrefix(StringBuffer src){
boolean startsWithPrefix = true;
if(src.length() < ACE_PREFIX.length){
return false;
}
for(int i=0; i<ACE_PREFIX.length;i++){
if(toASCIILower(src.charAt(i)) != ACE_PREFIX[i]){
startsWithPrefix = false;
}
}
return startsWithPrefix;
}
private static char toASCIILower(char ch){
if(CAPITAL_A <= ch && ch <= CAPITAL_Z){
return (char)(ch + LOWER_CASE_DELTA);
}
return ch;
}
private static StringBuffer toASCIILower(CharSequence src){
StringBuffer dest = new StringBuffer();
for(int i=0; i<src.length();i++){
dest.append(toASCIILower(src.charAt(i)));
}
return dest;
}
private static int compareCaseInsensitiveASCII(StringBuffer s1, StringBuffer s2){
char c1,c2;
int rc;
for(int i =0;/* no condition */;i++) {
/* If we reach the ends of both strings then they match */
if(i == s1.length()) {
return 0;
}
c1 = s1.charAt(i);
c2 = s2.charAt(i);
/* Case-insensitive comparison */
if(c1!=c2) {
rc=toASCIILower(c1)-toASCIILower(c2);
if(rc!=0) {
return rc;
}
}
}
}
private static int getSeparatorIndex(char[] src,int start, int limit){
for(; start<limit;start++){
if(isLabelSeparator(src[start])){
return start;
}
}
// we have not found the separator just return length
return start;
}
/*
private static int getSeparatorIndex(UCharacterIterator iter){
int currentIndex = iter.getIndex();
int separatorIndex = 0;
int ch;
while((ch=iter.next())!= UCharacterIterator.DONE){
if(isLabelSeparator(ch)){
separatorIndex = iter.getIndex();
iter.setIndex(currentIndex);
return separatorIndex;
}
}
// reset index
iter.setIndex(currentIndex);
// we have not found the separator just return the length
}
*/
private static boolean isLDHChar(int ch){
// high runner case
if(ch>0x007A){
return false;
}
//[\\u002D \\u0030-\\u0039 \\u0041-\\u005A \\u0061-\\u007A]
if( (ch==0x002D) ||
(0x0030 <= ch && ch <= 0x0039) ||
(0x0041 <= ch && ch <= 0x005A) ||
(0x0061 <= ch && ch <= 0x007A)
){
return true;
}
return false;
}
/**
* Ascertain if the given code point is a label separator as
* defined by the IDNA RFC
*
* @param ch The code point to be ascertained
* @return true if the char is a label separator
* @stable ICU 2.8
*/
private static boolean isLabelSeparator(int ch){
switch(ch){
case 0x002e:
case 0x3002:
case 0xFF0E:
case 0xFF61:
return true;
default:
return false;
}
}
public static StringBuffer convertToASCII(UCharacterIterator src, int options)
throws StringPrepParseException{
boolean[] caseFlags = null;
// the source contains all ascii codepoints
boolean srcIsASCII = true;
// assume the source contains all LDH codepoints
boolean srcIsLDH = true;
//get the options
boolean useSTD3ASCIIRules = ((options & IDNA.USE_STD3_RULES) != 0);
int ch;
// step 1
while((ch = src.next())!= UCharacterIterator.DONE){
if(ch> 0x7f){
srcIsASCII = false;
}
}
int failPos = -1;
src.setToStart();
StringBuffer processOut = null;
// step 2 is performed only if the source contains non ASCII
if(!srcIsASCII){
// step 2
processOut = namePrep.prepare(src, options);
}else{
processOut = new StringBuffer(src.getText());
}
int poLen = processOut.length();
if(poLen==0){
throw new StringPrepParseException("Found zero length lable after NamePrep.",StringPrepParseException.ZERO_LENGTH_LABEL);
}
StringBuffer dest = new StringBuffer();
// reset the variable to verify if output of prepare is ASCII or not
srcIsASCII = true;
// step 3 & 4
for(int j=0;j<poLen;j++ ){
ch=processOut.charAt(j);
if(ch > 0x7F){
srcIsASCII = false;
}else if(isLDHChar(ch)==false){
// here we do not assemble surrogates
// since we know that LDH code points
// are in the ASCII range only
srcIsLDH = false;
failPos = j;
}
}
if(useSTD3ASCIIRules == true){
// verify 3a and 3b
if( srcIsLDH == false /* source contains some non-LDH characters */
|| processOut.charAt(0) == HYPHEN
|| processOut.charAt(processOut.length()-1) == HYPHEN){
/* populate the parseError struct */
if(srcIsLDH==false){
throw new StringPrepParseException( "The input does not conform to the STD 3 ASCII rules",
StringPrepParseException.STD3_ASCII_RULES_ERROR,
processOut.toString(),
(failPos>0) ? (failPos-1) : failPos);
}else if(processOut.charAt(0) == HYPHEN){
throw new StringPrepParseException("The input does not conform to the STD 3 ASCII rules",
StringPrepParseException.STD3_ASCII_RULES_ERROR,processOut.toString(),0);
}else{
throw new StringPrepParseException("The input does not conform to the STD 3 ASCII rules",
StringPrepParseException.STD3_ASCII_RULES_ERROR,
processOut.toString(),
(poLen>0) ? poLen-1 : poLen);
}
}
}
if(srcIsASCII){
dest = processOut;
}else{
// step 5 : verify the sequence does not begin with ACE prefix
if(!startsWithPrefix(processOut)){
//step 6: encode the sequence with punycode
caseFlags = new boolean[poLen];
StringBuilder punyout = Punycode.encode(processOut,caseFlags);
// convert all codepoints to lower case ASCII
StringBuffer lowerOut = toASCIILower(punyout);
//Step 7: prepend the ACE prefix
dest.append(ACE_PREFIX,0,ACE_PREFIX.length);
//Step 6: copy the contents in b2 into dest
dest.append(lowerOut);
}else{
throw new StringPrepParseException("The input does not start with the ACE Prefix.",
StringPrepParseException.ACE_PREFIX_ERROR,processOut.toString(),0);
}
}
if(dest.length() > MAX_LABEL_LENGTH){
throw new StringPrepParseException("The labels in the input are too long. Length > 63.",
StringPrepParseException.LABEL_TOO_LONG_ERROR,dest.toString(),0);
}
return dest;
}
public static StringBuffer convertIDNToASCII(String src,int options)
throws StringPrepParseException{
char[] srcArr = src.toCharArray();
StringBuffer result = new StringBuffer();
int sepIndex=0;
int oldSepIndex=0;
for(;;){
sepIndex = getSeparatorIndex(srcArr,sepIndex,srcArr.length);
String label = new String(srcArr,oldSepIndex,sepIndex-oldSepIndex);
//make sure this is not a root label separator.
if(!(label.length()==0 && sepIndex==srcArr.length)){
UCharacterIterator iter = UCharacterIterator.getInstance(label);
result.append(convertToASCII(iter,options));
}
if(sepIndex==srcArr.length){
break;
}
// increment the sepIndex to skip past the separator
sepIndex++;
oldSepIndex = sepIndex;
result.append((char)FULL_STOP);
}
if(result.length() > MAX_DOMAIN_NAME_LENGTH){
throw new StringPrepParseException("The output exceed the max allowed length.", StringPrepParseException.DOMAIN_NAME_TOO_LONG_ERROR);
}
return result;
}
public static StringBuffer convertToUnicode(UCharacterIterator src, int options)
throws StringPrepParseException{
boolean[] caseFlags = null;
// the source contains all ascii codepoints
boolean srcIsASCII = true;
// assume the source contains all LDH codepoints
//boolean srcIsLDH = true;
//get the options
//boolean useSTD3ASCIIRules = ((options & USE_STD3_RULES) != 0);
//int failPos = -1;
int ch;
int saveIndex = src.getIndex();
// step 1: find out if all the codepoints in src are ASCII
while((ch=src.next())!= UCharacterIterator.DONE){
if(ch>0x7F){
srcIsASCII = false;
}/*else if((srcIsLDH = isLDHChar(ch))==false){
failPos = src.getIndex();
}*/
}
StringBuffer processOut;
if(srcIsASCII == false){
try {
// step 2: process the string
src.setIndex(saveIndex);
processOut = namePrep.prepare(src,options);
} catch (StringPrepParseException ex) {
return new StringBuffer(src.getText());
}
}else{
//just point to source
processOut = new StringBuffer(src.getText());
}
// TODO:
// The RFC states that
// <quote>
// ToUnicode never fails. If any step fails, then the original input
// is returned immediately in that step.
// </quote>
//step 3: verify ACE Prefix
if(startsWithPrefix(processOut)){
StringBuffer decodeOut = null;
//step 4: Remove the ACE Prefix
String temp = processOut.substring(ACE_PREFIX.length,processOut.length());
//step 5: Decode using punycode
try {
decodeOut = new StringBuffer(Punycode.decode(temp,caseFlags));
} catch (StringPrepParseException e) {
decodeOut = null;
}
//step 6:Apply toASCII
if (decodeOut != null) {
StringBuffer toASCIIOut = convertToASCII(UCharacterIterator.getInstance(decodeOut), options);
//step 7: verify
if(compareCaseInsensitiveASCII(processOut, toASCIIOut) !=0){
// throw new StringPrepParseException("The verification step prescribed by the RFC 3491 failed",
// StringPrepParseException.VERIFICATION_ERROR);
decodeOut = null;
}
}
//step 8: return output of step 5
if (decodeOut != null) {
return decodeOut;
}
}
// }else{
// // verify that STD3 ASCII rules are satisfied
// if(useSTD3ASCIIRules == true){
// if( srcIsLDH == false /* source contains some non-LDH characters */
// || processOut.charAt(0) == HYPHEN
// || processOut.charAt(processOut.length()-1) == HYPHEN){
//
// if(srcIsLDH==false){
// throw new StringPrepParseException("The input does not conform to the STD 3 ASCII rules",
// StringPrepParseException.STD3_ASCII_RULES_ERROR,processOut.toString(),
// (failPos>0) ? (failPos-1) : failPos);
// }else if(processOut.charAt(0) == HYPHEN){
// throw new StringPrepParseException("The input does not conform to the STD 3 ASCII rules",
// StringPrepParseException.STD3_ASCII_RULES_ERROR,
// processOut.toString(),0);
//
// }else{
// throw new StringPrepParseException("The input does not conform to the STD 3 ASCII rules",
// StringPrepParseException.STD3_ASCII_RULES_ERROR,
// processOut.toString(),
// processOut.length());
//
// }
// }
// }
// // just return the source
// return new StringBuffer(src.getText());
// }
return new StringBuffer(src.getText());
}
public static StringBuffer convertIDNToUnicode(String src, int options)
throws StringPrepParseException{
char[] srcArr = src.toCharArray();
StringBuffer result = new StringBuffer();
int sepIndex=0;
int oldSepIndex=0;
for(;;){
sepIndex = getSeparatorIndex(srcArr,sepIndex,srcArr.length);
String label = new String(srcArr,oldSepIndex,sepIndex-oldSepIndex);
if(label.length()==0 && sepIndex!=srcArr.length ){
throw new StringPrepParseException("Found zero length lable after NamePrep.",StringPrepParseException.ZERO_LENGTH_LABEL);
}
UCharacterIterator iter = UCharacterIterator.getInstance(label);
result.append(convertToUnicode(iter,options));
if(sepIndex==srcArr.length){
break;
}
// Unlike the ToASCII operation we don't normalize the label separators
result.append(srcArr[sepIndex]);
// increment the sepIndex to skip past the separator
sepIndex++;
oldSepIndex =sepIndex;
}
if(result.length() > MAX_DOMAIN_NAME_LENGTH){
throw new StringPrepParseException("The output exceed the max allowed length.", StringPrepParseException.DOMAIN_NAME_TOO_LONG_ERROR);
}
return result;
}
public static int compare(String s1, String s2, int options) throws StringPrepParseException{
StringBuffer s1Out = convertIDNToASCII(s1, options);
StringBuffer s2Out = convertIDNToASCII(s2, options);
return compareCaseInsensitiveASCII(s1Out,s2Out);
}
}