| // © 2016 and later: Unicode, Inc. and others. |
| // License & terms of use: http://www.unicode.org/copyright.html#License |
| /* |
| ******************************************************************************* |
| * Copyright (C) 2003-2010, International Business Machines |
| * Corporation and others. All Rights Reserved. |
| ******************************************************************************* |
| */ |
| package com.ibm.icu.impl; |
| |
| import com.ibm.icu.text.IDNA; |
| import com.ibm.icu.text.StringPrep; |
| import com.ibm.icu.text.StringPrepParseException; |
| import com.ibm.icu.text.UCharacterIterator; |
| |
| /** |
| * IDNA2003 implementation code, moved out of com.ibm.icu.text.IDNA.java |
| * while extending that class to support IDNA2008/UTS #46 as well. |
| * @author Ram Viswanadha |
| */ |
| public final class IDNA2003 { |
| /* IDNA ACE Prefix is "xn--" */ |
| private static char[] ACE_PREFIX = new char[]{ 0x0078,0x006E,0x002d,0x002d } ; |
| //private static final int ACE_PREFIX_LENGTH = ACE_PREFIX.length; |
| |
| private static final int MAX_LABEL_LENGTH = 63; |
| private static final int HYPHEN = 0x002D; |
| private static final int CAPITAL_A = 0x0041; |
| private static final int CAPITAL_Z = 0x005A; |
| private static final int LOWER_CASE_DELTA = 0x0020; |
| private static final int FULL_STOP = 0x002E; |
| private static final int MAX_DOMAIN_NAME_LENGTH = 255; |
| |
| // The NamePrep profile object |
| private static final StringPrep namePrep = StringPrep.getInstance(StringPrep.RFC3491_NAMEPREP); |
| |
| private static boolean startsWithPrefix(StringBuffer src){ |
| if(src.length() < ACE_PREFIX.length){ |
| return false; |
| } |
| for(int i=0; i<ACE_PREFIX.length;i++){ |
| if(toASCIILower(src.charAt(i)) != ACE_PREFIX[i]){ |
| return false; |
| } |
| } |
| return true; |
| } |
| |
| private static char toASCIILower(char ch){ |
| if(CAPITAL_A <= ch && ch <= CAPITAL_Z){ |
| return (char)(ch + LOWER_CASE_DELTA); |
| } |
| return ch; |
| } |
| |
| private static StringBuffer toASCIILower(CharSequence src){ |
| StringBuffer dest = new StringBuffer(); |
| for(int i=0; i<src.length();i++){ |
| dest.append(toASCIILower(src.charAt(i))); |
| } |
| return dest; |
| } |
| |
| private static int compareCaseInsensitiveASCII(StringBuffer s1, StringBuffer s2){ |
| char c1,c2; |
| int rc; |
| for(int i =0;/* no condition */;i++) { |
| /* If we reach the ends of both strings then they match */ |
| if(i == s1.length()) { |
| return 0; |
| } |
| |
| c1 = s1.charAt(i); |
| c2 = s2.charAt(i); |
| |
| /* Case-insensitive comparison */ |
| if(c1!=c2) { |
| rc=toASCIILower(c1)-toASCIILower(c2); |
| if(rc!=0) { |
| return rc; |
| } |
| } |
| } |
| } |
| |
| private static int getSeparatorIndex(char[] src,int start, int limit){ |
| for(; start<limit;start++){ |
| if(isLabelSeparator(src[start])){ |
| return start; |
| } |
| } |
| // we have not found the separator just return length |
| return start; |
| } |
| |
| /* |
| private static int getSeparatorIndex(UCharacterIterator iter){ |
| int currentIndex = iter.getIndex(); |
| int separatorIndex = 0; |
| int ch; |
| while((ch=iter.next())!= UCharacterIterator.DONE){ |
| if(isLabelSeparator(ch)){ |
| separatorIndex = iter.getIndex(); |
| iter.setIndex(currentIndex); |
| return separatorIndex; |
| } |
| } |
| // reset index |
| iter.setIndex(currentIndex); |
| // we have not found the separator just return the length |
| |
| } |
| */ |
| |
| |
| private static boolean isLDHChar(int ch){ |
| // high runner case |
| if(ch>0x007A){ |
| return false; |
| } |
| //[\\u002D \\u0030-\\u0039 \\u0041-\\u005A \\u0061-\\u007A] |
| if( (ch==0x002D) || |
| (0x0030 <= ch && ch <= 0x0039) || |
| (0x0041 <= ch && ch <= 0x005A) || |
| (0x0061 <= ch && ch <= 0x007A) |
| ){ |
| return true; |
| } |
| return false; |
| } |
| |
| /** |
| * Ascertain if the given code point is a label separator as |
| * defined by the IDNA RFC |
| * |
| * @param ch The code point to be ascertained |
| * @return true if the char is a label separator |
| * @stable ICU 2.8 |
| */ |
| private static boolean isLabelSeparator(int ch){ |
| switch(ch){ |
| case 0x002e: |
| case 0x3002: |
| case 0xFF0E: |
| case 0xFF61: |
| return true; |
| default: |
| return false; |
| } |
| } |
| |
| public static StringBuffer convertToASCII(UCharacterIterator src, int options) |
| throws StringPrepParseException{ |
| |
| boolean[] caseFlags = null; |
| |
| // the source contains all ascii codepoints |
| boolean srcIsASCII = true; |
| // assume the source contains all LDH codepoints |
| boolean srcIsLDH = true; |
| |
| //get the options |
| boolean useSTD3ASCIIRules = ((options & IDNA.USE_STD3_RULES) != 0); |
| int ch; |
| // step 1 |
| while((ch = src.next())!= UCharacterIterator.DONE){ |
| if(ch> 0x7f){ |
| srcIsASCII = false; |
| break; |
| } |
| } |
| int failPos = -1; |
| src.setToStart(); |
| StringBuffer processOut = null; |
| // step 2 is performed only if the source contains non ASCII |
| if(!srcIsASCII){ |
| // step 2 |
| processOut = namePrep.prepare(src, options); |
| }else{ |
| processOut = new StringBuffer(src.getText()); |
| } |
| int poLen = processOut.length(); |
| |
| if(poLen==0){ |
| throw new StringPrepParseException("Found zero length lable after NamePrep.",StringPrepParseException.ZERO_LENGTH_LABEL); |
| } |
| StringBuffer dest = new StringBuffer(); |
| |
| // reset the variable to verify if output of prepare is ASCII or not |
| srcIsASCII = true; |
| |
| // step 3 & 4 |
| for(int j=0;j<poLen;j++ ){ |
| ch=processOut.charAt(j); |
| if(ch > 0x7F){ |
| srcIsASCII = false; |
| }else if(isLDHChar(ch)==false){ |
| // here we do not assemble surrogates |
| // since we know that LDH code points |
| // are in the ASCII range only |
| srcIsLDH = false; |
| failPos = j; |
| } |
| } |
| |
| if(useSTD3ASCIIRules == true){ |
| // verify 3a and 3b |
| if( srcIsLDH == false /* source contains some non-LDH characters */ |
| || processOut.charAt(0) == HYPHEN |
| || processOut.charAt(processOut.length()-1) == HYPHEN){ |
| |
| /* populate the parseError struct */ |
| if(srcIsLDH==false){ |
| throw new StringPrepParseException( "The input does not conform to the STD 3 ASCII rules", |
| StringPrepParseException.STD3_ASCII_RULES_ERROR, |
| processOut.toString(), |
| (failPos>0) ? (failPos-1) : failPos); |
| }else if(processOut.charAt(0) == HYPHEN){ |
| throw new StringPrepParseException("The input does not conform to the STD 3 ASCII rules", |
| StringPrepParseException.STD3_ASCII_RULES_ERROR,processOut.toString(),0); |
| |
| }else{ |
| throw new StringPrepParseException("The input does not conform to the STD 3 ASCII rules", |
| StringPrepParseException.STD3_ASCII_RULES_ERROR, |
| processOut.toString(), |
| (poLen>0) ? poLen-1 : poLen); |
| |
| } |
| } |
| } |
| if(srcIsASCII){ |
| dest = processOut; |
| }else{ |
| // step 5 : verify the sequence does not begin with ACE prefix |
| if(!startsWithPrefix(processOut)){ |
| |
| //step 6: encode the sequence with punycode |
| caseFlags = new boolean[poLen]; |
| |
| StringBuilder punyout = Punycode.encode(processOut,caseFlags); |
| |
| // convert all codepoints to lower case ASCII |
| StringBuffer lowerOut = toASCIILower(punyout); |
| |
| //Step 7: prepend the ACE prefix |
| dest.append(ACE_PREFIX,0,ACE_PREFIX.length); |
| //Step 6: copy the contents in b2 into dest |
| dest.append(lowerOut); |
| }else{ |
| |
| throw new StringPrepParseException("The input does not start with the ACE Prefix.", |
| StringPrepParseException.ACE_PREFIX_ERROR,processOut.toString(),0); |
| } |
| } |
| if(dest.length() > MAX_LABEL_LENGTH){ |
| throw new StringPrepParseException("The labels in the input are too long. Length > 63.", |
| StringPrepParseException.LABEL_TOO_LONG_ERROR,dest.toString(),0); |
| } |
| return dest; |
| } |
| |
| public static StringBuffer convertIDNToASCII(String src,int options) |
| throws StringPrepParseException{ |
| |
| char[] srcArr = src.toCharArray(); |
| StringBuffer result = new StringBuffer(); |
| int sepIndex=0; |
| int oldSepIndex=0; |
| for(;;){ |
| sepIndex = getSeparatorIndex(srcArr,sepIndex,srcArr.length); |
| String label = new String(srcArr,oldSepIndex,sepIndex-oldSepIndex); |
| //make sure this is not a root label separator. |
| if(!(label.length()==0 && sepIndex==srcArr.length)){ |
| UCharacterIterator iter = UCharacterIterator.getInstance(label); |
| result.append(convertToASCII(iter,options)); |
| } |
| if(sepIndex==srcArr.length){ |
| break; |
| } |
| |
| // increment the sepIndex to skip past the separator |
| sepIndex++; |
| oldSepIndex = sepIndex; |
| result.append((char)FULL_STOP); |
| } |
| if(result.length() > MAX_DOMAIN_NAME_LENGTH){ |
| throw new StringPrepParseException("The output exceed the max allowed length.", StringPrepParseException.DOMAIN_NAME_TOO_LONG_ERROR); |
| } |
| return result; |
| } |
| |
| public static StringBuffer convertToUnicode(UCharacterIterator src, int options) |
| throws StringPrepParseException{ |
| |
| boolean[] caseFlags = null; |
| |
| // the source contains all ascii codepoints |
| boolean srcIsASCII = true; |
| // assume the source contains all LDH codepoints |
| //boolean srcIsLDH = true; |
| |
| //get the options |
| //boolean useSTD3ASCIIRules = ((options & USE_STD3_RULES) != 0); |
| |
| //int failPos = -1; |
| int ch; |
| int saveIndex = src.getIndex(); |
| // step 1: find out if all the codepoints in src are ASCII |
| while((ch=src.next())!= UCharacterIterator.DONE){ |
| if(ch>0x7F){ |
| srcIsASCII = false; |
| }/*else if((srcIsLDH = isLDHChar(ch))==false){ |
| failPos = src.getIndex(); |
| }*/ |
| } |
| StringBuffer processOut; |
| |
| if(srcIsASCII == false){ |
| try { |
| // step 2: process the string |
| src.setIndex(saveIndex); |
| processOut = namePrep.prepare(src,options); |
| } catch (StringPrepParseException ex) { |
| return new StringBuffer(src.getText()); |
| } |
| |
| }else{ |
| //just point to source |
| processOut = new StringBuffer(src.getText()); |
| } |
| // TODO: |
| // The RFC states that |
| // <quote> |
| // ToUnicode never fails. If any step fails, then the original input |
| // is returned immediately in that step. |
| // </quote> |
| |
| //step 3: verify ACE Prefix |
| if(startsWithPrefix(processOut)){ |
| StringBuffer decodeOut = null; |
| |
| //step 4: Remove the ACE Prefix |
| String temp = processOut.substring(ACE_PREFIX.length,processOut.length()); |
| |
| //step 5: Decode using punycode |
| try { |
| decodeOut = new StringBuffer(Punycode.decode(temp,caseFlags)); |
| } catch (StringPrepParseException e) { |
| decodeOut = null; |
| } |
| |
| //step 6:Apply toASCII |
| if (decodeOut != null) { |
| StringBuffer toASCIIOut = convertToASCII(UCharacterIterator.getInstance(decodeOut), options); |
| |
| //step 7: verify |
| if(compareCaseInsensitiveASCII(processOut, toASCIIOut) !=0){ |
| // throw new StringPrepParseException("The verification step prescribed by the RFC 3491 failed", |
| // StringPrepParseException.VERIFICATION_ERROR); |
| decodeOut = null; |
| } |
| } |
| |
| //step 8: return output of step 5 |
| if (decodeOut != null) { |
| return decodeOut; |
| } |
| } |
| |
| // }else{ |
| // // verify that STD3 ASCII rules are satisfied |
| // if(useSTD3ASCIIRules == true){ |
| // if( srcIsLDH == false /* source contains some non-LDH characters */ |
| // || processOut.charAt(0) == HYPHEN |
| // || processOut.charAt(processOut.length()-1) == HYPHEN){ |
| // |
| // if(srcIsLDH==false){ |
| // throw new StringPrepParseException("The input does not conform to the STD 3 ASCII rules", |
| // StringPrepParseException.STD3_ASCII_RULES_ERROR,processOut.toString(), |
| // (failPos>0) ? (failPos-1) : failPos); |
| // }else if(processOut.charAt(0) == HYPHEN){ |
| // throw new StringPrepParseException("The input does not conform to the STD 3 ASCII rules", |
| // StringPrepParseException.STD3_ASCII_RULES_ERROR, |
| // processOut.toString(),0); |
| // |
| // }else{ |
| // throw new StringPrepParseException("The input does not conform to the STD 3 ASCII rules", |
| // StringPrepParseException.STD3_ASCII_RULES_ERROR, |
| // processOut.toString(), |
| // processOut.length()); |
| // |
| // } |
| // } |
| // } |
| // // just return the source |
| // return new StringBuffer(src.getText()); |
| // } |
| |
| return new StringBuffer(src.getText()); |
| } |
| |
| public static StringBuffer convertIDNToUnicode(String src, int options) |
| throws StringPrepParseException{ |
| |
| char[] srcArr = src.toCharArray(); |
| StringBuffer result = new StringBuffer(); |
| int sepIndex=0; |
| int oldSepIndex=0; |
| for(;;){ |
| sepIndex = getSeparatorIndex(srcArr,sepIndex,srcArr.length); |
| String label = new String(srcArr,oldSepIndex,sepIndex-oldSepIndex); |
| if(label.length()==0 && sepIndex!=srcArr.length ){ |
| throw new StringPrepParseException("Found zero length lable after NamePrep.",StringPrepParseException.ZERO_LENGTH_LABEL); |
| } |
| UCharacterIterator iter = UCharacterIterator.getInstance(label); |
| result.append(convertToUnicode(iter,options)); |
| if(sepIndex==srcArr.length){ |
| break; |
| } |
| // Unlike the ToASCII operation we don't normalize the label separators |
| result.append(srcArr[sepIndex]); |
| // increment the sepIndex to skip past the separator |
| sepIndex++; |
| oldSepIndex =sepIndex; |
| } |
| if(result.length() > MAX_DOMAIN_NAME_LENGTH){ |
| throw new StringPrepParseException("The output exceed the max allowed length.", StringPrepParseException.DOMAIN_NAME_TOO_LONG_ERROR); |
| } |
| return result; |
| } |
| |
| public static int compare(String s1, String s2, int options) throws StringPrepParseException{ |
| StringBuffer s1Out = convertIDNToASCII(s1, options); |
| StringBuffer s2Out = convertIDNToASCII(s2, options); |
| return compareCaseInsensitiveASCII(s1Out,s2Out); |
| } |
| } |