icu4j/main/classes/core/src/com/ibm/icu/impl/UTS46.java - external/github.com/unicode-org/icu - Git at Google

 // © 2016 and later: Unicode, Inc. and others.
 // License & terms of use: http://www.unicode.org/copyright.html
 /*
 *******************************************************************************
 * Copyright (C) 2010-2014, International Business Machines
 * Corporation and others.  All Rights Reserved.
 *******************************************************************************
 */
 package com.ibm.icu.impl;

 import java.util.EnumSet;

 import com.ibm.icu.impl.Normalizer2Impl.UTF16Plus;
 import com.ibm.icu.lang.UCharacter;
 import com.ibm.icu.lang.UCharacterCategory;
 import com.ibm.icu.lang.UCharacterDirection;
 import com.ibm.icu.lang.UScript;
 import com.ibm.icu.text.IDNA;
 import com.ibm.icu.text.Normalizer2;
 import com.ibm.icu.text.StringPrepParseException;
 import com.ibm.icu.util.ICUException;

 // Note about tests for IDNA.Error.DOMAIN_NAME_TOO_LONG:
 //
 // The domain name length limit is 255 octets in an internal DNS representation
 // where the last ("root") label is the empty label
 // represented by length byte 0 alone.
 // In a conventional string, this translates to 253 characters, or 254
 // if there is a trailing dot for the root label.

 /**
  * UTS #46 (IDNA2008) implementation.
  * @author Markus Scherer
  * @since 2010jul09
  */
 public final class UTS46 extends IDNA {
     public UTS46(int options) {
         this.options=options;
     }

     @Override
     public StringBuilder labelToASCII(CharSequence label, StringBuilder dest, Info info) {
         return process(label, true, true, dest, info);
     }

     @Override
     public StringBuilder labelToUnicode(CharSequence label, StringBuilder dest, Info info) {
         return process(label, true, false, dest, info);
     }

     @Override
     public StringBuilder nameToASCII(CharSequence name, StringBuilder dest, Info info) {
         process(name, false, true, dest, info);
         if( dest.length()>=254 && !info.getErrors().contains(Error.DOMAIN_NAME_TOO_LONG) &&
             isASCIIString(dest) &&
             (dest.length()>254 || dest.charAt(253)!='.')
         ) {
             addError(info, Error.DOMAIN_NAME_TOO_LONG);
         }
         return dest;
     }

     @Override
     public StringBuilder nameToUnicode(CharSequence name, StringBuilder dest, Info info) {
         return process(name, false, false, dest, info);
     }

     private static final Normalizer2 uts46Norm2=
         Normalizer2.getInstance(null, "uts46", Normalizer2.Mode.COMPOSE);  // uts46.nrm
     final int options;

     // Severe errors which usually result in a U+FFFD replacement character in the result string.
     private static final EnumSet<Error> severeErrors=EnumSet.of(
         Error.LEADING_COMBINING_MARK,
         Error.DISALLOWED,
         Error.PUNYCODE,
         Error.LABEL_HAS_DOT,
         Error.INVALID_ACE_LABEL);

     private static boolean
     isASCIIString(CharSequence dest) {
         int length=dest.length();
         for(int i=0; i<length; ++i) {
             if(dest.charAt(i)>0x7f) {
                 return false;
             }
         }
         return true;
     }

     // UTS #46 data for ASCII characters.
     // The normalizer (using uts46.nrm) maps uppercase ASCII letters to lowercase
     // and passes through all other ASCII characters.
     // If USE_STD3_RULES is set, then non-LDH characters are disallowed
     // using this data.
     // The ASCII fastpath also uses this data.
     // Values: -1=disallowed  0==valid  1==mapped (lowercase)
     private static final byte asciiData[]={
         -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
         -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
         // 002D..002E; valid  #  HYPHEN-MINUS..FULL STOP
         -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,  0,  0, -1,
         // 0030..0039; valid  #  DIGIT ZERO..DIGIT NINE
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0, -1, -1, -1, -1, -1, -1,
         // 0041..005A; mapped  #  LATIN CAPITAL LETTER A..LATIN CAPITAL LETTER Z
         -1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
          1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1, -1, -1, -1, -1, -1,
         // 0061..007A; valid  #  LATIN SMALL LETTER A..LATIN SMALL LETTER Z
         -1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, -1, -1, -1, -1, -1
     };

     private StringBuilder
     process(CharSequence src,
             boolean isLabel, boolean toASCII,
             StringBuilder dest,
             Info info) {
         // uts46Norm2.normalize() would do all of this error checking and setup,
         // but with the ASCII fastpath we do not always call it, and do not
         // call it first.
         if(dest==src) {
             throw new IllegalArgumentException();
         }
         // Arguments are fine, reset output values.
         dest.delete(0, 0x7fffffff);
         resetInfo(info);
         int srcLength=src.length();
         if(srcLength==0) {
             addError(info, Error.EMPTY_LABEL);
             return dest;
         }
         // ASCII fastpath
         boolean disallowNonLDHDot=(options&USE_STD3_RULES)!=0;
         int labelStart=0;
         int i;
         for(i=0;; ++i) {
             if(i==srcLength) {
                 if(toASCII) {
                     if((i-labelStart)>63) {
                         addLabelError(info, Error.LABEL_TOO_LONG);
                     }
                     // There is a trailing dot if labelStart==i.
                     if(!isLabel && i>=254 && (i>254 || labelStart<i)) {
                         addError(info, Error.DOMAIN_NAME_TOO_LONG);
                     }
                 }
                 promoteAndResetLabelErrors(info);
                 return dest;
             }
             char c=src.charAt(i);
             if(c>0x7f) {
                 break;
             }
             int cData=asciiData[c];
             if(cData>0) {
                 dest.append((char)(c+0x20));  // Lowercase an uppercase ASCII letter.
             } else if(cData<0 && disallowNonLDHDot) {
                 break;  // Replacing with U+FFFD can be complicated for toASCII.
             } else {
                 dest.append(c);
                 if(c=='-') {  // hyphen
                     if(i==(labelStart+3) && src.charAt(i-1)=='-') {
                         // "??--..." is Punycode or forbidden.
                         ++i;  // '-' was copied to dest already
                         break;
                     }
                     if(i==labelStart) {
                         // label starts with "-"
                         addLabelError(info, Error.LEADING_HYPHEN);
                     }
                     if((i+1)==srcLength || src.charAt(i+1)=='.') {
                         // label ends with "-"
                         addLabelError(info, Error.TRAILING_HYPHEN);
                     }
                 } else if(c=='.') {  // dot
                     if(isLabel) {
                         // Replacing with U+FFFD can be complicated for toASCII.
                         ++i;  // '.' was copied to dest already
                         break;
                     }
                     if(i==labelStart) {
                         addLabelError(info, Error.EMPTY_LABEL);
                     }
                     if(toASCII && (i-labelStart)>63) {
                         addLabelError(info, Error.LABEL_TOO_LONG);
                     }
                     promoteAndResetLabelErrors(info);
                     labelStart=i+1;
                 }
             }
         }
         promoteAndResetLabelErrors(info);
         processUnicode(src, labelStart, i, isLabel, toASCII, dest, info);
         if( isBiDi(info) && !hasCertainErrors(info, severeErrors) &&
             (!isOkBiDi(info) || (labelStart>0 && !isASCIIOkBiDi(dest, labelStart)))
         ) {
             addError(info, Error.BIDI);
         }
         return dest;
     }

     private StringBuilder
     processUnicode(CharSequence src,
                    int labelStart, int mappingStart,
                    boolean isLabel, boolean toASCII,
                    StringBuilder dest,
                    Info info) {
         if(mappingStart==0) {
             uts46Norm2.normalize(src, dest);
         } else {
             uts46Norm2.normalizeSecondAndAppend(dest, src.subSequence(mappingStart, src.length()));
         }
         boolean doMapDevChars=
             toASCII ? (options&NONTRANSITIONAL_TO_ASCII)==0 :
                       (options&NONTRANSITIONAL_TO_UNICODE)==0;
         int destLength=dest.length();
         int labelLimit=labelStart;
         while(labelLimit<destLength) {
             char c=dest.charAt(labelLimit);
             if(c=='.' && !isLabel) {
                 int labelLength=labelLimit-labelStart;
                 int newLength=processLabel(dest, labelStart, labelLength,
                                                 toASCII, info);
                 promoteAndResetLabelErrors(info);
                 destLength+=newLength-labelLength;
                 labelLimit=labelStart+=newLength+1;
                 continue;
             } else if(c<0xdf) {
                 // pass
             } else if(c<=0x200d && (c==0xdf || c==0x3c2 || c>=0x200c)) {
                 setTransitionalDifferent(info);
                 if(doMapDevChars) {
                     destLength=mapDevChars(dest, labelStart, labelLimit);
                     // All deviation characters have been mapped, no need to check for them again.
                     doMapDevChars=false;
                     // Do not increment labelLimit in case c was removed.
                     continue;
                 }
             } else if(Character.isSurrogate(c)) {
                 if(UTF16Plus.isSurrogateLead(c) ?
                         (labelLimit+1)==destLength ||
                             !Character.isLowSurrogate(dest.charAt(labelLimit+1)) :
                         labelLimit==labelStart ||
                             !Character.isHighSurrogate(dest.charAt(labelLimit-1))) {
                     // Map an unpaired surrogate to U+FFFD before normalization so that when
                     // that removes characters we do not turn two unpaired ones into a pair.
                     addLabelError(info, Error.DISALLOWED);
                     dest.setCharAt(labelLimit, '\ufffd');
                 }
             }
             ++labelLimit;
         }
         // Permit an empty label at the end (0<labelStart==labelLimit==destLength is ok)
         // but not an empty label elsewhere nor a completely empty domain name.
         // processLabel() sets UIDNA_ERROR_EMPTY_LABEL when labelLength==0.
         if(0==labelStart || labelStart<labelLimit) {
             processLabel(dest, labelStart, labelLimit-labelStart, toASCII, info);
             promoteAndResetLabelErrors(info);
         }
         return dest;
     }

     // returns the new dest.length()
     private int
     mapDevChars(StringBuilder dest, int labelStart, int mappingStart) {
         int length=dest.length();
         boolean didMapDevChars=false;
         for(int i=mappingStart; i<length;) {
             char c=dest.charAt(i);
             switch(c) {
             case 0xdf:
                 // Map sharp s to ss.
                 didMapDevChars=true;
                 dest.setCharAt(i++, 's');
                 dest.insert(i++, 's');
                 ++length;
                 break;
             case 0x3c2:  // Map final sigma to nonfinal sigma.
                 didMapDevChars=true;
                 dest.setCharAt(i++, '\u03c3');
                 break;
             case 0x200c:  // Ignore/remove ZWNJ.
             case 0x200d:  // Ignore/remove ZWJ.
                 didMapDevChars=true;
                 dest.delete(i, i+1);
                 --length;
                 break;
             default:
                 ++i;
                 break;
             }
         }
         if(didMapDevChars) {
             // Mapping deviation characters might have resulted in an un-NFC string.
             // We could use either the NFC or the UTS #46 normalizer.
             // By using the UTS #46 normalizer again, we avoid having to load a second .nrm data file.
             String normalized=uts46Norm2.normalize(dest.subSequence(labelStart, dest.length()));
             dest.replace(labelStart, 0x7fffffff, normalized);
             return dest.length();
         }
         return length;
     }
     // Some non-ASCII characters are equivalent to sequences with
     // non-LDH ASCII characters. To find them:
     // grep disallowed_STD3_valid IdnaMappingTable.txt (or uts46.txt)
     private static boolean
     isNonASCIIDisallowedSTD3Valid(int c) {
         return c==0x2260 || c==0x226E || c==0x226F;
     }


     // Replace the label in dest with the label string, if the label was modified.
     // If label==dest then the label was modified in-place and labelLength
     // is the new label length, different from label.length().
     // If label!=dest then labelLength==label.length().
     // Returns labelLength (= the new label length).
     private static int
     replaceLabel(StringBuilder dest, int destLabelStart, int destLabelLength,
                  CharSequence label, int labelLength) {
         if(label!=dest) {
             dest.delete(destLabelStart, destLabelStart+destLabelLength).insert(destLabelStart, label);
             // or dest.replace(destLabelStart, destLabelStart+destLabelLength, label.toString());
             // which would create a String rather than moving characters in the StringBuilder.
         }
         return labelLength;
     }

     // returns the new label length
     private int
     processLabel(StringBuilder dest,
                  int labelStart, int labelLength,
                  boolean toASCII,
                  Info info) {
         StringBuilder fromPunycode;
         StringBuilder labelString;
         int destLabelStart=labelStart;
         int destLabelLength=labelLength;
         boolean wasPunycode;
         if( labelLength>=4 &&
             dest.charAt(labelStart)=='x' && dest.charAt(labelStart+1)=='n' &&
             dest.charAt(labelStart+2)=='-' && dest.charAt(labelStart+3)=='-'
         ) {
             // Label starts with "xn--", try to un-Punycode it.
             // In IDNA2008, labels like "xn--" (decodes to an empty string) and
             // "xn--ASCII-" (decodes to just "ASCII") fail the round-trip validation from
             // comparing the ToUnicode input with the back-to-ToASCII output.
             // They are alternate encodings of the respective ASCII labels.
             // Ignore "xn---" here: It will fail Punycode.decode() which logically comes before
             // the round-trip verification.
             if(labelLength==4 || (labelLength>5 && dest.charAt(labelStart+labelLength-1)=='-')) {
                 addLabelError(info, Error.INVALID_ACE_LABEL);
                 return markBadACELabel(dest, labelStart, labelLength, toASCII, info);
             }
             wasPunycode=true;
             try {
                 fromPunycode=Punycode.decode(dest.subSequence(labelStart+4, labelStart+labelLength), null);
             } catch (StringPrepParseException e) {
                 addLabelError(info, Error.PUNYCODE);
                 return markBadACELabel(dest, labelStart, labelLength, toASCII, info);
             }
             // Check for NFC, and for characters that are not
             // valid or deviation characters according to the normalizer.
             // If there is something wrong, then the string will change.
             // Note that the normalizer passes through non-LDH ASCII and deviation characters.
             // Deviation characters are ok in Punycode even in transitional processing.
             // In the code further below, if we find non-LDH ASCII and we have UIDNA_USE_STD3_RULES
             // then we will set UIDNA_ERROR_INVALID_ACE_LABEL there too.
             boolean isValid=uts46Norm2.isNormalized(fromPunycode);
             if(!isValid) {
                 addLabelError(info, Error.INVALID_ACE_LABEL);
                 return markBadACELabel(dest, labelStart, labelLength, toASCII, info);
             }
             labelString=fromPunycode;
             labelStart=0;
             labelLength=fromPunycode.length();
         } else {
             wasPunycode=false;
             labelString=dest;
         }
         // Validity check
         if(labelLength==0) {
             addLabelError(info, Error.EMPTY_LABEL);
             return replaceLabel(dest, destLabelStart, destLabelLength, labelString, labelLength);
         }
         // labelLength>0
         if(labelLength>=4 && labelString.charAt(labelStart+2)=='-' && labelString.charAt(labelStart+3)=='-') {
             // label starts with "??--"
             addLabelError(info, Error.HYPHEN_3_4);
         }
         if(labelString.charAt(labelStart)=='-') {
             // label starts with "-"
             addLabelError(info, Error.LEADING_HYPHEN);
         }
         if(labelString.charAt(labelStart+labelLength-1)=='-') {
             // label ends with "-"
             addLabelError(info, Error.TRAILING_HYPHEN);
         }
         // If the label was not a Punycode label, then it was the result of
         // mapping, normalization and label segmentation.
         // If the label was in Punycode, then we mapped it again above
         // and checked its validity.
         // Now we handle the STD3 restriction to LDH characters (if set)
         // and we look for U+FFFD which indicates disallowed characters
         // in a non-Punycode label or U+FFFD itself in a Punycode label.
         // We also check for dots which can come from the input to a single-label function.
         // Ok to cast away const because we own the UnicodeString.
         int i=labelStart;
         int limit=labelStart+labelLength;
         char oredChars=0;
         // If we enforce STD3 rules, then ASCII characters other than LDH and dot are disallowed.
         boolean disallowNonLDHDot=(options&USE_STD3_RULES)!=0;
         do {
             char c=labelString.charAt(i);
             if(c<=0x7f) {
                 if(c=='.') {
                     addLabelError(info, Error.LABEL_HAS_DOT);
                     labelString.setCharAt(i, '\ufffd');
                 } else if(disallowNonLDHDot && asciiData[c]<0) {
                     addLabelError(info, Error.DISALLOWED);
                     labelString.setCharAt(i, '\ufffd');
                 }
             } else {
                 oredChars|=c;
                 if(disallowNonLDHDot && isNonASCIIDisallowedSTD3Valid(c)) {
                     addLabelError(info, Error.DISALLOWED);
                     labelString.setCharAt(i, '\ufffd');
                 } else if(c==0xfffd) {
                     addLabelError(info, Error.DISALLOWED);
                 }
             }
             ++i;
         } while(i<limit);
         // Check for a leading combining mark after other validity checks
         // so that we don't report IDNA.Error.DISALLOWED for the U+FFFD from here.
         int c;
         // "Unsafe" is ok because unpaired surrogates were mapped to U+FFFD.
         c=labelString.codePointAt(labelStart);
         if((U_GET_GC_MASK(c)&U_GC_M_MASK)!=0) {
             addLabelError(info, Error.LEADING_COMBINING_MARK);
             labelString.setCharAt(labelStart, '\ufffd');
             if(c>0xffff) {
                 // Remove c's trail surrogate.
                 labelString.deleteCharAt(labelStart+1);
                 --labelLength;
                 if(labelString==dest) {
                     --destLabelLength;
                 }
             }
         }
         if(!hasCertainLabelErrors(info, severeErrors)) {
             // Do contextual checks only if we do not have U+FFFD from a severe error
             // because U+FFFD can make these checks fail.
             if((options&CHECK_BIDI)!=0 && (!isBiDi(info) || isOkBiDi(info))) {
                 checkLabelBiDi(labelString, labelStart, labelLength, info);
             }
             if( (options&CHECK_CONTEXTJ)!=0 && (oredChars&0x200c)==0x200c &&
                 !isLabelOkContextJ(labelString, labelStart, labelLength)
             ) {
                 addLabelError(info, Error.CONTEXTJ);
             }
             if((options&CHECK_CONTEXTO)!=0 && oredChars>=0xb7) {
                 checkLabelContextO(labelString, labelStart, labelLength, info);
             }
             if(toASCII) {
                 if(wasPunycode) {
                     // Leave a Punycode label unchanged if it has no severe errors.
                     if(destLabelLength>63) {
                         addLabelError(info, Error.LABEL_TOO_LONG);
                     }
                     return destLabelLength;
                 } else if(oredChars>=0x80) {
                     // Contains non-ASCII characters.
                     StringBuilder punycode;
                     try {
                         punycode=Punycode.encode(labelString.subSequence(labelStart, labelStart+labelLength), null);
                     } catch (StringPrepParseException e) {
                         throw new ICUException(e);  // unexpected
                     }
                     punycode.insert(0, "xn--");
                     if(punycode.length()>63) {
                         addLabelError(info, Error.LABEL_TOO_LONG);
                     }
                     return replaceLabel(dest, destLabelStart, destLabelLength,
                                         punycode, punycode.length());
                 } else {
                     // all-ASCII label
                     if(labelLength>63) {
                         addLabelError(info, Error.LABEL_TOO_LONG);
                     }
                 }
             }
         } else {
             // If a Punycode label has severe errors,
             // then leave it but make sure it does not look valid.
             if(wasPunycode) {
                 addLabelError(info, Error.INVALID_ACE_LABEL);
                 return markBadACELabel(dest, destLabelStart, destLabelLength, toASCII, info);
             }
         }
         return replaceLabel(dest, destLabelStart, destLabelLength, labelString, labelLength);
     }
     private int
     markBadACELabel(StringBuilder dest,
                     int labelStart, int labelLength,
                     boolean toASCII, Info info) {
         boolean disallowNonLDHDot=(options&USE_STD3_RULES)!=0;
         boolean isASCII=true;
         boolean onlyLDH=true;
         int limit=labelStart+labelLength;
         // Start after the initial "xn--".
         for(int i=labelStart+4; i<limit; ++i) {
             char c=dest.charAt(i);
             if(c<=0x7f) {
                 if(c=='.') {
                     addLabelError(info, Error.LABEL_HAS_DOT);
                     dest.setCharAt(i, '\ufffd');
                     isASCII=onlyLDH=false;
                 } else if(asciiData[c]<0) {
                     onlyLDH=false;
                     if(disallowNonLDHDot) {
                         dest.setCharAt(i, '\ufffd');
                         isASCII=false;
                     }
                 }
             } else {
                 isASCII=onlyLDH=false;
             }
         }
         if(onlyLDH) {
             dest.insert(labelStart+labelLength, '\ufffd');
             ++labelLength;
         } else {
             if(toASCII && isASCII && labelLength>63) {
                 addLabelError(info, Error.LABEL_TOO_LONG);
             }
         }
         return labelLength;
     }

     private static final int L_MASK=U_MASK(UCharacterDirection.LEFT_TO_RIGHT);
     private static final int R_AL_MASK=
         U_MASK(UCharacterDirection.RIGHT_TO_LEFT)|
         U_MASK(UCharacterDirection.RIGHT_TO_LEFT_ARABIC);
     private static final int L_R_AL_MASK=L_MASK|R_AL_MASK;

     private static final int R_AL_AN_MASK=R_AL_MASK|U_MASK(UCharacterDirection.ARABIC_NUMBER);

     private static final int EN_AN_MASK=
         U_MASK(UCharacterDirection.EUROPEAN_NUMBER)|
         U_MASK(UCharacterDirection.ARABIC_NUMBER);
     private static final int R_AL_EN_AN_MASK=R_AL_MASK|EN_AN_MASK;
     private static final int L_EN_MASK=L_MASK|U_MASK(UCharacterDirection.EUROPEAN_NUMBER);

     private static final int ES_CS_ET_ON_BN_NSM_MASK=
         U_MASK(UCharacterDirection.EUROPEAN_NUMBER_SEPARATOR)|
         U_MASK(UCharacterDirection.COMMON_NUMBER_SEPARATOR)|
         U_MASK(UCharacterDirection.EUROPEAN_NUMBER_TERMINATOR)|
         U_MASK(UCharacterDirection.OTHER_NEUTRAL)|
         U_MASK(UCharacterDirection.BOUNDARY_NEUTRAL)|
         U_MASK(UCharacterDirection.DIR_NON_SPACING_MARK);
     private static final int L_EN_ES_CS_ET_ON_BN_NSM_MASK=L_EN_MASK|ES_CS_ET_ON_BN_NSM_MASK;
     private static final int R_AL_AN_EN_ES_CS_ET_ON_BN_NSM_MASK=R_AL_MASK|EN_AN_MASK|ES_CS_ET_ON_BN_NSM_MASK;

     // We scan the whole label and check both for whether it contains RTL characters
     // and whether it passes the BiDi Rule.
     // In a BiDi domain name, all labels must pass the BiDi Rule, but we might find
     // that a domain name is a BiDi domain name (has an RTL label) only after
     // processing several earlier labels.
     private void
     checkLabelBiDi(CharSequence label, int labelStart, int labelLength, Info info) {
         // IDNA2008 BiDi rule
         // Get the directionality of the first character.
         int c;
         int i=labelStart;
         c=Character.codePointAt(label, i);
         i+=Character.charCount(c);
         int firstMask=U_MASK(UBiDiProps.INSTANCE.getClass(c));
         // 1. The first character must be a character with BIDI property L, R
         // or AL.  If it has the R or AL property, it is an RTL label; if it
         // has the L property, it is an LTR label.
         if((firstMask&~L_R_AL_MASK)!=0) {
             setNotOkBiDi(info);
         }
         // Get the directionality of the last non-NSM character.
         int lastMask;
         int labelLimit=labelStart+labelLength;
         for(;;) {
             if(i>=labelLimit) {
                 lastMask=firstMask;
                 break;
             }
             c=Character.codePointBefore(label, labelLimit);
             labelLimit-=Character.charCount(c);
             int dir=UBiDiProps.INSTANCE.getClass(c);
             if(dir!=UCharacterDirection.DIR_NON_SPACING_MARK) {
                 lastMask=U_MASK(dir);
                 break;
             }
         }
         // 3. In an RTL label, the end of the label must be a character with
         // BIDI property R, AL, EN or AN, followed by zero or more
         // characters with BIDI property NSM.
         // 6. In an LTR label, the end of the label must be a character with
         // BIDI property L or EN, followed by zero or more characters with
         // BIDI property NSM.
         if( (firstMask&L_MASK)!=0 ?
                 (lastMask&~L_EN_MASK)!=0 :
                 (lastMask&~R_AL_EN_AN_MASK)!=0
         ) {
             setNotOkBiDi(info);
         }
         // Add the directionalities of the intervening characters.
         int mask=firstMask|lastMask;
         while(i<labelLimit) {
             c=Character.codePointAt(label, i);
             i+=Character.charCount(c);
             mask|=U_MASK(UBiDiProps.INSTANCE.getClass(c));
         }
         if((firstMask&L_MASK)!=0) {
             // 5. In an LTR label, only characters with the BIDI properties L, EN,
             // ES, CS, ET, ON, BN and NSM are allowed.
             if((mask&~L_EN_ES_CS_ET_ON_BN_NSM_MASK)!=0) {
                 setNotOkBiDi(info);
             }
         } else {
             // 2. In an RTL label, only characters with the BIDI properties R, AL,
             // AN, EN, ES, CS, ET, ON, BN and NSM are allowed.
             if((mask&~R_AL_AN_EN_ES_CS_ET_ON_BN_NSM_MASK)!=0) {
                 setNotOkBiDi(info);
             }
             // 4. In an RTL label, if an EN is present, no AN may be present, and
             // vice versa.
             if((mask&EN_AN_MASK)==EN_AN_MASK) {
                 setNotOkBiDi(info);
             }
         }
         // An RTL label is a label that contains at least one character of type
         // R, AL or AN. [...]
         // A "BIDI domain name" is a domain name that contains at least one RTL
         // label. [...]
         // The following rule, consisting of six conditions, applies to labels
         // in BIDI domain names.
         if((mask&R_AL_AN_MASK)!=0) {
             setBiDi(info);
         }
     }

     // Special code for the ASCII prefix of a BiDi domain name.
     // The ASCII prefix is all-LTR.

     // IDNA2008 BiDi rule, parts relevant to ASCII labels:
     // 1. The first character must be a character with BIDI property L [...]
     // 5. In an LTR label, only characters with the BIDI properties L, EN,
     // ES, CS, ET, ON, BN and NSM are allowed.
     // 6. In an LTR label, the end of the label must be a character with
     // BIDI property L or EN [...]

     // UTF-16 version, called for mapped ASCII prefix.
     // Cannot contain uppercase A-Z.
     // s[length-1] must be the trailing dot.
     private static boolean
     isASCIIOkBiDi(CharSequence s, int length) {
         int labelStart=0;
         for(int i=0; i<length; ++i) {
             char c=s.charAt(i);
             if(c=='.') {  // dot
                 if(i>labelStart) {
                     c=s.charAt(i-1);
                     if(!('a'<=c && c<='z') && !('0'<=c && c<='9')) {
                         // Last character in the label is not an L or EN.
                         return false;
                     }
                 }
                 labelStart=i+1;
             } else if(i==labelStart) {
                 if(!('a'<=c && c<='z')) {
                     // First character in the label is not an L.
                     return false;
                 }
             } else {
                 if(c<=0x20 && (c>=0x1c || (9<=c && c<=0xd))) {
                     // Intermediate character in the label is a B, S or WS.
                     return false;
                 }
             }
         }
         return true;
     }

     private boolean
     isLabelOkContextJ(CharSequence label, int labelStart, int labelLength) {
         // [IDNA2008-Tables]
         // 200C..200D  ; CONTEXTJ    # ZERO WIDTH NON-JOINER..ZERO WIDTH JOINER
         int labelLimit=labelStart+labelLength;
         for(int i=labelStart; i<labelLimit; ++i) {
             if(label.charAt(i)==0x200c) {
                 // Appendix A.1. ZERO WIDTH NON-JOINER
                 // Rule Set:
                 //  False;
                 //  If Canonical_Combining_Class(Before(cp)) .eq.  Virama Then True;
                 //  If RegExpMatch((Joining_Type:{L,D})(Joining_Type:T)*\u200C
                 //     (Joining_Type:T)*(Joining_Type:{R,D})) Then True;
                 if(i==labelStart) {
                     return false;
                 }
                 int c;
                 int j=i;
                 c=Character.codePointBefore(label, j);
                 j-=Character.charCount(c);
                 if(uts46Norm2.getCombiningClass(c)==9) {
                     continue;
                 }
                 // check precontext (Joining_Type:{L,D})(Joining_Type:T)*
                 for(;;) {
                     /* UJoiningType */ int type=UBiDiProps.INSTANCE.getJoiningType(c);
                     if(type==UCharacter.JoiningType.TRANSPARENT) {
                         if(j==0) {
                             return false;
                         }
                         c=Character.codePointBefore(label, j);
                         j-=Character.charCount(c);
                     } else if(type==UCharacter.JoiningType.LEFT_JOINING || type==UCharacter.JoiningType.DUAL_JOINING) {
                         break;  // precontext fulfilled
                     } else {
                         return false;
                     }
                 }
                 // check postcontext (Joining_Type:T)*(Joining_Type:{R,D})
                 for(j=i+1;;) {
                     if(j==labelLimit) {
                         return false;
                     }
                     c=Character.codePointAt(label, j);
                     j+=Character.charCount(c);
                     /* UJoiningType */ int type=UBiDiProps.INSTANCE.getJoiningType(c);
                     if(type==UCharacter.JoiningType.TRANSPARENT) {
                         // just skip this character
                     } else if(type==UCharacter.JoiningType.RIGHT_JOINING || type==UCharacter.JoiningType.DUAL_JOINING) {
                         break;  // postcontext fulfilled
                     } else {
                         return false;
                     }
                 }
             } else if(label.charAt(i)==0x200d) {
                 // Appendix A.2. ZERO WIDTH JOINER (U+200D)
                 // Rule Set:
                 //  False;
                 //  If Canonical_Combining_Class(Before(cp)) .eq.  Virama Then True;
                 if(i==labelStart) {
                     return false;
                 }
                 int c=Character.codePointBefore(label, i);
                 if(uts46Norm2.getCombiningClass(c)!=9) {
                     return false;
                 }
             }
         }
         return true;
     }

     private void
     checkLabelContextO(CharSequence label, int labelStart, int labelLength, Info info) {
         int labelEnd=labelStart+labelLength-1;  // inclusive
         int arabicDigits=0;  // -1 for 066x, +1 for 06Fx
         for(int i=labelStart; i<=labelEnd; ++i) {
             int c=label.charAt(i);
             if(c<0xb7) {
                 // ASCII fastpath
             } else if(c<=0x6f9) {
                 if(c==0xb7) {
                     // Appendix A.3. MIDDLE DOT (U+00B7)
                     // Rule Set:
                     //  False;
                     //  If Before(cp) .eq.  U+006C And
                     //     After(cp) .eq.  U+006C Then True;
                     if(!(labelStart<i && label.charAt(i-1)=='l' &&
                          i<labelEnd && label.charAt(i+1)=='l')) {
                         addLabelError(info, Error.CONTEXTO_PUNCTUATION);
                     }
                 } else if(c==0x375) {
                     // Appendix A.4. GREEK LOWER NUMERAL SIGN (KERAIA) (U+0375)
                     // Rule Set:
                     //  False;
                     //  If Script(After(cp)) .eq.  Greek Then True;
                     if(!(i<labelEnd &&
                          UScript.GREEK==UScript.getScript(Character.codePointAt(label, i+1)))) {
                         addLabelError(info, Error.CONTEXTO_PUNCTUATION);
                     }
                 } else if(c==0x5f3 || c==0x5f4) {
                     // Appendix A.5. HEBREW PUNCTUATION GERESH (U+05F3)
                     // Rule Set:
                     //  False;
                     //  If Script(Before(cp)) .eq.  Hebrew Then True;
                     //
                     // Appendix A.6. HEBREW PUNCTUATION GERSHAYIM (U+05F4)
                     // Rule Set:
                     //  False;
                     //  If Script(Before(cp)) .eq.  Hebrew Then True;
                     if(!(labelStart<i &&
                          UScript.HEBREW==UScript.getScript(Character.codePointBefore(label, i)))) {
                         addLabelError(info, Error.CONTEXTO_PUNCTUATION);
                     }
                 } else if(0x660<=c /* && c<=0x6f9 */) {
                     // Appendix A.8. ARABIC-INDIC DIGITS (0660..0669)
                     // Rule Set:
                     //  True;
                     //  For All Characters:
                     //    If cp .in. 06F0..06F9 Then False;
                     //  End For;
                     //
                     // Appendix A.9. EXTENDED ARABIC-INDIC DIGITS (06F0..06F9)
                     // Rule Set:
                     //  True;
                     //  For All Characters:
                     //    If cp .in. 0660..0669 Then False;
                     //  End For;
                     if(c<=0x669) {
                         if(arabicDigits>0) {
                             addLabelError(info, Error.CONTEXTO_DIGITS);
                         }
                         arabicDigits=-1;
                     } else if(0x6f0<=c) {
                         if(arabicDigits<0) {
                             addLabelError(info, Error.CONTEXTO_DIGITS);
                         }
                         arabicDigits=1;
                     }
                 }
             } else if(c==0x30fb) {
                 // Appendix A.7. KATAKANA MIDDLE DOT (U+30FB)
                 // Rule Set:
                 //  False;
                 //  For All Characters:
                 //    If Script(cp) .in. {Hiragana, Katakana, Han} Then True;
                 //  End For;
                 for(int j=labelStart;; j+=Character.charCount(c)) {
                     if(j>labelEnd) {
                         addLabelError(info, Error.CONTEXTO_PUNCTUATION);
                         break;
                     }
                     c=Character.codePointAt(label, j);
                     int script=UScript.getScript(c);
                     if(script==UScript.HIRAGANA || script==UScript.KATAKANA || script==UScript.HAN) {
                         break;
                     }
                 }
             }
         }
     }

     // TODO: make public(?) -- in C, these are public in uchar.h
     private static int U_MASK(int x) {
         return 1<<x;
     }
     private static int U_GET_GC_MASK(int c) {
         return (1<<UCharacter.getType(c));
     }
     private static int U_GC_M_MASK=
         U_MASK(UCharacterCategory.NON_SPACING_MARK)|
         U_MASK(UCharacterCategory.ENCLOSING_MARK)|
         U_MASK(UCharacterCategory.COMBINING_SPACING_MARK);
 }