main/classes/core/src/com/ibm/icu/text/CharsetRecog_Unicode.java - external/github.com/unicode-org/icu - Git at Google

 /*
  *******************************************************************************
  * Copyright (C) 1996-2013, International Business Machines Corporation and    *
  * others. All Rights Reserved.                                                *
  *******************************************************************************
  *
  */

 package com.ibm.icu.text;

 /**
  * This class matches UTF-16 and UTF-32, both big- and little-endian. The
  * BOM will be used if it is present.
  */
 abstract class CharsetRecog_Unicode extends CharsetRecognizer {

     /* (non-Javadoc)
      * @see com.ibm.icu.text.CharsetRecognizer#getName()
      */
     abstract String getName();

     /* (non-Javadoc)
      * @see com.ibm.icu.text.CharsetRecognizer#match(com.ibm.icu.text.CharsetDetector)
      */
     abstract CharsetMatch match(CharsetDetector det);

     static int codeUnit16FromBytes(byte hi, byte lo) {
         return ((hi & 0xff) << 8) | (lo & 0xff);
     }

     // UTF-16 confidence calculation. Very simple minded, but better than nothing.
     //   Any 8 bit non-control characters bump the confidence up. These have a zero high byte,
     //     and are very likely to be UTF-16, although they could also be part of a UTF-32 code.
     //   NULs are a contra-indication, they will appear commonly if the actual encoding is UTF-32.
     //   NULs should be rare in actual text.
     static int adjustConfidence(int codeUnit, int confidence) {
         if (codeUnit == 0) {
             confidence -= 10;
         } else if ((codeUnit >= 0x20 && codeUnit <= 0xff) || codeUnit == 0x0a) {
             confidence += 10;
         }
         if (confidence < 0) {
             confidence = 0;
         } else if (confidence > 100) {
             confidence = 100;
         }
         return confidence;
     }

     static class CharsetRecog_UTF_16_BE extends CharsetRecog_Unicode
     {
         String getName()
         {
             return "UTF-16BE";
         }

         CharsetMatch match(CharsetDetector det)
         {
             byte[] input = det.fRawInput;
             int confidence = 10;

             int bytesToCheck = Math.min(input.length, 30);
             for (int charIndex=0; charIndex<bytesToCheck-1; charIndex+=2) {
                 int codeUnit = codeUnit16FromBytes(input[charIndex], input[charIndex + 1]);
                 if (charIndex == 0 && codeUnit == 0xFEFF) {
                     confidence = 100;
                     break;
                 }
                 confidence = adjustConfidence(codeUnit, confidence);
                 if (confidence == 0 || confidence == 100) {
                     break;
                 }
             }
             if (bytesToCheck < 4 && confidence < 100) {
                 confidence = 0;
             }
             if (confidence > 0) {
                 return new CharsetMatch(det, this, confidence);
             }
             return null;
         }
     }

     static class CharsetRecog_UTF_16_LE extends CharsetRecog_Unicode
     {
         String getName()
         {
             return "UTF-16LE";
         }

         CharsetMatch match(CharsetDetector det)
         {
             byte[] input = det.fRawInput;
             int confidence = 10;

             int bytesToCheck = Math.min(input.length, 30);
             for (int charIndex=0; charIndex<bytesToCheck-1; charIndex+=2) {
                 int codeUnit = codeUnit16FromBytes(input[charIndex+1], input[charIndex]);
                 if (charIndex == 0 && codeUnit == 0xFEFF) {
                     confidence = 100;
                     break;
                 }
                 confidence = adjustConfidence(codeUnit, confidence);
                 if (confidence == 0 || confidence == 100) {
                     break;
                 }
             }
             if (bytesToCheck < 4 && confidence < 100) {
                 confidence = 0;
             }
             if (confidence > 0) {
                 return new CharsetMatch(det, this, confidence);
             }
             return null;
         }
     }

     static abstract class CharsetRecog_UTF_32 extends CharsetRecog_Unicode
     {
         abstract int getChar(byte[] input, int index);

         abstract String getName();

         CharsetMatch match(CharsetDetector det)
         {
             byte[] input   = det.fRawInput;
             int limit      = (det.fRawLength / 4) * 4;
             int numValid   = 0;
             int numInvalid = 0;
             boolean hasBOM = false;
             int confidence = 0;

             if (limit==0) {
                 return null;
             }
             if (getChar(input, 0) == 0x0000FEFF) {
                 hasBOM = true;
             }

             for(int i = 0; i < limit; i += 4) {
                 int ch = getChar(input, i);

                 if (ch < 0 || ch >= 0x10FFFF || (ch >= 0xD800 && ch <= 0xDFFF)) {
                     numInvalid += 1;
                 } else {
                     numValid += 1;
                 }
             }


             // Cook up some sort of confidence score, based on presence of a BOM
             //    and the existence of valid and/or invalid multi-byte sequences.
             if (hasBOM && numInvalid==0) {
                 confidence = 100;
             } else if (hasBOM && numValid > numInvalid*10) {
                 confidence = 80;
             } else if (numValid > 3 && numInvalid == 0) {
                 confidence = 100;
             } else if (numValid > 0 && numInvalid == 0) {
                 confidence = 80;
             } else if (numValid > numInvalid*10) {
                 // Probably corrupt UTF-32BE data.  Valid sequences aren't likely by chance.
                 confidence = 25;
             }

             return confidence == 0 ? null : new CharsetMatch(det, this, confidence);
         }
     }

     static class CharsetRecog_UTF_32_BE extends CharsetRecog_UTF_32
     {
         int getChar(byte[] input, int index)
         {
             return (input[index + 0] & 0xFF) << 24 | (input[index + 1] & 0xFF) << 16 |
                    (input[index + 2] & 0xFF) <<  8 | (input[index + 3] & 0xFF);
         }

         String getName()
         {
             return "UTF-32BE";
         }
     }


     static class CharsetRecog_UTF_32_LE extends CharsetRecog_UTF_32
     {
         int getChar(byte[] input, int index)
         {
             return (input[index + 3] & 0xFF) << 24 | (input[index + 2] & 0xFF) << 16 |
                    (input[index + 1] & 0xFF) <<  8 | (input[index + 0] & 0xFF);
         }

         String getName()
         {
             return "UTF-32LE";
         }
     }
 }
	/*
	*******************************************************************************
	* Copyright (C) 1996-2013, International Business Machines Corporation and *
	* others. All Rights Reserved. *
	*******************************************************************************
	*
	*/

	package com.ibm.icu.text;

	/**
	* This class matches UTF-16 and UTF-32, both big- and little-endian. The
	* BOM will be used if it is present.
	*/
	abstract class CharsetRecog_Unicode extends CharsetRecognizer {

	/* (non-Javadoc)
	* @see com.ibm.icu.text.CharsetRecognizer#getName()
	*/
	abstract String getName();

	/* (non-Javadoc)
	* @see com.ibm.icu.text.CharsetRecognizer#match(com.ibm.icu.text.CharsetDetector)
	*/
	abstract CharsetMatch match(CharsetDetector det);

	static int codeUnit16FromBytes(byte hi, byte lo) {
	return ((hi & 0xff) << 8) \| (lo & 0xff);
	}

	// UTF-16 confidence calculation. Very simple minded, but better than nothing.
	// Any 8 bit non-control characters bump the confidence up. These have a zero high byte,
	// and are very likely to be UTF-16, although they could also be part of a UTF-32 code.
	// NULs are a contra-indication, they will appear commonly if the actual encoding is UTF-32.
	// NULs should be rare in actual text.
	static int adjustConfidence(int codeUnit, int confidence) {
	if (codeUnit == 0) {
	confidence -= 10;
	} else if ((codeUnit >= 0x20 && codeUnit <= 0xff) \|\| codeUnit == 0x0a) {
	confidence += 10;
	}
	if (confidence < 0) {
	confidence = 0;
	} else if (confidence > 100) {
	confidence = 100;
	}
	return confidence;
	}

	static class CharsetRecog_UTF_16_BE extends CharsetRecog_Unicode
	{
	String getName()
	{
	return "UTF-16BE";
	}

	CharsetMatch match(CharsetDetector det)
	{
	byte[] input = det.fRawInput;
	int confidence = 10;

	int bytesToCheck = Math.min(input.length, 30);
	for (int charIndex=0; charIndex<bytesToCheck-1; charIndex+=2) {
	int codeUnit = codeUnit16FromBytes(input[charIndex], input[charIndex + 1]);
	if (charIndex == 0 && codeUnit == 0xFEFF) {
	confidence = 100;
	break;
	}
	confidence = adjustConfidence(codeUnit, confidence);
	if (confidence == 0 \|\| confidence == 100) {
	break;
	}
	}
	if (bytesToCheck < 4 && confidence < 100) {
	confidence = 0;
	}
	if (confidence > 0) {
	return new CharsetMatch(det, this, confidence);
	}
	return null;
	}
	}

	static class CharsetRecog_UTF_16_LE extends CharsetRecog_Unicode
	{
	String getName()
	{
	return "UTF-16LE";
	}

	CharsetMatch match(CharsetDetector det)
	{
	byte[] input = det.fRawInput;
	int confidence = 10;

	int bytesToCheck = Math.min(input.length, 30);
	for (int charIndex=0; charIndex<bytesToCheck-1; charIndex+=2) {
	int codeUnit = codeUnit16FromBytes(input[charIndex+1], input[charIndex]);
	if (charIndex == 0 && codeUnit == 0xFEFF) {
	confidence = 100;
	break;
	}
	confidence = adjustConfidence(codeUnit, confidence);
	if (confidence == 0 \|\| confidence == 100) {
	break;
	}
	}
	if (bytesToCheck < 4 && confidence < 100) {
	confidence = 0;
	}
	if (confidence > 0) {
	return new CharsetMatch(det, this, confidence);
	}
	return null;
	}
	}

	static abstract class CharsetRecog_UTF_32 extends CharsetRecog_Unicode
	{
	abstract int getChar(byte[] input, int index);

	abstract String getName();

	CharsetMatch match(CharsetDetector det)
	{
	byte[] input = det.fRawInput;
	int limit = (det.fRawLength / 4) * 4;
	int numValid = 0;
	int numInvalid = 0;
	boolean hasBOM = false;
	int confidence = 0;

	if (limit==0) {
	return null;
	}
	if (getChar(input, 0) == 0x0000FEFF) {
	hasBOM = true;
	}

	for(int i = 0; i < limit; i += 4) {
	int ch = getChar(input, i);

	if (ch < 0 \|\| ch >= 0x10FFFF \|\| (ch >= 0xD800 && ch <= 0xDFFF)) {
	numInvalid += 1;
	} else {
	numValid += 1;
	}
	}


	// Cook up some sort of confidence score, based on presence of a BOM
	// and the existence of valid and/or invalid multi-byte sequences.
	if (hasBOM && numInvalid==0) {
	confidence = 100;
	} else if (hasBOM && numValid > numInvalid*10) {
	confidence = 80;
	} else if (numValid > 3 && numInvalid == 0) {
	confidence = 100;
	} else if (numValid > 0 && numInvalid == 0) {
	confidence = 80;
	} else if (numValid > numInvalid*10) {
	// Probably corrupt UTF-32BE data. Valid sequences aren't likely by chance.
	confidence = 25;
	}

	return confidence == 0 ? null : new CharsetMatch(det, this, confidence);
	}
	}

	static class CharsetRecog_UTF_32_BE extends CharsetRecog_UTF_32
	{
	int getChar(byte[] input, int index)
	{
	return (input[index + 0] & 0xFF) << 24 \| (input[index + 1] & 0xFF) << 16 \|
	(input[index + 2] & 0xFF) << 8 \| (input[index + 3] & 0xFF);
	}

	String getName()
	{
	return "UTF-32BE";
	}
	}


	static class CharsetRecog_UTF_32_LE extends CharsetRecog_UTF_32
	{
	int getChar(byte[] input, int index)
	{
	return (input[index + 3] & 0xFF) << 24 \| (input[index + 2] & 0xFF) << 16 \|
	(input[index + 1] & 0xFF) << 8 \| (input[index + 0] & 0xFF);
	}

	String getName()
	{
	return "UTF-32LE";
	}
	}
	}