main/tests/core/src/com/ibm/icu/dev/test/charsetdet/TestCharsetDetector.java - external/github.com/unicode-org/icu - Git at Google

 /**
  *******************************************************************************
  * Copyright (C) 2005-2011, International Business Machines Corporation and    *
  * others. All Rights Reserved.                                                *
  *******************************************************************************
  */
 package com.ibm.icu.dev.test.charsetdet;

 import java.io.ByteArrayInputStream;
 import java.io.InputStream;
 import java.io.Reader;
 import java.io.UnsupportedEncodingException;

 import javax.xml.parsers.DocumentBuilder;
 import javax.xml.parsers.DocumentBuilderFactory;

 import org.w3c.dom.Document;
 import org.w3c.dom.Element;
 import org.w3c.dom.NamedNodeMap;
 import org.w3c.dom.Node;
 import org.w3c.dom.NodeList;

 import com.ibm.icu.dev.test.TestFmwk;
 import com.ibm.icu.text.CharsetDetector;
 import com.ibm.icu.text.CharsetMatch;


 /**
  * @author andy
  */
 public class TestCharsetDetector extends TestFmwk
 {

     /**
      * Constructor
      */
     public TestCharsetDetector()
     {
     }

     public static void main(String[] args) {
         try
         {
             TestCharsetDetector test = new TestCharsetDetector();
             test.run(args);
         }
         catch (Exception e)
         {
             e.printStackTrace();
         }
     }

     private void CheckAssert(boolean exp) {
         if (exp == false) {
             String msg;
             try {
                 throw new Exception();
             }
             catch (Exception e) {
                 StackTraceElement failPoint = e.getStackTrace()[1];
                 msg = "Test failure in file " + failPoint.getFileName() +
                              " at line " + failPoint.getLineNumber();
             }
             errln(msg);
         }

     }

     private String stringFromReader(Reader reader)
     {
         StringBuffer sb = new StringBuffer();
         char[] buffer   = new char[1024];
         int bytesRead   = 0;

         try {
             while ((bytesRead = reader.read(buffer, 0, 1024)) >= 0) {
                 sb.append(buffer, 0, bytesRead);
             }

             return sb.toString();
         } catch (Exception e) {
             errln("stringFromReader() failed: " + e.toString());
             return null;
         }
     }

     public void TestConstruction() {
         int i;
         CharsetDetector  det = new CharsetDetector();
         if(det==null){
             errln("Could not construct a charset detector");
         }
         String [] charsetNames = CharsetDetector.getAllDetectableCharsets();
         CheckAssert(charsetNames.length != 0);
         for (i=0; i<charsetNames.length; i++) {
             CheckAssert(charsetNames[i].equals("") == false);
             // System.out.println("\"" + charsetNames[i] + "\"");
         }
      }

     public void TestInputFilter() throws Exception
     {
         String s = "<a> <lot> <of> <English> <inside> <the> <markup> Un tr\u00E8s petit peu de Fran\u00E7ais. <to> <confuse> <the> <detector>";
         byte[] bytes = s.getBytes("ISO-8859-1");
         CharsetDetector det = new CharsetDetector();
         CharsetMatch m;

         det.enableInputFilter(true);
         if (!det.inputFilterEnabled()){
             errln("input filter should be enabled");
         }

         det.setText(bytes);
         m = det.detect();

         if (! m.getLanguage().equals("fr")) {
             errln("input filter did not strip markup!");
         }

         det.enableInputFilter(false);
         det.setText(bytes);
         m = det.detect();

         if (! m.getLanguage().equals("en")) {
             errln("unfiltered input did not detect as English!");
         }
     }

     public void TestUTF8() throws Exception {

         String  s = "This is a string with some non-ascii characters that will " +
                     "be converted to UTF-8, then shoved through the detection process.  " +
                     "\u0391\u0392\u0393\u0394\u0395" +
                     "Sure would be nice if our source could contain Unicode directly!";
         byte [] bytes = s.getBytes("UTF-8");
         CharsetDetector det = new CharsetDetector();
         String retrievedS;
         Reader reader;

         retrievedS = det.getString(bytes, "UTF-8");
         CheckAssert(s.equals(retrievedS));

         reader = det.getReader(new ByteArrayInputStream(bytes), "UTF-8");
         CheckAssert(s.equals(stringFromReader(reader)));
         det.setDeclaredEncoding("UTF-8"); // Jitterbug 4451, for coverage
     }

     public void TestUTF16() throws Exception
     {
         String source =
                 "u0623\u0648\u0631\u0648\u0628\u0627, \u0628\u0631\u0645\u062c\u064a\u0627\u062a " +
                 "\u0627\u0644\u062d\u0627\u0633\u0648\u0628 \u002b\u0020\u0627\u0646\u062a\u0631\u0646\u064a\u062a";

         byte[] beBytes = source.getBytes("UnicodeBig");
         byte[] leBytes = source.getBytes("UnicodeLittle");
         CharsetDetector det = new CharsetDetector();
         CharsetMatch m;

         det.setText(beBytes);
         m = det.detect();

         if (! m.getName().equals("UTF-16BE")) {
             errln("Encoding detection failure: expected UTF-16BE, got " + m.getName());
         }

         det.setText(leBytes);
         m = det.detect();

         if (! m.getName().equals("UTF-16LE")) {
             errln("Encoding detection failure: expected UTF-16LE, got " + m.getName());
         }

         // Jitterbug 4451, for coverage
         int confidence = m.getConfidence();
         if(confidence != 100){
             errln("Did not get the expected confidence level " + confidence);
         }
         int matchType = m.getMatchType();
         if(matchType != 0){
             errln("Did not get the expected matchType level " + matchType);
         }
     }

     public void TestC1Bytes() throws Exception
     {
         String sISO =
             "This is a small sample of some English text. Just enough to be sure that it detects correctly.";

         String sWindows =
             "This is another small sample of some English text. Just enough to be sure that it detects correctly. It also includes some \u201CC1\u201D bytes.";

         byte[] bISO     = sISO.getBytes("ISO-8859-1");
         byte[] bWindows = sWindows.getBytes("windows-1252");

         CharsetDetector det = new CharsetDetector();
         CharsetMatch m;

         det.setText(bWindows);
         m = det.detect();

         if (m.getName() != "windows-1252") {
             errln("Text with C1 bytes not correctly detected as windows-1252.");
             return;
         }

         det.setText(bISO);
         m = det.detect();

         if (m.getName() != "ISO-8859-1") {
             errln("Text without C1 bytes not correctly detected as ISO-8859-1.");
         }
     }

     public void TestShortInput() {
         // Test that detection with very short byte strings does not crash and burn.
         // The shortest input that should produce positive detection result is two bytes,
         //   a UTF-16 BOM.
         // TODO:  Detector confidence levels needs to be refined for very short input.
         //        Too high now, for some charsets that happen to be compatible with a few bytes of input.
         byte [][]  shortBytes = new byte [][]
             {
                 {},
                 {(byte)0x0a},
                 {(byte)'A', (byte)'B'},
                 {(byte)'A', (byte)'B', (byte)'C'},
                 {(byte)'A', (byte)'B', (byte)'C', (byte)'D'}
             };

         CharsetDetector det = new CharsetDetector();
         CharsetMatch m;
         for (int i=0; i<shortBytes.length; i++) {
             det.setText(shortBytes[i]);
             m = det.detect();
             logln("i=" + i + " -> " + m.getName());
         }
     }

     public void TestBufferOverflow()
     {
         byte testStrings[][] = {
             {(byte) 0x80, (byte) 0x20, (byte) 0x54, (byte) 0x68, (byte) 0x69, (byte) 0x73, (byte) 0x20, (byte) 0x69, (byte) 0x73, (byte) 0x20, (byte) 0x45, (byte) 0x6E, (byte) 0x67, (byte) 0x6C, (byte) 0x69, (byte) 0x73, (byte) 0x68, (byte) 0x20, (byte) 0x1b}, /* A partial ISO-2022 shift state at the end */
             {(byte) 0x80, (byte) 0x20, (byte) 0x54, (byte) 0x68, (byte) 0x69, (byte) 0x73, (byte) 0x20, (byte) 0x69, (byte) 0x73, (byte) 0x20, (byte) 0x45, (byte) 0x6E, (byte) 0x67, (byte) 0x6C, (byte) 0x69, (byte) 0x73, (byte) 0x68, (byte) 0x20, (byte) 0x1b, (byte) 0x24}, /* A partial ISO-2022 shift state at the end */
             {(byte) 0x80, (byte) 0x20, (byte) 0x54, (byte) 0x68, (byte) 0x69, (byte) 0x73, (byte) 0x20, (byte) 0x69, (byte) 0x73, (byte) 0x20, (byte) 0x45, (byte) 0x6E, (byte) 0x67, (byte) 0x6C, (byte) 0x69, (byte) 0x73, (byte) 0x68, (byte) 0x20, (byte) 0x1b, (byte) 0x24, (byte) 0x28}, /* A partial ISO-2022 shift state at the end */
             {(byte) 0x80, (byte) 0x20, (byte) 0x54, (byte) 0x68, (byte) 0x69, (byte) 0x73, (byte) 0x20, (byte) 0x69, (byte) 0x73, (byte) 0x20, (byte) 0x45, (byte) 0x6E, (byte) 0x67, (byte) 0x6C, (byte) 0x69, (byte) 0x73, (byte) 0x68, (byte) 0x20, (byte) 0x1b, (byte) 0x24, (byte) 0x28, (byte) 0x44}, /* A complete ISO-2022 shift state at the end with a bad one at the start */
             {(byte) 0x1b, (byte) 0x24, (byte) 0x28, (byte) 0x44}, /* A complete ISO-2022 shift state at the end */
             {(byte) 0xa1}, /* Could be a single byte shift-jis at the end */
             {(byte) 0x74, (byte) 0x68, (byte) 0xa1}, /* Could be a single byte shift-jis at the end */
             {(byte) 0x74, (byte) 0x68, (byte) 0x65, (byte) 0xa1} /* Could be a single byte shift-jis at the end, but now we have English creeping in. */
         };

         String testResults[] = {
             "windows-1252",
             "windows-1252",
             "windows-1252",
             "windows-1252",
             "ISO-2022-JP",
             null,
             null,
             "ISO-8859-1"
         };

         CharsetDetector det = new CharsetDetector();
         CharsetMatch match;

         det.setDeclaredEncoding("ISO-2022-JP");

         for (int idx = 0; idx < testStrings.length; idx += 1) {
             det.setText(testStrings[idx]);
             match = det.detect();

             if (match == null) {
                 if (testResults[idx] != null) {
                     errln("Unexpectedly got no results at index " + idx);
                 }
                 else {
                     logln("Got no result as expected at index " + idx);
                 }
                 continue;
             }

             if (testResults[idx] == null || ! testResults[idx].equals(match.getName())) {
                 errln("Unexpectedly got " + match.getName() + " instead of " + testResults[idx] +
                       " at index " + idx + " with confidence " + match.getConfidence());
                 return;
             }
         }
     }

     public void TestDetection()
     {
         //
         //  Open and read the test data file.
         //
         //InputStreamReader isr = null;

         try {
             InputStream is = TestCharsetDetector.class.getResourceAsStream("CharsetDetectionTests.xml");
             if (is == null) {
                 errln("Could not open test data file CharsetDetectionTests.xml");
                 return;
             }

             //isr = new InputStreamReader(is, "UTF-8");

             // Set up an xml parser.
             DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();

             factory.setIgnoringComments(true);

             DocumentBuilder builder = factory.newDocumentBuilder();

             // Parse the xml content from the test case file.
             Document doc = builder.parse(is, null);
             Element root = doc.getDocumentElement();

             NodeList testCases = root.getElementsByTagName("test-case");

             // Process each test case
             for (int n = 0; n < testCases.getLength(); n += 1) {
                 Node testCase = testCases.item(n);
                 NamedNodeMap attrs = testCase.getAttributes();
                 NodeList testData  = testCase.getChildNodes();
                 StringBuffer testText = new StringBuffer();
                 String id = attrs.getNamedItem("id").getNodeValue();
                 String encodings = attrs.getNamedItem("encodings").getNodeValue();

                 // Collect the test case text.
                 for (int t = 0; t < testData.getLength(); t += 1) {
                     Node textNode = testData.item(t);

                     testText.append(textNode.getNodeValue());
                 }

                 // Process test text with each encoding / language pair.
                 String testString = testText.toString();
                 String[] encodingList = encodings.split(" ");
                 for (int e = 0; e < encodingList.length; e += 1) {
                     checkEncoding(testString, encodingList[e], id);
                 }
             }

         } catch (Exception e) {
             errln("exception while processing test cases: " + e.toString());
         }
     }

     private void checkMatch(CharsetDetector det, String testString, String encoding, String language, String id) throws Exception
     {
         CharsetMatch m = det.detect();
         String decoded;

         if (! m.getName().equals(encoding)) {
             errln(id + ": encoding detection failure - expected " + encoding + ", got " + m.getName());
             return;
         }

         String charsetMatchLanguage = m.getLanguage();
         if ((language != null && !charsetMatchLanguage.equals(language))
             || (language == null && charsetMatchLanguage != null)
             || (language != null && charsetMatchLanguage == null))
         {
             errln(id + ", " + encoding + ": language detection failure - expected " + language + ", got " + m.getLanguage());
         }

         if (encoding.startsWith("UTF-32")) {
             return;
         }

         decoded = m.getString();

         if (! testString.equals(decoded)) {
             errln(id + ", " + encoding + ": getString() didn't return the original string!");
         }

         decoded = stringFromReader(m.getReader());

         if (! testString.equals(decoded)) {
             errln(id + ", " + encoding + ": getReader() didn't yield the original string!");
         }
     }

     private void checkEncoding(String testString, String encoding, String id)
     {
         String enc = null, lang = null;
         String[] split = encoding.split("/");

         enc = split[0];

         if (split.length > 1) {
             lang = split[1];
         }

         try {
             CharsetDetector det = new CharsetDetector();
             byte[] bytes;

             //if (enc.startsWith("UTF-32")) {
             //    UTF32 utf32 = UTF32.getInstance(enc);

             //    bytes = utf32.toBytes(testString);
             //} else {
                 String from = enc;

                 while (true) {
                     try {
                         bytes = testString.getBytes(from);
                     } catch (UnsupportedOperationException uoe) {
                          // In some runtimes, the ISO-2022-CN converter
                          // only converts *to* Unicode - we have to use
                          // x-ISO-2022-CN-GB to convert *from* Unicode.
                         if (from.equals("ISO-2022-CN")) {
                             from = "x-ISO-2022-CN-GB";
                             continue;
                         }

                         // Ignore any other converters that can't
                         // convert from Unicode.
                         return;
                     } catch (UnsupportedEncodingException uee) {
                         // Ignore any encodings that this runtime
                         // doesn't support.
                         return;
                     }

                     break;
                 }
             //}

             det.setText(bytes);
             checkMatch(det, testString, enc, lang, id);

             det.setText(new ByteArrayInputStream(bytes));
             checkMatch(det, testString, enc, lang, id);
          } catch (Exception e) {
             errln(id + ": " + e.toString() + "enc=" + enc);
             e.printStackTrace();
         }
     }

     public void TestJapanese() throws Exception {
         String s = "\u3000\u3001\u3002\u3003\u3005\u3006\u3007\u3008\u3009\u300A\u300B\u300C\u300D\u300E\u300F\u3010\u3011\u3012\u3013\u3014" +
         "\u3015\u301C\u3041\u3042\u3043\u3044\u3045\u3046\u3047\u3048\u3049\u304A\u304B\u304C\u304D\u304E\u304F\u3050\u3051\u3052" +
         "\u3053\u3054\u3055\u3056\u3057\u3058\u3059\u305A\u305B\u305C\u305D\u305E\u305F\u3060\u3061\u3062\u3063\u3064\u3065\u3066" +
         "\u3067\u3068\u3069\u306A\u306B\u306C\u306D\u306E\u306F\u3070\u3071\u3072\u3073\u3074\u3075\u3076\u3077\u3078\u3079\u307A" +
         "\u307B\u307C\u307D\u307E\u307F\u3080\u3081\u3082\u3083\u3084\u3085\u3086\u3087\u3088\u3089\u308A\u308B\u308C\u308D\u308E" +
         "\u308F\u3090\u3091\u3092\u3093\u309B\u309C\u309D\u309E\u30A1\u30A2\u30A3\u30A4\u30A5\u30A6\u30A7\u30A8\u30A9\u30AA\u30AB" +
         "\u30AC\u30AD\u30AE\u30AF\u30B0\u30B1\u30B2\u30B3\u30B4\u30B5\u30B6\u30B7\u30B8\u30B9\u30BA\u30BB\u30BC\u30BD\u30BE\u30BF" +
         "\u30C0\u30C1\u30C2\u30C3\u30C4\u30C5\u30C6\u30C7\u30C8\u30C9\u30CA\u30CB\u30CC\u30CD\u30CE\u30CF\u30D0\u30D1\u30D2\u30D3" +
         "\u30D4\u30D5\u30D6\u30D7\u30D8\u30D9\u30DA\u30DB\u30DC\u30DD\u30DE\u30DF\u30E0\u30E1\u30E2\u30E3\u30E4\u30E5\u30E6\u30E7" +
         "\u30E8\u30E9\u30EA\u30EB\u30EC\u30ED\u30EE\u30EF\u30F0\u30F1\u30F2\u30F3\u30F4\u30F5\u30F6\u30FB\u30FC\u30FD\u30FE\u4E00" +
         "\u4E01\u4E02\u4E03\u4E04\u4E05\u4E07\u4E08\u4E09\u4E0A\u4E0B\u4E0C\u4E0D\u4E0E\u4E10\u4E11\u4E12\u4E14\u4E15\u4E16\u4E17" +
         "\u4E18\u4E19\u4E1E\u4E1F\u4E21\u4E23\u4E24\u4E26\u4E28\u4E2A\u4E2B\u4E2D\u4E2E\u4E2F\u4E30\u4E31\u4E32\u4E35\u4E36\u4E38" +
         "\u4E39\u4E3B\u4E3C\u4E3F\u4E40\u4E41\u4E42\u4E43\u4E44\u4E45\u4E47\u4E4B\u4E4D\u4E4E\u4E4F\u4E51\u4E55\u4E56\u4E57\u4E58" +
         "\u4E59\u4E5A\u4E5C\u4E5D\u4E5E\u4E5F\u4E62\u4E63\u4E68\u4E69\u4E71\u4E73\u4E74\u4E75\u4E79\u4E7E\u4E7F\u4E80\u4E82\u4E85" +
         "\u4E86\u4E88\u4E89\u4E8A\u4E8B\u4E8C";

         CharsetDetector det = new CharsetDetector();
         CharsetMatch m;
         String charsetMatch;
         byte[] bytes;
         {
             bytes = s.getBytes("EUC-JP");
             det.setText(bytes);
             m = det.detect();
             charsetMatch = m.getName();
             CheckAssert(charsetMatch.equals("EUC-JP"));

             // Tests "public String getLanguage()"
             CheckAssert(m.getLanguage().equals("ja"));
         }
     }

     public void TestArabic() throws Exception {
         String  s = "\u0648\u0636\u0639\u062A \u0648\u0646\u0641\u0630\u062A \u0628\u0631\u0627" +
         "\u0645\u062C \u062A\u0623\u0645\u064A\u0646 \u0639\u062F\u064A\u062F\u0629 \u0641\u064A " +
         "\u0645\u0624\u0633\u0633\u0629 \u0627\u0644\u062A\u0623\u0645\u064A\u0646 \u0627\u0644"  +
         "\u0648\u0637\u0646\u064A, \u0645\u0639 \u0645\u0644\u0627\u0626\u0645\u062A\u0647\u0627 " +
         "\u062F\u0627\u0626\u0645\u0627 \u0644\u0644\u0627\u062D\u062A\u064A\u0627\u062C" +
         "\u0627\u062A \u0627\u0644\u0645\u062A\u063A\u064A\u0631\u0629 \u0644\u0644\u0645\u062C" +
         "\u062A\u0645\u0639 \u0648\u0644\u0644\u062F\u0648\u0644\u0629. \u062A\u0648\u0633\u0639" +
         "\u062A \u0648\u062A\u0637\u0648\u0631\u062A \u0627\u0644\u0645\u0624\u0633\u0633\u0629 " +
         "\u0628\u0647\u062F\u0641 \u0636\u0645\u0627\u0646 \u0634\u0628\u0643\u0629 \u0623\u0645" +
         "\u0627\u0646 \u0644\u0633\u0643\u0627\u0646 \u062F\u0648\u0644\u0629 \u0627\u0633\u0631" +
         "\u0627\u0626\u064A\u0644 \u0628\u0648\u062C\u0647 \u0627\u0644\u0645\u062E\u0627\u0637" +
         "\u0631 \u0627\u0644\u0627\u0642\u062A\u0635\u0627\u062F\u064A\u0629 \u0648\u0627\u0644" +
         "\u0627\u062C\u062A\u0645\u0627\u0639\u064A\u0629.";

         CharsetDetector det = new CharsetDetector();
         CharsetMatch m;
         String charsetMatch;
         byte[] bytes;
         {
             bytes = s.getBytes("windows-1256");
             det.setText(bytes);
             m = det.detect();
             charsetMatch = m.getName();
             CheckAssert(charsetMatch.equals("windows-1256"));

             // Tests "public String getLanguage()"
             CheckAssert(m.getLanguage().endsWith("ar"));
         }

         {
             // We cannot rely on IBM420 converter in Sun Java
             /*
             bytes = s.getBytes("IBM420");
             */
             bytes = new byte[] {
                 (byte)0xCF, (byte)0x8D, (byte)0x9A, (byte)0x63, (byte)0x40, (byte)0xCF, (byte)0xBD, (byte)0xAB,
                 (byte)0x74, (byte)0x63, (byte)0x40, (byte)0x58, (byte)0x75, (byte)0x56, (byte)0xBB, (byte)0x67,
                 (byte)0x40, (byte)0x63, (byte)0x49, (byte)0xBB, (byte)0xDC, (byte)0xBD, (byte)0x40, (byte)0x9A,
                 (byte)0x73, (byte)0xDC, (byte)0x73, (byte)0x62, (byte)0x40, (byte)0xAB, (byte)0xDC, (byte)0x40,
                 (byte)0xBB, (byte)0x52, (byte)0x77, (byte)0x77, (byte)0x62, (byte)0x40, (byte)0x56, (byte)0xB1,
                 (byte)0x63, (byte)0x49, (byte)0xBB, (byte)0xDC, (byte)0xBD, (byte)0x40, (byte)0x56, (byte)0xB1,
                 (byte)0xCF, (byte)0x8F, (byte)0xBD, (byte)0xDC, (byte)0x6B, (byte)0x40, (byte)0xBB, (byte)0x9A,
                 (byte)0x40, (byte)0xBB, (byte)0xB1, (byte)0x56, (byte)0x55, (byte)0xBB, (byte)0x63, (byte)0xBF,
                 (byte)0x56, (byte)0x40, (byte)0x73, (byte)0x56, (byte)0x55, (byte)0xBB, (byte)0x56, (byte)0x40,
                 (byte)0xB1, (byte)0xB1, (byte)0x56, (byte)0x69, (byte)0x63, (byte)0xDC, (byte)0x56, (byte)0x67,
                 (byte)0x56, (byte)0x63, (byte)0x40, (byte)0x56, (byte)0xB1, (byte)0xBB, (byte)0x63, (byte)0x9E,
                 (byte)0xDC, (byte)0x75, (byte)0x62, (byte)0x40, (byte)0xB1, (byte)0xB1, (byte)0xBB, (byte)0x67,
                 (byte)0x63, (byte)0xBB, (byte)0x9A, (byte)0x40, (byte)0xCF, (byte)0xB1, (byte)0xB1, (byte)0x73,
                 (byte)0xCF, (byte)0xB1, (byte)0x62, (byte)0x4B, (byte)0x40, (byte)0x63, (byte)0xCF, (byte)0x77,
                 (byte)0x9A, (byte)0x63, (byte)0x40, (byte)0xCF, (byte)0x63, (byte)0x8F, (byte)0xCF, (byte)0x75,
                 (byte)0x63, (byte)0x40, (byte)0x56, (byte)0xB1, (byte)0xBB, (byte)0x52, (byte)0x77, (byte)0x77,
                 (byte)0x62, (byte)0x40, (byte)0x58, (byte)0xBF, (byte)0x73, (byte)0xAB, (byte)0x40, (byte)0x8D,
                 (byte)0xBB, (byte)0x56, (byte)0xBD, (byte)0x40, (byte)0x80, (byte)0x58, (byte)0xAF, (byte)0x62,
                 (byte)0x40, (byte)0x49, (byte)0xBB, (byte)0x56, (byte)0xBD, (byte)0x40, (byte)0xB1, (byte)0x77,
                 (byte)0xAF, (byte)0x56, (byte)0xBD, (byte)0x40, (byte)0x73, (byte)0xCF, (byte)0xB1, (byte)0x62,
                 (byte)0x40, (byte)0x56, (byte)0x77, (byte)0x75, (byte)0x56, (byte)0x55, (byte)0xDC, (byte)0xB1,
                 (byte)0x40, (byte)0x58, (byte)0xCF, (byte)0x67, (byte)0xBF, (byte)0x40, (byte)0x56, (byte)0xB1,
                 (byte)0xBB, (byte)0x71, (byte)0x56, (byte)0x8F, (byte)0x75, (byte)0x40, (byte)0x56, (byte)0xB1,
                 (byte)0x56, (byte)0xAD, (byte)0x63, (byte)0x8B, (byte)0x56, (byte)0x73, (byte)0xDC, (byte)0x62,
                 (byte)0x40, (byte)0xCF, (byte)0x56, (byte)0xB1, (byte)0x56, (byte)0x67, (byte)0x63, (byte)0xBB,
                 (byte)0x56, (byte)0x9A, (byte)0xDC, (byte)0x62, (byte)0x4B,
             };
             det.setText(bytes);
             m = det.detect();
             charsetMatch = m.getName();
             CheckAssert(charsetMatch.equals("IBM420_rtl"));

          // Tests "public String getLanguage()"
             CheckAssert(m.getLanguage().endsWith("ar"));
         }

         {
             // We cannot rely on IBM420 converter in Sun Java
             /*
             StringBuffer ltrStrBuf = new StringBuffer(s);
             ltrStrBuf = ltrStrBuf.reverse();
             bytes = ltrStrBuf.toString().getBytes("IBM420");
             */
             bytes = new byte[] {
                 (byte)0x4B, (byte)0x62, (byte)0xDC, (byte)0x9A, (byte)0x56, (byte)0xBB, (byte)0x63, (byte)0x67,
                 (byte)0x56, (byte)0xB1, (byte)0x56, (byte)0xCF, (byte)0x40, (byte)0x62, (byte)0xDC, (byte)0x73,
                 (byte)0x56, (byte)0x8B, (byte)0x63, (byte)0xAD, (byte)0x56, (byte)0xB1, (byte)0x56, (byte)0x40,
                 (byte)0x75, (byte)0x8F, (byte)0x56, (byte)0x71, (byte)0xBB, (byte)0xB1, (byte)0x56, (byte)0x40,
                 (byte)0xBF, (byte)0x67, (byte)0xCF, (byte)0x58, (byte)0x40, (byte)0xB1, (byte)0xDC, (byte)0x55,
                 (byte)0x56, (byte)0x75, (byte)0x77, (byte)0x56, (byte)0x40, (byte)0x62, (byte)0xB1, (byte)0xCF,
                 (byte)0x73, (byte)0x40, (byte)0xBD, (byte)0x56, (byte)0xAF, (byte)0x77, (byte)0xB1, (byte)0x40,
                 (byte)0xBD, (byte)0x56, (byte)0xBB, (byte)0x49, (byte)0x40, (byte)0x62, (byte)0xAF, (byte)0x58,
                 (byte)0x80, (byte)0x40, (byte)0xBD, (byte)0x56, (byte)0xBB, (byte)0x8D, (byte)0x40, (byte)0xAB,
                 (byte)0x73, (byte)0xBF, (byte)0x58, (byte)0x40, (byte)0x62, (byte)0x77, (byte)0x77, (byte)0x52,
                 (byte)0xBB, (byte)0xB1, (byte)0x56, (byte)0x40, (byte)0x63, (byte)0x75, (byte)0xCF, (byte)0x8F,
                 (byte)0x63, (byte)0xCF, (byte)0x40, (byte)0x63, (byte)0x9A, (byte)0x77, (byte)0xCF, (byte)0x63,
                 (byte)0x40, (byte)0x4B, (byte)0x62, (byte)0xB1, (byte)0xCF, (byte)0x73, (byte)0xB1, (byte)0xB1,
                 (byte)0xCF, (byte)0x40, (byte)0x9A, (byte)0xBB, (byte)0x63, (byte)0x67, (byte)0xBB, (byte)0xB1,
                 (byte)0xB1, (byte)0x40, (byte)0x62, (byte)0x75, (byte)0xDC, (byte)0x9E, (byte)0x63, (byte)0xBB,
                 (byte)0xB1, (byte)0x56, (byte)0x40, (byte)0x63, (byte)0x56, (byte)0x67, (byte)0x56, (byte)0xDC,
                 (byte)0x63, (byte)0x69, (byte)0x56, (byte)0xB1, (byte)0xB1, (byte)0x40, (byte)0x56, (byte)0xBB,
                 (byte)0x55, (byte)0x56, (byte)0x73, (byte)0x40, (byte)0x56, (byte)0xBF, (byte)0x63, (byte)0xBB,
                 (byte)0x55, (byte)0x56, (byte)0xB1, (byte)0xBB, (byte)0x40, (byte)0x9A, (byte)0xBB, (byte)0x40,
                 (byte)0x6B, (byte)0xDC, (byte)0xBD, (byte)0x8F, (byte)0xCF, (byte)0xB1, (byte)0x56, (byte)0x40,
                 (byte)0xBD, (byte)0xDC, (byte)0xBB, (byte)0x49, (byte)0x63, (byte)0xB1, (byte)0x56, (byte)0x40,
                 (byte)0x62, (byte)0x77, (byte)0x77, (byte)0x52, (byte)0xBB, (byte)0x40, (byte)0xDC, (byte)0xAB,
                 (byte)0x40, (byte)0x62, (byte)0x73, (byte)0xDC, (byte)0x73, (byte)0x9A, (byte)0x40, (byte)0xBD,
                 (byte)0xDC, (byte)0xBB, (byte)0x49, (byte)0x63, (byte)0x40, (byte)0x67, (byte)0xBB, (byte)0x56,
                 (byte)0x75, (byte)0x58, (byte)0x40, (byte)0x63, (byte)0x74, (byte)0xAB, (byte)0xBD, (byte)0xCF,
                 (byte)0x40, (byte)0x63, (byte)0x9A, (byte)0x8D, (byte)0xCF,
             };

             det.setText(bytes);
             m = det.detect();
             charsetMatch = m.getName();
             CheckAssert(charsetMatch.equals("IBM420_ltr"));
         }
     }

     public void TestHebrew() throws Exception {
         String  s =  "\u05D4\u05E4\u05E8\u05E7\u05DC\u05D9\u05D8 \u05D4\u05E6\u05D1\u05D0\u05D9 \u05D4" +
             "\u05E8\u05D0\u05E9\u05D9, \u05EA\u05EA \u05D0\u05DC\u05D5\u05E3 \u05D0\u05D1\u05D9" +
             "\u05D7\u05D9 \u05DE\u05E0\u05D3\u05DC\u05D1\u05DC\u05D9\u05D8, \u05D4\u05D5\u05E8" +
             "\u05D4 \u05E2\u05DC \u05E4\u05EA\u05D9\u05D7\u05EA \u05D7\u05E7\u05D9\u05E8\u05EA " +
             "\u05DE\u05E6\"\u05D7 \u05D1\u05E2\u05E7\u05D1\u05D5\u05EA \u05E2\u05D3\u05D5\u05D9" +
             "\u05D5\u05EA \u05D7\u05D9\u05D9\u05DC\u05D9 \u05E6\u05D4\"\u05DC \u05DE\u05DE\u05D1" +
             "\u05E6\u05E2 \u05E2\u05D5\u05E4\u05E8\u05EA \u05D9\u05E6\u05D5\u05E7\u05D4 \u05D1+ " +
             "\u05E8\u05E6\u05D5\u05E2\u05EA \u05E2\u05D6\u05D4. \u05DC\u05D3\u05D1\u05E8\u05D9 " +
             "\u05D4\u05E4\u05E6\"\u05E8, \u05DE\u05D4\u05E2\u05D3\u05D5\u05D9\u05D5\u05EA \u05E2" +
             "\u05D5\u05DC\u05D4 \u05EA\u05DE\u05D5\u05E0\u05D4 \u05E9\u05DC \"\u05D4\u05EA\u05E0" +
             "\u05D4\u05D2\u05D5\u05EA \u05E4\u05E1\u05D5\u05DC\u05D4 \u05DC\u05DB\u05D0\u05D5\u05E8" +
             "\u05D4 \u05E9\u05DC \u05D7\u05D9\u05D9\u05DC\u05D9\u05DD \u05D1\u05DE\u05D4\u05DC\u05DA" +
             " \u05DE\u05D1\u05E6\u05E2 \u05E2\u05D5\u05E4\u05E8\u05EA \u05D9\u05E6\u05D5\u05E7\u05D4\"." +
             " \u05DE\u05E0\u05D3\u05DC\u05D1\u05DC\u05D9\u05D8 \u05E7\u05D9\u05D1\u05DC \u05D0\u05EA" +
             " \u05D4\u05D7\u05DC\u05D8\u05EA\u05D5 \u05DC\u05D0\u05D7\u05E8 \u05E9\u05E2\u05D9\u05D9" +
             "\u05DF \u05D1\u05EA\u05DE\u05DC\u05D9\u05DC \u05D4\u05E2\u05D3\u05D5\u05D9\u05D5\u05EA";

         CharsetMatch m = _test1255(s);
         String charsetMatch = m.getName();
         CheckAssert(charsetMatch.equals("ISO-8859-8"));
         CheckAssert(m.getLanguage().equals("he"));

         m = _test1255_reverse(s);
         charsetMatch = m.getName();
         CheckAssert(charsetMatch.equals("ISO-8859-8"));
         CheckAssert(m.getLanguage().equals("he"));

         m = _testIBM424_he_rtl(s);
         charsetMatch = m.getName();
         CheckAssert(charsetMatch.equals("IBM424_rtl"));
         CheckAssert(m.getLanguage().equals("he"));
         try {
             m.getString();
         } catch (Exception ex) {
             errln("Error getting string for charsetMatch: " + charsetMatch);
         }

         m = _testIBM424_he_ltr(s);
         charsetMatch = m.getName();
         CheckAssert(charsetMatch.equals("IBM424_ltr"));
         CheckAssert(m.getLanguage().equals("he"));
         try {
             m.getString();
         } catch (Exception ex) {
             errln("Error getting string for charsetMatch: " + charsetMatch);
         }
     }

     private CharsetMatch _test1255(String s) throws Exception {
         byte [] bytes = s.getBytes("ISO-8859-8");
         CharsetDetector det = new CharsetDetector();
         det.setText(bytes);
         CharsetMatch m = det.detect();
         return m;
     }

     private CharsetMatch _test1255_reverse(String s) throws Exception {
         StringBuffer reverseStrBuf = new StringBuffer(s);
         reverseStrBuf = reverseStrBuf.reverse();
         byte [] bytes = reverseStrBuf.toString().getBytes("ISO-8859-8");

         CharsetDetector det = new CharsetDetector();
         det.setText(bytes);
         CharsetMatch m = det.detect();
         return m;
     }

     private CharsetMatch _testIBM424_he_rtl(String s) throws Exception {
         byte [] bytes = s.getBytes("IBM424");
         CharsetDetector det = new CharsetDetector();
         det.setText(bytes);
         CharsetMatch m = det.detect();
         return m;
     }

     private CharsetMatch _testIBM424_he_ltr(String s) throws Exception {
         /**
          * transformation of input string to CP420 left to right requires reversing the string
          */

         StringBuffer ltrStrBuf = new StringBuffer(s);
         ltrStrBuf = ltrStrBuf.reverse();
         byte [] bytes = ltrStrBuf.toString().getBytes("IBM424");

         CharsetDetector det = new CharsetDetector();
         det.setText(bytes);
         CharsetMatch m = det.detect();
         return m;
     }

     /*
      * Test the method int match(CharsetDetector det) in CharsetRecog_UTF_16_LE
      */
     public void TestCharsetRecog_UTF_16_LE_Match() {
         byte[] in = { Byte.MIN_VALUE, Byte.MIN_VALUE, Byte.MIN_VALUE, Byte.MIN_VALUE };
         CharsetDetector cd = new CharsetDetector();
         // Tests when if (input.length>=4 && input[2] == 0x00 && input[3] == 0x00) is true inside the
         // match(CharsetDetector) method of CharsetRecog_UTF_16_LE
         try {
             cd.setText(in);
         } catch (Exception e) {
             errln("CharsetRecog_UTF_16_LE.match(CharsetDetector) was not suppose to return an exception.");
         }
     }
 }