| //##header |
| /** |
| ******************************************************************************* |
| * Copyright (C) 2005-2006, International Business Machines Corporation and * |
| * others. All Rights Reserved. * |
| ******************************************************************************* |
| */ |
| package com.ibm.icu.dev.test.charsetdet; |
| |
| import java.io.ByteArrayInputStream; |
| import java.io.InputStream; |
| import java.io.Reader; |
| import java.io.UnsupportedEncodingException; |
| |
| import com.ibm.icu.dev.test.TestFmwk; |
| import com.ibm.icu.text.CharsetDetector; |
| import com.ibm.icu.text.CharsetMatch; |
| |
| //#ifdef FOUNDATION |
| //##import com.ibm.icu.impl.Utility; |
| //#endif |
| |
| import javax.xml.parsers.*; |
| import org.w3c.dom.*; |
| |
| |
| /** |
| * @author andy |
| */ |
| public class TestCharsetDetector extends TestFmwk |
| { |
| |
| /** |
| * Constructor |
| */ |
| public TestCharsetDetector() |
| { |
| } |
| |
| public static void main(String[] args) { |
| try |
| { |
| TestCharsetDetector test = new TestCharsetDetector(); |
| test.run(args); |
| } |
| catch (Exception e) |
| { |
| e.printStackTrace(); |
| } |
| } |
| |
| private void CheckAssert(boolean exp) { |
| if (exp == false) { |
| String msg; |
| try { |
| throw new Exception(); |
| } |
| catch (Exception e) { |
| //#ifndef FOUNDATION |
| StackTraceElement failPoint = e.getStackTrace()[1]; |
| msg = "Test failure in file " + failPoint.getFileName() + |
| " at line " + failPoint.getLineNumber(); |
| //#else |
| //## msg = "Test failure " + e.getMessage() ; |
| //#endif |
| } |
| errln(msg); |
| } |
| |
| } |
| |
| private String stringFromReader(Reader reader) |
| { |
| StringBuffer sb = new StringBuffer(); |
| char[] buffer = new char[1024]; |
| int bytesRead = 0; |
| |
| try { |
| while ((bytesRead = reader.read(buffer, 0, 1024)) >= 0) { |
| sb.append(buffer, 0, bytesRead); |
| } |
| |
| return sb.toString(); |
| } catch (Exception e) { |
| errln("stringFromReader() failed: " + e.toString()); |
| return null; |
| } |
| } |
| |
| private void checkMatch(CharsetDetector det, String testString, String encoding, String language, String id) throws Exception |
| { |
| CharsetMatch m = det.detect(); |
| String decoded; |
| |
| if (! m.getName().equals(encoding)) { |
| errln(id + ": encoding detection failure - expected " + encoding + ", got " + m.getName()); |
| return; |
| } |
| |
| String charsetMatchLanguage = m.getLanguage(); |
| if ((language != null && !charsetMatchLanguage.equals(language)) |
| || (language == null && charsetMatchLanguage != null) |
| || (language != null && charsetMatchLanguage == null)) |
| { |
| errln(id + ", " + encoding + ": language detection failure - expected " + language + ", got " + m.getLanguage()); |
| } |
| |
| if (encoding.startsWith("UTF-32")) { |
| return; |
| } |
| |
| decoded = m.getString(); |
| |
| if (! testString.equals(decoded)) { |
| errln(id + ", " + encoding + ": getString() didn't return the original string!"); |
| } |
| |
| decoded = stringFromReader(m.getReader()); |
| |
| if (! testString.equals(decoded)) { |
| errln(id + ", " + encoding + ": getReader() didn't yield the original string!"); |
| } |
| } |
| |
| private void checkEncoding(String testString, String encoding, String id) |
| { |
| String enc = null, lang = null; |
| //#ifndef FOUNDATION |
| String[] split = encoding.split("/"); |
| //#else |
| //## String[] split = Utility.split(encoding,'/'); |
| //#endif |
| |
| enc = split[0]; |
| |
| if (split.length > 1) { |
| lang = split[1]; |
| } |
| |
| try { |
| CharsetDetector det = new CharsetDetector(); |
| byte[] bytes; |
| |
| //if (enc.startsWith("UTF-32")) { |
| // UTF32 utf32 = UTF32.getInstance(enc); |
| |
| // bytes = utf32.toBytes(testString); |
| //} else { |
| String from = enc; |
| |
| while (true) { |
| try { |
| bytes = testString.getBytes(from); |
| } catch (UnsupportedOperationException uoe) { |
| // In some runtimes, the ISO-2022-CN converter |
| // only converts *to* Unicode - we have to use |
| // x-ISO-2022-CN-GB to convert *from* Unicode. |
| if (from.equals("ISO-2022-CN")) { |
| from = "x-ISO-2022-CN-GB"; |
| continue; |
| } |
| |
| // Ignore any other converters that can't |
| // convert from Unicode. |
| return; |
| } catch (UnsupportedEncodingException uee) { |
| // Ignore any encodings that this runtime |
| // doesn't support. |
| return; |
| } |
| |
| break; |
| } |
| //} |
| |
| det.setText(bytes); |
| checkMatch(det, testString, enc, lang, id); |
| |
| det.setText(new ByteArrayInputStream(bytes)); |
| checkMatch(det, testString, enc, lang, id); |
| } catch (Exception e) { |
| errln(id + ": " + e.toString() + "enc=" + enc); |
| e.printStackTrace(); |
| } |
| } |
| |
| public void TestConstruction() { |
| int i; |
| CharsetDetector det = new CharsetDetector(); |
| if(det==null){ |
| errln("Could not construct a charset detector"); |
| } |
| String [] charsetNames = CharsetDetector.getAllDetectableCharsets(); |
| CheckAssert(charsetNames.length != 0); |
| for (i=0; i<charsetNames.length; i++) { |
| CheckAssert(charsetNames[i].equals("") == false); |
| // System.out.println("\"" + charsetNames[i] + "\""); |
| } |
| } |
| |
| public void TestInputFilter() throws Exception |
| { |
| String s = "<a> <lot> <of> <English> <inside> <the> <markup> Un tr\u00E8s petit peu de Fran\u00E7ais. <to> <confuse> <the> <detector>"; |
| byte[] bytes = s.getBytes("ISO-8859-1"); |
| CharsetDetector det = new CharsetDetector(); |
| CharsetMatch m; |
| |
| det.enableInputFilter(true); |
| if (!det.inputFilterEnabled()){ |
| errln("input filter should be enabled"); |
| } |
| |
| det.setText(bytes); |
| m = det.detect(); |
| |
| if (! m.getLanguage().equals("fr")) { |
| errln("input filter did not strip markup!"); |
| } |
| |
| det.enableInputFilter(false); |
| det.setText(bytes); |
| m = det.detect(); |
| |
| if (! m.getLanguage().equals("en")) { |
| errln("unfiltered input did not detect as English!"); |
| } |
| } |
| |
| public void TestUTF8() throws Exception { |
| |
| String s = "This is a string with some non-ascii characters that will " + |
| "be converted to UTF-8, then shoved through the detection process. " + |
| "\u0391\u0392\u0393\u0394\u0395" + |
| "Sure would be nice if our source could contain Unicode directly!"; |
| byte [] bytes = s.getBytes("UTF-8"); |
| CharsetDetector det = new CharsetDetector(); |
| String retrievedS; |
| Reader reader; |
| |
| retrievedS = det.getString(bytes, "UTF-8"); |
| CheckAssert(s.equals(retrievedS)); |
| |
| reader = det.getReader(new ByteArrayInputStream(bytes), "UTF-8"); |
| CheckAssert(s.equals(stringFromReader(reader))); |
| det.setDeclaredEncoding("UTF-8"); // Jitterbug 4451, for coverage |
| } |
| |
| public void TestUTF16() throws Exception |
| { |
| String source = |
| "u0623\u0648\u0631\u0648\u0628\u0627, \u0628\u0631\u0645\u062c\u064a\u0627\u062a " + |
| "\u0627\u0644\u062d\u0627\u0633\u0648\u0628 \u002b\u0020\u0627\u0646\u062a\u0631\u0646\u064a\u062a"; |
| |
| byte[] beBytes = source.getBytes("UnicodeBig"); |
| byte[] leBytes = source.getBytes("UnicodeLittle"); |
| CharsetDetector det = new CharsetDetector(); |
| CharsetMatch m; |
| |
| det.setText(beBytes); |
| m = det.detect(); |
| |
| if (! m.getName().equals("UTF-16BE")) { |
| errln("Encoding detection failure: expected UTF-16BE, got " + m.getName()); |
| } |
| |
| det.setText(leBytes); |
| m = det.detect(); |
| |
| if (! m.getName().equals("UTF-16LE")) { |
| errln("Encoding detection failure: expected UTF-16LE, got " + m.getName()); |
| } |
| |
| // Jitterbug 4451, for coverage |
| int confidence = m.getConfidence(); |
| if(confidence != 100){ |
| errln("Did not get the expected confidence level " + confidence); |
| } |
| int matchType = m.getMatchType(); |
| if(matchType != 0){ |
| errln("Did not get the expected matchType level " + matchType); |
| } |
| } |
| |
| public void TestC1Bytes() throws Exception |
| { |
| String sISO = |
| "This is a small sample of some English text. Just enough to be sure that it detects correctly."; |
| |
| String sWindows = |
| "This is another small sample of some English text. Just enough to be sure that it detects correctly. It also includes some \u201CC1\u201D bytes."; |
| |
| byte[] bISO = sISO.getBytes("ISO-8859-1"); |
| byte[] bWindows = sWindows.getBytes("windows-1252"); |
| |
| CharsetDetector det = new CharsetDetector(); |
| CharsetMatch m; |
| |
| det.setText(bWindows); |
| m = det.detect(); |
| |
| if (m.getName() != "windows-1252") { |
| errln("Text with C1 bytes not correctly detected as windows-1252."); |
| return; |
| } |
| |
| det.setText(bISO); |
| m = det.detect(); |
| |
| if (m.getName() != "ISO-8859-1") { |
| errln("Text without C1 bytes not correctly detected as ISO-8859-1."); |
| } |
| } |
| |
| public void TestDetection() |
| { |
| // |
| // Open and read the test data file. |
| // |
| //InputStreamReader isr = null; |
| |
| try { |
| InputStream is = TestCharsetDetector.class.getResourceAsStream("CharsetDetectionTests.xml"); |
| if (is == null) { |
| errln("Could not open test data file CharsetDetectionTests.xml"); |
| return; |
| } |
| |
| //isr = new InputStreamReader(is, "UTF-8"); |
| |
| // Set up an xml parser. |
| DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance(); |
| |
| factory.setIgnoringComments(true); |
| |
| DocumentBuilder builder = factory.newDocumentBuilder(); |
| |
| // Parse the xml content from the test case file. |
| Document doc = builder.parse(is, null); |
| Element root = doc.getDocumentElement(); |
| |
| NodeList testCases = root.getElementsByTagName("test-case"); |
| |
| // Process each test case |
| for (int n = 0; n < testCases.getLength(); n += 1) { |
| Node testCase = testCases.item(n); |
| NamedNodeMap attrs = testCase.getAttributes(); |
| NodeList testData = testCase.getChildNodes(); |
| StringBuffer testText = new StringBuffer(); |
| String id = attrs.getNamedItem("id").getNodeValue(); |
| String encodings = attrs.getNamedItem("encodings").getNodeValue(); |
| |
| // Collect the test case text. |
| for (int t = 0; t < testData.getLength(); t += 1) { |
| Node textNode = testData.item(t); |
| |
| testText.append(textNode.getNodeValue()); |
| } |
| |
| // Process test text with each encoding / language pair. |
| String testString = testText.toString(); |
| //#ifndef FOUNDATION |
| String[] encodingList = encodings.split(" "); |
| //#else |
| //## String[] encodingList = Utility.split(encodings, ' '); |
| //#endif |
| |
| for (int e = 0; e < encodingList.length; e += 1) { |
| checkEncoding(testString, encodingList[e], id); |
| } |
| } |
| |
| } catch (Exception e) { |
| errln("exception while processing test cases: " + e.toString()); |
| } |
| } |
| } |