This commit was manufactured by cvs2svn to create tag
'release-3-4-1-d03'.
X-SVN-Rev: 18631
diff --git a/.gitattributes b/.gitattributes
new file mode 100644
index 0000000..4d99a35
--- /dev/null
+++ b/.gitattributes
@@ -0,0 +1,58 @@
+* text=auto !eol
+
+*.c text !eol
+*.cc text !eol
+*.classpath text !eol
+*.cpp text !eol
+*.css text !eol
+*.dsp text !eol
+*.dsw text !eol
+*.filters text !eol
+*.h text !eol
+*.htm text !eol
+*.html text !eol
+*.in text !eol
+*.java text !eol
+*.launch text !eol
+*.mak text !eol
+*.md text !eol
+*.MF text !eol
+*.mk text !eol
+*.pl text !eol
+*.pm text !eol
+*.project text !eol
+*.properties text !eol
+*.py text !eol
+*.rc text !eol
+*.sh text eol=lf
+*.sln text !eol
+*.stub text !eol
+*.txt text !eol
+*.ucm text !eol
+*.vcproj text !eol
+*.vcxproj text !eol
+*.xml text !eol
+*.xsl text !eol
+*.xslt text !eol
+Makefile text !eol
+configure text !eol
+LICENSE text !eol
+README text !eol
+
+*.bin -text
+*.brk -text
+*.cnv -text
+*.icu -text
+*.res -text
+*.nrm -text
+*.spp -text
+*.tri2 -text
+
+# The following file types are stored in Git-LFS.
+*.jar filter=lfs diff=lfs merge=lfs -text
+*.dat filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.gif filter=lfs diff=lfs merge=lfs -text
+
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..229f478
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,5 @@
+/.classpath
+/.externalToolBuilders
+/.project
+/classes
+/doc
diff --git a/src/com/ibm/icu/dev/test/charsetdet/TestCharsetDetector.java b/src/com/ibm/icu/dev/test/charsetdet/TestCharsetDetector.java
new file mode 100644
index 0000000..20af4f3
--- /dev/null
+++ b/src/com/ibm/icu/dev/test/charsetdet/TestCharsetDetector.java
@@ -0,0 +1,320 @@
+/**
+ *******************************************************************************
+ * Copyright (C) 2005, International Business Machines Corporation and *
+ * others. All Rights Reserved. *
+ *******************************************************************************
+ */
+package com.ibm.icu.dev.test.charsetdet;
+
+import java.io.ByteArrayInputStream;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.io.Reader;
+
+import com.ibm.icu.dev.test.TestFmwk;
+import com.ibm.icu.impl.UTF32;
+import com.ibm.icu.text.*;
+import com.ibm.icu.util.VersionInfo;
+
+import javax.xml.parsers.*;
+import org.w3c.dom.*;
+
+
+/**
+ * @author andy
+ *
+ * TODO To change the template for this generated type comment go to
+ * Window - Preferences - Java - Code Style - Code Templates
+ */
+public class TestCharsetDetector extends TestFmwk
+{
+
+ /**
+ * Constructor
+ */
+ public TestCharsetDetector()
+ {
+ }
+
+ public static void main(String[] args) {
+ try
+ {
+ TestCharsetDetector test = new TestCharsetDetector();
+ test.run(args);
+ }
+ catch (Exception e)
+ {
+ e.printStackTrace();
+ }
+ }
+
+ private void CheckAssert(boolean exp) {
+ if (exp == false) {
+ String msg;
+ try {
+ throw new Exception();
+ }
+ catch (Exception e) {
+ StackTraceElement failPoint = e.getStackTrace()[1];
+ msg = "Test failure in file " + failPoint.getFileName() +
+ " at line " + failPoint.getLineNumber();
+ }
+ errln(msg);
+ }
+
+ }
+
+ private String stringFromReader(Reader reader)
+ {
+ StringBuffer sb = new StringBuffer();
+ char[] buffer = new char[1024];
+ int bytesRead = 0;
+
+ try {
+ while ((bytesRead = reader.read(buffer, 0, 1024)) >= 0) {
+ sb.append(buffer, 0, bytesRead);
+ }
+
+ return sb.toString();
+ } catch (Exception e) {
+ errln("stringFromReader() failed: " + e.toString());
+ return null;
+ }
+ }
+
+ private void checkMatch(CharsetDetector det, String testString, String encoding, String language, String id) throws Exception
+ {
+ CharsetMatch m = det.detect();
+ String decoded;
+
+ if (! m.getName().equals(encoding)) {
+ errln(id + ": encoding detection failure - expected " + encoding + ", got " + m.getName());
+ return;
+ }
+
+ if (! (language == null || m.getLanguage().equals(language))) {
+ errln(id + ", " + encoding + ": language detection failure - expected " + language + ", got " + m.getLanguage());
+ }
+
+ if (encoding.startsWith("UTF-32")) {
+ return;
+ }
+
+ decoded = m.getString();
+
+ if (! testString.equals(decoded)) {
+ errln(id + ", " + encoding + ": getString() didn't return the original string!");
+ }
+
+ decoded = stringFromReader(m.getReader());
+
+ if (! testString.equals(decoded)) {
+ errln(id + ", " + encoding + ": getReader() didn't yield the original string!");
+ }
+ }
+
+ private void checkEncoding(String testString, String encoding, String id)
+ {
+ String enc = null, from = null, lang = null;
+ String[] split = encoding.split("/");
+
+ enc = split[0];
+
+ if (split.length > 1) {
+ lang = split[1];
+ }
+
+ if (enc.equals("ISO-2022-CN")) {
+
+ // Don't test ISO-2022-CN on older runtimes.
+ if (! have_ISO_2022_CN) {
+ return;
+ }
+
+ // ISO-2022-CN only works for converting *to* Unicode,
+ // we need to use x-ISO-2022-CN-GB to convert *from* unicode...
+ from = "x-ISO-2022-CN-GB";
+ } else {
+ from = enc;
+ }
+
+ try {
+ CharsetDetector det = new CharsetDetector();
+ byte[] bytes;
+
+ if (from.startsWith("UTF-32")) {
+ UTF32 utf32 = UTF32.getInstance(from);
+
+ bytes = utf32.toBytes(testString);
+ } else {
+ bytes = testString.getBytes(from);
+ }
+
+ det.setText(bytes);
+ checkMatch(det, testString, enc, lang, id);
+
+ det.setText(new ByteArrayInputStream(bytes));
+ checkMatch(det, testString, enc, lang, id);
+ } catch (Exception e) {
+ errln(id + ": " + e.toString());
+ }
+
+ }
+
+ public void TestConstruction() {
+ int i;
+ CharsetDetector det = new CharsetDetector();
+
+ String [] charsetNames = CharsetDetector.getAllDetectableCharsets();
+ CheckAssert(charsetNames.length != 0);
+ for (i=0; i<charsetNames.length; i++) {
+ CheckAssert(charsetNames[i].equals("") == false);
+ // System.out.println("\"" + charsetNames[i] + "\"");
+ }
+ }
+
+ public void TestInputFilter() throws Exception
+ {
+ String s = "<a> <lot> <of> <English> <inside> <the> <markup> Un tr\u00E8s petit peu de Fran\u00E7ais. <to> <confuse> <the> <detector>";
+ byte[] bytes = s.getBytes("ISO-8859-1");
+ CharsetDetector det = new CharsetDetector();
+ CharsetMatch m;
+
+ det.enableInputFilter(true);
+ if (!det.inputFilterEnabled()){
+ errln("input filter should be enabled");
+ }
+
+ det.setText(bytes);
+ m = det.detect();
+
+ if (! m.getLanguage().equals("fr")) {
+ errln("input filter did not strip markup!");
+ }
+
+ det.enableInputFilter(false);
+ det.setText(bytes);
+ m = det.detect();
+
+ if (! m.getLanguage().equals("en")) {
+ errln("unfiltered input did not detect as English!");
+ }
+ }
+
+ public void TestUTF8() throws Exception {
+
+ String s = "This is a string with some non-ascii characters that will " +
+ "be converted to UTF-8, then shoved through the detection process. " +
+ "\u0391\u0392\u0393\u0394\u0395" +
+ "Sure would be nice if our source could contain Unicode directly!";
+ byte [] bytes = s.getBytes("UTF-8");
+ CharsetDetector det = new CharsetDetector();
+ String retrievedS;
+ Reader reader;
+
+ retrievedS = det.getString(bytes, "UTF-8");
+ CheckAssert(s.equals(retrievedS));
+
+ reader = det.getReader(new ByteArrayInputStream(bytes), "UTF-8");
+ CheckAssert(s.equals(stringFromReader(reader)));
+ det.setDeclaredEncoding("UTF-8"); // Jitterbug 4451, for coverage
+ }
+
+ public void TestUTF16() throws Exception
+ {
+ String source =
+ "u0623\u0648\u0631\u0648\u0628\u0627, \u0628\u0631\u0645\u062c\u064a\u0627\u062a " +
+ "\u0627\u0644\u062d\u0627\u0633\u0648\u0628 \u002b\u0020\u0627\u0646\u062a\u0631\u0646\u064a\u062a";
+
+ byte[] beBytes = source.getBytes("UnicodeBig");
+ byte[] leBytes = source.getBytes("UnicodeLittle");
+ CharsetDetector det = new CharsetDetector();
+ CharsetMatch m;
+
+ det.setText(beBytes);
+ m = det.detect();
+
+ if (! m.getName().equals("UTF-16BE")) {
+ errln("Encoding detection failure: expected UTF-16BE, got " + m.getName());
+ }
+
+ det.setText(leBytes);
+ m = det.detect();
+
+ if (! m.getName().equals("UTF-16LE")) {
+ errln("Encoding detection failure: expected UTF-16LE, got " + m.getName());
+ }
+
+ // Jitterbug 4451, for coverage
+ int confidence = m.getConfidence();
+ if(confidence != 100){
+ errln("Did not get the expected confidence level " + confidence);
+ }
+ int matchType = m.getMatchType();
+ if(matchType != 0){
+ errln("Did not get the expected matchType level " + matchType);
+ }
+}
+
+ public void TestDetection()
+ {
+ //
+ // Open and read the test data file.
+ //
+ InputStreamReader isr = null;
+
+ try {
+ InputStream is = TestCharsetDetector.class.getResourceAsStream("CharsetDetectionTests.xml");
+ if (is == null) {
+ errln("Could not open test data file CharsetDetectionTests.xml");
+ return;
+ }
+
+ isr = new InputStreamReader(is, "UTF-8");
+
+ // Set up an xml parser.
+ DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
+
+ factory.setIgnoringComments(true);
+
+ DocumentBuilder builder = factory.newDocumentBuilder();
+
+ // Parse the xml content from the test case file.
+ Document doc = builder.parse(is, null);
+ Element root = doc.getDocumentElement();
+
+ NodeList testCases = root.getElementsByTagName("test-case");
+
+ // Process each test case
+ for (int n = 0; n < testCases.getLength(); n += 1) {
+ Node testCase = testCases.item(n);
+ NamedNodeMap attrs = testCase.getAttributes();
+ NodeList testData = testCase.getChildNodes();
+ StringBuffer testText = new StringBuffer();
+ String id = attrs.getNamedItem("id").getNodeValue();
+ String encodings = attrs.getNamedItem("encodings").getNodeValue();
+
+ // Collect the test case text.
+ for (int t = 0; t < testData.getLength(); t += 1) {
+ Node textNode = testData.item(t);
+
+ testText.append(textNode.getNodeValue());
+ }
+
+ // Process test text with each encoding / language pair.
+ String testString = testText.toString();
+ String[] encodingList = encodings.split(" ");
+
+ for (int e = 0; e < encodingList.length; e += 1) {
+ checkEncoding(testString, encodingList[e], id);
+ }
+ }
+
+ } catch (Exception e) {
+ errln("exception while processing test cases: " + e.toString());
+ }
+ }
+
+ // Before Java 1.5, we cannot convert from Unicode to ISO-2022-CN, so checkEncoding() can't test it...
+ private boolean have_ISO_2022_CN = VersionInfo.javaVersion().compareTo(VersionInfo.getInstance(1, 5)) >= 0;
+}
diff --git a/src/com/ibm/icu/text/CharsetDetector.java b/src/com/ibm/icu/text/CharsetDetector.java
new file mode 100644
index 0000000..3f62833
--- /dev/null
+++ b/src/com/ibm/icu/text/CharsetDetector.java
@@ -0,0 +1,520 @@
+/**
+*******************************************************************************
+* Copyright (C) 2005, International Business Machines Corporation and *
+* others. All Rights Reserved. *
+*******************************************************************************
+*/
+package com.ibm.icu.text;
+
+import java.io.InputStream;
+import java.io.Reader;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Collections;
+import java.util.Arrays;
+
+
+/**
+ * <code>CharsetDetector</code> provides a facility for detecting the
+ * charset or encoding of character data in an unknown format.
+ * The input data can either be from an input stream or an array of bytes.
+ * The result of the detection operation is a list of possibly matching
+ * charsets, or, for simple use, you can just ask for a Java Reader that
+ * will will work over the input data.
+ * <p/>
+ * Character set detection is at best an imprecise operation. The detection
+ * process will attempt to identify the charset that best matches the characteristics
+ * of the byte data, but the process is partly statistical in nature, and
+ * the results can not be guaranteed to always be correct.
+ * <p/>
+ * For best accuracy in charset detection, the input data should be primarily
+ * in a single language, and a minimum of a few hundred bytes worth of plain text
+ * in the language are needed. The detection process will attempt to
+ * ignore html or xml style markup that could otherwise obscure the content.
+ * <p/>
+ * @draft ICU 3.4
+ * @deprecated This is a draft API and might change in a future release of ICU.
+ */
+public class CharsetDetector {
+
+// Question: Should we have getters corresponding to the setters for inut text
+// and declared encoding?
+
+// A thought: If we were to create our own type of Java Reader, we could defer
+// figuring out an actual charset for data that starts out with too much English
+// only ASCII until the user actually read through to something that didn't look
+// like 7 bit English. If nothing else ever appeared, we would never need to
+// actually choose the "real" charset. All assuming that the application just
+// wants the data, and doesn't care about a char set name.
+
+ /**
+ * Constructor
+ *
+ * @draft ICU 3.4
+ * @deprecated This is a draft API and might change in a future release of ICU.
+ */
+ public CharsetDetector() {
+ }
+
+ /**
+ * Set the declared encoding for charset detection.
+ * The declared encoding of an input text is an encoding obtained
+ * from an http header or xml declaration or similar source that
+ * can be provided as additional information to the charset detector.
+ * A match between a declared encoding and a possible detected encoding
+ * will raise the quality of that detected encoding by a small delta,
+ * and will also appear as a "reason" for the match.
+ * <p/>
+ * A declared encoding that is incompatible with the input data being
+ * analyzed will not be added to the list of possible encodings.
+ *
+ * @param encoding The declared encoding
+ *
+ * @draft ICU 3.4
+ * @deprecated This is a draft API and might change in a future release of ICU.
+ */
+ public CharsetDetector setDeclaredEncoding(String encoding) {
+ fDeclaredEncoding = encoding;
+ return this;
+ }
+
+ /**
+ * Set the input text (byte) data whose charset is to be detected.
+ *
+ * @param in the input text of unknown encoding
+ *
+ * @return This CharsetDetector
+ *
+ * @draft ICU 3.4
+ * @deprecated This is a draft API and might change in a future release of ICU.
+ */
+ public CharsetDetector setText(byte [] in) {
+ fRawInput = in;
+ fRawLength = in.length;
+
+ MungeInput();
+
+ return this;
+ }
+
+ private static final int kBufSize = 8000;
+
+ /**
+ * Set the input text (byte) data whose charset is to be detected.
+ * <p/>
+ * The input stream that supplies the character data must have markSupported()
+ * == true; the charset detection process will read a small amount of data,
+ * then return the stream to its original position via
+ * the InputStream.reset() operation. The exact amount that will
+ * be read depends on the characteristics of the data itself.
+ *
+ * @param in the input text of unknown encoding
+ *
+ * @return This CharsetDetector
+ *
+ * @draft ICU 3.4
+ * @deprecated This is a draft API and might change in a future release of ICU.
+ */
+
+ public CharsetDetector setText(InputStream in) throws IOException {
+ fInputStream = in;
+ fInputStream.mark(kBufSize);
+ fRawInput = new byte[kBufSize]; // Always make a new buffer because the
+ // previous one may have come from the caller,
+ // in which case we can't touch it.
+ fRawLength = 0;
+ int remainingLength = kBufSize;
+ while (remainingLength > 0 ) {
+ // read() may give data in smallish chunks, esp. for remote sources. Hence, this loop.
+ int bytesRead = fInputStream.read(fRawInput, fRawLength, remainingLength);
+ if (bytesRead <= 0) {
+ break;
+ }
+ fRawLength += bytesRead;
+ remainingLength -= bytesRead;
+ }
+ fInputStream.reset();
+
+ MungeInput(); // Strip html markup, collect byte stats.
+ return this;
+ }
+
+
+ /**
+ * Return the charset that best matches the supplied input data.
+ *
+ * Note though, that because the detection
+ * only looks at the start of the input data,
+ * there is a possibility that the returned charset will fail to handle
+ * the full set of input data.
+ * <p/>
+ * Raise an exception if
+ * <ul>
+ * <li>no charset appears to match the data.</li>
+ * <li>no input text has been provided</li>
+ * </ul>
+ *
+ * @return a CharsetMatch object representing the best matching charset.
+ *
+ * @draft ICU 3.4
+ * @deprecated This is a draft API and might change in a future release of ICU.
+ */
+ public CharsetMatch detect() {
+// TODO: A better implementation would be to copy the detect loop from
+// detectAll(), and cut it short as soon as a match with a high confidence
+// is found. This is something to be done later, after things are otherwise
+// working.
+
+ return detectAll()[0];
+ }
+
+ /**
+ * Return an array of all charsets that appear to be plausible
+ * matches with the input data. The array is ordered with the
+ * best quality match first.
+ * <p/>
+ * Raise an exception if
+ * <ul>
+ * <li>no charsets appear to match the input data.</li>
+ * <li>no input text has been provided</li>
+ * </ul>
+ *
+ * @return An array of CharsetMatch objects representing possibly matching charsets.
+ *
+ * @draft ICU 3.4
+ * @deprecated This is a draft API and might change in a future release of ICU.
+ */
+ public CharsetMatch[] detectAll() {
+ CharsetRecognizer csr;
+ int i;
+ int detectResults;
+ int confidence;
+ ArrayList matches = new ArrayList();
+
+ // Iterate over all possible charsets, remember all that
+ // give a match quality > 0.
+ for (i=0; i<fCSRecognizers.size(); i++) {
+ csr = (CharsetRecognizer)fCSRecognizers.get(i);
+ detectResults = csr.match(this);
+ confidence = detectResults & 0x000000ff;
+ if (confidence > 0) {
+ CharsetMatch m = new CharsetMatch(this, csr, confidence);
+ matches.add(m);
+ }
+ }
+ Collections.sort(matches); // CharsetMatch compares on confidence
+ Collections.reverse(matches); // Put best match first.
+ CharsetMatch [] resultArray = new CharsetMatch[matches.size()];
+ resultArray = (CharsetMatch[]) matches.toArray(resultArray);
+ return resultArray;
+ }
+
+
+ /**
+ * Autodetect the charset of an inputStream, and return a Java Reader
+ * to access the converted input data.
+ * <p/>
+ * This is a convenience method that is equivalent to
+ * <code>this.setDeclaredEncoding(declaredEncoding).setText(in).detect().getReader();</code>
+ * <p/>
+ * For the input stream that supplies the character data, markSupported()
+ * must be true; the charset detection will read a small amount of data,
+ * then return the stream to its original position via
+ * the InputStream.reset() operation. The exact amount that will
+ * be read depends on the characteristics of the data itself.
+ *<p/>
+ * Raise an exception if no charsets appear to match the input data.
+ *
+ * @param in The source of the byte data in the unknown charset.
+ *
+ * @param declaredEncoding A declared encoding for the data, if available,
+ * or null or an empty string if none is available.
+ *
+ * @draft ICU 3.4
+ * @deprecated This is a draft API and might change in a future release of ICU.
+ */
+ public Reader getReader(InputStream in, String declaredEncoding) {
+ fDeclaredEncoding = declaredEncoding;
+
+ try {
+ setText(in);
+
+ return detect().getReader();
+ } catch (IOException e) {
+ return null;
+ }
+ }
+
+ /**
+ * Autodetect the charset of an inputStream, and return a String
+ * containing the converted input data.
+ * <p/>
+ * This is a convenience method that is equivalent to
+ * <code>this.setDeclaredEncoding(declaredEncoding).setText(in).detect().getString();</code>
+ *<p/>
+ * Raise an exception if no charsets appear to match the input data.
+ *
+ * @param in The source of the byte data in the unknown charset.
+ *
+ * @param declaredEncoding A declared encoding for the data, if available,
+ * or null or an empty string if none is available.
+ *
+ * @draft ICU 3.4
+ * @deprecated This is a draft API and might change in a future release of ICU.
+ */
+ public String getString(byte[] in, String declaredEncoding)
+ {
+ fDeclaredEncoding = declaredEncoding;
+
+ try {
+ setText(in);
+ return detect().getString(-1);
+ } catch (IOException e) {
+ return null;
+ }
+ }
+
+
+ /**
+ * Get the names of all char sets that can be recognized by the char set detector.
+ *
+ * @return an array of the names of all charsets that can be recognized
+ * by the charset detector.
+ *
+ * @draft ICU 3.4
+ * @deprecated This is a draft API and might change in a future release of ICU.
+ */
+ public static String[] getAllDetectableCharsets() {
+ return fCharsetNames;
+ }
+
+ /**
+ * Test whether or not input filtering is enabled.
+ *
+ * @return <code>true</code> if input text will be filtered.
+ *
+ * @see #enableInputFilter
+ *
+ * @draft ICU 3.4
+ * @deprecated This is a draft API and might change in a future release of ICU.
+ */
+ public boolean inputFilterEnabled()
+ {
+ return fStripTags;
+ }
+
+ /**
+ * Enable filtering of input text. If filtering is enabled,
+ * text within angle brackets ("<" and ">") will be removed
+ * before detection.
+ *
+ * @param filter <code>true</code> to enable input text filtering.
+ *
+ * @return The previous setting.
+ *
+ * @draft ICU 3.4
+ * @deprecated This is a draft API and might change in a future release of ICU.
+ */
+ public boolean enableInputFilter(boolean filter)
+ {
+ boolean previous = fStripTags;
+
+ fStripTags = filter;
+
+ return previous;
+ }
+
+ /**
+ * MungeInput - after getting a set of raw input data to be analyzed, preprocess
+ * it by removing what appears to be html markup.
+ *
+ * @internal
+ */
+ private void MungeInput() {
+ int srci = 0;
+ int dsti = 0;
+ byte b;
+ boolean inMarkup = false;
+ int openTags = 0;
+ int badTags = 0;
+
+ //
+ // html / xml markup stripping.
+ // quick and dirty, not 100% accurate, but hopefully good enough, statistically.
+ // discard everything within < brackets >
+ // Count how many total '<' and illegal (nested) '<' occur, so we can make some
+ // guess as to whether the input was actually marked up at all.
+ if (fStripTags) {
+ for (srci=0; srci<fRawLength; srci++) {
+ b = fRawInput[srci];
+ if (b == (byte)'<') {
+ if (inMarkup) {
+ badTags++;
+ }
+ inMarkup = true;
+ openTags++;
+ }
+
+ if (! inMarkup) {
+ fInputBytes[dsti++] = b;
+ }
+
+ if (b == (byte)'>') {
+ inMarkup = false;
+ }
+ }
+
+ fInputLen = dsti;
+ }
+
+ //
+ // If it looks like this input wasn't marked up, or if it looks like it's
+ // essentially nothing but markup abandon the markup stripping.
+ // Detection will have to work on the unstripped input.
+ //
+ if (openTags<5 || openTags/5 < badTags ||
+ (fInputLen < 100 && fRawLength>600)) {
+ int limit = fRawLength;
+
+ if (limit > kBufSize) {
+ limit = kBufSize;
+ }
+
+ for (srci=0; srci<limit; srci++) {
+ fInputBytes[srci] = fRawInput[srci];
+ }
+ fInputLen = srci;
+ }
+
+ //
+ // Tally up the byte occurence statistics.
+ // These are available for use by the various detectors.
+ //
+ Arrays.fill(fByteStats, (short)0);
+ for (srci=0; srci<fInputLen; srci++) {
+ int val = fInputBytes[srci] & 0x00ff;
+ fByteStats[val]++;
+ }
+
+ fC1Bytes = false;
+ for (int i = 0x80; i <= 0x9F; i += 1) {
+ if (fByteStats[i] != 0) {
+ fC1Bytes = true;
+ break;
+ }
+ }
+ }
+
+ /**
+ * The following items are accessed by individual CharsetRecongizers during
+ * the recognition process
+ *
+ * @internal
+ */
+ byte[] fInputBytes = // The text to be checked. Markup will have been
+ new byte[kBufSize]; // removed if appropriate.
+
+ int fInputLen; // Length of the byte data in fInputText.
+
+ short fByteStats[] = // byte frequency statistics for the input text.
+ new short[256]; // Value is percent, not absolute.
+ // Value is rounded up, so zero really means zero occurences.
+
+ boolean fC1Bytes = // True if any bytes in the range 0x80 - 0x9F are in the input;
+ false;
+
+ String fDeclaredEncoding;
+
+
+
+ //
+ // Stuff private to CharsetDetector
+ //
+ byte[] fRawInput; // Original, untouched input bytes.
+ // If user gave us a byte array, this is it.
+ // If user gave us a stream, it's read to a
+ // buffer here.
+ int fRawLength; // Length of data in fRawInput array.
+
+ InputStream fInputStream; // User's input stream, or null if the user
+ // gave us a byte array.
+
+ boolean fStripTags = // If true, setText() will strip tags from input text.
+ false;
+
+
+ /**
+ * List of recognizers for all charsets known to the implementation.
+ *
+ * @internal
+ */
+ private static ArrayList fCSRecognizers = createRecognizers();
+ private static String [] fCharsetNames;
+
+ /**
+ * Create the singleton instances of the CharsetRecognizer classes
+ *
+ * @internal
+ */
+ private static ArrayList createRecognizers() {
+ ArrayList recognizers = new ArrayList();
+
+ recognizers.add(new CharsetRecog_UTF8());
+
+ recognizers.add(new CharsetRecog_Unicode.CharsetRecog_UTF_16_BE());
+ recognizers.add(new CharsetRecog_Unicode.CharsetRecog_UTF_16_LE());
+ recognizers.add(new CharsetRecog_Unicode.CharsetRecog_UTF_32_BE());
+ recognizers.add(new CharsetRecog_Unicode.CharsetRecog_UTF_32_LE());;
+
+ recognizers.add(new CharsetRecog_mbcs.CharsetRecog_sjis());
+ recognizers.add(new CharsetRecog_2022.CharsetRecog_2022JP());
+ recognizers.add(new CharsetRecog_2022.CharsetRecog_2022CN());
+ recognizers.add(new CharsetRecog_2022.CharsetRecog_2022KR());
+ recognizers.add(new CharsetRecog_mbcs.CharsetRecog_euc.CharsetRecog_gb_18030());
+ recognizers.add(new CharsetRecog_mbcs.CharsetRecog_euc.CharsetRecog_euc_jp());
+ recognizers.add(new CharsetRecog_mbcs.CharsetRecog_euc.CharsetRecog_euc_kr());
+
+ recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_da());
+ recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_de());
+ recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_en());
+ recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_es());
+ recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_fr());
+ recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_it());
+ recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_nl());
+ recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_no());
+ recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_pt());
+ recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_sv());
+ recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_2_cs());
+ recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_2_hu());
+ recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_2_pl());
+ recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_2_ro());
+ recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_5_ru());
+ recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_6_ar());
+ recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_7_el());
+ recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_8_I_he());
+ recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_8_he());
+ recognizers.add(new CharsetRecog_sbcs.CharsetRecog_windows_1251());
+ recognizers.add(new CharsetRecog_sbcs.CharsetRecog_windows_1256());
+ recognizers.add(new CharsetRecog_sbcs.CharsetRecog_KOI8_R());
+ recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_9_tr());
+
+ // Create an array of all charset names, as a side effect.
+ // Needed for the getAllDetectableCharsets() API.
+ String[] charsetNames = new String [recognizers.size()];
+ int out = 0;
+
+ for (int i = 0; i < recognizers.size(); i++) {
+ String name = ((CharsetRecognizer)recognizers.get(i)).getName();
+
+ if (out == 0 || ! name.equals(charsetNames[out - 1])) {
+ charsetNames[out++] = name;
+ }
+ }
+
+ fCharsetNames = new String[out];
+ System.arraycopy(charsetNames, 0, fCharsetNames, 0, out);
+
+ return recognizers;
+ }
+}