This commit was manufactured by cvs2svn to create tag 'release-3-4-1-d03'. X-SVN-Rev: 18631

commit: 2eacb91581c46722e4e851c1b2869fde46b3cba5 [log] [tgz]
author: (no author) <anonymous@svn.icu-project.org> Wed Oct 05 21:49:52 2005 +0000
committer: (no author) <anonymous@svn.icu-project.org> Wed Oct 05 21:49:52 2005 +0000
tree: 5dbf7db6a8f4b2e0621d87a3d613855a75e9ef3a
diff --git a/.gitattributes b/.gitattributes
new file mode 100644
index 0000000..4d99a35
--- /dev/null
+++ b/.gitattributes

@@ -0,0 +1,58 @@
+* text=auto !eol
+
+*.c text !eol
+*.cc text !eol
+*.classpath text !eol
+*.cpp text !eol
+*.css text !eol
+*.dsp text !eol
+*.dsw text !eol
+*.filters text !eol
+*.h text !eol
+*.htm text !eol
+*.html text !eol
+*.in text !eol
+*.java text !eol
+*.launch text !eol
+*.mak text !eol
+*.md text !eol
+*.MF text !eol
+*.mk text !eol
+*.pl text !eol
+*.pm text !eol
+*.project text !eol
+*.properties text !eol
+*.py text !eol
+*.rc text !eol
+*.sh text eol=lf
+*.sln text !eol
+*.stub text !eol
+*.txt text !eol
+*.ucm text !eol
+*.vcproj text !eol
+*.vcxproj text !eol
+*.xml text !eol
+*.xsl text !eol
+*.xslt text !eol
+Makefile text !eol
+configure text !eol
+LICENSE text !eol
+README text !eol
+
+*.bin -text
+*.brk -text
+*.cnv -text
+*.icu -text
+*.res -text
+*.nrm -text
+*.spp -text
+*.tri2 -text
+
+# The following file types are stored in Git-LFS.
+*.jar filter=lfs diff=lfs merge=lfs -text
+*.dat filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.gif filter=lfs diff=lfs merge=lfs -text
+

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..229f478
--- /dev/null
+++ b/.gitignore

@@ -0,0 +1,5 @@
+/.classpath
+/.externalToolBuilders
+/.project
+/classes
+/doc

diff --git a/src/com/ibm/icu/dev/test/charsetdet/TestCharsetDetector.java b/src/com/ibm/icu/dev/test/charsetdet/TestCharsetDetector.java
new file mode 100644
index 0000000..20af4f3
--- /dev/null
+++ b/src/com/ibm/icu/dev/test/charsetdet/TestCharsetDetector.java

@@ -0,0 +1,320 @@
+/**
+ *******************************************************************************
+ * Copyright (C) 2005, International Business Machines Corporation and         *
+ * others. All Rights Reserved.                                                *
+ *******************************************************************************
+ */
+package com.ibm.icu.dev.test.charsetdet;
+
+import java.io.ByteArrayInputStream;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.io.Reader;
+
+import com.ibm.icu.dev.test.TestFmwk;
+import com.ibm.icu.impl.UTF32;
+import com.ibm.icu.text.*;
+import com.ibm.icu.util.VersionInfo;
+
+import javax.xml.parsers.*;
+import org.w3c.dom.*;
+
+
+/**
+ * @author andy
+ *
+ * TODO To change the template for this generated type comment go to
+ * Window - Preferences - Java - Code Style - Code Templates
+ */
+public class TestCharsetDetector extends TestFmwk
+{
+    
+    /**
+     * Constructor
+     */
+    public TestCharsetDetector()
+    {
+    }
+
+    public static void main(String[] args) {
+        try
+        {
+            TestCharsetDetector test = new TestCharsetDetector();
+            test.run(args);
+        }
+        catch (Exception e)
+        {
+            e.printStackTrace();
+        }
+    }
+
+    private void CheckAssert(boolean exp) {
+        if (exp == false) {
+            String msg;
+            try {
+                throw new Exception();
+            }
+            catch (Exception e) {
+                StackTraceElement failPoint = e.getStackTrace()[1];
+                msg = "Test failure in file " + failPoint.getFileName() +
+                             " at line " + failPoint.getLineNumber();
+            }
+            errln(msg);
+        }
+        
+    }
+    
+    private String stringFromReader(Reader reader)
+    {
+        StringBuffer sb = new StringBuffer();
+        char[] buffer   = new char[1024];
+        int bytesRead   = 0;
+        
+        try {
+            while ((bytesRead = reader.read(buffer, 0, 1024)) >= 0) {
+                sb.append(buffer, 0, bytesRead);
+            }
+            
+            return sb.toString();
+        } catch (Exception e) {
+            errln("stringFromReader() failed: " + e.toString());
+            return null;
+        }
+    }
+    
+    private void checkMatch(CharsetDetector det, String testString, String encoding, String language, String id) throws Exception
+    {
+        CharsetMatch m = det.detect();
+        String decoded;
+        
+        if (! m.getName().equals(encoding)) {
+            errln(id + ": encoding detection failure - expected " + encoding + ", got " + m.getName());
+            return;
+        }
+        
+        if (! (language == null || m.getLanguage().equals(language))) {
+            errln(id + ", " + encoding + ": language detection failure - expected " + language + ", got " + m.getLanguage());
+        }
+        
+        if (encoding.startsWith("UTF-32")) {
+            return;
+        }
+        
+        decoded = m.getString();
+        
+        if (! testString.equals(decoded)) {
+            errln(id + ", " + encoding + ": getString() didn't return the original string!");
+        }
+        
+        decoded = stringFromReader(m.getReader());
+        
+        if (! testString.equals(decoded)) {
+            errln(id + ", " + encoding + ": getReader() didn't yield the original string!");
+        }
+    }
+    
+    private void checkEncoding(String testString, String encoding, String id)
+    {
+        String enc = null, from = null, lang = null;
+        String[] split = encoding.split("/");
+        
+        enc = split[0];
+        
+        if (split.length > 1) {
+            lang = split[1];
+        }
+
+        if (enc.equals("ISO-2022-CN")) {
+            
+            // Don't test ISO-2022-CN on older runtimes.
+            if (! have_ISO_2022_CN) {
+                return;
+            }
+            
+            // ISO-2022-CN only works for converting *to* Unicode,
+            // we need to use x-ISO-2022-CN-GB to convert *from* unicode...
+            from = "x-ISO-2022-CN-GB";
+        } else {
+            from = enc;
+        }
+        
+        try {
+            CharsetDetector det = new CharsetDetector();
+            byte[] bytes;
+            
+            if (from.startsWith("UTF-32")) {
+                UTF32 utf32 = UTF32.getInstance(from);
+                
+                bytes = utf32.toBytes(testString);
+            } else {
+                bytes = testString.getBytes(from);
+            }
+        
+            det.setText(bytes);
+            checkMatch(det, testString, enc, lang, id);
+            
+            det.setText(new ByteArrayInputStream(bytes));
+            checkMatch(det, testString, enc, lang, id);
+         } catch (Exception e) {
+            errln(id + ": " + e.toString());
+        }
+        
+    }
+    
+    public void TestConstruction() {
+        int i;
+        CharsetDetector  det = new CharsetDetector();
+        
+        String [] charsetNames = CharsetDetector.getAllDetectableCharsets();
+        CheckAssert(charsetNames.length != 0);
+        for (i=0; i<charsetNames.length; i++) {
+            CheckAssert(charsetNames[i].equals("") == false); 
+            // System.out.println("\"" + charsetNames[i] + "\"");
+        }
+     }
+
+    public void TestInputFilter() throws Exception
+    {
+        String s = "<a> <lot> <of> <English> <inside> <the> <markup> Un tr\u00E8s petit peu de Fran\u00E7ais. <to> <confuse> <the> <detector>";
+        byte[] bytes = s.getBytes("ISO-8859-1");
+        CharsetDetector det = new CharsetDetector();
+        CharsetMatch m;
+        
+        det.enableInputFilter(true);
+        if (!det.inputFilterEnabled()){
+            errln("input filter should be enabled");
+        }
+        
+        det.setText(bytes);
+        m = det.detect();
+        
+        if (! m.getLanguage().equals("fr")) {
+            errln("input filter did not strip markup!");
+        }
+        
+        det.enableInputFilter(false);
+        det.setText(bytes);
+        m = det.detect();
+        
+        if (! m.getLanguage().equals("en")) {
+            errln("unfiltered input did not detect as English!");
+        }
+    }
+    
+    public void TestUTF8() throws Exception {
+        
+        String  s = "This is a string with some non-ascii characters that will " +
+                    "be converted to UTF-8, then shoved through the detection process.  " +
+                    "\u0391\u0392\u0393\u0394\u0395" +
+                    "Sure would be nice if our source could contain Unicode directly!";
+        byte [] bytes = s.getBytes("UTF-8");
+        CharsetDetector det = new CharsetDetector();
+        String retrievedS;
+        Reader reader;
+        
+        retrievedS = det.getString(bytes, "UTF-8");
+        CheckAssert(s.equals(retrievedS));
+        
+        reader = det.getReader(new ByteArrayInputStream(bytes), "UTF-8");
+        CheckAssert(s.equals(stringFromReader(reader)));
+        det.setDeclaredEncoding("UTF-8");	// Jitterbug 4451, for coverage
+    }
+    
+    public void TestUTF16() throws Exception
+    {
+        String source = 
+                "u0623\u0648\u0631\u0648\u0628\u0627, \u0628\u0631\u0645\u062c\u064a\u0627\u062a " +
+                "\u0627\u0644\u062d\u0627\u0633\u0648\u0628 \u002b\u0020\u0627\u0646\u062a\u0631\u0646\u064a\u062a";
+        
+        byte[] beBytes = source.getBytes("UnicodeBig");
+        byte[] leBytes = source.getBytes("UnicodeLittle");
+        CharsetDetector det = new CharsetDetector();
+        CharsetMatch m;
+        
+        det.setText(beBytes);
+        m = det.detect();
+        
+        if (! m.getName().equals("UTF-16BE")) {
+            errln("Encoding detection failure: expected UTF-16BE, got " + m.getName());
+        }
+        
+        det.setText(leBytes);
+        m = det.detect();
+        
+        if (! m.getName().equals("UTF-16LE")) {
+            errln("Encoding detection failure: expected UTF-16LE, got " + m.getName());
+        }
+
+        // Jitterbug 4451, for coverage
+        int confidence = m.getConfidence(); 
+        if(confidence != 100){
+            errln("Did not get the expected confidence level " + confidence);
+        }
+        int matchType = m.getMatchType();
+        if(matchType != 0){
+            errln("Did not get the expected matchType level " + matchType);
+        }
+}
+    
+    public void TestDetection()
+    {
+        //
+        //  Open and read the test data file.
+        //
+        InputStreamReader isr = null;
+        
+        try {
+            InputStream is = TestCharsetDetector.class.getResourceAsStream("CharsetDetectionTests.xml");
+            if (is == null) {
+                errln("Could not open test data file CharsetDetectionTests.xml");
+                return;
+            }
+            
+            isr = new InputStreamReader(is, "UTF-8"); 
+
+            // Set up an xml parser.
+            DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
+            
+            factory.setIgnoringComments(true);
+            
+            DocumentBuilder builder = factory.newDocumentBuilder();
+            
+            // Parse the xml content from the test case file.
+            Document doc = builder.parse(is, null);
+            Element root = doc.getDocumentElement();
+            
+            NodeList testCases = root.getElementsByTagName("test-case");
+            
+            // Process each test case
+            for (int n = 0; n < testCases.getLength(); n += 1) {
+                Node testCase = testCases.item(n);
+                NamedNodeMap attrs = testCase.getAttributes();
+                NodeList testData  = testCase.getChildNodes();
+                StringBuffer testText = new StringBuffer();
+                String id = attrs.getNamedItem("id").getNodeValue();
+                String encodings = attrs.getNamedItem("encodings").getNodeValue();
+                
+                // Collect the test case text.
+                for (int t = 0; t < testData.getLength(); t += 1) {
+                    Node textNode = testData.item(t);
+                    
+                    testText.append(textNode.getNodeValue());                    
+                }
+                
+                // Process test text with each encoding / language pair.
+                String testString = testText.toString();
+                String[] encodingList = encodings.split(" ");
+                
+                for (int e = 0; e < encodingList.length; e += 1) {
+                    checkEncoding(testString, encodingList[e], id);
+                }
+            }
+            
+        } catch (Exception e) {
+            errln("exception while processing test cases: " + e.toString());
+        }
+    }
+    
+    // Before Java 1.5, we cannot convert from Unicode to ISO-2022-CN, so checkEncoding() can't test it...
+    private boolean have_ISO_2022_CN = VersionInfo.javaVersion().compareTo(VersionInfo.getInstance(1, 5)) >= 0;
+}

diff --git a/src/com/ibm/icu/text/CharsetDetector.java b/src/com/ibm/icu/text/CharsetDetector.java
new file mode 100644
index 0000000..3f62833
--- /dev/null
+++ b/src/com/ibm/icu/text/CharsetDetector.java

@@ -0,0 +1,520 @@
+/**
+*******************************************************************************
+* Copyright (C) 2005, International Business Machines Corporation and         *
+* others. All Rights Reserved.                                                *
+*******************************************************************************
+*/
+package com.ibm.icu.text;
+
+import java.io.InputStream;
+import java.io.Reader;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Collections;
+import java.util.Arrays;
+
+
+/**
+ * <code>CharsetDetector</code> provides a facility for detecting the
+ * charset or encoding of character data in an unknown format.
+ * The input data can either be from an input stream or an array of bytes.
+ * The result of the detection operation is a list of possibly matching
+ * charsets, or, for simple use, you can just ask for a Java Reader that
+ * will will work over the input data.
+ * <p/>
+ * Character set detection is at best an imprecise operation.  The detection
+ * process will attempt to identify the charset that best matches the characteristics
+ * of the byte data, but the process is partly statistical in nature, and
+ * the results can not be guaranteed to always be correct.
+ * <p/>
+ * For best accuracy in charset detection, the input data should be primarily
+ * in a single language, and a minimum of a few hundred bytes worth of plain text
+ * in the language are needed.  The detection process will attempt to
+ * ignore html or xml style markup that could otherwise obscure the content.
+ * <p/>
+ * @draft ICU 3.4
+ * @deprecated This is a draft API and might change in a future release of ICU.
+ */
+public class CharsetDetector {
+
+//   Question: Should we have getters corresponding to the setters for inut text
+//   and declared encoding?
+
+//   A thought: If we were to create our own type of Java Reader, we could defer
+//   figuring out an actual charset for data that starts out with too much English
+//   only ASCII until the user actually read through to something that didn't look
+//   like 7 bit English.  If  nothing else ever appeared, we would never need to
+//   actually choose the "real" charset.  All assuming that the application just
+//   wants the data, and doesn't care about a char set name.
+
+    /**
+     *   Constructor
+     * 
+     * @draft ICU 3.4
+     * @deprecated This is a draft API and might change in a future release of ICU.
+     */
+    public CharsetDetector() {
+    }
+
+    /**
+     * Set the declared encoding for charset detection.
+     *  The declared encoding of an input text is an encoding obtained
+     *  from an http header or xml declaration or similar source that
+     *  can be provided as additional information to the charset detector.  
+     *  A match between a declared encoding and a possible detected encoding
+     *  will raise the quality of that detected encoding by a small delta,
+     *  and will also appear as a "reason" for the match.
+     * <p/>
+     * A declared encoding that is incompatible with the input data being
+     * analyzed will not be added to the list of possible encodings.
+     * 
+     *  @param encoding The declared encoding 
+     *
+     * @draft ICU 3.4
+     * @deprecated This is a draft API and might change in a future release of ICU.
+     */
+    public CharsetDetector setDeclaredEncoding(String encoding) {
+        fDeclaredEncoding = encoding;
+        return this;
+    }
+    
+    /**
+     * Set the input text (byte) data whose charset is to be detected.
+     * 
+     * @param in the input text of unknown encoding
+     * 
+     * @return This CharsetDetector
+     *
+     * @draft ICU 3.4
+     * @deprecated This is a draft API and might change in a future release of ICU.
+     */
+    public CharsetDetector setText(byte [] in) {
+        fRawInput  = in;
+        fRawLength = in.length;
+        
+        MungeInput();
+        
+        return this;
+    }
+    
+    private static final int kBufSize = 8000;
+
+    /**
+     * Set the input text (byte) data whose charset is to be detected.
+     *  <p/>
+     *   The input stream that supplies the character data must have markSupported()
+     *   == true; the charset detection process will read a small amount of data,
+     *   then return the stream to its original position via
+     *   the InputStream.reset() operation.  The exact amount that will
+     *   be read depends on the characteristics of the data itself.
+     *
+     * @param in the input text of unknown encoding
+     * 
+     * @return This CharsetDetector
+     *
+     * @draft ICU 3.4
+     * @deprecated This is a draft API and might change in a future release of ICU.
+     */
+    
+    public CharsetDetector setText(InputStream in) throws IOException {
+        fInputStream = in;
+        fInputStream.mark(kBufSize);
+        fRawInput = new byte[kBufSize];   // Always make a new buffer because the
+                                          //   previous one may have come from the caller,
+                                          //   in which case we can't touch it.
+        fRawLength = 0;
+        int remainingLength = kBufSize;
+        while (remainingLength > 0 ) {
+            // read() may give data in smallish chunks, esp. for remote sources.  Hence, this loop.
+            int  bytesRead = fInputStream.read(fRawInput, fRawLength, remainingLength);
+            if (bytesRead <= 0) {
+                 break;
+            }
+            fRawLength += bytesRead;
+            remainingLength -= bytesRead;
+        }
+        fInputStream.reset();
+        
+        MungeInput();                     // Strip html markup, collect byte stats.
+        return this;
+    }
+
+  
+    /**
+     * Return the charset that best matches the supplied input data.
+     * 
+     * Note though, that because the detection 
+     * only looks at the start of the input data,
+     * there is a possibility that the returned charset will fail to handle
+     * the full set of input data.
+     * <p/>
+     * Raise an exception if 
+     *  <ul>
+     *    <li>no charset appears to match the data.</li>
+     *    <li>no input text has been provided</li>
+     *  </ul>
+     *
+     * @return a CharsetMatch object representing the best matching charset.
+     *
+     * @draft ICU 3.4
+     * @deprecated This is a draft API and might change in a future release of ICU.
+     */
+    public CharsetMatch detect() {
+//   TODO:  A better implementation would be to copy the detect loop from
+//          detectAll(), and cut it short as soon as a match with a high confidence
+//          is found.  This is something to be done later, after things are otherwise
+//          working.
+
+        return detectAll()[0];
+     }
+    
+    /**
+     *  Return an array of all charsets that appear to be plausible
+     *  matches with the input data.  The array is ordered with the
+     *  best quality match first.
+     * <p/>
+     * Raise an exception if 
+     *  <ul>
+     *    <li>no charsets appear to match the input data.</li>
+     *    <li>no input text has been provided</li>
+     *  </ul>
+     * 
+     * @return An array of CharsetMatch objects representing possibly matching charsets.
+     *
+     * @draft ICU 3.4
+     * @deprecated This is a draft API and might change in a future release of ICU.
+     */
+    public CharsetMatch[] detectAll() {
+        CharsetRecognizer csr;
+        int               i;
+        int               detectResults;
+        int               confidence;
+        ArrayList         matches = new ArrayList();
+        
+        //  Iterate over all possible charsets, remember all that
+        //    give a match quality > 0.
+        for (i=0; i<fCSRecognizers.size(); i++) {
+            csr = (CharsetRecognizer)fCSRecognizers.get(i);
+            detectResults = csr.match(this);
+            confidence = detectResults & 0x000000ff;
+            if (confidence > 0) {
+                CharsetMatch  m = new CharsetMatch(this, csr, confidence);
+                matches.add(m);
+            }
+        }
+        Collections.sort(matches);      // CharsetMatch compares on confidence
+        Collections.reverse(matches);   //  Put best match first.
+        CharsetMatch [] resultArray = new CharsetMatch[matches.size()];
+        resultArray = (CharsetMatch[]) matches.toArray(resultArray);
+        return resultArray;
+    }
+
+    
+    /**
+     * Autodetect the charset of an inputStream, and return a Java Reader
+     * to access the converted input data.
+     * <p/>
+     * This is a convenience method that is equivalent to
+     *   <code>this.setDeclaredEncoding(declaredEncoding).setText(in).detect().getReader();</code>
+     * <p/>
+     *   For the input stream that supplies the character data, markSupported()
+     *   must be true; the  charset detection will read a small amount of data,
+     *   then return the stream to its original position via
+     *   the InputStream.reset() operation.  The exact amount that will
+     *    be read depends on the characteristics of the data itself.
+     *<p/>
+     * Raise an exception if no charsets appear to match the input data.
+     * 
+     * @param in The source of the byte data in the unknown charset.
+     *
+     * @param declaredEncoding  A declared encoding for the data, if available,
+     *           or null or an empty string if none is available.
+     *
+     * @draft ICU 3.4
+     * @deprecated This is a draft API and might change in a future release of ICU.
+     */
+    public Reader getReader(InputStream in, String declaredEncoding) {
+        fDeclaredEncoding = declaredEncoding;
+        
+        try {
+            setText(in);
+            
+            return detect().getReader();
+        } catch (IOException e) {
+            return null;
+        }
+    }
+
+    /**
+     * Autodetect the charset of an inputStream, and return a String
+     * containing the converted input data.
+     * <p/>
+     * This is a convenience method that is equivalent to
+     *   <code>this.setDeclaredEncoding(declaredEncoding).setText(in).detect().getString();</code>
+     *<p/>
+     * Raise an exception if no charsets appear to match the input data.
+     * 
+     * @param in The source of the byte data in the unknown charset.
+     *
+     * @param declaredEncoding  A declared encoding for the data, if available,
+     *           or null or an empty string if none is available.
+     *
+     * @draft ICU 3.4
+     * @deprecated This is a draft API and might change in a future release of ICU.
+     */
+    public String getString(byte[] in, String declaredEncoding)
+    {
+        fDeclaredEncoding = declaredEncoding;
+       
+        try {
+            setText(in);
+            return detect().getString(-1);
+        } catch (IOException e) {
+            return null;
+        }
+    }
+
+ 
+    /**
+     * Get the names of all char sets that can be recognized by the char set detector.
+     *
+     * @return an array of the names of all charsets that can be recognized
+     * by the charset detector.
+     *
+     * @draft ICU 3.4
+     * @deprecated This is a draft API and might change in a future release of ICU.
+     */
+    public static String[] getAllDetectableCharsets() {
+        return fCharsetNames;
+    }
+    
+    /**
+     * Test whether or not input filtering is enabled.
+     * 
+     * @return <code>true</code> if input text will be filtered.
+     * 
+     * @see #enableInputFilter
+     *
+     * @draft ICU 3.4
+     * @deprecated This is a draft API and might change in a future release of ICU.
+     */
+    public boolean inputFilterEnabled()
+    {
+        return fStripTags;
+    }
+    
+    /**
+     * Enable filtering of input text. If filtering is enabled,
+     * text within angle brackets ("<" and ">") will be removed
+     * before detection.
+     * 
+     * @param filter <code>true</code> to enable input text filtering.
+     * 
+     * @return The previous setting.
+     *
+     * @draft ICU 3.4
+     * @deprecated This is a draft API and might change in a future release of ICU.
+     */
+    public boolean enableInputFilter(boolean filter)
+    {
+        boolean previous = fStripTags;
+        
+        fStripTags = filter;
+        
+        return previous;
+    }
+    
+    /**
+     *  MungeInput - after getting a set of raw input data to be analyzed, preprocess
+     *               it by removing what appears to be html markup.
+     * 
+     * @internal
+     */
+    private void MungeInput() {
+        int srci = 0;
+        int dsti = 0;
+        byte b;
+        boolean  inMarkup = false;
+        int      openTags = 0;
+        int      badTags  = 0;
+        
+        //
+        //  html / xml markup stripping.
+        //     quick and dirty, not 100% accurate, but hopefully good enough, statistically.
+        //     discard everything within < brackets >
+        //     Count how many total '<' and illegal (nested) '<' occur, so we can make some
+        //     guess as to whether the input was actually marked up at all.
+        if (fStripTags) {
+            for (srci=0; srci<fRawLength; srci++) {
+                b = fRawInput[srci];
+                if (b == (byte)'<') {
+                    if (inMarkup) {
+                        badTags++;
+                    }
+                    inMarkup = true;
+                    openTags++;
+                }
+                
+                if (! inMarkup) {
+                    fInputBytes[dsti++] = b;
+                }
+                
+                if (b == (byte)'>') {
+                    inMarkup = false;
+                }        
+            }
+            
+            fInputLen = dsti;
+        }
+        
+        //
+        //  If it looks like this input wasn't marked up, or if it looks like it's
+        //    essentially nothing but markup abandon the markup stripping.
+        //    Detection will have to work on the unstripped input.
+        //
+        if (openTags<5 || openTags/5 < badTags || 
+                (fInputLen < 100 && fRawLength>600)) {
+            int limit = fRawLength;
+            
+            if (limit > kBufSize) {
+                limit = kBufSize;
+            }
+            
+            for (srci=0; srci<limit; srci++) {
+                fInputBytes[srci] = fRawInput[srci];
+            }
+            fInputLen = srci;
+        }
+        
+        //
+        // Tally up the byte occurence statistics.
+        //   These are available for use by the various detectors.
+        //
+        Arrays.fill(fByteStats, (short)0);
+        for (srci=0; srci<fInputLen; srci++) {
+            int val = fInputBytes[srci] & 0x00ff;
+            fByteStats[val]++;
+        }
+        
+        fC1Bytes = false;
+        for (int i = 0x80; i <= 0x9F; i += 1) {
+            if (fByteStats[i] != 0) {
+                fC1Bytes = true;
+                break;
+            }
+        }
+     }
+
+    /**
+     *  The following items are accessed by individual CharsetRecongizers during
+     *     the recognition process
+     * 
+     * @internal
+     */
+    byte[]      fInputBytes =       // The text to be checked.  Markup will have been
+                   new byte[kBufSize];  //   removed if appropriate.
+    
+    int         fInputLen;          // Length of the byte data in fInputText.
+    
+    short       fByteStats[] =      // byte frequency statistics for the input text.
+                   new short[256];  //   Value is percent, not absolute.
+                                    //   Value is rounded up, so zero really means zero occurences.
+    
+    boolean     fC1Bytes =          // True if any bytes in the range 0x80 - 0x9F are in the input;
+                   false;
+    
+    String      fDeclaredEncoding;
+    
+    
+
+    //
+    //  Stuff private to CharsetDetector
+    //
+    byte[]               fRawInput;     // Original, untouched input bytes.
+                                        //  If user gave us a byte array, this is it.
+                                        //  If user gave us a stream, it's read to a 
+                                        //  buffer here.
+    int                  fRawLength;    // Length of data in fRawInput array.
+    
+    InputStream          fInputStream;  // User's input stream, or null if the user
+                                        //   gave us a byte array.
+     
+    boolean              fStripTags =   // If true, setText() will strip tags from input text.
+                           false;
+    
+    
+    /**
+     *  List of recognizers for all charsets known to the implementation.
+     *
+     * @internal
+     */
+    private static ArrayList fCSRecognizers = createRecognizers();
+    private static String [] fCharsetNames;
+    
+   /**
+     * Create the singleton instances of the CharsetRecognizer classes
+     * 
+     * @internal
+     */
+    private static ArrayList createRecognizers() {
+        ArrayList recognizers = new ArrayList();
+        
+        recognizers.add(new CharsetRecog_UTF8());
+        
+        recognizers.add(new CharsetRecog_Unicode.CharsetRecog_UTF_16_BE());
+        recognizers.add(new CharsetRecog_Unicode.CharsetRecog_UTF_16_LE());
+        recognizers.add(new CharsetRecog_Unicode.CharsetRecog_UTF_32_BE());
+        recognizers.add(new CharsetRecog_Unicode.CharsetRecog_UTF_32_LE());;
+        
+        recognizers.add(new CharsetRecog_mbcs.CharsetRecog_sjis());
+        recognizers.add(new CharsetRecog_2022.CharsetRecog_2022JP());
+        recognizers.add(new CharsetRecog_2022.CharsetRecog_2022CN());
+        recognizers.add(new CharsetRecog_2022.CharsetRecog_2022KR());
+        recognizers.add(new CharsetRecog_mbcs.CharsetRecog_euc.CharsetRecog_gb_18030());
+        recognizers.add(new CharsetRecog_mbcs.CharsetRecog_euc.CharsetRecog_euc_jp());
+        recognizers.add(new CharsetRecog_mbcs.CharsetRecog_euc.CharsetRecog_euc_kr());
+        
+        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_da());
+        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_de());
+        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_en());
+        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_es());
+        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_fr());
+        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_it());
+        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_nl());
+        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_no());
+        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_pt());
+        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_sv());
+        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_2_cs());
+        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_2_hu());
+        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_2_pl());
+        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_2_ro());
+        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_5_ru());
+        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_6_ar());
+        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_7_el());
+        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_8_I_he());
+        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_8_he());
+        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_windows_1251());
+        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_windows_1256());
+        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_KOI8_R());
+        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_9_tr());
+        
+        // Create an array of all charset names, as a side effect.
+        // Needed for the getAllDetectableCharsets() API.
+        String[] charsetNames = new String [recognizers.size()];
+        int out = 0;
+        
+        for (int i = 0; i < recognizers.size(); i++) {
+            String name = ((CharsetRecognizer)recognizers.get(i)).getName();
+            
+            if (out == 0 || ! name.equals(charsetNames[out - 1])) {
+                charsetNames[out++] = name;
+            }
+        }
+        
+        fCharsetNames = new String[out];
+        System.arraycopy(charsetNames, 0, fCharsetNames, 0, out);
+        
+        return recognizers;
+    }
+}
commit	2eacb91581c46722e4e851c1b2869fde46b3cba5	[log] [tgz]
author	(no author) <anonymous@svn.icu-project.org>	Wed Oct 05 21:49:52 2005 +0000
committer	(no author) <anonymous@svn.icu-project.org>	Wed Oct 05 21:49:52 2005 +0000
tree	5dbf7db6a8f4b2e0621d87a3d613855a75e9ef3a