tools/misc/src/com/ibm/icu/dev/tool/charsetdet/mbcs/BIG5Tool.java - external/github.com/unicode-org/icu - Git at Google

 /*
  ***********************************************************************
  *
  * Copyright (C) 2006-2012, International Business Machines Corporation and
  * others. All Rights Reserved.
  *
  ***********************************************************************
  *
  * BIG5Tool
  *
  *    This tool produces the character usage frequency statistics for the Big5
  *    Chinese charset, for use by the ICU charset detectors.
  *
  *    usage:  java BIG5Tool [-d] [directory path]
  *
  *        -d:   Produce the data in a form to be exported to the ICU implementation
  *              Default is to produce an informative dump.
  *
  *        -sjis Do Shift_JIS.  The structure of sjis is very similar to Big5.
  *
  *        directory path
  *              Source directory for the text files to be analyzed.
  *              All files in the specified directory must be in the Big5 encoding.
  *
  */

 package com.ibm.icu.dev.tool.charsetdet.mbcs;

 import java.io.File;
 import java.io.FileInputStream;
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.HashMap;
 import java.util.List;


 public class BIG5Tool {

     // The file buffer and file data length need to be out in class member variables
     //  so that the code lifted from charSet detection for scanning the multi-byte chars
     //  can see them conveniently.
     byte []    buf = new byte[1000000];
     int        fileSize;

     boolean    option_d = false;    // data option.  Produce exportable data
     boolean    option_v = true;     // verbose informaional output.
     boolean    sjis     = false;    // True if input text files are Shift_JIS encoded.


     public static void main(String[] args) {
         BIG5Tool  This = new BIG5Tool();
         This.Main(args);
     }


     void Main(String[] args) {
         int i;

         //
         //   Command Line Option Handling
         //
         String     dirName  = null;
         for (i=0; i<args.length; i++) {
             if (args[i].equals("-d")) {
                 option_d = true;
                 option_v = false;
                 continue;
             }
             if (args[i].equals("-sjis")) {
                 sjis = true;
                 continue;
             }
             if (args[i].startsWith("-")) {
                 System.err.println("Unrecognized option: " + args[i]);
                 System.exit(-1);
             }
             if (dirName == null) {
                 dirName = args[i];
             } else {
                 System.err.println("Unrecognized option: " + dirName);
                 System.exit(-1);
             }
         }
         if (dirName == null) {
             dirName = ".";
         }

         //
         //  Verify that the specified directory exists.
         //
         File dir = new File(dirName);
         if (dir.isDirectory() == false) {
             System.err.println("\"" + dirName + "\" is not a directory");
             System.exit(-1);
         }
         processDir(dir);

     }

     //
     // Collect statistics from all ordinary files in a specified directory.
     //
     void processDir(File dir) {
         int      totalMbcsChars  = 0;
         HashMap  m = new HashMap(10000);
         int      i;

         System.out.println(dir.getName());
         File[] files = dir.listFiles();
         for (i=0; i<files.length; i++) {
             FileInputStream is = null;
             try {
                 if (files[i].isFile()) {
                     is = new FileInputStream(files[i]);
                     fileSize = is.read(buf);
                     if (option_v) {
                         System.out.println(files[i].getPath());
                         System.out.println("  " + fileSize + " bytes.");
                     }
                     iteratedChar ichar = new iteratedChar();
                     int fileChars     = 0;
                     int fileMbcsChars = 0;
                     int errs          = 0;

                     while (nextChar(ichar)) {
                         if (ichar.error == true) {
                             errs++;
                             continue;
                         }
                         fileChars++;
                         if (ichar.charValue > 255) {
                             fileMbcsChars++;
                             totalMbcsChars++;
                         }
                         if (ichar.charValue <= 255) {
                             // Don't keep occurence statistics for the single byte range
                             continue;
                         }

                         //
                         //  Frequency of occurence statistics are accumulated in a map.
                         //
                         ChEl  keyEl = new ChEl(ichar.charValue, 0);
                         ChEl  valEl = (ChEl)m.get(keyEl);
                         if (valEl == null) {
                             m.put(keyEl, keyEl);
                             valEl = keyEl;
                         }
                         valEl.occurences++;
                     }
                     if (option_v) {
                         System.out.println("  " + fileChars     + " Chars");
                         System.out.println("  " + fileMbcsChars + " mbcs Chars");
                         System.out.println("  " + errs          + " errors");
                         System.out.println("\n");
                     }
                 }
             }
             catch (Exception e) {
                 System.err.println("Exception:" + e);

             }
             finally {
                 if (is != null) {
                     try {
                         is.close();
                     } catch (Exception e) {
                         // ignore
                     }
                 }
             }
         }

         //
         //  We've processed through all of the files.
         //     sort and dump out the frequency statistics.
         //
         Object [] encounteredChars = m.values().toArray();
         Arrays.sort(encounteredChars);
         int cumulativeChars = 0;
         int cumulativePercent = 0;
         if (option_v) {
             System.out.println("# <char code> <occurences>  <Cumulative %>");
             for (i=0; i<encounteredChars.length; i++) {
                 ChEl c = (ChEl)encounteredChars[i];
                 cumulativeChars += c.occurences;
                 cumulativePercent = cumulativeChars*100/totalMbcsChars;
                 System.out.println(i + "   " + Integer.toHexString(c.charCode) + "        "
                         + c.occurences + "         " + cumulativePercent);
             }
         }
         if (option_d) {
             //
             //   Output the list of characters formatted for pasting into a
             //     Java source code array initializer.
             //     Resort into order based on the character code value, not
             //      on frequency of occurence.
             //
             List  charList = new ArrayList();

             for (i=0; i<100 && cumulativePercent<50; i++) {
                 ChEl c = (ChEl)encounteredChars[i];
                 cumulativeChars += c.occurences;
                 cumulativePercent = cumulativeChars*100/totalMbcsChars;
                 charList.add(new Integer(c.charCode));
             }
             Object [] sortedChars = charList.toArray();
             Arrays.sort(sortedChars);

             System.out.print("          {");
             for (i=0; i<sortedChars.length; i++) {
                 if (i != 0) {
                     System.out.print(", ");
                     if ((i)%10 == 0) {
                         System.out.print("\n           ");
                     }
                 }
                 int cp = ((Integer)sortedChars[i]).intValue();
                 System.out.print("0x" + Integer.toHexString(cp));
             }
             System.out.println("};");
         }
     }

     //
     //  This is a little class containing a
     //    multi-byte character value and an occurence count for that char.
     //  Instances of this class are kept in the collection that accumulates statistics
     //
     //  WARNING:  this class's natural ordering (from Comparable) and equals()
     //            are inconsistent.

     static class ChEl implements Comparable {
         int charCode;
         int occurences;

         ChEl(int c, int o) {
             charCode = c;
             occurences = o;
         }

         // Equals needs to work with a map, with the charCode as the key.
         //   For insertion/lookup, we care about the char code only, not the occurence count.
         public boolean equals(Object other) {
             ChEl o = (ChEl)other;
             return o.charCode == this.charCode;
         }

         // Hashcode needs to be compatible with equals
         //   We're using this in a hashMap!
         public int hashCode() {
             return charCode;
         }

         // We want to be able to sort the results by frequency of occurence
         //   Compare backwards.  We want most frequent chars first.
         public int compareTo(Object other) {
             ChEl o = (ChEl)other;
             return (this.occurences> o.occurences? -1 :
                    (this.occurences==o.occurences?  0 : 1));
         }

     }

     //
     // iteratedChar is copied and slightly hacked from the similar calss in CharsetRecog_mbcs
     //              Pulls out one logical char according to the rules of EUC encoding.
     //
     class iteratedChar {
         int             charValue = 0;             // The char value is a value from the encoding.
                                                    //   It's meaning is not well defined, other than
                                                    //   different encodings
         int             index     = 0;
         int             nextIndex = 0;
         boolean         error     = false;
         boolean         done      = false;

         void reset() {
             charValue = 0;
             index     = -1;
             nextIndex = 0;
             error     = false;
             done      = false;
         }

         int nextByte() {
             if (nextIndex >= fileSize) {
                 done = true;
                 return -1;
             }
             int byteValue = (int)buf[nextIndex++] & 0x00ff;
             return byteValue;
         }
     }


     boolean nextChar(iteratedChar it) {
         it.index = it.nextIndex;
         it.error = false;
         int firstByte  = 0;
         int secondByte = 0;

         buildChar: {
             firstByte = it.charValue = it.nextByte();
             if (firstByte < 0) {
                 // Ran off the end of the input data
                 it.done = true;
                 break buildChar;
             }
             if (firstByte <= 0x0080 ||
                     (sjis && firstByte>=0x00a0 && firstByte< 0x00e0) ||
                     (sjis && firstByte>=0x00fd && firstByte<=0x00ff)) {
                 // single byte char
                 break buildChar;
             }

             secondByte = it.nextByte();
             it.charValue = (it.charValue << 8) | secondByte;

             if (secondByte <  0x40 ||
                 secondByte == 0x007f ||
                 secondByte == 0x00ff ||
                 sjis && secondByte >= 0x00fd) {
                     it.error = true;
             }

             if (it.error) {
                 System.out.println("Error " + Integer.toHexString(firstByte) + " " + Integer.toHexString(secondByte));
             }
        }

         return (it.done == false);
     }

 }
	/*
	***********************************************************************
	*
	* Copyright (C) 2006-2012, International Business Machines Corporation and
	* others. All Rights Reserved.
	*
	***********************************************************************
	*
	* BIG5Tool
	*
	* This tool produces the character usage frequency statistics for the Big5
	* Chinese charset, for use by the ICU charset detectors.
	*
	* usage: java BIG5Tool [-d] [directory path]
	*
	* -d: Produce the data in a form to be exported to the ICU implementation
	* Default is to produce an informative dump.
	*
	* -sjis Do Shift_JIS. The structure of sjis is very similar to Big5.
	*
	* directory path
	* Source directory for the text files to be analyzed.
	* All files in the specified directory must be in the Big5 encoding.
	*
	*/

	package com.ibm.icu.dev.tool.charsetdet.mbcs;

	import java.io.File;
	import java.io.FileInputStream;
	import java.util.ArrayList;
	import java.util.Arrays;
	import java.util.HashMap;
	import java.util.List;


	public class BIG5Tool {

	// The file buffer and file data length need to be out in class member variables
	// so that the code lifted from charSet detection for scanning the multi-byte chars
	// can see them conveniently.
	byte [] buf = new byte[1000000];
	int fileSize;

	boolean option_d = false; // data option. Produce exportable data
	boolean option_v = true; // verbose informaional output.
	boolean sjis = false; // True if input text files are Shift_JIS encoded.



	public static void main(String[] args) {
	BIG5Tool This = new BIG5Tool();
	This.Main(args);
	}



	void Main(String[] args) {
	int i;

	//
	// Command Line Option Handling
	//
	String dirName = null;
	for (i=0; i<args.length; i++) {
	if (args[i].equals("-d")) {
	option_d = true;
	option_v = false;
	continue;
	}
	if (args[i].equals("-sjis")) {
	sjis = true;
	continue;
	}
	if (args[i].startsWith("-")) {
	System.err.println("Unrecognized option: " + args[i]);
	System.exit(-1);
	}
	if (dirName == null) {
	dirName = args[i];
	} else {
	System.err.println("Unrecognized option: " + dirName);
	System.exit(-1);
	}
	}
	if (dirName == null) {
	dirName = ".";
	}

	//
	// Verify that the specified directory exists.
	//
	File dir = new File(dirName);
	if (dir.isDirectory() == false) {
	System.err.println("\"" + dirName + "\" is not a directory");
	System.exit(-1);
	}
	processDir(dir);

	}

	//
	// Collect statistics from all ordinary files in a specified directory.
	//
	void processDir(File dir) {
	int totalMbcsChars = 0;
	HashMap m = new HashMap(10000);
	int i;

	System.out.println(dir.getName());
	File[] files = dir.listFiles();
	for (i=0; i<files.length; i++) {
	FileInputStream is = null;
	try {
	if (files[i].isFile()) {
	is = new FileInputStream(files[i]);
	fileSize = is.read(buf);
	if (option_v) {
	System.out.println(files[i].getPath());
	System.out.println(" " + fileSize + " bytes.");
	}
	iteratedChar ichar = new iteratedChar();
	int fileChars = 0;
	int fileMbcsChars = 0;
	int errs = 0;

	while (nextChar(ichar)) {
	if (ichar.error == true) {
	errs++;
	continue;
	}
	fileChars++;
	if (ichar.charValue > 255) {
	fileMbcsChars++;
	totalMbcsChars++;
	}
	if (ichar.charValue <= 255) {
	// Don't keep occurence statistics for the single byte range
	continue;
	}

	//
	// Frequency of occurence statistics are accumulated in a map.
	//
	ChEl keyEl = new ChEl(ichar.charValue, 0);
	ChEl valEl = (ChEl)m.get(keyEl);
	if (valEl == null) {
	m.put(keyEl, keyEl);
	valEl = keyEl;
	}
	valEl.occurences++;
	}
	if (option_v) {
	System.out.println(" " + fileChars + " Chars");
	System.out.println(" " + fileMbcsChars + " mbcs Chars");
	System.out.println(" " + errs + " errors");
	System.out.println("\n");
	}
	}
	}
	catch (Exception e) {
	System.err.println("Exception:" + e);

	}
	finally {
	if (is != null) {
	try {
	is.close();
	} catch (Exception e) {
	// ignore
	}
	}
	}
	}

	//
	// We've processed through all of the files.
	// sort and dump out the frequency statistics.
	//
	Object [] encounteredChars = m.values().toArray();
	Arrays.sort(encounteredChars);
	int cumulativeChars = 0;
	int cumulativePercent = 0;
	if (option_v) {
	System.out.println("# <char code> <occurences> <Cumulative %>");
	for (i=0; i<encounteredChars.length; i++) {
	ChEl c = (ChEl)encounteredChars[i];
	cumulativeChars += c.occurences;
	cumulativePercent = cumulativeChars*100/totalMbcsChars;
	System.out.println(i + " " + Integer.toHexString(c.charCode) + " "
	+ c.occurences + " " + cumulativePercent);
	}
	}
	if (option_d) {
	//
	// Output the list of characters formatted for pasting into a
	// Java source code array initializer.
	// Resort into order based on the character code value, not
	// on frequency of occurence.
	//
	List charList = new ArrayList();

	for (i=0; i<100 && cumulativePercent<50; i++) {
	ChEl c = (ChEl)encounteredChars[i];
	cumulativeChars += c.occurences;
	cumulativePercent = cumulativeChars*100/totalMbcsChars;
	charList.add(new Integer(c.charCode));
	}
	Object [] sortedChars = charList.toArray();
	Arrays.sort(sortedChars);

	System.out.print(" {");
	for (i=0; i<sortedChars.length; i++) {
	if (i != 0) {
	System.out.print(", ");
	if ((i)%10 == 0) {
	System.out.print("\n ");
	}
	}
	int cp = ((Integer)sortedChars[i]).intValue();
	System.out.print("0x" + Integer.toHexString(cp));
	}
	System.out.println("};");
	}
	}

	//
	// This is a little class containing a
	// multi-byte character value and an occurence count for that char.
	// Instances of this class are kept in the collection that accumulates statistics
	//
	// WARNING: this class's natural ordering (from Comparable) and equals()
	// are inconsistent.

	static class ChEl implements Comparable {
	int charCode;
	int occurences;

	ChEl(int c, int o) {
	charCode = c;
	occurences = o;
	}

	// Equals needs to work with a map, with the charCode as the key.
	// For insertion/lookup, we care about the char code only, not the occurence count.
	public boolean equals(Object other) {
	ChEl o = (ChEl)other;
	return o.charCode == this.charCode;
	}

	// Hashcode needs to be compatible with equals
	// We're using this in a hashMap!
	public int hashCode() {
	return charCode;
	}

	// We want to be able to sort the results by frequency of occurence
	// Compare backwards. We want most frequent chars first.
	public int compareTo(Object other) {
	ChEl o = (ChEl)other;
	return (this.occurences> o.occurences? -1 :
	(this.occurences==o.occurences? 0 : 1));
	}

	}

	//
	// iteratedChar is copied and slightly hacked from the similar calss in CharsetRecog_mbcs
	// Pulls out one logical char according to the rules of EUC encoding.
	//
	class iteratedChar {
	int charValue = 0; // The char value is a value from the encoding.
	// It's meaning is not well defined, other than
	// different encodings
	int index = 0;
	int nextIndex = 0;
	boolean error = false;
	boolean done = false;

	void reset() {
	charValue = 0;
	index = -1;
	nextIndex = 0;
	error = false;
	done = false;
	}

	int nextByte() {
	if (nextIndex >= fileSize) {
	done = true;
	return -1;
	}
	int byteValue = (int)buf[nextIndex++] & 0x00ff;
	return byteValue;
	}
	}


	boolean nextChar(iteratedChar it) {
	it.index = it.nextIndex;
	it.error = false;
	int firstByte = 0;
	int secondByte = 0;

	buildChar: {
	firstByte = it.charValue = it.nextByte();
	if (firstByte < 0) {
	// Ran off the end of the input data
	it.done = true;
	break buildChar;
	}
	if (firstByte <= 0x0080 \|\|
	(sjis && firstByte>=0x00a0 && firstByte< 0x00e0) \|\|
	(sjis && firstByte>=0x00fd && firstByte<=0x00ff)) {
	// single byte char
	break buildChar;
	}

	secondByte = it.nextByte();
	it.charValue = (it.charValue << 8) \| secondByte;

	if (secondByte < 0x40 \|\|
	secondByte == 0x007f \|\|
	secondByte == 0x00ff \|\|
	sjis && secondByte >= 0x00fd) {
	it.error = true;
	}

	if (it.error) {
	System.out.println("Error " + Integer.toHexString(firstByte) + " " + Integer.toHexString(secondByte));
	}
	}

	return (it.done == false);
	}

	}