src/com/ibm/icu/dev/tool/charsetdet/sbcs/Checker.java - external/github.com/unicode-org/icu - Git at Google

 /*
  ***********************************************************************
  * Copyright (C) 2005, International Business Machines Corporation and *
  * others. All Rights Reserved.                                        *
  ***********************************************************************
  *
  */

 package com.ibm.icu.dev.tool.charsetdet.sbcs;

 import java.io.File;
 import java.io.FileInputStream;
 import java.io.InputStreamReader;

 /**
  * @author emader
  *
  * TODO To change the template for this generated type comment go to
  * Window - Preferences - Java - Code Style - Code Templates
  */
 public class Checker implements NGramParser.NGramParserClient
 {
     private NGramList ngrams;
     private int totalNGrams;
     private int totalHits;

     private String language;
     private String encoding;

     private int[] histogram;

     private static final int BUFFER_SIZE = 1024;

     private char[] buffer;
     private int bufIndex;
     private int bufMax;

     private NGramParser parser;

     /**
      * TODO This should take cumulative percent and the name...
      */
     public Checker(NGramList list, InputFile dataFile)
     {
         ngrams = list;
         ngrams.setMapper(dataFile);

         language = languageName(dataFile.getFilename());
         encoding = dataFile.getEncoding();

         buffer = new char[BUFFER_SIZE];
         parser = new NGramParser(this);
         resetCounts();

         histogram = new int[100];
         resetHistogram();
    }

     public void handleNGram(String key)
     {
         NGramList.NGram ngram = ngrams.get(key);

         totalNGrams += 1;

         if (ngram != null) {
             totalHits += 1;
             //ngram.incrementRefCount();
         }
     }

     private void resetCounts()
     {
         bufIndex = 0;
         totalNGrams = totalHits = 0;
     }

     private void resetHistogram()
     {
         for(int i = 0; i < 100; i += 1) {
             histogram[i] = 0;
         }

     }

     private static void exceptionError(Exception e)
     {
         System.err.println("ioError: " + e.toString());
     }

     private static String languageName(String filename)
     {
         return filename.substring(0, filename.indexOf('.'));
     }

     private boolean nextBuffer(InputFile inputFile)
     {
         try {
             bufMax = inputFile.read(buffer);
         } catch (Exception e) {
             bufMax = -1;
             exceptionError(e);

             return false;
         }

         bufIndex = 0;

         return bufMax >= 0;
     }

     private void parseBuffer()
     {
         resetCounts();
         parser.reset();
         parser.parse();
     }

     public char nextChar()
     {
         if (bufIndex >= bufMax) {
             return 0;
         }

         return buffer[bufIndex++];
     }

     public String getLanguage()
     {
         return language;
     }

     public void setMapper(InputFile file)
     {
         ngrams.setMapper(file);
     }

     public int checkBuffer(char[] theBuffer, int charCount)
     {
         buffer = theBuffer;
         bufMax = charCount;

         parseBuffer();

         return totalHits;
     }

     public void check(InputFile dataFile)
     {
         int minHist = 101, maxHist = -1;

         dataFile.open();

         String dataFilename = dataFile.getFilename();
         String fileEncoding = dataFile.getEncoding();

         System.out.println(language + "(" + encoding + ") stats, " + languageName(dataFilename) + "(" + fileEncoding + ") data:");

         setMapper(dataFile);
         resetHistogram();

         while (nextBuffer(dataFile)) {
             parseBuffer();

             double percentHits = (double) totalHits / totalNGrams * 100.0;
             int ph = (int) percentHits;

             if (ph < minHist) {
                 minHist = ph;
             }

             if (ph > maxHist) {
                 maxHist = ph;
             }

             histogram[ph] += 1;
         }

         for(int ph = minHist; ph <= maxHist; ph += 1) {
             System.out.println(ph + "\t" + histogram[ph]);
         }

         System.out.println();

         dataFile.close();

         return;
     }
 }
	/*
	***********************************************************************
	* Copyright (C) 2005, International Business Machines Corporation and *
	* others. All Rights Reserved. *
	***********************************************************************
	*
	*/

	package com.ibm.icu.dev.tool.charsetdet.sbcs;

	import java.io.File;
	import java.io.FileInputStream;
	import java.io.InputStreamReader;

	/**
	* @author emader
	*
	* TODO To change the template for this generated type comment go to
	* Window - Preferences - Java - Code Style - Code Templates
	*/
	public class Checker implements NGramParser.NGramParserClient
	{
	private NGramList ngrams;
	private int totalNGrams;
	private int totalHits;

	private String language;
	private String encoding;

	private int[] histogram;

	private static final int BUFFER_SIZE = 1024;

	private char[] buffer;
	private int bufIndex;
	private int bufMax;

	private NGramParser parser;

	/**
	* TODO This should take cumulative percent and the name...
	*/
	public Checker(NGramList list, InputFile dataFile)
	{
	ngrams = list;
	ngrams.setMapper(dataFile);

	language = languageName(dataFile.getFilename());
	encoding = dataFile.getEncoding();

	buffer = new char[BUFFER_SIZE];
	parser = new NGramParser(this);
	resetCounts();

	histogram = new int[100];
	resetHistogram();
	}

	public void handleNGram(String key)
	{
	NGramList.NGram ngram = ngrams.get(key);

	totalNGrams += 1;

	if (ngram != null) {
	totalHits += 1;
	//ngram.incrementRefCount();
	}
	}

	private void resetCounts()
	{
	bufIndex = 0;
	totalNGrams = totalHits = 0;
	}

	private void resetHistogram()
	{
	for(int i = 0; i < 100; i += 1) {
	histogram[i] = 0;
	}

	}

	private static void exceptionError(Exception e)
	{
	System.err.println("ioError: " + e.toString());
	}

	private static String languageName(String filename)
	{
	return filename.substring(0, filename.indexOf('.'));
	}

	private boolean nextBuffer(InputFile inputFile)
	{
	try {
	bufMax = inputFile.read(buffer);
	} catch (Exception e) {
	bufMax = -1;
	exceptionError(e);

	return false;
	}

	bufIndex = 0;

	return bufMax >= 0;
	}

	private void parseBuffer()
	{
	resetCounts();
	parser.reset();
	parser.parse();
	}

	public char nextChar()
	{
	if (bufIndex >= bufMax) {
	return 0;
	}

	return buffer[bufIndex++];
	}

	public String getLanguage()
	{
	return language;
	}

	public void setMapper(InputFile file)
	{
	ngrams.setMapper(file);
	}

	public int checkBuffer(char[] theBuffer, int charCount)
	{
	buffer = theBuffer;
	bufMax = charCount;

	parseBuffer();

	return totalHits;
	}

	public void check(InputFile dataFile)
	{
	int minHist = 101, maxHist = -1;

	dataFile.open();

	String dataFilename = dataFile.getFilename();
	String fileEncoding = dataFile.getEncoding();

	System.out.println(language + "(" + encoding + ") stats, " + languageName(dataFilename) + "(" + fileEncoding + ") data:");

	setMapper(dataFile);
	resetHistogram();

	while (nextBuffer(dataFile)) {
	parseBuffer();

	double percentHits = (double) totalHits / totalNGrams * 100.0;
	int ph = (int) percentHits;

	if (ph < minHist) {
	minHist = ph;
	}

	if (ph > maxHist) {
	maxHist = ph;
	}

	histogram[ph] += 1;
	}

	for(int ph = minHist; ph <= maxHist; ph += 1) {
	System.out.println(ph + "\t" + histogram[ph]);
	}

	System.out.println();

	dataFile.close();

	return;
	}
	}