blob: d3a30bc8d39b180919ca66ad3348060f5cc46f24 [file] [log] [blame]
/*
***********************************************************************
* Copyright (C) 2005-2006, International Business Machines *
* Corporation and others. All Rights Reserved. *
***********************************************************************
*
*/
package com.ibm.icu.dev.tool.charsetdet.sbcs;
/**
* @author emader
*
* TODO To change the template for this generated type comment go to
* Window - Preferences - Java - Code Style - Code Templates
*/
public class Checker implements NGramParser.NGramParserClient
{
private NGramList ngrams;
private int totalNGrams;
private int totalHits;
private String language;
private String encoding;
private int[] histogram;
private static final int BUFFER_SIZE = 1024;
private char[] buffer;
private int bufIndex;
private int bufMax;
private NGramParser parser;
/**
* TODO This should take cumulative percent and the name...
*/
public Checker(NGramList list, InputFile dataFile)
{
ngrams = list;
ngrams.setMapper(dataFile);
language = languageName(dataFile.getFilename());
encoding = dataFile.getEncoding();
buffer = new char[BUFFER_SIZE];
parser = new NGramParser(this);
resetCounts();
histogram = new int[100];
resetHistogram();
}
public void handleNGram(String key)
{
NGramList.NGram ngram = ngrams.get(key);
totalNGrams += 1;
if (ngram != null) {
totalHits += 1;
//ngram.incrementRefCount();
}
}
private void resetCounts()
{
bufIndex = 0;
totalNGrams = totalHits = 0;
}
private void resetHistogram()
{
for(int i = 0; i < 100; i += 1) {
histogram[i] = 0;
}
}
private static void exceptionError(Exception e)
{
System.err.println("ioError: " + e.toString());
}
private static String languageName(String filename)
{
return filename.substring(0, filename.indexOf('.'));
}
private boolean nextBuffer(InputFile inputFile)
{
try {
bufMax = inputFile.read(buffer);
} catch (Exception e) {
bufMax = -1;
exceptionError(e);
return false;
}
bufIndex = 0;
return bufMax >= 0;
}
private void parseBuffer()
{
resetCounts();
parser.reset();
parser.parse();
}
public char nextChar()
{
if (bufIndex >= bufMax) {
return 0;
}
return buffer[bufIndex++];
}
public String getLanguage()
{
return language;
}
public void setMapper(InputFile file)
{
ngrams.setMapper(file);
}
public int checkBuffer(char[] theBuffer, int charCount)
{
buffer = theBuffer;
bufMax = charCount;
parseBuffer();
return totalHits;
}
public void check(InputFile dataFile)
{
int minHist = 101, maxHist = -1;
dataFile.open();
String dataFilename = dataFile.getFilename();
String fileEncoding = dataFile.getEncoding();
System.out.println(language + "(" + encoding + ") stats, " + languageName(dataFilename) + "(" + fileEncoding + ") data:");
setMapper(dataFile);
resetHistogram();
while (nextBuffer(dataFile)) {
parseBuffer();
double percentHits = (double) totalHits / totalNGrams * 100.0;
int ph = (int) percentHits;
if (ph < minHist) {
minHist = ph;
}
if (ph > maxHist) {
maxHist = ph;
}
histogram[ph] += 1;
}
for(int ph = minHist; ph <= maxHist; ph += 1) {
System.out.println(ph + "\t" + histogram[ph]);
}
System.out.println();
dataFile.close();
return;
}
}