| /* |
| *********************************************************************** |
| * Copyright (C) 2005, International Business Machines Corporation and * |
| * others. All Rights Reserved. * |
| *********************************************************************** |
| * |
| */ |
| |
| package com.ibm.icu.dev.tool.charsetdet.sbcs; |
| |
| import java.io.File; |
| import java.io.FileInputStream; |
| import java.io.InputStreamReader; |
| |
| /** |
| * @author emader |
| * |
| * TODO To change the template for this generated type comment go to |
| * Window - Preferences - Java - Code Style - Code Templates |
| */ |
| public class Checker implements NGramParser.NGramParserClient |
| { |
| private NGramList ngrams; |
| private int totalNGrams; |
| private int totalHits; |
| |
| private String language; |
| private String encoding; |
| |
| private int[] histogram; |
| |
| private static final int BUFFER_SIZE = 1024; |
| |
| private char[] buffer; |
| private int bufIndex; |
| private int bufMax; |
| |
| private NGramParser parser; |
| |
| /** |
| * TODO This should take cumulative percent and the name... |
| */ |
| public Checker(NGramList list, InputFile dataFile) |
| { |
| ngrams = list; |
| ngrams.setMapper(dataFile); |
| |
| language = languageName(dataFile.getFilename()); |
| encoding = dataFile.getEncoding(); |
| |
| buffer = new char[BUFFER_SIZE]; |
| parser = new NGramParser(this); |
| resetCounts(); |
| |
| histogram = new int[100]; |
| resetHistogram(); |
| } |
| |
| public void handleNGram(String key) |
| { |
| NGramList.NGram ngram = ngrams.get(key); |
| |
| totalNGrams += 1; |
| |
| if (ngram != null) { |
| totalHits += 1; |
| //ngram.incrementRefCount(); |
| } |
| } |
| |
| private void resetCounts() |
| { |
| bufIndex = 0; |
| totalNGrams = totalHits = 0; |
| } |
| |
| private void resetHistogram() |
| { |
| for(int i = 0; i < 100; i += 1) { |
| histogram[i] = 0; |
| } |
| |
| } |
| |
| private static void exceptionError(Exception e) |
| { |
| System.err.println("ioError: " + e.toString()); |
| } |
| |
| private static String languageName(String filename) |
| { |
| return filename.substring(0, filename.indexOf('.')); |
| } |
| |
| private boolean nextBuffer(InputFile inputFile) |
| { |
| try { |
| bufMax = inputFile.read(buffer); |
| } catch (Exception e) { |
| bufMax = -1; |
| exceptionError(e); |
| |
| return false; |
| } |
| |
| bufIndex = 0; |
| |
| return bufMax >= 0; |
| } |
| |
| private void parseBuffer() |
| { |
| resetCounts(); |
| parser.reset(); |
| parser.parse(); |
| } |
| |
| public char nextChar() |
| { |
| if (bufIndex >= bufMax) { |
| return 0; |
| } |
| |
| return buffer[bufIndex++]; |
| } |
| |
| public String getLanguage() |
| { |
| return language; |
| } |
| |
| public void setMapper(InputFile file) |
| { |
| ngrams.setMapper(file); |
| } |
| |
| public int checkBuffer(char[] theBuffer, int charCount) |
| { |
| buffer = theBuffer; |
| bufMax = charCount; |
| |
| parseBuffer(); |
| |
| return totalHits; |
| } |
| |
| public void check(InputFile dataFile) |
| { |
| int minHist = 101, maxHist = -1; |
| |
| dataFile.open(); |
| |
| String dataFilename = dataFile.getFilename(); |
| String fileEncoding = dataFile.getEncoding(); |
| |
| System.out.println(language + "(" + encoding + ") stats, " + languageName(dataFilename) + "(" + fileEncoding + ") data:"); |
| |
| setMapper(dataFile); |
| resetHistogram(); |
| |
| while (nextBuffer(dataFile)) { |
| parseBuffer(); |
| |
| double percentHits = (double) totalHits / totalNGrams * 100.0; |
| int ph = (int) percentHits; |
| |
| if (ph < minHist) { |
| minHist = ph; |
| } |
| |
| if (ph > maxHist) { |
| maxHist = ph; |
| } |
| |
| histogram[ph] += 1; |
| } |
| |
| for(int ph = minHist; ph <= maxHist; ph += 1) { |
| System.out.println(ph + "\t" + histogram[ph]); |
| } |
| |
| System.out.println(); |
| |
| dataFile.close(); |
| |
| return; |
| } |
| } |