icu4j/tools/misc/src/com/ibm/icu/dev/tool/charsetdet/sbcs/NGramParser.java - external/github.com/unicode-org/icu - Git at Google

 /*
  ***********************************************************************
  * Copyright (C) 2005, International Business Machines Corporation and *
  * others. All Rights Reserved.                                        *
  ***********************************************************************
  *
  */

 package com.ibm.icu.dev.tool.charsetdet.sbcs;

 import com.ibm.icu.text.UnicodeSet;

 /**
  * @author emader
  *
  * TODO To change the template for this generated type comment go to
  * Window - Preferences - Java - Code Style - Code Templates
  */
 public class NGramParser
 {

     public interface NGramParserClient
     {
         char nextChar();
         void handleNGram(String key);
     }

     private static final int A_NULL = 0;
     private static final int A_ADDC = 1;
     private static final int A_ADDS = 2;

     /*
      * Character classes
      */
     public static final int C_IGNORE = 0;
     public static final int C_LETTER = 1;
     public static final int C_PUNCT  = 2;

     private static final int S_START  = 0;
     private static final int S_LETTER = 1;
     private static final int S_PUNCT  = 2;

     static final class StateEntry
     {
         private int newState;
         private int action;

         StateEntry(int theState, int theAction)
         {
             newState = theState;
             action   = theAction;
         }

         public int getNewState()
         {
             return newState;
         }

         public int getAction()
         {
             return action;
         }
     }

     private StateEntry[][] stateTable = {
             {new StateEntry(S_START,  A_NULL), new StateEntry(S_LETTER, A_ADDC), new StateEntry(S_PUNCT,  A_ADDS)},
             {new StateEntry(S_LETTER, A_NULL), new StateEntry(S_LETTER, A_ADDC), new StateEntry(S_PUNCT,  A_ADDS)},
             {new StateEntry(S_PUNCT,  A_NULL), new StateEntry(S_LETTER, A_ADDC), new StateEntry(S_PUNCT,  A_NULL)}
     };

     protected final int N_GRAM_SIZE = 3;

     private char[] letters = new char[N_GRAM_SIZE];
     private int letterCount;

     private static UnicodeSet letterSet = new UnicodeSet("[:letter:]");

     private NGramParserClient client;

     /**
      *
      */
     public NGramParser(NGramParserClient theClient)
     {
         client = theClient;
         letterCount = 0;
     }

     public void setClient(NGramParserClient theClient)
     {
         client = theClient;
     }

     // TODO Is this good enough, or are there other C_IGNORE characters?
     // TODO Could this make Latin letters C_PUNCT for non-Latin scripts?
     public static int getCharClass(char ch)
     {
         if (ch == '\'' || ch == '\uFEFF') {
             return C_IGNORE;
         }

         if (letterSet.contains(ch)) {
             return C_LETTER;
         }

         return C_PUNCT;
     }

     public void reset()
     {
         letterCount = 0;
     }

     public void addLetter(char letter)
     {
         // somewhat clever stuff goes here...
         letters[letterCount++] = letter;

         if (letterCount >= N_GRAM_SIZE) {
             String key = new String(letters);

             client.handleNGram(key);

             letterCount = N_GRAM_SIZE - 1;
             for (int i = 0; i < letterCount; i += 1) {
                 letters[i] = letters[i + 1];
             }
         }
     }

     public void parse()
     {
         char ch;
         int state = 0;

         // this is where the clever stuff goes...
         while ((ch = client.nextChar()) != 0) {
             int charClass = getCharClass(ch);
             StateEntry entry = stateTable[state][charClass];

             state = entry.getNewState();

             switch (entry.getAction())
             {
             case A_ADDC:
                 addLetter(Character.toLowerCase(ch));
                 break;

             case A_ADDS:
                 addLetter(' ');
                 break;

             case A_NULL:
             default:
                 break;
             }
         }

         addLetter(' ');
     }
 }
	/*
	***********************************************************************
	* Copyright (C) 2005, International Business Machines Corporation and *
	* others. All Rights Reserved. *
	***********************************************************************
	*
	*/

	package com.ibm.icu.dev.tool.charsetdet.sbcs;

	import com.ibm.icu.text.UnicodeSet;

	/**
	* @author emader
	*
	* TODO To change the template for this generated type comment go to
	* Window - Preferences - Java - Code Style - Code Templates
	*/
	public class NGramParser
	{

	public interface NGramParserClient
	{
	char nextChar();
	void handleNGram(String key);
	}

	private static final int A_NULL = 0;
	private static final int A_ADDC = 1;
	private static final int A_ADDS = 2;

	/*
	* Character classes
	*/
	public static final int C_IGNORE = 0;
	public static final int C_LETTER = 1;
	public static final int C_PUNCT = 2;

	private static final int S_START = 0;
	private static final int S_LETTER = 1;
	private static final int S_PUNCT = 2;

	static final class StateEntry
	{
	private int newState;
	private int action;

	StateEntry(int theState, int theAction)
	{
	newState = theState;
	action = theAction;
	}

	public int getNewState()
	{
	return newState;
	}

	public int getAction()
	{
	return action;
	}
	}

	private StateEntry[][] stateTable = {
	{new StateEntry(S_START, A_NULL), new StateEntry(S_LETTER, A_ADDC), new StateEntry(S_PUNCT, A_ADDS)},
	{new StateEntry(S_LETTER, A_NULL), new StateEntry(S_LETTER, A_ADDC), new StateEntry(S_PUNCT, A_ADDS)},
	{new StateEntry(S_PUNCT, A_NULL), new StateEntry(S_LETTER, A_ADDC), new StateEntry(S_PUNCT, A_NULL)}
	};

	protected final int N_GRAM_SIZE = 3;

	private char[] letters = new char[N_GRAM_SIZE];
	private int letterCount;

	private static UnicodeSet letterSet = new UnicodeSet("[:letter:]");

	private NGramParserClient client;

	/**
	*
	*/
	public NGramParser(NGramParserClient theClient)
	{
	client = theClient;
	letterCount = 0;
	}

	public void setClient(NGramParserClient theClient)
	{
	client = theClient;
	}

	// TODO Is this good enough, or are there other C_IGNORE characters?
	// TODO Could this make Latin letters C_PUNCT for non-Latin scripts?
	public static int getCharClass(char ch)
	{
	if (ch == '\'' \|\| ch == '\uFEFF') {
	return C_IGNORE;
	}

	if (letterSet.contains(ch)) {
	return C_LETTER;
	}

	return C_PUNCT;
	}

	public void reset()
	{
	letterCount = 0;
	}

	public void addLetter(char letter)
	{
	// somewhat clever stuff goes here...
	letters[letterCount++] = letter;

	if (letterCount >= N_GRAM_SIZE) {
	String key = new String(letters);

	client.handleNGram(key);

	letterCount = N_GRAM_SIZE - 1;
	for (int i = 0; i < letterCount; i += 1) {
	letters[i] = letters[i + 1];
	}
	}
	}

	public void parse()
	{
	char ch;
	int state = 0;

	// this is where the clever stuff goes...
	while ((ch = client.nextChar()) != 0) {
	int charClass = getCharClass(ch);
	StateEntry entry = stateTable[state][charClass];

	state = entry.getNewState();

	switch (entry.getAction())
	{
	case A_ADDC:
	addLetter(Character.toLowerCase(ch));
	break;

	case A_ADDS:
	addLetter(' ');
	break;

	case A_NULL:
	default:
	break;
	}
	}

	addLetter(' ');
	}
	}