| /* |
| ******************************************************************************* |
| * Copyright (C) 2005 - 2010, International Business Machines Corporation and * |
| * others. All Rights Reserved. * |
| ******************************************************************************* |
| */ |
| package com.ibm.icu.text; |
| |
| /** |
| * class CharsetRecog_2022 part of the ICU charset detection imlementation. |
| * This is a superclass for the individual detectors for |
| * each of the detectable members of the ISO 2022 family |
| * of encodings. |
| * |
| * The separate classes are nested within this class. |
| */ |
| abstract class CharsetRecog_2022 extends CharsetRecognizer { |
| |
| |
| /** |
| * Matching function shared among the 2022 detectors JP, CN and KR |
| * Counts up the number of legal an unrecognized escape sequences in |
| * the sample of text, and computes a score based on the total number & |
| * the proportion that fit the encoding. |
| * |
| * |
| * @param text the byte buffer containing text to analyse |
| * @param textLen the size of the text in the byte. |
| * @param escapeSequences the byte escape sequences to test for. |
| * @return match quality, in the range of 0-100. |
| */ |
| int match(byte [] text, int textLen, byte [][] escapeSequences) { |
| int i, j; |
| int escN; |
| int hits = 0; |
| int misses = 0; |
| int shifts = 0; |
| int quality; |
| scanInput: |
| for (i=0; i<textLen; i++) { |
| if (text[i] == 0x1b) { |
| checkEscapes: |
| for (escN=0; escN<escapeSequences.length; escN++) { |
| byte [] seq = escapeSequences[escN]; |
| |
| if ((textLen - i) < seq.length) { |
| continue checkEscapes; |
| } |
| |
| for (j=1; j<seq.length; j++) { |
| if (seq[j] != text[i+j]) { |
| continue checkEscapes; |
| } |
| } |
| |
| hits++; |
| i += seq.length-1; |
| continue scanInput; |
| } |
| |
| misses++; |
| } |
| |
| if (text[i] == 0x0e || text[i] == 0x0f) { |
| // Shift in/out |
| shifts++; |
| } |
| } |
| |
| if (hits == 0) { |
| return 0; |
| } |
| |
| // |
| // Initial quality is based on relative proportion of recongized vs. |
| // unrecognized escape sequences. |
| // All good: quality = 100; |
| // half or less good: quality = 0; |
| // linear inbetween. |
| quality = (100*hits - 100*misses) / (hits + misses); |
| |
| // Back off quality if there were too few escape sequences seen. |
| // Include shifts in this computation, so that KR does not get penalized |
| // for having only a single Escape sequence, but many shifts. |
| if (hits+shifts < 5) { |
| quality -= (5-(hits+shifts))*10; |
| } |
| |
| if (quality < 0) { |
| quality = 0; |
| } |
| return quality; |
| } |
| |
| |
| |
| |
| static class CharsetRecog_2022JP extends CharsetRecog_2022 { |
| private byte [] [] escapeSequences = { |
| {0x1b, 0x24, 0x28, 0x43}, // KS X 1001:1992 |
| {0x1b, 0x24, 0x28, 0x44}, // JIS X 212-1990 |
| {0x1b, 0x24, 0x40}, // JIS C 6226-1978 |
| {0x1b, 0x24, 0x41}, // GB 2312-80 |
| {0x1b, 0x24, 0x42}, // JIS X 208-1983 |
| {0x1b, 0x26, 0x40}, // JIS X 208 1990, 1997 |
| {0x1b, 0x28, 0x42}, // ASCII |
| {0x1b, 0x28, 0x48}, // JIS-Roman |
| {0x1b, 0x28, 0x49}, // Half-width katakana |
| {0x1b, 0x28, 0x4a}, // JIS-Roman |
| {0x1b, 0x2e, 0x41}, // ISO 8859-1 |
| {0x1b, 0x2e, 0x46} // ISO 8859-7 |
| }; |
| |
| String getName() { |
| return "ISO-2022-JP"; |
| } |
| |
| int match(CharsetDetector det) { |
| return match(det.fInputBytes, det.fInputLen, escapeSequences); |
| } |
| } |
| |
| static class CharsetRecog_2022KR extends CharsetRecog_2022 { |
| private byte [] [] escapeSequences = { |
| {0x1b, 0x24, 0x29, 0x43} |
| }; |
| |
| String getName() { |
| return "ISO-2022-KR"; |
| } |
| |
| int match(CharsetDetector det) { |
| return match(det.fInputBytes, det.fInputLen, escapeSequences); |
| } |
| |
| } |
| |
| static class CharsetRecog_2022CN extends CharsetRecog_2022 { |
| private byte [] [] escapeSequences = { |
| {0x1b, 0x24, 0x29, 0x41}, // GB 2312-80 |
| {0x1b, 0x24, 0x29, 0x47}, // CNS 11643-1992 Plane 1 |
| {0x1b, 0x24, 0x2A, 0x48}, // CNS 11643-1992 Plane 2 |
| {0x1b, 0x24, 0x29, 0x45}, // ISO-IR-165 |
| {0x1b, 0x24, 0x2B, 0x49}, // CNS 11643-1992 Plane 3 |
| {0x1b, 0x24, 0x2B, 0x4A}, // CNS 11643-1992 Plane 4 |
| {0x1b, 0x24, 0x2B, 0x4B}, // CNS 11643-1992 Plane 5 |
| {0x1b, 0x24, 0x2B, 0x4C}, // CNS 11643-1992 Plane 6 |
| {0x1b, 0x24, 0x2B, 0x4D}, // CNS 11643-1992 Plane 7 |
| {0x1b, 0x4e}, // SS2 |
| {0x1b, 0x4f}, // SS3 |
| }; |
| |
| String getName() { |
| return "ISO-2022-CN"; |
| } |
| |
| |
| int match(CharsetDetector det) { |
| return match(det.fInputBytes, det.fInputLen, escapeSequences); |
| } |
| } |
| |
| } |
| |