| # Copyright (c) 2002-2005, International Business Machines Corporation and |
| # others. All Rights Reserved. |
| # |
| # word.txt Word Breaking Rules for ICU Rules Based Break Iterator. |
| # |
| # TODO: Shift this over to being based on the current default (non-Thai) |
| # word rules, including exact reverse rules. Postponed |
| # because of interactions with dictionary implementation. |
| |
| |
| $Katakana = [\p{Word_Break = Katakana}]; |
| $ALetter = [\p{Word_Break = ALetter}]; |
| $MidLetter = [\p{Word_Break = MidLetter}]; |
| $Numeric = [\p{Line_Break = Numeric}]; |
| $MidNum = [\p{Word_Break = MidNum}]; |
| $ExtendNumLet = [\p{Word_Break = ExtendNumLet}]; |
| |
| $Hiragana = [\p{Hiragana}]; |
| |
| $Control = [^\p{Grapheme_Cluster_Break = Control}]; |
| $Extend = [\p{Grapheme_Cluster_Break = Extend}]; |
| $ALetterEx = $ALetter $Extend*; |
| $NumericEx = $Numeric $Extend*; |
| $MidLetterEx = $MidLetter $Extend*; |
| $MidNumEx = $MidNum $Extend*; |
| $ExtendNumLetEx = $ExtendNumLet $Extend*; |
| |
| |
| |
| # |
| # Thai Dictionary Related Rules. Identify runs that will be subdivided into words |
| # using the dictionary. |
| # |
| $dictionary = [\u0e01-\u0e2e \u0e30-\u0e3a \u0e40-\u0e44 \u0e47-\u0e4e]; # this rule breaks the iterator with mixed Thai and English |
| $paiyannoi = [\u0e2f]; |
| $maiyamok = [\u0e46]; |
| $thai_etc = $paiyannoi \u0e25 $paiyannoi; |
| |
| |
| $dictionary+ ($paiyannoi? $maiyamok)?; |
| $dictionary+ $paiyannoi / ([^\u0e25 $maiyamok $Extend] | \u0e25[^$paiyannoi $Extend]); |
| $thai_etc; |
| |
| |
| # |
| # The Big Rule. Gloms Non-Thai words together. |
| # |
| $NumericClump = $NumericEx ($MidNumEx? $NumericEx)*; |
| $AlphaClump = $ALetterEx ($MidLetterEx? $ALetterEx)*; |
| ($AlphaClump | $NumericClump | $ExtendNumLetEx)+; |
| |
| # |
| # Lesser rules |
| # |
| ($Hiragana $Extend*)*; |
| ($Katakana $Extend*)*; |
| [^$Control] $Extend*; |
| \r\n; |
| .; |
| |
| # |
| # Reverse Rules. Back up over any of the chars that can group together. |
| # (Reverse rules do not need to be exact; they can back up a bit too far, |
| # but must back up at least enough.) |
| # |
| ! ( $ALetter | $MidLetter | $Numeric | $ExtendNumLet | $MidNum | $Extend )*; |
| ! ($Hiragana | $Extend)*; |
| ! ($Katakana | $Extend)*; |
| ! $Extend* .; |
| ! \n\r; |
| |
| ! ($dictionary | $paiyannoi | $maiyamok | \u0e25)*; |