source/data/brkitr/word.txt - external/github.com/unicode-org/icu - Git at Google

 #
 #   Copyright (C) 2002, International Business Machines Corporation and others.
 #       All Rights Reserved.
 #
 #   file:  word.txt
 #
 #   ICU Word Break Rules
 #      See Unicode Technical Report #29.
 #      These rules are based on the proposed draft dated 2002-08-06
 #


 ####################################################################################
 #
 #  Definitions imported from Line Break Rules.
 #
 ####################################################################################
 $Numeric = [ \u0030-\u0039 \u0660-\u0669 \u06F0-\u06F9 \u0966-\u096F \u09E6-\u09EF
         \u0A66-\u0A6F \u0AE6-\u0AEF \u0B66-\u0B6F \u0BE7-\u0BEF \u0C66-\u0C6F
         \u0CE6-\u0CEF \u0D66-\u0D6F \u0E50-\u0E59 \u0ED0-\u0ED9 \u0F20-\u0F29
         \u1040-\u1049 \u1369-\u1371 \u17E0-\u17E9 \u1810-\u1819 \U0001D7CE-\U0001D7FF];


 ####################################################################################
 #
 #  Definitions imported from Character Break Rules.
 #
 ####################################################################################
 #
 #  Character Class Definitions.
 #    The names are those from TR29.
 #
 $Control    = [[:Zl:] [:Zp:] [:Cc:] [:Cf:]];

 # Note on $Extend:  Earlier versions of TR29 included Mc characters.
 #                   To avoid test breakage, Mc is still included for the time being.
 # $Extend     = [[:Mn:] [:Me:] \uff9e-\uff9f];   #  FF9E..FF9F    ; Other_Grapheme_Extend
 $Extend     = [[:Mn:] [:Me:] [:Mc:] \uff9e-\uff9f];   #  FF9E..FF9F    ; Other_Grapheme_Extend


 ####################################################################################
 #
 #  Word Break Rules.    Definitions and Rules specific to word break begin Here.
 #
 ####################################################################################

 $Katakana  = [[:Kana:]  \u30fc \uff70 \uff9e-\uff9f];
 $Hiragana  = [[:Hira:]];
 $Letter    = [[[:Alphabetic:]  \u02b9-\u02ba \u02c2-\u02cf \u02d2-\u02df \u02e5-\u02ed \u05f3] -
              [[:IDEOGRAPHIC:] [:THAI:] [:LAO:] $Hiragana $Katakana ]];
 $Format    = [[:Cf:]];

 $MidLetter = [\u0027 \u00ad \u05f4 \u2019];

 $MidNumLet = [\u002e \u003a];


 # From Line Break, IS - Numeric Separator (Infix)
 #  $IS = [\u002c \u002e \u003a \u003b \u0589];
 $MidNum    = [\u002c \u003b \u0589];

 #
 #  "Extended"  definitions.  Classes of characters including trailing combining chars and,
 #                            for types of chars that can appear in the interior of a word only,
 #                            trailing format characters.
 #
 $LetterEx     = $Letter    $Extend*;
 $NumericEx    = $Numeric   $Extend*;
 $MidNumExF    = $MidNum    $Extend* $Format*;
 $MidNumLetExF = $MidNumLet $Extend* $Format*;
 $MidLetterExF = $MidLetter $Extend* $Format*;


 #
 #  Numbers.  Rules 6, 9, 10 form the TR.
 #
 $NumberSequence = $NumericEx ($Format* ($MidNumExF | $MidNumLetExF)? $NumericEx)*;
 $NumberSequence {100};

 #
 #  Words.  Alpha-numerics.  Rule 3 - 10
 #     - must include at least one letter.
 #     - may include both letters and numbers.
 #     - may include  MideLetter, MidNumber punctuation.
 #
 $LetterSequence = $LetterEx ($Format* ($MidLetterExF | $MidNumLetExF)? $LetterEx)*;
 $NumberSequence? $LetterSequence ($NumberSequence | $LetterSequence)* {200};

 #
 #  Hiragana and Katakana
 #
 $Hiragana $Extend* {300};
 $Katakana $Extend* ($Format* $Katakana $Extend*)* {300};

 #
 #  Ideographic Characters.  Stand by themselves as words.
 #
 [:IDEOGRAPHIC:] $Extend* {400};

 #
 #  Everything Else, with no tag.
 #                   Non-Control chars combine with $Extend (combining) chars.
 #                   Controls are returned by themselves.
 #
 [^$Control] $Extend*;
 \r\n;
 .;

 #
 #  Reverse Rules.   Back up over any of the chars that can group together.
 #                   (Reverse rules do not need to be exact; they can back up  too far,
 #                   but must back up at least enough, and must stop on a boundary.)
 #

 # NonStarters are the set of all characters that can appear at the 2nd - nth position of
 #    a word.   (They may also be the first.)   The reverse rule skips over these, until it
 #    reaches something that can only be the start (and probably only) char in a "word".
 #    A space or punctuation meets the test.
 #
 $NonStarters = [$Numeric $Letter $Katakana $MidLetter $MidNum $MidNumLet $Extend $Format \u000a];

 ! $NonStarters* .;
	#
	# Copyright (C) 2002, International Business Machines Corporation and others.
	# All Rights Reserved.
	#
	# file: word.txt
	#
	# ICU Word Break Rules
	# See Unicode Technical Report #29.
	# These rules are based on the proposed draft dated 2002-08-06
	#



	####################################################################################
	#
	# Definitions imported from Line Break Rules.
	#
	####################################################################################
	$Numeric = [ \u0030-\u0039 \u0660-\u0669 \u06F0-\u06F9 \u0966-\u096F \u09E6-\u09EF
	\u0A66-\u0A6F \u0AE6-\u0AEF \u0B66-\u0B6F \u0BE7-\u0BEF \u0C66-\u0C6F
	\u0CE6-\u0CEF \u0D66-\u0D6F \u0E50-\u0E59 \u0ED0-\u0ED9 \u0F20-\u0F29
	\u1040-\u1049 \u1369-\u1371 \u17E0-\u17E9 \u1810-\u1819 \U0001D7CE-\U0001D7FF];



	####################################################################################
	#
	# Definitions imported from Character Break Rules.
	#
	####################################################################################
	#
	# Character Class Definitions.
	# The names are those from TR29.
	#
	$Control = [[:Zl:] [:Zp:] [:Cc:] [:Cf:]];

	# Note on $Extend: Earlier versions of TR29 included Mc characters.
	# To avoid test breakage, Mc is still included for the time being.
	# $Extend = [[:Mn:] [:Me:] \uff9e-\uff9f]; # FF9E..FF9F ; Other_Grapheme_Extend
	$Extend = [[:Mn:] [:Me:] [:Mc:] \uff9e-\uff9f]; # FF9E..FF9F ; Other_Grapheme_Extend




	####################################################################################
	#
	# Word Break Rules. Definitions and Rules specific to word break begin Here.
	#
	####################################################################################

	$Katakana = [[:Kana:] \u30fc \uff70 \uff9e-\uff9f];
	$Hiragana = [[:Hira:]];
	$Letter = [[[:Alphabetic:] \u02b9-\u02ba \u02c2-\u02cf \u02d2-\u02df \u02e5-\u02ed \u05f3] -
	[[:IDEOGRAPHIC:] [:THAI:] [:LAO:] $Hiragana $Katakana ]];
	$Format = [[:Cf:]];

	$MidLetter = [\u0027 \u00ad \u05f4 \u2019];

	$MidNumLet = [\u002e \u003a];


	# From Line Break, IS - Numeric Separator (Infix)
	# $IS = [\u002c \u002e \u003a \u003b \u0589];
	$MidNum = [\u002c \u003b \u0589];

	#
	# "Extended" definitions. Classes of characters including trailing combining chars and,
	# for types of chars that can appear in the interior of a word only,
	# trailing format characters.
	#
	$LetterEx = $Letter $Extend*;
	$NumericEx = $Numeric $Extend*;
	$MidNumExF = $MidNum $Extend* $Format*;
	$MidNumLetExF = $MidNumLet $Extend* $Format*;
	$MidLetterExF = $MidLetter $Extend* $Format*;


	#
	# Numbers. Rules 6, 9, 10 form the TR.
	#
	$NumberSequence = $NumericEx ($Format* ($MidNumExF \| $MidNumLetExF)? $NumericEx)*;
	$NumberSequence {100};

	#
	# Words. Alpha-numerics. Rule 3 - 10
	# - must include at least one letter.
	# - may include both letters and numbers.
	# - may include MideLetter, MidNumber punctuation.
	#
	$LetterSequence = $LetterEx ($Format* ($MidLetterExF \| $MidNumLetExF)? $LetterEx)*;
	$NumberSequence? $LetterSequence ($NumberSequence \| $LetterSequence)* {200};

	#
	# Hiragana and Katakana
	#
	$Hiragana $Extend* {300};
	$Katakana $Extend* ($Format* $Katakana $Extend) {300};

	#
	# Ideographic Characters. Stand by themselves as words.
	#
	[:IDEOGRAPHIC:] $Extend* {400};

	#
	# Everything Else, with no tag.
	# Non-Control chars combine with $Extend (combining) chars.
	# Controls are returned by themselves.
	#
	[^$Control] $Extend*;
	\r\n;
	.;

	#
	# Reverse Rules. Back up over any of the chars that can group together.
	# (Reverse rules do not need to be exact; they can back up too far,
	# but must back up at least enough, and must stop on a boundary.)
	#

	# NonStarters are the set of all characters that can appear at the 2nd - nth position of
	# a word. (They may also be the first.) The reverse rule skips over these, until it
	# reaches something that can only be the start (and probably only) char in a "word".
	# A space or punctuation meets the test.
	#
	$NonStarters = [$Numeric $Letter $Katakana $MidLetter $MidNum $MidNumLet $Extend $Format \u000a];

	! $NonStarters* .;