source/data/brkitr/line_th.txt - external/github.com/unicode-org/icu - Git at Google

 # Copyright (c) 2002-2005, International Business Machines Corporation and
 # others. All Rights Reserved.
 #
 #  file:  line.txt
 #
 #         Line Breaking Rules for ICU rules based break iteration.
 #         Implement default line breaking as defined by Unicode TR 14.
 #
 #         TODO:  Rework the rules not pertaining to Thai to be based on the
 #                default line break rules.  Not done yet because of interactions
 #                between exact reverse rules and the Dictionary code.
 #
 #                These rules, in their current form, do not conform to TR-14 for
 #                non-Thai breaks.
 #

 $LF = [\p{LineBreak = LF}];
 $IN = [\p{LineBreak = IN}];
 $SY = [\p{LineBreak = SY}];
 $EX = [\p{LineBreak = EX}];
 $BA = [\p{LineBreak = BA}];
 $IS = [\p{LineBreak = IS}];
 $BB = [\p{LineBreak = BB}];
 $SA = [\p{LineBreak = SA}];
 $CB = [\p{LineBreak = CB}];
 $XX = [\p{LineBreak = XX}];
 $HY = [\p{LineBreak = HY}];
 $AI = [\p{LineBreak = AI}];
 $ZW = [\p{LineBreak = ZW}];
 $SG = [\p{LineBreak = SG}];
 $AL = [\p{LineBreak = AL}];
 $OP = [\p{LineBreak = OP}];
 $BK = [\p{LineBreak = BK}];
 $PO = [\p{LineBreak = PO}];
 $NS = [\p{LineBreak = NS}];
 $CL = [\p{LineBreak = CL}];
 $NU = [\p{LineBreak = NU}];
 $CM = [\p{LineBreak = CM}];
 $PR = [\p{LineBreak = PR}];
 $B2 = [\p{LineBreak = B2}];
 $ID = [\p{LineBreak = ID}];
 $SP = [\p{LineBreak = SP}];
 $QU = [\p{LineBreak = QU}];
 $CR = [\p{LineBreak = CR}];
 $GL = [\p{LineBreak = GL}];

 $JL = [\p{LineBreak = JL}];
 $JV = [\p{LineBreak = JV}];
 $JT = [\p{LineBreak = JT}];
 $H2 = [\p{LineBreak = H2}];
 $H3 = [\p{LineBreak = H3}];


 $Extend = [\{p{Grapheme_Cluster_Break = Extend}];


 #
 #  Thai Dictionary related definitions and rules
 #

 $dictionary = [\u0e01-\u0e2e \u0e30-\u0e3a \u0e40-\u0e44 \u0e47-\u0e4e];  # this rule breaks the iterator with mixed Thai and English
 $paiyannoi  = [\u0e2f];
 $maiyamok   = [\u0e46];
 $thai_etc   = $paiyannoi \u0e25 $paiyannoi;


 #
 #  Rule LB1.  By default, treat AI  (characters with ambiguous east Asian width) and
 #                               SA  (South East Asian: Thai, Lao, Khmer) as $AL  (Alphabetic)
 #
 $ALPlus = $AL | $AI | [$SA - $dictionary];

 #
 #  Combining Marks.   X $CM*  behaves as if it were X.  Rule LB6.
 #                     TODO:  This is going to produce some odd results, because of the non-combining
 #                            chars that are included in $CM.  Use $Extend instead, where possible.
 #
 $ALcm = $ALPlus $CM*;
 $IDcm = $ID $CM*;
 $NUcm = $NU $Extend*;
 $HYcm = $HY $Extend*;
 $SPcm = $SP $Extend*;
 $QUcm = $QU $Extend*;
 $POcm = $PO $Extend*;
 $OPcm = $OP $Extend*;
 $BAcm = $BA $Extend*;
 $BBcm = $BB $Extend*;
 $NScm = $NS $Extend*;
 $GLcm = $GL $Extend*;
 $B2cm = $B2 $Extend*;
 $INcm = $IN $Extend*;


 #  New Lines.  Always break after, never break before.
 #              Rule LB 3
 #
 #  Endings.    NewLine or Zero Width Space, or both.  Rules 4, 5
 #              Because we never break before these things, $Endings
 #              appears at the end of line break rule.
 #
 $NLF = $BK | $CR | $LF | $CR $LF;
 $Endings = $SPcm* $ZW* $NLF?;
 $EndingsMandatory = $SPcm* $NLF | $SPcm* $ZW $NLF?;


 #
 #  Openings  Sequences that can precede Words, and that should not be separated from them.
 #            Rules LB 9, 10
 #
 $Openings = (($QUcm $SPcm*)? $OPcm $SPcm*)*;

 #
 #  Closings  Seqences that follow words, and that should not be separated from them,
 #            Rule LB 8, 11, 15
 $Closings =  ($SPcm*( ($CL ($SPcm* $NScm)?  |  $EX  | $IS  | $SY) $Extend*) | $BAcm | $HYcm  | $NScm | $maiyamok)*;

 #
 #  Words.  Includes mixed Alpha-numerics.
 #          Rules 11a, 16, 17, 19, more or less.
 #
 $NumberInterior = $IDcm | ($NUcm | $ALcm | $IS $NUcm)+;
 $Number         =  $PR? ($OPcm | $HYcm)? $NumberInterior $CL? $POcm?;   # Fancy Number     18
 $Word           = (($IDcm | ($ALcm | $NUcm)+) ($POcm? | $INcm?));       # Alpha-numeric.   16, 17
 $Dashes         = (($B2cm $SPcm*)*);                                    # Dashes           11a
 $ThaiRange      = $dictionary+ | $thai_etc;
 $WordLikeThing  = $Number | $Word | $Dashes | $ThaiRange;


 $Word15 = ($BBcm* ($WordLikeThing)? ($BAcm | $HYcm | $NScm)*) |     # Rule 15. Stuff sticks around words.
           [^[:Cc:] $BK $CR $LF $ZW $SP $GL] $Extend*  |             # Allow characters that don't meet the
           [^$BK $CR $LF $ZW $SP $GL ];                              #  more elaborate definitions for WORD to be glued.


 $GluedWord  = ($GLcm | $QUcm)? $Word15 (($GLcm | $QUcm) $Word15)*;  # "Glue" will stick anything below it together.
                                                                     # Rules 13, 14

 #
 #  The actual rules, a combination of everything defined above.
 #
 $Openings $GluedWord  $Closings $paiyannoi? $EndingsMandatory;
 $Openings $GluedWord  $Closings  $Endings;

 $Openings $GluedWord  $Closings $paiyannoi   /
                ([^\u0e25 $Extend] | \u0e25[^$paiyannoi $Extend]);


  #"$word($nbsp+$word)*$paiyannoi/([^[\u0e25$_ignore_]]|"
  #                       + "\u0e25[^$paiyannoi$_ignore_]);"

 #
 # LB 18b.  Do not break a Korean syllable
 #
 $JL+ $JV* $JT* $Extend*;
 $JV+ $JT* $Extend*;
 $JT+ $Extend*;
 $H2 $JV* $JT* $Extend*;
 $H3 $JT* $Extend*;

 #
 #  Reverse Rules.
 #
 #     Back up to a hard break or a space that will cause a boundary.
 #     Not all spaces cause line breaks.  $SpaceGlue represents a sequence
 #     containing a space that may inhibit a break from occuring.
 #
 $SpaceGlue  = ([$ZW $CL $IS $NS $OP]  ($Extend* $SP)) | (($Extend* $SP)+ $OP);
 $ClumpingChars = [^$SP $BK $CR $LF];

 !. . $ClumpingChars*  ($SpaceGlue $ClumpingChars*)* (. | $LF $CR);
	# Copyright (c) 2002-2005, International Business Machines Corporation and
	# others. All Rights Reserved.
	#
	# file: line.txt
	#
	# Line Breaking Rules for ICU rules based break iteration.
	# Implement default line breaking as defined by Unicode TR 14.
	#
	# TODO: Rework the rules not pertaining to Thai to be based on the
	# default line break rules. Not done yet because of interactions
	# between exact reverse rules and the Dictionary code.
	#
	# These rules, in their current form, do not conform to TR-14 for
	# non-Thai breaks.
	#

	$LF = [\p{LineBreak = LF}];
	$IN = [\p{LineBreak = IN}];
	$SY = [\p{LineBreak = SY}];
	$EX = [\p{LineBreak = EX}];
	$BA = [\p{LineBreak = BA}];
	$IS = [\p{LineBreak = IS}];
	$BB = [\p{LineBreak = BB}];
	$SA = [\p{LineBreak = SA}];
	$CB = [\p{LineBreak = CB}];
	$XX = [\p{LineBreak = XX}];
	$HY = [\p{LineBreak = HY}];
	$AI = [\p{LineBreak = AI}];
	$ZW = [\p{LineBreak = ZW}];
	$SG = [\p{LineBreak = SG}];
	$AL = [\p{LineBreak = AL}];
	$OP = [\p{LineBreak = OP}];
	$BK = [\p{LineBreak = BK}];
	$PO = [\p{LineBreak = PO}];
	$NS = [\p{LineBreak = NS}];
	$CL = [\p{LineBreak = CL}];
	$NU = [\p{LineBreak = NU}];
	$CM = [\p{LineBreak = CM}];
	$PR = [\p{LineBreak = PR}];
	$B2 = [\p{LineBreak = B2}];
	$ID = [\p{LineBreak = ID}];
	$SP = [\p{LineBreak = SP}];
	$QU = [\p{LineBreak = QU}];
	$CR = [\p{LineBreak = CR}];
	$GL = [\p{LineBreak = GL}];

	$JL = [\p{LineBreak = JL}];
	$JV = [\p{LineBreak = JV}];
	$JT = [\p{LineBreak = JT}];
	$H2 = [\p{LineBreak = H2}];
	$H3 = [\p{LineBreak = H3}];


	$Extend = [\{p{Grapheme_Cluster_Break = Extend}];


	#
	# Thai Dictionary related definitions and rules
	#

	$dictionary = [\u0e01-\u0e2e \u0e30-\u0e3a \u0e40-\u0e44 \u0e47-\u0e4e]; # this rule breaks the iterator with mixed Thai and English
	$paiyannoi = [\u0e2f];
	$maiyamok = [\u0e46];
	$thai_etc = $paiyannoi \u0e25 $paiyannoi;



	#
	# Rule LB1. By default, treat AI (characters with ambiguous east Asian width) and
	# SA (South East Asian: Thai, Lao, Khmer) as $AL (Alphabetic)
	#
	$ALPlus = $AL \| $AI \| [$SA - $dictionary];

	#
	# Combining Marks. X $CM* behaves as if it were X. Rule LB6.
	# TODO: This is going to produce some odd results, because of the non-combining
	# chars that are included in $CM. Use $Extend instead, where possible.
	#
	$ALcm = $ALPlus $CM*;
	$IDcm = $ID $CM*;
	$NUcm = $NU $Extend*;
	$HYcm = $HY $Extend*;
	$SPcm = $SP $Extend*;
	$QUcm = $QU $Extend*;
	$POcm = $PO $Extend*;
	$OPcm = $OP $Extend*;
	$BAcm = $BA $Extend*;
	$BBcm = $BB $Extend*;
	$NScm = $NS $Extend*;
	$GLcm = $GL $Extend*;
	$B2cm = $B2 $Extend*;
	$INcm = $IN $Extend*;


	# New Lines. Always break after, never break before.
	# Rule LB 3
	#
	# Endings. NewLine or Zero Width Space, or both. Rules 4, 5
	# Because we never break before these things, $Endings
	# appears at the end of line break rule.
	#
	$NLF = $BK \| $CR \| $LF \| $CR $LF;
	$Endings = $SPcm* $ZW* $NLF?;
	$EndingsMandatory = $SPcm* $NLF \| $SPcm* $ZW $NLF?;


	#
	# Openings Sequences that can precede Words, and that should not be separated from them.
	# Rules LB 9, 10
	#
	$Openings = (($QUcm $SPcm)? $OPcm $SPcm)*;

	#
	# Closings Seqences that follow words, and that should not be separated from them,
	# Rule LB 8, 11, 15
	$Closings = ($SPcm( ($CL ($SPcm $NScm)? \| $EX \| $IS \| $SY) $Extend) \| $BAcm \| $HYcm \| $NScm \| $maiyamok);

	#
	# Words. Includes mixed Alpha-numerics.
	# Rules 11a, 16, 17, 19, more or less.
	#
	$NumberInterior = $IDcm \| ($NUcm \| $ALcm \| $IS $NUcm)+;
	$Number = $PR? ($OPcm \| $HYcm)? $NumberInterior $CL? $POcm?; # Fancy Number 18
	$Word = (($IDcm \| ($ALcm \| $NUcm)+) ($POcm? \| $INcm?)); # Alpha-numeric. 16, 17
	$Dashes = (($B2cm $SPcm)); # Dashes 11a
	$ThaiRange = $dictionary+ \| $thai_etc;
	$WordLikeThing = $Number \| $Word \| $Dashes \| $ThaiRange;




	$Word15 = ($BBcm* ($WordLikeThing)? ($BAcm \| $HYcm \| $NScm)*) \| # Rule 15. Stuff sticks around words.
	[^[:Cc:] $BK $CR $LF $ZW $SP $GL] $Extend* \| # Allow characters that don't meet the
	[^$BK $CR $LF $ZW $SP $GL ]; # more elaborate definitions for WORD to be glued.


	$GluedWord = ($GLcm \| $QUcm)? $Word15 (($GLcm \| $QUcm) $Word15)*; # "Glue" will stick anything below it together.
	# Rules 13, 14

	#
	# The actual rules, a combination of everything defined above.
	#
	$Openings $GluedWord $Closings $paiyannoi? $EndingsMandatory;
	$Openings $GluedWord $Closings $Endings;

	$Openings $GluedWord $Closings $paiyannoi /
	([^\u0e25 $Extend] \| \u0e25[^$paiyannoi $Extend]);


	#"$word($nbsp+$word)*$paiyannoi/([^[\u0e25$_ignore_]]\|"
	# + "\u0e25[^$paiyannoi$_ignore_]);"

	#
	# LB 18b. Do not break a Korean syllable
	#
	$JL+ $JV* $JT* $Extend*;
	$JV+ $JT* $Extend*;
	$JT+ $Extend*;
	$H2 $JV* $JT* $Extend*;
	$H3 $JT* $Extend*;

	#
	# Reverse Rules.
	#
	# Back up to a hard break or a space that will cause a boundary.
	# Not all spaces cause line breaks. $SpaceGlue represents a sequence
	# containing a space that may inhibit a break from occuring.
	#
	$SpaceGlue = ([$ZW $CL $IS $NS $OP] ($Extend* $SP)) \| (($Extend* $SP)+ $OP);
	$ClumpingChars = [^$SP $BK $CR $LF];

	!. . $ClumpingChars* ($SpaceGlue $ClumpingChars) (. \| $LF $CR);