| # |
| # Copyright (C) 2002 - 2003, International Business Machines Corporation and others. |
| # All Rights Reserved. |
| # |
| # file: sent.txt |
| # |
| # ICU Sentence Break Rules |
| # See Unicode Standard Annex #29. |
| # These rules are based on the draft dated 2003-03-31 |
| # |
| |
| |
| # |
| # Character categories as defined in TR 29 |
| # |
| $Sep = [\u000a \u000d \u0085 \u2028 \u2029]; |
| $Format = [[:Format:]]; |
| $Sp = [[:Whitespace:] - $Sep]; |
| $Lower = [[:Lowercase:]]; |
| $Upper = [[:TitleCase_Letter:] [:Uppercase:]]; |
| $OLetter = [[:Alphabetic:] [:name = HEBREW PUNCTUATION GERESH:] - [$Lower $Upper]]; |
| $Numeric = [:LineBreak = Numeric:]; |
| |
| $ATerm = [.]; |
| |
| $Term = [\u0021 \u003F \u0589 \u061F \u06D4 \u0700 \u0701 \u0702 \u0964 \u1362 |
| \u1367 \u1368 \u104a \u104b \u166e \u1803 \u1809 \u203C \u203D \u2047 |
| \u2048 \u2049 \u3002 \uFE52 \uFE57 \uFF01 \uFF0E \uFF1F \uFF61]; |
| |
| $Close = [[:Open_Punctuation:] [:Close_Punctuation:] [:Linebreak = Quotation:] - |
| [[:name = HEBREW PUNCTUATION GERESH:] $ATerm $Term]]; |
| |
| |
| |
| # Define extended forms of the character classes, |
| # incorporate grapheme cluster + format chars. |
| |
| $Extend = [[:Grapheme_Extend = TRUE:]]; |
| $ATermEx = $ATerm $Extend* $Format*; |
| $NumericEx = $Numeric $Extend* $Format*; |
| $UpperEx = $Upper $Extend* $Format*; |
| $TermEx = $Term $Extend* $Format*; |
| |
| # |
| # $SepSeq keeps together CRLF as a separator. (CRLF is a grapheme cluster) |
| # |
| $SepSeq = $Sep | \u000d\u000a; |
| |
| # $InteriorChars are those that never trigger a following break. |
| $InteriorChars = [^$Term $ATerm $Sep]; #Note: includes Extend and Format chars |
| |
| |
| # Rule 6. Match an ATerm (.) that does not cause a break because a number immediately follows it. |
| $NumberFollows = $InteriorChars* $ATermEx $NumericEx; |
| |
| |
| # Rule 7. $UppersSurround Match a no-break sentence fragment containing a . surrounded by Uppers |
| $UppersSurround = $InteriorChars* $UpperEx $ATermEx $UpperEx; |
| |
| # Rule 8 Matches a sentence fragment containing "." that should not cause a sentence break, |
| # because a lower case word follows the period. |
| $LowerWordFollows = $InteriorChars* $ATermEx $Close* $Sp* [^$OLetter $Upper $Lower $Sep]* $Lower; |
| |
| # Rules 3, 9, 10, 11 |
| # Matches a simple sentence, or the trailing part of a complex sentence, |
| # where a simple sentence contains no interior "."s. |
| $EndSequence = $InteriorChars* ($TermEx | $ATermEx) $Close* $Sp* $SepSeq? | |
| $InteriorChars* $SepSeq?; |
| |
| |
| |
| # Put them all together. |
| ($NumberFollows | $UppersSurround | $LowerWordFollows)* $EndSequence; |
| |
| |
| # |
| # Reverse Rules |
| # |
| $EndGorp = ($Term | $ATerm | $Sep | $Close | $Extend | $Format | $Sp); |
| $RevEndSequence = $EndGorp* $InteriorChars* $EndGorp* | $Sep [^$ATerm $Term]*; |
| $ReverseLowerWordFollows = $Lower [^$OLetter $Upper $Lower $Sep]* $ATerm $InteriorChars*; |
| $ReverseUpperSurround = $Upper $Format* $Extend* $ATerm $Format* $Extend* $Upper $InteriorChars*; |
| $ReverseNumberFollows = $Numeric $Format* $Extend* $ATerm $InteriorChars*; |
| |
| ! $RevEndSequence ($ReverseLowerWordFollows | $ReverseUpperSurround | $ReverseNumberFollows)* .?; |
| #! .*; |
| |