blob: 1ad50abaed7163ef049360f4bd3269d43fca5649 [file] [log] [blame]
#
# Copyright (C) 2002, International Business Machines Corporation and others.
# All Rights Reserved.
#
# file: sent.txt
#
# ICU Sentence Break Rules
# See Unicode Technical Report #29.
# These rules are based on the proposed draft dated 2002-08-09
#
#
# Character categories as defined in TR 29
#
$Sep = [\u000d \u000a \u0085 \u2028 \u2029];
$Format = [[:Cf:]];
$Sp = [[:Whitespace:] - $Sep];
$Lower = [[:Lowercase:]];
$Upper = [[:Lt:] [:Uppercase:]];
$OLetter = [[:Alphabetic:] \u02b9-\u02ba \u02c2-\u02cf \u02d2-\u02df \u02e5-\u02ed \u05f3];
# The chars listed by number below are those with "Linebreak=QU"
$Close = [[:Pe:] [:Po:] \u0022 \u0027 \u00AB \u00BB \u2018 \u2019 \u201B-\u201C
\u201D \u201F \u2039 \u203A \u23B6 \u275B-\u275E ];
$ATerm = [\u002e];
$Term = [\u0021 \u003F \u0589 \u061F \u06D4 \u0701 \u0702 \u0700 \u0964
\u1362 \u1367 \u1368 \u1803 \u203C \u203D \u2048 \u2049 \u3002
\uFE52 \uFE57 \uFF01 \uFF0E \uFF1F \uFF61];
$AnyTerm = [$ATerm $Term];
# From Grapheme Cluster
$Extend = [[:Mn:] [:Me:] [:Mc:] \uff9e-\uff9f]; # FF9E..FF9F ; Other_Grapheme_Extend
#
# $SepSeq keeps together CRLF as a separator.
#
$SepSeq = $Sep | \u000d\u000a;
# $InteriorChars are those that never cause a break.
$InteriorChars = [^$AnyTerm $Sep];
# Sentence Break Rules 8, 9, 11
# $EndSequence matches a "Normal" sentence, which is one not containg any extra ATerms (periods)
# that do not cause a break for one exceptional reason or another.
$EndSequence = $InteriorChars* $AnyTerm? ($Close | $AnyTerm | $Format | $Extend)*
($AnyTerm | $Format | $Sp | $Extend)* $SepSeq?;
# Rule 6 Matches a sentence fragment containing "." that should not cause a sentence break,
# because a lower case word follows the period.
$LowerWordFollows = $InteriorChars* $ATerm [^$OLetter $Upper]* $Lower;
# Rule 7. $UpperFollowsImmediately
# Matches a fragment containing in a "." that should not cause a sentence break
# because an uppercase letter follows the period with no intervening spaces.
$UpperFollowsImmediately = $InteriorChars* $ATerm ($Format | $Extend)* $Upper;
# Put them all together.
($LowerWordFollows | $UpperFollowsImmediately)* $EndSequence;
#
# Reverse Rules
#
$EndGorp = ($AnyTerm | $Sep | $Close | $Extend | $Format | $Sp);
$RevEndSequence = $EndGorp* $InteriorChars* $EndGorp*;
$ReverseLowerWordFollows = $Lower [^$OLetter $Upper]* $ATerm $InteriorChars*;
$ReverseUpperFollowsIm = $Upper ($Format | $Extend)* $ATerm $InteriorChars*;
! $RevEndSequence? ($ReverseLowerWordFollows | $ReverseUpperFollowsIm)* .?;
#! .*;