blob: 732df1a1b52c4b7499ff7570303d0e7f1d41cee2 [file] [log] [blame]
# file: sent.txt Sentence Boundary Rules.
#
# Separators are line or paragraph ends that will attach to the end of sentences.
$Sep =[\n \r \u0085 \u2028 \u2029];
$SepSeq = $Sep | \u000d\u000a;
$Sp = [[:Zs:] - $Sep];
# $ATerm contains ambiguous terminators, characters that may or may not terminate
# sentence depending on the context.
# $Term contains $ATerm + all characters that unambiguously end sentences.
#
$ATerm = [\u002e \u0589 \u3001]; # same as Terminal_Punctuation2 from TR29
$Term = [$ATerm \u0021 \u003f \u037e \u061f \u06d4 \u203c \u203d
\u3002 \u2048 \u2049
\u0964]; # TODO: these (this line) not yet decided in TR29.
$Lower = [[:Ll:] [:Sk:]];
$Upper = [[:Lu:] [:Lt:]];
$NotLetter = [^[:L:] $Term];
$Open = [:Ps:];
$Close = [[:Pe:] \" \'];
#
# Combining chars. Copied from UNIDATA/DerivedCoreProperties.txt
#
$Extend =
[\u0300-\u034E \u0360-\u036F \u0483-\u0486 \u0488-\u0489 \u0591-\u05A1 \u05A3-\u05B9
\u05BB-\u05BD \u05BF \u05C1-\u05C2 \u05C4 \u064B-\u0655 \u0670 \u06D6-\u06DC
\u06DE \u06DF-\u06E4 \u06E7-\u06E8 \u06EA-\u06ED \u0711 \u0730-\u074A
\u07A6-\u07B0 \u0901-\u0902 \u0903 \u093C \u093E-\u0940 \u0941-\u0948
\u0949-\u094C \u0951-\u0954 \u0962-\u0963 \u0981 \u0982-\u0983 \u09BC
\u09BE-\u09C0 \u09C1-\u09C4 \u09C7-\u09C8 \u09CB-\u09CC \u09D7 \u09E2-\u09E3
\u0A02 \u0A3C \u0A3E-\u0A40 \u0A41-\u0A42 \u0A47-\u0A48 \u0A4B-\u0A4C
\u0A70-\u0A71 \u0A81-\u0A82 \u0A83 \u0ABC \u0ABE-\u0AC0 \u0AC1-\u0AC5
\u0AC7-\u0AC8 \u0AC9 \u0ACB-\u0ACC \u0B01 \u0B02-\u0B03 \u0B3C \u0B3E
\u0B3F \u0B40 \u0B41-\u0B43 \u0B47-\u0B48 \u0B4B-\u0B4C \u0B56 \u0B57
\u0B82 \u0BBE-\u0BBF \u0BC0 \u0BC1-\u0BC2 \u0BC6-\u0BC8 \u0BCA-\u0BCC \u0BD7
\u0C01-\u0C03 \u0C3E-\u0C40 \u0C41-\u0C44 \u0C46-\u0C48 \u0C4A-\u0C4C
\u0C55-\u0C56 \u0C82-\u0C83 \u0CBE \u0CBF \u0CC0-\u0CC4 \u0CC6
\u0CC7-\u0CC8 \u0CCA-\u0CCB \u0CCC \u0CD5-\u0CD6 \u0D02-\u0D03 \u0D3E-\u0D40
\u0D41-\u0D43 \u0D46-\u0D48 \u0D4A-\u0D4C \u0D57 \u0D82-\u0D83 \u0DCF-\u0DD1
\u0DD2-\u0DD4 \u0DD6 \u0DD8-\u0DDF \u0DF2-\u0DF3 \u0E31 \u0E34-\u0E39
\u0E47-\u0E4E \u0EB1 \u0EB4-\u0EB9 \u0EBB-\u0EBC \u0EC8-\u0ECD \u0F18-\u0F19
\u0F35 \u0F37 \u0F39 \u0F3E-\u0F3F \u0F71-\u0F7E \u0F7F \u0F80-\u0F84
\u0F86-\u0F87 \u0F90-\u0F97 \u0F99-\u0FBC \u0FC6 \u102C \u102D-\u1030 \u1031
\u1032 \u1036-\u1037 \u1038 \u1056-\u1057 \u1058-\u1059 \u1712-\u1714
\u1732-\u1734 \u1752-\u1753 \u1772-\u1773 \u17B4-\u17B6 \u17B7-\u17BD
\u17BE-\u17C5 \u17C6 \u17C7-\u17C8 \u17C9-\u17D1 \u17D3 \u180B-\u180D
\u18A9 \u20D0-\u20DC \u20DD-\u20E0 \u20E1 \u20E2-\u20E4 \u20E5-\u20EA
\u302A-\u302F \u3099-\u309A \uFB1E \uFE00-\uFE0F \uFE20-\uFE23 \uFF9E-\uFF9F
\U0001D165-\U0001D166 \U0001D167-\U0001D169 \U0001D16D-\U0001D172
\U0001D17B-\U0001D182 \U0001D185-\U0001D18B \U0001D1AA-\U0001D1AD];
$EndSequence = [^$Term]* $Term ($Close | $Term | $Extend)* $Sp* $SepSeq?;
$LowerWordFollows = [^$Term]* $ATerm $Close* $Sp* $SepSeq? $NotLetter* $Lower;
$UpperWordPrecedes = [^$Term]* $Upper ($Lower | $Extend)* $ATerm $Close* $Sp* $SepSeq?;
($LowerWordFollows | $UpperWordPrecedes)* $EndSequence;
#
# In cases where the input text ends without a normal end-of-sentence sequence,
# this rule will match whatever text is there.
#
[^$Term]*;
#
# Reverse Rules
#
$RevEndSequence = [^$Term]* ($Term | $Close | $Extend)* [^$Term]*;
$ReverseLowerWordFollows = $Lower ($Close | $Sp | $Sep | $Extend | $NotLetter)* $ATerm [^$Term]*;
$ReverseUpperWordPrecedes = $ATerm ($Lower | $Extend)* $Upper [^$Term]*;
! $RevEndSequence? ($ReverseLowerWordFollows | $ReverseUpperWordPrecedes)* $Term?;
!.;