source/data/brkitr/sent.txt - external/github.com/unicode-org/icu - Git at Google

 #
 #   Copyright (C) 2002, International Business Machines Corporation and others.
 #       All Rights Reserved.
 #
 #   file:  sent.txt
 #
 #   ICU Sentence Break Rules
 #      See Unicode Technical Report #29.
 #      These rules are based on the proposed draft dated 2002-08-09
 #


 #
 # Character categories as defined in TR 29
 #
 $Sep     = [\u000d \u000a \u0085 \u2028 \u2029];
 $Format  = [[:Cf:]];
 $Sp      = [[:Whitespace:] - $Sep];
 $Lower   = [[:Lowercase:]];
 $Upper   = [[:Lt:] [:Uppercase:]];
 $OLetter = [[:Alphabetic:] \u02b9-\u02ba  \u02c2-\u02cf  \u02d2-\u02df  \u02e5-\u02ed  \u05f3];

                            #  The chars listed by number below are those with "Linebreak=QU"
 $Close   = [[:Pe:] [:Po:]  \u0022 \u0027 \u00AB \u00BB \u2018 \u2019 \u201B-\u201C
                            \u201D \u201F \u2039 \u203A \u23B6 \u275B-\u275E ];

 $ATerm = [\u002e];
 $Term  = [\u0021 \u003F \u0589 \u061F \u06D4 \u0701 \u0702 \u0700 \u0964
           \u1362 \u1367 \u1368 \u1803 \u203C \u203D \u2048 \u2049 \u3002
           \uFE52 \uFE57 \uFF01 \uFF0E \uFF1F \uFF61];
 $AnyTerm = [$ATerm $Term];

 # From Grapheme Cluster
 $Extend   = [[:Mn:] [:Me:] [:Mc:] \uff9e-\uff9f];   #  FF9E..FF9F    ; Other_Grapheme_Extend

 #
 #  $SepSeq keeps together CRLF as a separator.
 #
 $SepSeq  = $Sep | \u000d\u000a;

 # $InteriorChars are those that never cause a break.
 $InteriorChars = [^$AnyTerm $Sep];


 # Sentence Break Rules 8, 9, 11
 # $EndSequence matches a "Normal" sentence, which is one not containg any extra ATerms (periods)
 #              that do not cause a break for one exceptional reason or another.
 $EndSequence       = $InteriorChars* $AnyTerm? ($Close | $AnyTerm | $Format | $Extend)*
                                ($AnyTerm | $Format | $Sp | $Extend)*  $SepSeq?;

 # Rule 6   Matches a sentence fragment containing "." that should not cause a sentence break,
 #          because a lower case word follows the period.
 $LowerWordFollows  = $InteriorChars* $ATerm [^$OLetter $Upper]* $Lower;


 # Rule 7.  $UpperFollowsImmediately
 #          Matches a fragment containing in a "." that should not cause a sentence break
 #          because an uppercase letter follows the period with no intervening spaces.
 $UpperFollowsImmediately = $InteriorChars* $ATerm ($Format | $Extend)* $Upper;

 # Put them all together.
 ($LowerWordFollows |  $UpperFollowsImmediately)*  $EndSequence;


 #
 #  Reverse Rules
 #
 $EndGorp                  = ($AnyTerm | $Sep | $Close | $Extend | $Format | $Sp);
 $RevEndSequence           = $EndGorp* $InteriorChars* $EndGorp*;
 $ReverseLowerWordFollows  = $Lower [^$OLetter $Upper]* $ATerm $InteriorChars*;
 $ReverseUpperFollowsIm    = $Upper ($Format | $Extend)* $ATerm $InteriorChars*;

 ! $RevEndSequence? ($ReverseLowerWordFollows | $ReverseUpperFollowsIm)* .?;
 #! .*;
	#
	# Copyright (C) 2002, International Business Machines Corporation and others.
	# All Rights Reserved.
	#
	# file: sent.txt
	#
	# ICU Sentence Break Rules
	# See Unicode Technical Report #29.
	# These rules are based on the proposed draft dated 2002-08-09
	#


	#
	# Character categories as defined in TR 29
	#
	$Sep = [\u000d \u000a \u0085 \u2028 \u2029];
	$Format = [[:Cf:]];
	$Sp = [[:Whitespace:] - $Sep];
	$Lower = [[:Lowercase:]];
	$Upper = [[:Lt:] [:Uppercase:]];
	$OLetter = [[:Alphabetic:] \u02b9-\u02ba \u02c2-\u02cf \u02d2-\u02df \u02e5-\u02ed \u05f3];

	# The chars listed by number below are those with "Linebreak=QU"
	$Close = [[:Pe:] [:Po:] \u0022 \u0027 \u00AB \u00BB \u2018 \u2019 \u201B-\u201C
	\u201D \u201F \u2039 \u203A \u23B6 \u275B-\u275E ];

	$ATerm = [\u002e];
	$Term = [\u0021 \u003F \u0589 \u061F \u06D4 \u0701 \u0702 \u0700 \u0964
	\u1362 \u1367 \u1368 \u1803 \u203C \u203D \u2048 \u2049 \u3002
	\uFE52 \uFE57 \uFF01 \uFF0E \uFF1F \uFF61];
	$AnyTerm = [$ATerm $Term];

	# From Grapheme Cluster
	$Extend = [[:Mn:] [:Me:] [:Mc:] \uff9e-\uff9f]; # FF9E..FF9F ; Other_Grapheme_Extend

	#
	# $SepSeq keeps together CRLF as a separator.
	#
	$SepSeq = $Sep \| \u000d\u000a;

	# $InteriorChars are those that never cause a break.
	$InteriorChars = [^$AnyTerm $Sep];



	# Sentence Break Rules 8, 9, 11
	# $EndSequence matches a "Normal" sentence, which is one not containg any extra ATerms (periods)
	# that do not cause a break for one exceptional reason or another.
	$EndSequence = $InteriorChars* $AnyTerm? ($Close \| $AnyTerm \| $Format \| $Extend)*
	($AnyTerm \| $Format \| $Sp \| $Extend)* $SepSeq?;

	# Rule 6 Matches a sentence fragment containing "." that should not cause a sentence break,
	# because a lower case word follows the period.
	$LowerWordFollows = $InteriorChars* $ATerm [^$OLetter $Upper]* $Lower;


	# Rule 7. $UpperFollowsImmediately
	# Matches a fragment containing in a "." that should not cause a sentence break
	# because an uppercase letter follows the period with no intervening spaces.
	$UpperFollowsImmediately = $InteriorChars* $ATerm ($Format \| $Extend)* $Upper;

	# Put them all together.
	($LowerWordFollows \| $UpperFollowsImmediately)* $EndSequence;


	#
	# Reverse Rules
	#
	$EndGorp = ($AnyTerm \| $Sep \| $Close \| $Extend \| $Format \| $Sp);
	$RevEndSequence = $EndGorp* $InteriorChars* $EndGorp*;
	$ReverseLowerWordFollows = $Lower [^$OLetter $Upper]* $ATerm $InteriorChars*;
	$ReverseUpperFollowsIm = $Upper ($Format \| $Extend)* $ATerm $InteriorChars*;

	! $RevEndSequence? ($ReverseLowerWordFollows \| $ReverseUpperFollowsIm)* .?;
	#! .*;