blob: 01e80226801e7e4728c5410de13740f63358b5d8 [file] [log] [blame]
# ***************************************************************************
# *
# * Copyright (C) 2004-2016, International Business Machines
# * Corporation; Unicode, Inc.; and others. All Rights Reserved.
# *
# ***************************************************************************
# File: Han_Spacedhan.txt
# Generated from CLDR
#
# Only intended for internal use
# Make sure Han are normalized, including characters that contain them.
# The first set in the filter is computed with http://unicode.org/cldr/utility/list-unicodeset.jsp?a=[:tonfkd:/XXX/:]-[:ideographic:]-[:sc=han:]
# Where XXX is the resolved [:ideographic:][:sc=han:]. It needs updating with each Unicode release!
:: [[㆒-㆟㈠-㉇㊀-㊰㋀-㋋㍘-㍰㍻-㍿㏠-㏾ 🈐-🈒🈔-🈺🉀-🉈🉐🉑][:ideographic:][:sc=han:]] nfkc;
:: fullwidth-halfwidth;
。 → '.';
$terminalPunct = [\.\,\:\;\?\!.,:?!。、;[:Pe:][:Pf:]];
$initialPunct = [:Ps:][:Pi:];
# add space between any Han or terminal punctuation and letters, and
# between letters and Han or initial punct
[[:Ideographic:] $terminalPunct] {} [:Letter:] → ' ' ;
[:Letter:] [:Mark:]* {} [[:Ideographic:] $initialPunct] → ' ' ;
# remove spacing between ideographs and other letters
← [:Ideographic:] { ' ' } [:Letter:] ;
← [:Letter:] [:Mark:]* { ' ' } [:Ideographic:] ;