| // © 2016 and later: Unicode, Inc. and others. |
| // License & terms of use: http://www.unicode.org/copyright.html |
| /* |
| ********************************************************************** |
| * Copyright (C) 1999-2015, International Business Machines |
| * Corporation and others. All Rights Reserved. |
| ********************************************************************** |
| * Date Name Description |
| * 11/17/99 aliu Creation. |
| ********************************************************************** |
| */ |
| |
| #include "unicode/utypes.h" |
| |
| #if !UCONFIG_NO_TRANSLITERATION |
| |
| #include "unicode/rep.h" |
| #include "unicode/uniset.h" |
| #include "rbt_pars.h" |
| #include "rbt_data.h" |
| #include "rbt_rule.h" |
| #include "rbt.h" |
| #include "mutex.h" |
| #include "umutex.h" |
| |
| U_NAMESPACE_BEGIN |
| |
| UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RuleBasedTransliterator) |
| |
| static Replaceable *gLockedText = NULL; |
| |
| void RuleBasedTransliterator::_construct(const UnicodeString& rules, |
| UTransDirection direction, |
| UParseError& parseError, |
| UErrorCode& status) { |
| fData = 0; |
| isDataOwned = TRUE; |
| if (U_FAILURE(status)) { |
| return; |
| } |
| |
| TransliteratorParser parser(status); |
| parser.parse(rules, direction, parseError, status); |
| if (U_FAILURE(status)) { |
| return; |
| } |
| |
| if (parser.idBlockVector.size() != 0 || |
| parser.compoundFilter != NULL || |
| parser.dataVector.size() == 0) { |
| status = U_INVALID_RBT_SYNTAX; // ::ID blocks disallowed in RBT |
| return; |
| } |
| |
| fData = (TransliterationRuleData*)parser.dataVector.orphanElementAt(0); |
| setMaximumContextLength(fData->ruleSet.getMaximumContextLength()); |
| } |
| |
| /** |
| * Constructs a new transliterator from the given rules. |
| * @param id the id for the transliterator. |
| * @param rules rules, separated by ';' |
| * @param direction either FORWARD or REVERSE. |
| * @param adoptedFilter the filter for this transliterator. |
| * @param parseError Struct to recieve information on position |
| * of error if an error is encountered |
| * @param status Output param set to success/failure code. |
| * @exception IllegalArgumentException if rules are malformed |
| * or direction is invalid. |
| */ |
| RuleBasedTransliterator::RuleBasedTransliterator( |
| const UnicodeString& id, |
| const UnicodeString& rules, |
| UTransDirection direction, |
| UnicodeFilter* adoptedFilter, |
| UParseError& parseError, |
| UErrorCode& status) : |
| Transliterator(id, adoptedFilter) { |
| _construct(rules, direction,parseError,status); |
| } |
| |
| /** |
| * Constructs a new transliterator from the given rules. |
| * @param id the id for the transliterator. |
| * @param rules rules, separated by ';' |
| * @param direction either FORWARD or REVERSE. |
| * @param adoptedFilter the filter for this transliterator. |
| * @param status Output param set to success/failure code. |
| * @exception IllegalArgumentException if rules are malformed |
| * or direction is invalid. |
| */ |
| /*RuleBasedTransliterator::RuleBasedTransliterator( |
| const UnicodeString& id, |
| const UnicodeString& rules, |
| UTransDirection direction, |
| UnicodeFilter* adoptedFilter, |
| UErrorCode& status) : |
| Transliterator(id, adoptedFilter) { |
| UParseError parseError; |
| _construct(rules, direction,parseError, status); |
| }*/ |
| |
| /** |
| * Convenience constructor with no filter. |
| */ |
| /*RuleBasedTransliterator::RuleBasedTransliterator( |
| const UnicodeString& id, |
| const UnicodeString& rules, |
| UTransDirection direction, |
| UErrorCode& status) : |
| Transliterator(id, 0) { |
| UParseError parseError; |
| _construct(rules, direction,parseError, status); |
| }*/ |
| |
| /** |
| * Convenience constructor with no filter and FORWARD direction. |
| */ |
| /*RuleBasedTransliterator::RuleBasedTransliterator( |
| const UnicodeString& id, |
| const UnicodeString& rules, |
| UErrorCode& status) : |
| Transliterator(id, 0) { |
| UParseError parseError; |
| _construct(rules, UTRANS_FORWARD, parseError, status); |
| }*/ |
| |
| /** |
| * Convenience constructor with FORWARD direction. |
| */ |
| /*RuleBasedTransliterator::RuleBasedTransliterator( |
| const UnicodeString& id, |
| const UnicodeString& rules, |
| UnicodeFilter* adoptedFilter, |
| UErrorCode& status) : |
| Transliterator(id, adoptedFilter) { |
| UParseError parseError; |
| _construct(rules, UTRANS_FORWARD,parseError, status); |
| }*/ |
| |
| RuleBasedTransliterator::RuleBasedTransliterator(const UnicodeString& id, |
| const TransliterationRuleData* theData, |
| UnicodeFilter* adoptedFilter) : |
| Transliterator(id, adoptedFilter), |
| fData((TransliterationRuleData*)theData), // cast away const |
| isDataOwned(FALSE) { |
| setMaximumContextLength(fData->ruleSet.getMaximumContextLength()); |
| } |
| |
| /** |
| * Internal constructor. |
| */ |
| RuleBasedTransliterator::RuleBasedTransliterator(const UnicodeString& id, |
| TransliterationRuleData* theData, |
| UBool isDataAdopted) : |
| Transliterator(id, 0), |
| fData(theData), |
| isDataOwned(isDataAdopted) { |
| setMaximumContextLength(fData->ruleSet.getMaximumContextLength()); |
| } |
| |
| /** |
| * Copy constructor. |
| */ |
| RuleBasedTransliterator::RuleBasedTransliterator( |
| const RuleBasedTransliterator& other) : |
| Transliterator(other), fData(other.fData), |
| isDataOwned(other.isDataOwned) { |
| |
| // The data object may or may not be owned. If it is not owned we |
| // share it; it is invariant. If it is owned, it's still |
| // invariant, but we need to copy it to prevent double-deletion. |
| // If this becomes a performance issue (if people do a lot of RBT |
| // copying -- unlikely) we can reference count the data object. |
| |
| // Only do a deep copy if this is owned data, that is, data that |
| // will be later deleted. System transliterators contain |
| // non-owned data. |
| if (isDataOwned) { |
| fData = new TransliterationRuleData(*other.fData); |
| } |
| } |
| |
| /** |
| * Destructor. |
| */ |
| RuleBasedTransliterator::~RuleBasedTransliterator() { |
| // Delete the data object only if we own it. |
| if (isDataOwned) { |
| delete fData; |
| } |
| } |
| |
| RuleBasedTransliterator* |
| RuleBasedTransliterator::clone() const { |
| return new RuleBasedTransliterator(*this); |
| } |
| |
| /** |
| * Implements {@link Transliterator#handleTransliterate}. |
| */ |
| void |
| RuleBasedTransliterator::handleTransliterate(Replaceable& text, UTransPosition& index, |
| UBool isIncremental) const { |
| /* We keep contextStart and contextLimit fixed the entire time, |
| * relative to the text -- contextLimit may move numerically if |
| * text is inserted or removed. The start offset moves toward |
| * limit, with replacements happening under it. |
| * |
| * Example: rules 1. ab>x|y |
| * 2. yc>z |
| * |
| * |eabcd begin - no match, advance start |
| * e|abcd match rule 1 - change text & adjust start |
| * ex|ycd match rule 2 - change text & adjust start |
| * exz|d no match, advance start |
| * exzd| done |
| */ |
| |
| /* A rule like |
| * a>b|a |
| * creates an infinite loop. To prevent that, we put an arbitrary |
| * limit on the number of iterations that we take, one that is |
| * high enough that any reasonable rules are ok, but low enough to |
| * prevent a server from hanging. The limit is 16 times the |
| * number of characters n, unless n is so large that 16n exceeds a |
| * uint32_t. |
| */ |
| uint32_t loopCount = 0; |
| uint32_t loopLimit = index.limit - index.start; |
| if (loopLimit >= 0x10000000) { |
| loopLimit = 0xFFFFFFFF; |
| } else { |
| loopLimit <<= 4; |
| } |
| |
| // Transliterator locking. Rule-based Transliterators are not thread safe; concurrent |
| // operations must be prevented. |
| // A Complication: compound transliterators can result in recursive entries to this |
| // function, sometimes with different "This" objects, always with the same text. |
| // Double-locking must be prevented in these cases. |
| // |
| |
| UBool lockedMutexAtThisLevel = FALSE; |
| |
| // Test whether this request is operating on the same text string as |
| // some other transliteration that is still in progress and holding the |
| // transliteration mutex. If so, do not lock the transliteration |
| // mutex again. |
| // |
| // gLockedText variable is protected by the global ICU mutex. |
| // Shared RBT data protected by transliteratorDataMutex. |
| // |
| // TODO(andy): Need a better scheme for handling this. |
| |
| static UMutex transliteratorDataMutex; |
| UBool needToLock; |
| { |
| Mutex m; |
| needToLock = (&text != gLockedText); |
| } |
| if (needToLock) { |
| umtx_lock(&transliteratorDataMutex); // Contention, longish waits possible here. |
| Mutex m; |
| gLockedText = &text; |
| lockedMutexAtThisLevel = TRUE; |
| } |
| |
| // Check to make sure we don't dereference a null pointer. |
| if (fData != NULL) { |
| while (index.start < index.limit && |
| loopCount <= loopLimit && |
| fData->ruleSet.transliterate(text, index, isIncremental)) { |
| ++loopCount; |
| } |
| } |
| if (lockedMutexAtThisLevel) { |
| { |
| Mutex m; |
| gLockedText = NULL; |
| } |
| umtx_unlock(&transliteratorDataMutex); |
| } |
| } |
| |
| UnicodeString& RuleBasedTransliterator::toRules(UnicodeString& rulesSource, |
| UBool escapeUnprintable) const { |
| return fData->ruleSet.toRules(rulesSource, escapeUnprintable); |
| } |
| |
| /** |
| * Implement Transliterator framework |
| */ |
| void RuleBasedTransliterator::handleGetSourceSet(UnicodeSet& result) const { |
| fData->ruleSet.getSourceTargetSet(result, FALSE); |
| } |
| |
| /** |
| * Override Transliterator framework |
| */ |
| UnicodeSet& RuleBasedTransliterator::getTargetSet(UnicodeSet& result) const { |
| return fData->ruleSet.getSourceTargetSet(result, TRUE); |
| } |
| |
| U_NAMESPACE_END |
| |
| #endif /* #if !UCONFIG_NO_TRANSLITERATION */ |