# This includes the previously applied # rhbz1646703-icu4c-ICU-20246-integer-overflow.patch # on 63.1 that was also applied by ICU to 63.2 # Omitted are the Japanese Reiwa source/test/intltest/incaltst.* related # changes, i.e. still included, not backed out. # Also omitted are changes that would identify as ICU 63.1 instead of 63.2 # (configure, readme, icu version, package version, data version, ...) as it # would confuse the installer or pkgconfig or possibly a mismatch with the # included binary icu/source/data/in/icudt63l.dat diff -urp icu4c-63_2/icu/source/common/characterproperties.cpp icu4c-63_1/icu/source/common/characterproperties.cpp --- icu4c-63_2/icu/source/common/characterproperties.cpp 2019-04-12 00:38:30.000000000 +0200 +++ icu4c-63_1/icu/source/common/characterproperties.cpp 2018-10-02 00:39:56.000000000 +0200 @@ -23,9 +23,6 @@ #include "umutex.h" #include "uprops.h" -using icu::LocalPointer; -using icu::Normalizer2Factory; -using icu::Normalizer2Impl; using icu::UInitOnce; using icu::UnicodeSet; @@ -33,13 +30,11 @@ namespace { UBool U_CALLCONV characterproperties_cleanup(); -constexpr int32_t NUM_INCLUSIONS = UPROPS_SRC_COUNT + UCHAR_INT_LIMIT - UCHAR_INT_START; - struct Inclusion { UnicodeSet *fSet; UInitOnce fInitOnce; }; -Inclusion gInclusions[NUM_INCLUSIONS]; // cached getInclusions() +Inclusion gInclusions[UPROPS_SRC_COUNT]; // cached getInclusions() UnicodeSet *sets[UCHAR_BINARY_LIMIT] = {}; @@ -85,22 +80,35 @@ UBool U_CALLCONV characterproperties_cle return TRUE; } -void U_CALLCONV initInclusion(UPropertySource src, UErrorCode &errorCode) { +} // namespace + +U_NAMESPACE_BEGIN + +/* +Reduce excessive reallocation, and make it easier to detect initialization problems. +Usually you don't see smaller sets than this for Unicode 5.0. +*/ +constexpr int32_t DEFAULT_INCLUSION_CAPACITY = 3072; + +void U_CALLCONV CharacterProperties::initInclusion(UPropertySource src, UErrorCode &errorCode) { // This function is invoked only via umtx_initOnce(). + // This function is a friend of class UnicodeSet. + U_ASSERT(0 <= src && src < UPROPS_SRC_COUNT); if (src == UPROPS_SRC_NONE) { errorCode = U_INTERNAL_PROGRAM_ERROR; return; } - U_ASSERT(gInclusions[src].fSet == nullptr); + UnicodeSet * &incl = gInclusions[src].fSet; + U_ASSERT(incl == nullptr); - LocalPointer incl(new UnicodeSet()); - if (incl.isNull()) { + incl = new UnicodeSet(); + if (incl == nullptr) { errorCode = U_MEMORY_ALLOCATION_ERROR; return; } USetAdder sa = { - (USet *)incl.getAlias(), + (USet *)incl, _set_add, _set_addRange, _set_addString, @@ -108,6 +116,7 @@ void U_CALLCONV initInclusion(UPropertyS nullptr // don't need removeRange() }; + incl->ensureCapacity(DEFAULT_INCLUSION_CAPACITY, errorCode); switch(src) { case UPROPS_SRC_CHAR: uchar_addPropertyStarts(&sa, &errorCode); @@ -174,15 +183,12 @@ void U_CALLCONV initInclusion(UPropertyS } if (U_FAILURE(errorCode)) { + delete incl; + incl = nullptr; return; } - if (incl->isBogus()) { - errorCode = U_MEMORY_ALLOCATION_ERROR; - return; - } - // Compact for caching. + // Compact for caching incl->compact(); - gInclusions[src].fSet = incl.orphan(); ucln_common_registerCleanup(UCLN_COMMON_CHARACTERPROPERTIES, characterproperties_cleanup); } @@ -193,66 +199,15 @@ const UnicodeSet *getInclusionsForSource return nullptr; } Inclusion &i = gInclusions[src]; - umtx_initOnce(i.fInitOnce, &initInclusion, src, errorCode); + umtx_initOnce(i.fInitOnce, &CharacterProperties::initInclusion, src, errorCode); return i.fSet; } -void U_CALLCONV initIntPropInclusion(UProperty prop, UErrorCode &errorCode) { - // This function is invoked only via umtx_initOnce(). - U_ASSERT(UCHAR_INT_START <= prop && prop < UCHAR_INT_LIMIT); - int32_t inclIndex = UPROPS_SRC_COUNT + prop - UCHAR_INT_START; - U_ASSERT(gInclusions[inclIndex].fSet == nullptr); - UPropertySource src = uprops_getSource(prop); - const UnicodeSet *incl = getInclusionsForSource(src, errorCode); - if (U_FAILURE(errorCode)) { - return; - } - - LocalPointer intPropIncl(new UnicodeSet(0, 0)); - if (intPropIncl.isNull()) { - errorCode = U_MEMORY_ALLOCATION_ERROR; - return; - } - int32_t numRanges = incl->getRangeCount(); - int32_t prevValue = 0; - for (int32_t i = 0; i < numRanges; ++i) { - UChar32 rangeEnd = incl->getRangeEnd(i); - for (UChar32 c = incl->getRangeStart(i); c <= rangeEnd; ++c) { - // TODO: Get a UCharacterProperty.IntProperty to avoid the property dispatch. - int32_t value = u_getIntPropertyValue(c, prop); - if (value != prevValue) { - intPropIncl->add(c); - prevValue = value; - } - } - } - - if (intPropIncl->isBogus()) { - errorCode = U_MEMORY_ALLOCATION_ERROR; - return; - } - // Compact for caching. - intPropIncl->compact(); - gInclusions[inclIndex].fSet = intPropIncl.orphan(); - ucln_common_registerCleanup(UCLN_COMMON_CHARACTERPROPERTIES, characterproperties_cleanup); -} - -} // namespace - -U_NAMESPACE_BEGIN - const UnicodeSet *CharacterProperties::getInclusionsForProperty( UProperty prop, UErrorCode &errorCode) { if (U_FAILURE(errorCode)) { return nullptr; } - if (UCHAR_INT_START <= prop && prop < UCHAR_INT_LIMIT) { - int32_t inclIndex = UPROPS_SRC_COUNT + prop - UCHAR_INT_START; - Inclusion &i = gInclusions[inclIndex]; - umtx_initOnce(i.fInitOnce, &initIntPropInclusion, prop, errorCode); - return i.fSet; - } else { - UPropertySource src = uprops_getSource(prop); - return getInclusionsForSource(src, errorCode); - } + UPropertySource src = uprops_getSource(prop); + return getInclusionsForSource(src, errorCode); } U_NAMESPACE_END @@ -261,7 +216,7 @@ namespace { UnicodeSet *makeSet(UProperty property, UErrorCode &errorCode) { if (U_FAILURE(errorCode)) { return nullptr; } - LocalPointer set(new UnicodeSet()); + icu::LocalPointer set(new UnicodeSet()); if (set.isNull()) { errorCode = U_MEMORY_ALLOCATION_ERROR; return nullptr; diff -urp icu4c-63_2/icu/source/common/ucptrie.cpp icu4c-63_1/icu/source/common/ucptrie.cpp --- icu4c-63_2/icu/source/common/ucptrie.cpp 2019-04-12 00:38:30.000000000 +0200 +++ icu4c-63_1/icu/source/common/ucptrie.cpp 2018-10-02 00:39:56.000000000 +0200 @@ -280,7 +280,7 @@ UChar32 getRange(const void *t, UChar32 int32_t prevI3Block = -1; int32_t prevBlock = -1; UChar32 c = start; - uint32_t trieValue, value; + uint32_t value; bool haveValue = false; do { int32_t i3Block; @@ -319,7 +319,6 @@ UChar32 getRange(const void *t, UChar32 return c - 1; } } else { - trieValue = trie->nullValue; value = nullValue; if (pValue != nullptr) { *pValue = nullValue; } haveValue = true; @@ -358,7 +357,6 @@ UChar32 getRange(const void *t, UChar32 return c - 1; } } else { - trieValue = trie->nullValue; value = nullValue; if (pValue != nullptr) { *pValue = nullValue; } haveValue = true; @@ -366,32 +364,23 @@ UChar32 getRange(const void *t, UChar32 c = (c + dataBlockLength) & ~dataMask; } else { int32_t di = block + (c & dataMask); - uint32_t trieValue2 = getValue(trie->data, valueWidth, di); + uint32_t value2 = getValue(trie->data, valueWidth, di); + value2 = maybeFilterValue(value2, trie->nullValue, nullValue, + filter, context); if (haveValue) { - if (trieValue2 != trieValue) { - if (filter == nullptr || - maybeFilterValue(trieValue2, trie->nullValue, nullValue, - filter, context) != value) { - return c - 1; - } - trieValue = trieValue2; // may or may not help + if (value2 != value) { + return c - 1; } } else { - trieValue = trieValue2; - value = maybeFilterValue(trieValue2, trie->nullValue, nullValue, - filter, context); + value = value2; if (pValue != nullptr) { *pValue = value; } haveValue = true; } while ((++c & dataMask) != 0) { - trieValue2 = getValue(trie->data, valueWidth, ++di); - if (trieValue2 != trieValue) { - if (filter == nullptr || - maybeFilterValue(trieValue2, trie->nullValue, nullValue, - filter, context) != value) { - return c - 1; - } - trieValue = trieValue2; // may or may not help + if (maybeFilterValue(getValue(trie->data, valueWidth, ++di), + trie->nullValue, nullValue, + filter, context) != value) { + return c - 1; } } } diff -urp icu4c-63_2/icu/source/common/umutablecptrie.cpp icu4c-63_1/icu/source/common/umutablecptrie.cpp --- icu4c-63_2/icu/source/common/umutablecptrie.cpp 2019-04-12 00:38:30.000000000 +0200 +++ icu4c-63_1/icu/source/common/umutablecptrie.cpp 2018-10-02 00:39:56.000000000 +0200 @@ -60,7 +60,6 @@ constexpr uint8_t I3_18 = 3; constexpr int32_t INDEX_3_18BIT_BLOCK_LENGTH = UCPTRIE_INDEX_3_BLOCK_LENGTH + UCPTRIE_INDEX_3_BLOCK_LENGTH / 8; class AllSameBlocks; -class MixedBlocks; class MutableCodePointTrie : public UMemory { public: @@ -93,10 +92,8 @@ private: void maskValues(uint32_t mask); UChar32 findHighStart() const; int32_t compactWholeDataBlocks(int32_t fastILimit, AllSameBlocks &allSameBlocks); - int32_t compactData( - int32_t fastILimit, uint32_t *newData, int32_t newDataCapacity, - int32_t dataNullIndex, MixedBlocks &mixedBlocks, UErrorCode &errorCode); - int32_t compactIndex(int32_t fastILimit, MixedBlocks &mixedBlocks, UErrorCode &errorCode); + int32_t compactData(int32_t fastILimit, uint32_t *newData, int32_t dataNullIndex); + int32_t compactIndex(int32_t fastILimit, UErrorCode &errorCode); int32_t compactTrie(int32_t fastILimit, UErrorCode &errorCode); uint32_t *index = nullptr; @@ -304,56 +301,41 @@ UChar32 MutableCodePointTrie::getRange( uint32_t nullValue = initialValue; if (filter != nullptr) { nullValue = filter(context, nullValue); } UChar32 c = start; - uint32_t trieValue, value; + uint32_t value; bool haveValue = false; int32_t i = c >> UCPTRIE_SHIFT_3; do { if (flags[i] == ALL_SAME) { - uint32_t trieValue2 = index[i]; + uint32_t value2 = maybeFilterValue(index[i], initialValue, nullValue, + filter, context); if (haveValue) { - if (trieValue2 != trieValue) { - if (filter == nullptr || - maybeFilterValue(trieValue2, initialValue, nullValue, - filter, context) != value) { - return c - 1; - } - trieValue = trieValue2; // may or may not help + if (value2 != value) { + return c - 1; } } else { - trieValue = trieValue2; - value = maybeFilterValue(trieValue2, initialValue, nullValue, filter, context); + value = value2; if (pValue != nullptr) { *pValue = value; } haveValue = true; } c = (c + UCPTRIE_SMALL_DATA_BLOCK_LENGTH) & ~UCPTRIE_SMALL_DATA_MASK; } else /* MIXED */ { int32_t di = index[i] + (c & UCPTRIE_SMALL_DATA_MASK); - uint32_t trieValue2 = data[di]; + uint32_t value2 = maybeFilterValue(data[di], initialValue, nullValue, + filter, context); if (haveValue) { - if (trieValue2 != trieValue) { - if (filter == nullptr || - maybeFilterValue(trieValue2, initialValue, nullValue, - filter, context) != value) { - return c - 1; - } - trieValue = trieValue2; // may or may not help + if (value2 != value) { + return c - 1; } } else { - trieValue = trieValue2; - value = maybeFilterValue(trieValue2, initialValue, nullValue, filter, context); + value = value2; if (pValue != nullptr) { *pValue = value; } haveValue = true; } while ((++c & UCPTRIE_SMALL_DATA_MASK) != 0) { - trieValue2 = data[++di]; - if (trieValue2 != trieValue) { - if (filter == nullptr || - maybeFilterValue(trieValue2, initialValue, nullValue, - filter, context) != value) { - return c - 1; - } + if (maybeFilterValue(data[++di], initialValue, nullValue, + filter, context) != value) { + return c - 1; } - trieValue = trieValue2; // may or may not help } } ++i; @@ -566,8 +548,28 @@ void MutableCodePointTrie::maskValues(ui } } -template -bool equalBlocks(const UIntA *s, const UIntB *t, int32_t length) { +inline bool +equalBlocks(const uint32_t *s, const uint32_t *t, int32_t length) { + while (length > 0 && *s == *t) { + ++s; + ++t; + --length; + } + return length == 0; +} + +inline bool +equalBlocks(const uint16_t *s, const uint32_t *t, int32_t length) { + while (length > 0 && *s == *t) { + ++s; + ++t; + --length; + } + return length == 0; +} + +inline bool +equalBlocks(const uint16_t *s, const uint16_t *t, int32_t length) { while (length > 0 && *s == *t) { ++s; ++t; @@ -583,6 +585,36 @@ bool allValuesSameAs(const uint32_t *p, } /** Search for an identical block. */ +int32_t findSameBlock(const uint32_t *p, int32_t pStart, int32_t length, + const uint32_t *q, int32_t qStart, int32_t blockLength) { + // Ensure that we do not even partially get past length. + length -= blockLength; + + q += qStart; + while (pStart <= length) { + if (equalBlocks(p + pStart, q, blockLength)) { + return pStart; + } + ++pStart; + } + return -1; +} + +int32_t findSameBlock(const uint16_t *p, int32_t pStart, int32_t length, + const uint32_t *q, int32_t qStart, int32_t blockLength) { + // Ensure that we do not even partially get past length. + length -= blockLength; + + q += qStart; + while (pStart <= length) { + if (equalBlocks(p + pStart, q, blockLength)) { + return pStart; + } + ++pStart; + } + return -1; +} + int32_t findSameBlock(const uint16_t *p, int32_t pStart, int32_t length, const uint16_t *q, int32_t qStart, int32_t blockLength) { // Ensure that we do not even partially get past length. @@ -623,9 +655,30 @@ int32_t findAllSameBlock(const uint32_t * Look for maximum overlap of the beginning of the other block * with the previous, adjacent block. */ -template -int32_t getOverlap(const UIntA *p, int32_t length, - const UIntB *q, int32_t qStart, int32_t blockLength) { +int32_t getOverlap(const uint32_t *p, int32_t length, + const uint32_t *q, int32_t qStart, int32_t blockLength) { + int32_t overlap = blockLength - 1; + U_ASSERT(overlap <= length); + q += qStart; + while (overlap > 0 && !equalBlocks(p + (length - overlap), q, overlap)) { + --overlap; + } + return overlap; +} + +int32_t getOverlap(const uint16_t *p, int32_t length, + const uint32_t *q, int32_t qStart, int32_t blockLength) { + int32_t overlap = blockLength - 1; + U_ASSERT(overlap <= length); + q += qStart; + while (overlap > 0 && !equalBlocks(p + (length - overlap), q, overlap)) { + --overlap; + } + return overlap; +} + +int32_t getOverlap(const uint16_t *p, int32_t length, + const uint16_t *q, int32_t qStart, int32_t blockLength) { int32_t overlap = blockLength - 1; U_ASSERT(overlap <= length); q += qStart; @@ -754,171 +807,6 @@ private: int32_t refCounts[CAPACITY]; }; -// Custom hash table for mixed-value blocks to be found anywhere in the -// compacted data or index so far. -class MixedBlocks { -public: - MixedBlocks() {} - ~MixedBlocks() { - uprv_free(table); - } - - bool init(int32_t maxLength, int32_t newBlockLength) { - // We store actual data indexes + 1 to reserve 0 for empty entries. - int32_t maxDataIndex = maxLength - newBlockLength + 1; - int32_t newLength; - if (maxDataIndex <= 0xfff) { // 4k - newLength = 6007; - shift = 12; - mask = 0xfff; - } else if (maxDataIndex <= 0x7fff) { // 32k - newLength = 50021; - shift = 15; - mask = 0x7fff; - } else if (maxDataIndex <= 0x1ffff) { // 128k - newLength = 200003; - shift = 17; - mask = 0x1ffff; - } else { - // maxDataIndex up to around MAX_DATA_LENGTH, ca. 1.1M - newLength = 1500007; - shift = 21; - mask = 0x1fffff; - } - if (newLength > capacity) { - uprv_free(table); - table = (uint32_t *)uprv_malloc(newLength * 4); - if (table == nullptr) { - return false; - } - capacity = newLength; - } - length = newLength; - uprv_memset(table, 0, length * 4); - - blockLength = newBlockLength; - return true; - } - - template - void extend(const UInt *data, int32_t minStart, int32_t prevDataLength, int32_t newDataLength) { - int32_t start = prevDataLength - blockLength; - if (start >= minStart) { - ++start; // Skip the last block that we added last time. - } else { - start = minStart; // Begin with the first full block. - } - for (int32_t end = newDataLength - blockLength; start <= end; ++start) { - uint32_t hashCode = makeHashCode(data, start); - addEntry(data, start, hashCode, start); - } - } - - template - int32_t findBlock(const UIntA *data, const UIntB *blockData, int32_t blockStart) const { - uint32_t hashCode = makeHashCode(blockData, blockStart); - int32_t entryIndex = findEntry(data, blockData, blockStart, hashCode); - if (entryIndex >= 0) { - return (table[entryIndex] & mask) - 1; - } else { - return -1; - } - } - - int32_t findAllSameBlock(const uint32_t *data, uint32_t blockValue) const { - uint32_t hashCode = makeHashCode(blockValue); - int32_t entryIndex = findEntry(data, blockValue, hashCode); - if (entryIndex >= 0) { - return (table[entryIndex] & mask) - 1; - } else { - return -1; - } - } - -private: - template - uint32_t makeHashCode(const UInt *blockData, int32_t blockStart) const { - int32_t blockLimit = blockStart + blockLength; - uint32_t hashCode = blockData[blockStart++]; - do { - hashCode = 37 * hashCode + blockData[blockStart++]; - } while (blockStart < blockLimit); - return hashCode; - } - - uint32_t makeHashCode(uint32_t blockValue) const { - uint32_t hashCode = blockValue; - for (int32_t i = 1; i < blockLength; ++i) { - hashCode = 37 * hashCode + blockValue; - } - return hashCode; - } - - template - void addEntry(const UInt *data, int32_t blockStart, uint32_t hashCode, int32_t dataIndex) { - U_ASSERT(0 <= dataIndex && dataIndex < (int32_t)mask); - int32_t entryIndex = findEntry(data, data, blockStart, hashCode); - if (entryIndex < 0) { - table[~entryIndex] = (hashCode << shift) | (dataIndex + 1); - } - } - - template - int32_t findEntry(const UIntA *data, const UIntB *blockData, int32_t blockStart, - uint32_t hashCode) const { - uint32_t shiftedHashCode = hashCode << shift; - int32_t initialEntryIndex = (hashCode % (length - 1)) + 1; // 1..length-1 - for (int32_t entryIndex = initialEntryIndex;;) { - uint32_t entry = table[entryIndex]; - if (entry == 0) { - return ~entryIndex; - } - if ((entry & ~mask) == shiftedHashCode) { - int32_t dataIndex = (entry & mask) - 1; - if (equalBlocks(data + dataIndex, blockData + blockStart, blockLength)) { - return entryIndex; - } - } - entryIndex = nextIndex(initialEntryIndex, entryIndex); - } - } - - int32_t findEntry(const uint32_t *data, uint32_t blockValue, uint32_t hashCode) const { - uint32_t shiftedHashCode = hashCode << shift; - int32_t initialEntryIndex = (hashCode % (length - 1)) + 1; // 1..length-1 - for (int32_t entryIndex = initialEntryIndex;;) { - uint32_t entry = table[entryIndex]; - if (entry == 0) { - return ~entryIndex; - } - if ((entry & ~mask) == shiftedHashCode) { - int32_t dataIndex = (entry & mask) - 1; - if (allValuesSameAs(data + dataIndex, blockLength, blockValue)) { - return entryIndex; - } - } - entryIndex = nextIndex(initialEntryIndex, entryIndex); - } - } - - inline int32_t nextIndex(int32_t initialEntryIndex, int32_t entryIndex) const { - // U_ASSERT(0 < initialEntryIndex && initialEntryIndex < length); - return (entryIndex + initialEntryIndex) % length; - } - - // Hash table. - // The length is a prime number, larger than the maximum data length. - // The "shift" lower bits store a data index + 1. - // The remaining upper bits store a partial hashCode of the block data values. - uint32_t *table = nullptr; - int32_t capacity = 0; - int32_t length = 0; - int32_t shift = 0; - uint32_t mask = 0; - - int32_t blockLength = 0; -}; - int32_t MutableCodePointTrie::compactWholeDataBlocks(int32_t fastILimit, AllSameBlocks &allSameBlocks) { #ifdef UCPTRIE_DEBUG bool overflow = false; @@ -1074,9 +962,8 @@ void printBlock(const uint32_t *block, i * * It does not try to find an optimal order of writing, deduplicating, and overlapping blocks. */ -int32_t MutableCodePointTrie::compactData( - int32_t fastILimit, uint32_t *newData, int32_t newDataCapacity, - int32_t dataNullIndex, MixedBlocks &mixedBlocks, UErrorCode &errorCode) { +int32_t MutableCodePointTrie::compactData(int32_t fastILimit, + uint32_t *newData, int32_t dataNullIndex) { #ifdef UCPTRIE_DEBUG int32_t countSame=0, sumOverlaps=0; bool printData = dataLength == 29088 /* line.brk */ || @@ -1096,14 +983,8 @@ int32_t MutableCodePointTrie::compactDat #endif } - int32_t blockLength = UCPTRIE_FAST_DATA_BLOCK_LENGTH; - if (!mixedBlocks.init(newDataCapacity, blockLength)) { - errorCode = U_MEMORY_ALLOCATION_ERROR; - return 0; - } - mixedBlocks.extend(newData, 0, 0, newDataLength); - int32_t iLimit = highStart >> UCPTRIE_SHIFT_3; + int32_t blockLength = UCPTRIE_FAST_DATA_BLOCK_LENGTH; int32_t inc = SMALL_DATA_BLOCKS_PER_BMP_BLOCK; int32_t fastLength = 0; for (int32_t i = ASCII_I_LIMIT; i < iLimit; i += inc) { @@ -1111,17 +992,12 @@ int32_t MutableCodePointTrie::compactDat blockLength = UCPTRIE_SMALL_DATA_BLOCK_LENGTH; inc = 1; fastLength = newDataLength; - if (!mixedBlocks.init(newDataCapacity, blockLength)) { - errorCode = U_MEMORY_ALLOCATION_ERROR; - return 0; - } - mixedBlocks.extend(newData, 0, 0, newDataLength); } if (flags[i] == ALL_SAME) { uint32_t value = index[i]; + int32_t n; // Find an earlier part of the data array of length blockLength // that is filled with this value. - int32_t n = mixedBlocks.findAllSameBlock(newData, value); // If we find a match, and the current block is the data null block, // and it is not a fast block but matches the start of a fast block, // then we need to continue looking. @@ -1129,10 +1005,12 @@ int32_t MutableCodePointTrie::compactDat // and not all of the rest of the fast block is filled with this value. // Otherwise trie.getRange() would detect that the fast block starts at // dataNullOffset and assume incorrectly that it is filled with the null value. - while (n >= 0 && i == dataNullIndex && i >= fastILimit && n < fastLength && - isStartOfSomeFastBlock(n, index, fastILimit)) { - n = findAllSameBlock(newData, n + 1, newDataLength, value, blockLength); - } + for (int32_t start = 0; + (n = findAllSameBlock(newData, start, newDataLength, + value, blockLength)) >= 0 && + i == dataNullIndex && i >= fastILimit && n < fastLength && + isStartOfSomeFastBlock(n, index, fastILimit); + start = n + 1) {} if (n >= 0) { DEBUG_DO(++countSame); index[i] = n; @@ -1145,16 +1023,14 @@ int32_t MutableCodePointTrie::compactDat } #endif index[i] = newDataLength - n; - int32_t prevDataLength = newDataLength; while (n < blockLength) { newData[newDataLength++] = value; ++n; } - mixedBlocks.extend(newData, 0, prevDataLength, newDataLength); } } else if (flags[i] == MIXED) { const uint32_t *block = data + index[i]; - int32_t n = mixedBlocks.findBlock(newData, block, 0); + int32_t n = findSameBlock(newData, 0, newDataLength, block, 0, blockLength); if (n >= 0) { DEBUG_DO(++countSame); index[i] = n; @@ -1167,11 +1043,9 @@ int32_t MutableCodePointTrie::compactDat } #endif index[i] = newDataLength - n; - int32_t prevDataLength = newDataLength; while (n < blockLength) { newData[newDataLength++] = block[n++]; } - mixedBlocks.extend(newData, 0, prevDataLength, newDataLength); } } else /* SAME_AS */ { uint32_t j = index[i]; @@ -1187,8 +1061,7 @@ int32_t MutableCodePointTrie::compactDat return newDataLength; } -int32_t MutableCodePointTrie::compactIndex(int32_t fastILimit, MixedBlocks &mixedBlocks, - UErrorCode &errorCode) { +int32_t MutableCodePointTrie::compactIndex(int32_t fastILimit, UErrorCode &errorCode) { int32_t fastIndexLength = fastILimit >> (UCPTRIE_FAST_SHIFT - UCPTRIE_SHIFT_3); if ((highStart >> UCPTRIE_FAST_SHIFT) <= fastIndexLength) { // Only the linear fast index, no multi-stage index tables. @@ -1222,12 +1095,6 @@ int32_t MutableCodePointTrie::compactInd } } - if (!mixedBlocks.init(fastIndexLength, UCPTRIE_INDEX_3_BLOCK_LENGTH)) { - errorCode = U_MEMORY_ALLOCATION_ERROR; - return 0; - } - mixedBlocks.extend(fastIndex, 0, 0, fastIndexLength); - // Examine index-3 blocks. For each determine one of: // - same as the index-3 null block // - same as a fast-index block @@ -1238,7 +1105,6 @@ int32_t MutableCodePointTrie::compactInd // Also determine an upper limit for the index-3 table length. int32_t index3Capacity = 0; i3FirstNull = index3NullOffset; - bool hasLongI3Blocks = false; // If the fast index covers the whole BMP, then // the multi-stage index is only for supplementary code points. // Otherwise, the multi-stage index covers all of Unicode. @@ -1263,13 +1129,13 @@ int32_t MutableCodePointTrie::compactInd index3Capacity += UCPTRIE_INDEX_3_BLOCK_LENGTH; } else { index3Capacity += INDEX_3_18BIT_BLOCK_LENGTH; - hasLongI3Blocks = true; } i3FirstNull = 0; } } else { if (oredI3 <= 0xffff) { - int32_t n = mixedBlocks.findBlock(fastIndex, index, i); + int32_t n = findSameBlock(fastIndex, 0, fastIndexLength, + index, i, UCPTRIE_INDEX_3_BLOCK_LENGTH); if (n >= 0) { flags[i] = I3_BMP; index[i] = n; @@ -1280,7 +1146,6 @@ int32_t MutableCodePointTrie::compactInd } else { flags[i] = I3_18; index3Capacity += INDEX_3_18BIT_BLOCK_LENGTH; - hasLongI3Blocks = true; } } i = j; @@ -1301,18 +1166,6 @@ int32_t MutableCodePointTrie::compactInd } uprv_memcpy(index16, fastIndex, fastIndexLength * 2); - if (!mixedBlocks.init(index16Capacity, UCPTRIE_INDEX_3_BLOCK_LENGTH)) { - errorCode = U_MEMORY_ALLOCATION_ERROR; - return 0; - } - MixedBlocks longI3Blocks; - if (hasLongI3Blocks) { - if (!longI3Blocks.init(index16Capacity, INDEX_3_18BIT_BLOCK_LENGTH)) { - errorCode = U_MEMORY_ALLOCATION_ERROR; - return 0; - } - } - // Compact the index-3 table and write an uncompacted version of the index-2 table. uint16_t index2[UNICODE_LIMIT >> UCPTRIE_SHIFT_2]; // index2Capacity int32_t i2Length = 0; @@ -1332,7 +1185,8 @@ int32_t MutableCodePointTrie::compactInd } else if (f == I3_BMP) { i3 = index[i]; } else if (f == I3_16) { - int32_t n = mixedBlocks.findBlock(index16, index, i); + int32_t n = findSameBlock(index16, index3Start, indexLength, + index, i, UCPTRIE_INDEX_3_BLOCK_LENGTH); if (n >= 0) { i3 = n; } else { @@ -1344,18 +1198,12 @@ int32_t MutableCodePointTrie::compactInd index, i, UCPTRIE_INDEX_3_BLOCK_LENGTH); } i3 = indexLength - n; - int32_t prevIndexLength = indexLength; while (n < UCPTRIE_INDEX_3_BLOCK_LENGTH) { index16[indexLength++] = index[i + n++]; } - mixedBlocks.extend(index16, index3Start, prevIndexLength, indexLength); - if (hasLongI3Blocks) { - longI3Blocks.extend(index16, index3Start, prevIndexLength, indexLength); - } } } else { U_ASSERT(f == I3_18); - U_ASSERT(hasLongI3Blocks); // Encode an index-3 block that contains one or more data indexes exceeding 16 bits. int32_t j = i; int32_t jLimit = i + UCPTRIE_INDEX_3_BLOCK_LENGTH; @@ -1388,7 +1236,8 @@ int32_t MutableCodePointTrie::compactInd index16[k++] = v; index16[k - 9] = upperBits; } while (j < jLimit); - int32_t n = longI3Blocks.findBlock(index16, index16, indexLength); + int32_t n = findSameBlock(index16, index3Start, indexLength, + index16, indexLength, INDEX_3_18BIT_BLOCK_LENGTH); if (n >= 0) { i3 = n | 0x8000; } else { @@ -1400,7 +1249,6 @@ int32_t MutableCodePointTrie::compactInd index16, indexLength, INDEX_3_18BIT_BLOCK_LENGTH); } i3 = (indexLength - n) | 0x8000; - int32_t prevIndexLength = indexLength; if (n > 0) { int32_t start = indexLength; while (n < INDEX_3_18BIT_BLOCK_LENGTH) { @@ -1409,10 +1257,6 @@ int32_t MutableCodePointTrie::compactInd } else { indexLength += INDEX_3_18BIT_BLOCK_LENGTH; } - mixedBlocks.extend(index16, index3Start, prevIndexLength, indexLength); - if (hasLongI3Blocks) { - longI3Blocks.extend(index16, index3Start, prevIndexLength, indexLength); - } } } if (index3NullOffset < 0 && i3FirstNull >= 0) { @@ -1435,23 +1279,16 @@ int32_t MutableCodePointTrie::compactInd } // Compact the index-2 table and write the index-1 table. - static_assert(UCPTRIE_INDEX_2_BLOCK_LENGTH == UCPTRIE_INDEX_3_BLOCK_LENGTH, - "must re-init mixedBlocks"); int32_t blockLength = UCPTRIE_INDEX_2_BLOCK_LENGTH; int32_t i1 = fastIndexLength; for (int32_t i = 0; i < i2Length; i += blockLength) { - int32_t n; - if ((i2Length - i) >= blockLength) { - // normal block - U_ASSERT(blockLength == UCPTRIE_INDEX_2_BLOCK_LENGTH); - n = mixedBlocks.findBlock(index16, index2, i); - } else { + if ((i2Length - i) < blockLength) { // highStart is inside the last index-2 block. Shorten it. blockLength = i2Length - i; - n = findSameBlock(index16, index3Start, indexLength, - index2, i, blockLength); } int32_t i2; + int32_t n = findSameBlock(index16, index3Start, indexLength, + index2, i, blockLength); if (n >= 0) { i2 = n; } else { @@ -1462,11 +1299,9 @@ int32_t MutableCodePointTrie::compactInd n = getOverlap(index16, indexLength, index2, i, blockLength); } i2 = indexLength - n; - int32_t prevIndexLength = indexLength; while (n < blockLength) { index16[indexLength++] = index2[i + n++]; } - mixedBlocks.extend(index16, index3Start, prevIndexLength, indexLength); } // Set the index-1 table entry. index16[i1++] = i2; @@ -1534,11 +1369,7 @@ int32_t MutableCodePointTrie::compactTri uprv_memcpy(newData, asciiData, sizeof(asciiData)); int32_t dataNullIndex = allSameBlocks.findMostUsed(); - - MixedBlocks mixedBlocks; - int32_t newDataLength = compactData(fastILimit, newData, newDataCapacity, - dataNullIndex, mixedBlocks, errorCode); - if (U_FAILURE(errorCode)) { return 0; } + int32_t newDataLength = compactData(fastILimit, newData, dataNullIndex); U_ASSERT(newDataLength <= newDataCapacity); uprv_free(data); data = newData; @@ -1563,7 +1394,7 @@ int32_t MutableCodePointTrie::compactTri dataNullOffset = UCPTRIE_NO_DATA_NULL_OFFSET; } - int32_t indexLength = compactIndex(fastILimit, mixedBlocks, errorCode); + int32_t indexLength = compactIndex(fastILimit, errorCode); highStart = realHighStart; return indexLength; } diff -urp icu4c-63_2/icu/source/common/umutex.h icu4c-63_1/icu/source/common/umutex.h --- icu4c-63_2/icu/source/common/umutex.h 2019-04-12 00:38:30.000000000 +0200 +++ icu4c-63_1/icu/source/common/umutex.h 2018-10-02 00:39:56.000000000 +0200 @@ -54,23 +54,15 @@ U_NAMESPACE_END #include +U_NAMESPACE_BEGIN + // Export an explicit template instantiation of std::atomic. // When building DLLs for Windows this is required as it is used as a data member of the exported SharedObject class. // See digitlst.h, pluralaffix.h, datefmt.h, and others for similar examples. -#if U_PF_WINDOWS <= U_PLATFORM && U_PLATFORM <= U_PF_CYGWIN && !defined(U_IN_DOXYGEN) - #if defined(__clang__) - // Suppress the warning that the explicit instantiation after explicit specialization has no effect. - #pragma clang diagnostic push - #pragma clang diagnostic ignored "-Winstantiation-after-specialization" - #endif +#if U_PF_WINDOWS <= U_PLATFORM && U_PLATFORM <= U_PF_CYGWIN template struct U_COMMON_API std::atomic; - #if defined(__clang__) - #pragma clang diagnostic pop - #endif #endif -U_NAMESPACE_BEGIN - typedef std::atomic u_atomic_int32_t; #define ATOMIC_INT32_T_INITIALIZER(val) ATOMIC_VAR_INIT(val) diff -urp icu4c-63_2/icu/source/common/unicode/uniset.h icu4c-63_1/icu/source/common/unicode/uniset.h --- icu4c-63_2/icu/source/common/unicode/uniset.h 2019-04-12 00:38:30.000000000 +0200 +++ icu4c-63_1/icu/source/common/unicode/uniset.h 2018-10-02 00:39:56.000000000 +0200 @@ -27,6 +27,7 @@ U_NAMESPACE_BEGIN // Forward Declarations. class BMPSet; +class CharacterProperties; class ParsePosition; class RBBIRuleScanner; class SymbolTable; @@ -275,23 +276,14 @@ class RuleCharacterIterator; * @stable ICU 2.0 */ class U_COMMON_API UnicodeSet U_FINAL : public UnicodeFilter { -private: - /** - * Enough for sets with few ranges. - * For example, White_Space has 10 ranges, list length 21. - */ - static constexpr int32_t INITIAL_CAPACITY = 25; - // fFlags constant - static constexpr uint8_t kIsBogus = 1; // This set is bogus (i.e. not valid) - - UChar32* list = stackList; // MUST be terminated with HIGH - int32_t capacity = INITIAL_CAPACITY; // capacity of list - int32_t len = 1; // length of list used; 1 <= len <= capacity - uint8_t fFlags = 0; // Bit flag (see constants above) - - BMPSet *bmpSet = nullptr; // The set is frozen iff either bmpSet or stringSpan is not NULL. - UChar32* buffer = nullptr; // internal buffer, may be NULL - int32_t bufferCapacity = 0; // capacity of buffer + + int32_t len; // length of list used; 0 <= len <= capacity + int32_t capacity; // capacity of list + UChar32* list; // MUST be terminated with HIGH + BMPSet *bmpSet; // The set is frozen iff either bmpSet or stringSpan is not NULL. + UChar32* buffer; // internal buffer, may be NULL + int32_t bufferCapacity; // capacity of buffer + int32_t patLen; /** * The pattern representation of this set. This may not be the @@ -302,19 +294,15 @@ private: * indicating that toPattern() must generate a pattern * representation from the inversion list. */ - char16_t *pat = nullptr; - int32_t patLen = 0; - - UVector* strings = nullptr; // maintained in sorted order - UnicodeSetStringSpan *stringSpan = nullptr; - - /** - * Initial list array. - * Avoids some heap allocations, and list is never nullptr. - * Increases the object size a bit. - */ - UChar32 stackList[INITIAL_CAPACITY]; + char16_t *pat; + UVector* strings; // maintained in sorted order + UnicodeSetStringSpan *stringSpan; +private: + enum { // constants + kIsBogus = 1 // This set is bogus (i.e. not valid) + }; + uint8_t fFlags; // Bit flag (see constants above) public: /** * Determine if this object contains a valid set. @@ -1492,6 +1480,8 @@ private: friend class USetAccess; + int32_t getStringCount() const; + const UnicodeString* getString(int32_t index) const; //---------------------------------------------------------------- @@ -1538,18 +1528,13 @@ private: // Implementation: Utility methods //---------------------------------------------------------------- - static int32_t nextCapacity(int32_t minCapacity); - - bool ensureCapacity(int32_t newLen); + void ensureCapacity(int32_t newLen, UErrorCode& ec); - bool ensureBufferCapacity(int32_t newLen); + void ensureBufferCapacity(int32_t newLen, UErrorCode& ec); void swapBuffers(void); UBool allocateStrings(UErrorCode &status); - UBool hasStrings() const; - int32_t stringsSize() const; - UBool stringsContains(const UnicodeString &s) const; UnicodeString& _toPattern(UnicodeString& result, UBool escapeUnprintable) const; @@ -1629,6 +1614,7 @@ private: UnicodeString& rebuiltPat, UErrorCode& ec); + friend class CharacterProperties; static const UnicodeSet* getInclusions(int32_t src, UErrorCode &status); /** @@ -1660,10 +1646,7 @@ private: /** * Set the new pattern to cache. */ - void setPattern(const UnicodeString& newPat) { - setPattern(newPat.getBuffer(), newPat.length()); - } - void setPattern(const char16_t *newPat, int32_t newPatLen); + void setPattern(const UnicodeString& newPat); /** * Release existing cached pattern. */ diff -urp icu4c-63_2/icu/source/common/unicode/urename.h icu4c-63_1/icu/source/common/unicode/urename.h --- icu4c-63_2/icu/source/common/unicode/urename.h 2019-04-12 00:38:30.000000000 +0200 +++ icu4c-63_1/icu/source/common/unicode/urename.h 2018-10-15 20:02:37.000000000 +0200 @@ -110,6 +110,7 @@ #define _UTF7Data U_ICU_ENTRY_POINT_RENAME(_UTF7Data) #define _UTF8Data U_ICU_ENTRY_POINT_RENAME(_UTF8Data) #define allowedHourFormatsCleanup U_ICU_ENTRY_POINT_RENAME(allowedHourFormatsCleanup) +#define checkImpl U_ICU_ENTRY_POINT_RENAME(checkImpl) #define cmemory_cleanup U_ICU_ENTRY_POINT_RENAME(cmemory_cleanup) #define dayPeriodRulesCleanup U_ICU_ENTRY_POINT_RENAME(dayPeriodRulesCleanup) #define deleteAllowedHourFormats U_ICU_ENTRY_POINT_RENAME(deleteAllowedHourFormats) diff -urp icu4c-63_2/icu/source/common/uniset_closure.cpp icu4c-63_1/icu/source/common/uniset_closure.cpp --- icu4c-63_2/icu/source/common/uniset_closure.cpp 2019-04-12 00:38:30.000000000 +0200 +++ icu4c-63_1/icu/source/common/uniset_closure.cpp 2018-09-29 02:34:41.000000000 +0200 @@ -31,6 +31,10 @@ #include "util.h" #include "uvector.h" +// initial storage. Must be >= 0 +// *** same as in uniset.cpp ! *** +#define START_EXTRA 16 + U_NAMESPACE_BEGIN // TODO memory debugging provided inside uniset.cpp @@ -45,16 +49,42 @@ U_NAMESPACE_BEGIN UnicodeSet::UnicodeSet(const UnicodeString& pattern, uint32_t options, const SymbolTable* symbols, - UErrorCode& status) { - applyPattern(pattern, options, symbols, status); + UErrorCode& status) : + len(0), capacity(START_EXTRA), list(0), bmpSet(0), buffer(0), + bufferCapacity(0), patLen(0), pat(NULL), strings(NULL), stringSpan(NULL), + fFlags(0) +{ + if(U_SUCCESS(status)){ + list = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity); + /* test for NULL */ + if(list == NULL) { + status = U_MEMORY_ALLOCATION_ERROR; + }else{ + allocateStrings(status); + applyPattern(pattern, options, symbols, status); + } + } _dbgct(this); } UnicodeSet::UnicodeSet(const UnicodeString& pattern, ParsePosition& pos, uint32_t options, const SymbolTable* symbols, - UErrorCode& status) { - applyPattern(pattern, pos, options, symbols, status); + UErrorCode& status) : + len(0), capacity(START_EXTRA), list(0), bmpSet(0), buffer(0), + bufferCapacity(0), patLen(0), pat(NULL), strings(NULL), stringSpan(NULL), + fFlags(0) +{ + if(U_SUCCESS(status)){ + list = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity); + /* test for NULL */ + if(list == NULL) { + status = U_MEMORY_ALLOCATION_ERROR; + }else{ + allocateStrings(status); + applyPattern(pattern, pos, options, symbols, status); + } + } _dbgct(this); } @@ -169,7 +199,7 @@ UnicodeSet& UnicodeSet::closeOver(int32_ // start with input set to guarantee inclusion // USET_CASE: remove strings because the strings will actually be reduced (folded); // therefore, start with no strings and add only those needed - if ((attribute & USET_CASE_INSENSITIVE) && foldSet.hasStrings()) { + if (attribute & USET_CASE_INSENSITIVE) { foldSet.strings->removeAllElements(); } @@ -204,7 +234,7 @@ UnicodeSet& UnicodeSet::closeOver(int32_ } } } - if (hasStrings()) { + if (strings != NULL && strings->size() > 0) { if (attribute & USET_CASE_INSENSITIVE) { for (int32_t j=0; jsize(); ++j) { str = *(const UnicodeString *) strings->elementAt(j); diff -urp icu4c-63_2/icu/source/common/uniset.cpp icu4c-63_1/icu/source/common/uniset.cpp --- icu4c-63_2/icu/source/common/uniset.cpp 2019-04-12 00:38:30.000000000 +0200 +++ icu4c-63_1/icu/source/common/uniset.cpp 2018-10-02 00:39:56.000000000 +0200 @@ -14,7 +14,6 @@ #include "unicode/parsepos.h" #include "unicode/symtable.h" #include "unicode/uniset.h" -#include "unicode/ustring.h" #include "unicode/utf8.h" #include "unicode/utf16.h" #include "ruleiter.h" @@ -54,8 +53,11 @@ // LOW <= all valid values. ZERO for codepoints #define UNICODESET_LOW 0x000000 -/** Max list [0, 1, 2, ..., max code point, HIGH] */ -constexpr int32_t MAX_LENGTH = UNICODESET_HIGH + 1; +// initial storage. Must be >= 0 +#define START_EXTRA 16 + +// extra amount for growth. Must be >= 0 +#define GROW_EXTRA START_EXTRA U_NAMESPACE_BEGIN @@ -135,18 +137,6 @@ static int8_t U_CALLCONV compareUnicodeS return a.compare(b); } -UBool UnicodeSet::hasStrings() const { - return strings != nullptr && !strings->isEmpty(); -} - -int32_t UnicodeSet::stringsSize() const { - return strings == nullptr ? 0 : strings->size(); -} - -UBool UnicodeSet::stringsContains(const UnicodeString &s) const { - return strings != nullptr && strings->contains((void*) &s); -} - //---------------------------------------------------------------- // Constructors &c //---------------------------------------------------------------- @@ -154,8 +144,24 @@ UBool UnicodeSet::stringsContains(const /** * Constructs an empty set. */ -UnicodeSet::UnicodeSet() { - list[0] = UNICODESET_HIGH; +UnicodeSet::UnicodeSet() : + len(1), capacity(1 + START_EXTRA), list(0), bmpSet(0), buffer(0), + bufferCapacity(0), patLen(0), pat(NULL), strings(NULL), stringSpan(NULL), + fFlags(0) +{ + UErrorCode status = U_ZERO_ERROR; + allocateStrings(status); + if (U_FAILURE(status)) { + setToBogus(); // If memory allocation failed, set to bogus state. + return; + } + list = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity); + if(list!=NULL){ + list[0] = UNICODESET_HIGH; + } else { // If memory allocation failed, set to bogus state. + setToBogus(); + return; + } _dbgct(this); } @@ -166,39 +172,89 @@ UnicodeSet::UnicodeSet() { * @param start first character, inclusive, of range * @param end last character, inclusive, of range */ -UnicodeSet::UnicodeSet(UChar32 start, UChar32 end) { - list[0] = UNICODESET_HIGH; - add(start, end); +UnicodeSet::UnicodeSet(UChar32 start, UChar32 end) : + len(1), capacity(1 + START_EXTRA), list(0), bmpSet(0), buffer(0), + bufferCapacity(0), patLen(0), pat(NULL), strings(NULL), stringSpan(NULL), + fFlags(0) +{ + UErrorCode status = U_ZERO_ERROR; + allocateStrings(status); + if (U_FAILURE(status)) { + setToBogus(); // If memory allocation failed, set to bogus state. + return; + } + list = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity); + if(list!=NULL){ + list[0] = UNICODESET_HIGH; + complement(start, end); + } else { // If memory allocation failed, set to bogus state. + setToBogus(); + return; + } _dbgct(this); } /** * Constructs a set that is identical to the given UnicodeSet. */ -UnicodeSet::UnicodeSet(const UnicodeSet& o) : UnicodeFilter(o) { - *this = o; +UnicodeSet::UnicodeSet(const UnicodeSet& o) : + UnicodeFilter(o), + len(0), capacity(o.isFrozen() ? o.len : o.len + GROW_EXTRA), list(0), + bmpSet(0), + buffer(0), bufferCapacity(0), + patLen(0), pat(NULL), strings(NULL), stringSpan(NULL), + fFlags(0) +{ + UErrorCode status = U_ZERO_ERROR; + allocateStrings(status); + if (U_FAILURE(status)) { + setToBogus(); // If memory allocation failed, set to bogus state. + return; + } + list = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity); + if(list!=NULL){ + *this = o; + } else { // If memory allocation failed, set to bogus state. + setToBogus(); + return; + } _dbgct(this); } // Copy-construct as thawed. -UnicodeSet::UnicodeSet(const UnicodeSet& o, UBool /* asThawed */) : UnicodeFilter(o) { - if (ensureCapacity(o.len)) { +UnicodeSet::UnicodeSet(const UnicodeSet& o, UBool /* asThawed */) : + UnicodeFilter(o), + len(0), capacity(o.len + GROW_EXTRA), list(0), + bmpSet(0), + buffer(0), bufferCapacity(0), + patLen(0), pat(NULL), strings(NULL), stringSpan(NULL), + fFlags(0) +{ + UErrorCode status = U_ZERO_ERROR; + allocateStrings(status); + if (U_FAILURE(status)) { + setToBogus(); // If memory allocation failed, set to bogus state. + return; + } + list = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity); + if(list!=NULL){ // *this = o except for bmpSet and stringSpan len = o.len; uprv_memcpy(list, o.list, (size_t)len*sizeof(UChar32)); - if (o.hasStrings()) { - UErrorCode status = U_ZERO_ERROR; - if (!allocateStrings(status) || - (strings->assign(*o.strings, cloneUnicodeString, status), U_FAILURE(status))) { - setToBogus(); - return; - } + if (strings != NULL && o.strings != NULL) { + strings->assign(*o.strings, cloneUnicodeString, status); + } else { // Invalid strings. + setToBogus(); + return; } if (o.pat) { - setPattern(o.pat, o.patLen); + setPattern(UnicodeString(o.pat, o.patLen)); } - _dbgct(this); + } else { // If memory allocation failed, set to bogus state. + setToBogus(); + return; } + _dbgct(this); } /** @@ -206,11 +262,9 @@ UnicodeSet::UnicodeSet(const UnicodeSet& */ UnicodeSet::~UnicodeSet() { _dbgdt(this); // first! - if (list != stackList) { - uprv_free(list); - } + uprv_free(list); delete bmpSet; - if (buffer != stackList) { + if (buffer) { uprv_free(buffer); } delete strings; @@ -236,30 +290,32 @@ UnicodeSet& UnicodeSet::copyFrom(const U setToBogus(); return *this; } - if (!ensureCapacity(o.len)) { + UErrorCode ec = U_ZERO_ERROR; + ensureCapacity(o.len, ec); + if (U_FAILURE(ec)) { // ensureCapacity will mark the UnicodeSet as Bogus if OOM failure happens. return *this; } len = o.len; uprv_memcpy(list, o.list, (size_t)len*sizeof(UChar32)); - if (o.bmpSet != nullptr && !asThawed) { + if (o.bmpSet == NULL || asThawed) { + bmpSet = NULL; + } else { bmpSet = new BMPSet(*o.bmpSet, list, len); if (bmpSet == NULL) { // Check for memory allocation error. setToBogus(); return *this; } } - if (o.hasStrings()) { - UErrorCode status = U_ZERO_ERROR; - if ((strings == nullptr && !allocateStrings(status)) || - (strings->assign(*o.strings, cloneUnicodeString, status), U_FAILURE(status))) { - setToBogus(); - return *this; - } - } else if (hasStrings()) { - strings->removeAllElements(); + if (strings != NULL && o.strings != NULL) { + strings->assign(*o.strings, cloneUnicodeString, ec); + } else { // Invalid strings. + setToBogus(); + return *this; } - if (o.stringSpan != nullptr && !asThawed) { + if (o.stringSpan == NULL || asThawed) { + stringSpan = NULL; + } else { stringSpan = new UnicodeSetStringSpan(*o.stringSpan, *strings); if (stringSpan == NULL) { // Check for memory allocation error. setToBogus(); @@ -268,7 +324,7 @@ UnicodeSet& UnicodeSet::copyFrom(const U } releasePattern(); if (o.pat) { - setPattern(o.pat, o.patLen); + setPattern(UnicodeString(o.pat, o.patLen)); } return *this; } @@ -301,8 +357,7 @@ UBool UnicodeSet::operator==(const Unico for (int32_t i = 0; i < len; ++i) { if (list[i] != o.list[i]) return FALSE; } - if (hasStrings() != o.hasStrings()) { return FALSE; } - if (hasStrings() && *strings != *o.strings) return FALSE; + if (*strings != *o.strings) return FALSE; return TRUE; } @@ -338,7 +393,7 @@ int32_t UnicodeSet::size(void) const { for (int32_t i = 0; i < count; ++i) { n += getRangeEnd(i) - getRangeStart(i) + 1; } - return n + stringsSize(); + return n + strings->size(); } /** @@ -347,7 +402,7 @@ int32_t UnicodeSet::size(void) const { * @return true if this set contains no elements. */ UBool UnicodeSet::isEmpty(void) const { - return len == 1 && !hasStrings(); + return len == 1 && strings->size() == 0; } /** @@ -447,7 +502,7 @@ UBool UnicodeSet::contains(const Unicode if (s.length() == 0) return FALSE; int32_t cp = getSingleCP(s); if (cp < 0) { - return stringsContains(s); + return strings->contains((void*) &s); } else { return contains((UChar32) cp); } @@ -469,7 +524,8 @@ UBool UnicodeSet::containsAll(const Unic return FALSE; } } - return !c.hasStrings() || (strings != nullptr && strings->containsAll(*c.strings)); + if (!strings->containsAll(*c.strings)) return FALSE; + return TRUE; } /** @@ -515,7 +571,8 @@ UBool UnicodeSet::containsNone(const Uni return FALSE; } } - return strings == nullptr || !c.hasStrings() || strings->containsNone(*c.strings); + if (!strings->containsNone(*c.strings)) return FALSE; + return TRUE; } /** @@ -556,7 +613,7 @@ UBool UnicodeSet::matchesIndexValue(uint return TRUE; } } - if (hasStrings()) { + if (strings->size() != 0) { for (i=0; isize(); ++i) { const UnicodeString& s = *(const UnicodeString*)strings->elementAt(i); //if (s.length() == 0) { @@ -591,7 +648,7 @@ UMatchDegree UnicodeSet::matches(const R return U_MISMATCH; } } else { - if (hasStrings()) { // try strings first + if (strings->size() != 0) { // try strings first // might separate forward and backward loops later // for now they are combined @@ -792,39 +849,7 @@ UnicodeSet& UnicodeSet::set(UChar32 star */ UnicodeSet& UnicodeSet::add(UChar32 start, UChar32 end) { if (pinCodePoint(start) < pinCodePoint(end)) { - UChar32 limit = end + 1; - // Fast path for adding a new range after the last one. - // Odd list length: [..., lastStart, lastLimit, HIGH] - if ((len & 1) != 0) { - // If the list is empty, set lastLimit low enough to not be adjacent to 0. - UChar32 lastLimit = len == 1 ? -2 : list[len - 2]; - if (lastLimit <= start && !isFrozen() && !isBogus()) { - if (lastLimit == start) { - // Extend the last range. - list[len - 2] = limit; - if (limit == UNICODESET_HIGH) { - --len; - } - } else { - list[len - 1] = start; - if (limit < UNICODESET_HIGH) { - if (ensureCapacity(len + 2)) { - list[len++] = limit; - list[len++] = UNICODESET_HIGH; - } - } else { // limit == UNICODESET_HIGH - if (ensureCapacity(len + 1)) { - list[len++] = UNICODESET_HIGH; - } - } - } - releasePattern(); - return *this; - } - } - // This is slow. Could be much faster using findCodePoint(start) - // and modifying the list, dealing with adjacent & overlapping ranges. - UChar32 range[3] = { start, limit, UNICODESET_HIGH }; + UChar32 range[3] = { start, end+1, UNICODESET_HIGH }; add(range, 2, 0); } else if (start == end) { add(start); @@ -893,7 +918,9 @@ UnicodeSet& UnicodeSet::add(UChar32 c) { list[i] = c; // if we touched the HIGH mark, then add a new one if (c == (UNICODESET_HIGH - 1)) { - if (!ensureCapacity(len+1)) { + UErrorCode status = U_ZERO_ERROR; + ensureCapacity(len+1, status); + if (U_FAILURE(status)) { // ensureCapacity will mark the object as Bogus if OOM failure happens. return *this; } @@ -937,13 +964,21 @@ UnicodeSet& UnicodeSet::add(UChar32 c) { // ^ // list[i] - if (!ensureCapacity(len+2)) { + UErrorCode status = U_ZERO_ERROR; + ensureCapacity(len+2, status); + if (U_FAILURE(status)) { // ensureCapacity will mark the object as Bogus if OOM failure happens. return *this; } - UChar32 *p = list + i; - uprv_memmove(p + 2, p, (len - i) * sizeof(*p)); + //for (int32_t k=len-1; k>=i; --k) { + // list[k+2] = list[k]; + //} + UChar32* src = list + len; + UChar32* dst = src + 2; + UChar32* srclimit = list + i; + while (src > srclimit) *(--dst) = *(--src); + list[i] = c; list[i+1] = c+1; len += 2; @@ -979,7 +1014,7 @@ UnicodeSet& UnicodeSet::add(const Unicod if (s.length() == 0 || isFrozen() || isBogus()) return *this; int32_t cp = getSingleCP(s); if (cp < 0) { - if (!stringsContains(s)) { + if (!strings->contains((void*) &s)) { _add(s); releasePattern(); } @@ -998,16 +1033,12 @@ void UnicodeSet::_add(const UnicodeStrin if (isFrozen() || isBogus()) { return; } - UErrorCode ec = U_ZERO_ERROR; - if (strings == nullptr && !allocateStrings(ec)) { - setToBogus(); - return; - } UnicodeString* t = new UnicodeString(s); if (t == NULL) { // Check for memory allocation error. setToBogus(); return; } + UErrorCode ec = U_ZERO_ERROR; strings->sortedInsert(t, compareUnicodeString, ec); if (U_FAILURE(ec)) { setToBogus(); @@ -1090,10 +1121,7 @@ UnicodeSet& UnicodeSet::removeAll(const } UnicodeSet& UnicodeSet::removeAllStrings() { - if (!isFrozen() && hasStrings()) { - strings->removeAllElements(); - releasePattern(); - } + strings->removeAllElements(); return *this; } @@ -1189,9 +1217,8 @@ UnicodeSet& UnicodeSet::remove(const Uni if (s.length() == 0 || isFrozen() || isBogus()) return *this; int32_t cp = getSingleCP(s); if (cp < 0) { - if (strings != nullptr && strings->removeElement((void*) &s)) { - releasePattern(); - } + strings->removeElement((void*) &s); + releasePattern(); } else { remove((UChar32)cp, (UChar32)cp); } @@ -1233,17 +1260,24 @@ UnicodeSet& UnicodeSet::complement(void) if (isFrozen() || isBogus()) { return *this; } + UErrorCode status = U_ZERO_ERROR; if (list[0] == UNICODESET_LOW) { - uprv_memmove(list, list + 1, (size_t)(len-1)*sizeof(UChar32)); + ensureBufferCapacity(len-1, status); + if (U_FAILURE(status)) { + return *this; + } + uprv_memcpy(buffer, list + 1, (size_t)(len-1)*sizeof(UChar32)); --len; } else { - if (!ensureCapacity(len+1)) { + ensureBufferCapacity(len+1, status); + if (U_FAILURE(status)) { return *this; } - uprv_memmove(list + 1, list, (size_t)len*sizeof(UChar32)); - list[0] = UNICODESET_LOW; + uprv_memcpy(buffer + 1, list, (size_t)len*sizeof(UChar32)); + buffer[0] = UNICODESET_LOW; ++len; } + swapBuffers(); releasePattern(); return *this; } @@ -1260,7 +1294,7 @@ UnicodeSet& UnicodeSet::complement(const if (s.length() == 0 || isFrozen() || isBogus()) return *this; int32_t cp = getSingleCP(s); if (cp < 0) { - if (stringsContains(s)) { + if (strings->contains((void*) &s)) { strings->removeElement((void*) &s); } else { _add(s); @@ -1291,7 +1325,7 @@ UnicodeSet& UnicodeSet::addAll(const Uni if ( c.strings!=NULL ) { for (int32_t i=0; isize(); ++i) { const UnicodeString* s = (const UnicodeString*)c.strings->elementAt(i); - if (!stringsContains(*s)) { + if (!strings->contains((void*) s)) { _add(*s); } } @@ -1313,13 +1347,7 @@ UnicodeSet& UnicodeSet::retainAll(const return *this; } retain(c.list, c.len, 0); - if (hasStrings()) { - if (!c.hasStrings()) { - strings->removeAllElements(); - } else { - strings->retainAll(*c.strings); - } - } + strings->retainAll(*c.strings); return *this; } @@ -1337,9 +1365,7 @@ UnicodeSet& UnicodeSet::removeAll(const return *this; } retain(c.list, c.len, 2); - if (hasStrings() && c.hasStrings()) { - strings->removeAll(*c.strings); - } + strings->removeAll(*c.strings); return *this; } @@ -1357,12 +1383,10 @@ UnicodeSet& UnicodeSet::complementAll(co } exclusiveOr(c.list, c.len, 0); - if (c.strings != nullptr) { - for (int32_t i=0; isize(); ++i) { - void* e = c.strings->elementAt(i); - if (strings == nullptr || !strings->removeElement(e)) { - _add(*(const UnicodeString*)e); - } + for (int32_t i=0; isize(); ++i) { + void* e = c.strings->elementAt(i); + if (!strings->removeElement(e)) { + _add(*(const UnicodeString*)e); } } return *this; @@ -1376,14 +1400,18 @@ UnicodeSet& UnicodeSet::clear(void) { if (isFrozen()) { return *this; } - list[0] = UNICODESET_HIGH; + if (list != NULL) { + list[0] = UNICODESET_HIGH; + } len = 1; releasePattern(); if (strings != NULL) { strings->removeAllElements(); } - // Remove bogus - fFlags = 0; + if (list != NULL && strings != NULL) { + // Remove bogus + fFlags = 0; + } return *this; } @@ -1417,6 +1445,10 @@ UChar32 UnicodeSet::getRangeEnd(int32_t return list[index*2 + 1] - 1; } +int32_t UnicodeSet::getStringCount() const { + return strings->size(); +} + const UnicodeString* UnicodeSet::getString(int32_t index) const { return (const UnicodeString*) strings->elementAt(index); } @@ -1430,32 +1462,22 @@ UnicodeSet& UnicodeSet::compact() { return *this; } // Delete buffer first to defragment memory less. - if (buffer != stackList) { + if (buffer != NULL) { uprv_free(buffer); buffer = NULL; - bufferCapacity = 0; } - if (list == stackList) { - // pass - } else if (len <= INITIAL_CAPACITY) { - uprv_memcpy(stackList, list, len * sizeof(UChar32)); - uprv_free(list); - list = stackList; - capacity = INITIAL_CAPACITY; - } else if ((len + 7) < capacity) { - // If we have more than a little unused capacity, shrink it to len. - UChar32* temp = (UChar32*) uprv_realloc(list, sizeof(UChar32) * len); + if (len < capacity) { + // Make the capacity equal to len or 1. + // We don't want to realloc of 0 size. + int32_t newCapacity = len + (len == 0); + UChar32* temp = (UChar32*) uprv_realloc(list, sizeof(UChar32) * newCapacity); if (temp) { list = temp; - capacity = len; + capacity = newCapacity; } // else what the heck happened?! We allocated less memory! // Oh well. We'll keep our original array. } - if (strings != nullptr && strings->isEmpty()) { - delete strings; - strings = nullptr; - } return *this; } @@ -1466,8 +1488,10 @@ UnicodeSet& UnicodeSet::compact() { /** * Deserialize constructor. */ -UnicodeSet::UnicodeSet(const uint16_t data[], int32_t dataLen, ESerialization serialization, - UErrorCode &ec) { +UnicodeSet::UnicodeSet(const uint16_t data[], int32_t dataLen, ESerialization serialization, UErrorCode &ec) + : len(1), capacity(1+START_EXTRA), list(0), bmpSet(0), buffer(0), + bufferCapacity(0), patLen(0), pat(NULL), strings(NULL), stringSpan(NULL), + fFlags(0) { if(U_FAILURE(ec)) { setToBogus(); @@ -1482,15 +1506,24 @@ UnicodeSet::UnicodeSet(const uint16_t da return; } + allocateStrings(ec); + if (U_FAILURE(ec)) { + setToBogus(); + return; + } + // bmp? int32_t headerSize = ((data[0]&0x8000)) ?2:1; int32_t bmpLength = (headerSize==1)?data[0]:data[1]; - int32_t newLength = (((data[0]&0x7FFF)-bmpLength)/2)+bmpLength; + len = (((data[0]&0x7FFF)-bmpLength)/2)+bmpLength; #ifdef DEBUG_SERIALIZE - printf("dataLen %d headerSize %d bmpLen %d len %d. data[0]=%X/%X/%X/%X\n", dataLen,headerSize,bmpLength,newLength, data[0],data[1],data[2],data[3]); + printf("dataLen %d headerSize %d bmpLen %d len %d. data[0]=%X/%X/%X/%X\n", dataLen,headerSize,bmpLength,len, data[0],data[1],data[2],data[3]); #endif - if(!ensureCapacity(newLength + 1)) { // +1 for HIGH + capacity = len+1; + list = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity); + if(!list || U_FAILURE(ec)) { + setToBogus(); return; } // copy bmp @@ -1502,18 +1535,15 @@ UnicodeSet::UnicodeSet(const uint16_t da #endif } // copy smp - for(i=bmpLength;i MAX_LENGTH) { - newCapacity = MAX_LENGTH; - } - return newCapacity; - } -} - -bool UnicodeSet::ensureCapacity(int32_t newLen) { - if (newLen > MAX_LENGTH) { - newLen = MAX_LENGTH; - } +void UnicodeSet::ensureCapacity(int32_t newLen, UErrorCode& ec) { if (newLen <= capacity) { - return true; + return; } - int32_t newCapacity = nextCapacity(newLen); - UChar32* temp = (UChar32*) uprv_malloc(newCapacity * sizeof(UChar32)); + UChar32* temp = (UChar32*) uprv_realloc(list, sizeof(UChar32) * (newLen + GROW_EXTRA)); if (temp == NULL) { + ec = U_MEMORY_ALLOCATION_ERROR; setToBogus(); // set the object to bogus state if an OOM failure occurred. - return false; - } - // Copy only the actual contents. - uprv_memcpy(temp, list, len * sizeof(UChar32)); - if (list != stackList) { - uprv_free(list); + return; } list = temp; - capacity = newCapacity; - return true; + capacity = newLen + GROW_EXTRA; + // else we keep the original contents on the memory failure. } -bool UnicodeSet::ensureBufferCapacity(int32_t newLen) { - if (newLen > MAX_LENGTH) { - newLen = MAX_LENGTH; - } - if (newLen <= bufferCapacity) { - return true; - } - int32_t newCapacity = nextCapacity(newLen); - UChar32* temp = (UChar32*) uprv_malloc(newCapacity * sizeof(UChar32)); +void UnicodeSet::ensureBufferCapacity(int32_t newLen, UErrorCode& ec) { + if (buffer != NULL && newLen <= bufferCapacity) + return; + UChar32* temp = (UChar32*) uprv_realloc(buffer, sizeof(UChar32) * (newLen + GROW_EXTRA)); if (temp == NULL) { + ec = U_MEMORY_ALLOCATION_ERROR; setToBogus(); - return false; - } - // The buffer has no contents to be copied. - // It is always filled from scratch after this call. - if (buffer != stackList) { - uprv_free(buffer); + return; } buffer = temp; - bufferCapacity = newCapacity; - return true; + bufferCapacity = newLen + GROW_EXTRA; + // else we keep the original contents on the memory failure. } /** @@ -1729,7 +1727,9 @@ void UnicodeSet::exclusiveOr(const UChar if (isFrozen() || isBogus()) { return; } - if (!ensureBufferCapacity(len + otherLen)) { + UErrorCode status = U_ZERO_ERROR; + ensureBufferCapacity(len + otherLen, status); + if (U_FAILURE(status)) { return; } @@ -1777,7 +1777,9 @@ void UnicodeSet::add(const UChar32* othe if (isFrozen() || isBogus() || other==NULL) { return; } - if (!ensureBufferCapacity(len + otherLen)) { + UErrorCode status = U_ZERO_ERROR; + ensureBufferCapacity(len + otherLen, status); + if (U_FAILURE(status)) { return; } @@ -1888,7 +1890,9 @@ void UnicodeSet::retain(const UChar32* o if (isFrozen() || isBogus()) { return; } - if (!ensureBufferCapacity(len + otherLen)) { + UErrorCode status = U_ZERO_ERROR; + ensureBufferCapacity(len + otherLen, status); + if (U_FAILURE(status)) { return; } @@ -2134,14 +2138,12 @@ UnicodeString& UnicodeSet::_generatePatt } } - if (strings != nullptr) { - for (int32_t i = 0; isize(); ++i) { - result.append(OPEN_BRACE); - _appendToPat(result, - *(const UnicodeString*) strings->elementAt(i), - escapeUnprintable); - result.append(CLOSE_BRACE); - } + for (int32_t i = 0; isize(); ++i) { + result.append(OPEN_BRACE); + _appendToPat(result, + *(const UnicodeString*) strings->elementAt(i), + escapeUnprintable); + result.append(CLOSE_BRACE); } return result.append(SET_CLOSE); } @@ -2160,12 +2162,13 @@ void UnicodeSet::releasePattern() { /** * Set the new pattern to cache. */ -void UnicodeSet::setPattern(const char16_t *newPat, int32_t newPatLen) { +void UnicodeSet::setPattern(const UnicodeString& newPat) { releasePattern(); + int32_t newPatLen = newPat.length(); pat = (UChar *)uprv_malloc((newPatLen + 1) * sizeof(UChar)); if (pat) { patLen = newPatLen; - u_memcpy(pat, newPat, patLen); + newPat.extractBetween(0, patLen, pat); pat[patLen] = 0; } // else we don't care if malloc failed. This was just a nice cache. @@ -2174,15 +2177,30 @@ void UnicodeSet::setPattern(const char16 UnicodeFunctor *UnicodeSet::freeze() { if(!isFrozen() && !isBogus()) { - compact(); + // Do most of what compact() does before freezing because + // compact() will not work when the set is frozen. + // Small modification: Don't shrink if the savings would be tiny (<=GROW_EXTRA). + + // Delete buffer first to defragment memory less. + if (buffer != NULL) { + uprv_free(buffer); + buffer = NULL; + } + if (capacity > (len + GROW_EXTRA)) { + // Make the capacity equal to len or 1. + // We don't want to realloc of 0 size. + capacity = len + (len == 0); + list = (UChar32*) uprv_realloc(list, sizeof(UChar32) * capacity); + if (list == NULL) { // Check for memory allocation error. + setToBogus(); + return this; + } + } // Optimize contains() and span() and similar functions. - if (hasStrings()) { + if (!strings->isEmpty()) { stringSpan = new UnicodeSetStringSpan(*this, *strings, UnicodeSetStringSpan::ALL); - if (stringSpan == nullptr) { - setToBogus(); - return this; - } else if (!stringSpan->needsStringSpanUTF16()) { + if (stringSpan != NULL && !stringSpan->needsStringSpanUTF16()) { // All strings are irrelevant for span() etc. because // all of each string's code points are contained in this set. // Do not check needsStringSpanUTF8() because UTF-8 has at most as @@ -2215,7 +2233,7 @@ int32_t UnicodeSet::span(const UChar *s, } if(stringSpan!=NULL) { return stringSpan->span(s, length, spanCondition); - } else if(hasStrings()) { + } else if(!strings->isEmpty()) { uint32_t which= spanCondition==USET_SPAN_NOT_CONTAINED ? UnicodeSetStringSpan::FWD_UTF16_NOT_CONTAINED : UnicodeSetStringSpan::FWD_UTF16_CONTAINED; @@ -2252,7 +2270,7 @@ int32_t UnicodeSet::spanBack(const UChar } if(stringSpan!=NULL) { return stringSpan->spanBack(s, length, spanCondition); - } else if(hasStrings()) { + } else if(!strings->isEmpty()) { uint32_t which= spanCondition==USET_SPAN_NOT_CONTAINED ? UnicodeSetStringSpan::BACK_UTF16_NOT_CONTAINED : UnicodeSetStringSpan::BACK_UTF16_CONTAINED; @@ -2290,7 +2308,7 @@ int32_t UnicodeSet::spanUTF8(const char } if(stringSpan!=NULL) { return stringSpan->spanUTF8((const uint8_t *)s, length, spanCondition); - } else if(hasStrings()) { + } else if(!strings->isEmpty()) { uint32_t which= spanCondition==USET_SPAN_NOT_CONTAINED ? UnicodeSetStringSpan::FWD_UTF8_NOT_CONTAINED : UnicodeSetStringSpan::FWD_UTF8_CONTAINED; @@ -2328,7 +2346,7 @@ int32_t UnicodeSet::spanBackUTF8(const c } if(stringSpan!=NULL) { return stringSpan->spanBackUTF8((const uint8_t *)s, length, spanCondition); - } else if(hasStrings()) { + } else if(!strings->isEmpty()) { uint32_t which= spanCondition==USET_SPAN_NOT_CONTAINED ? UnicodeSetStringSpan::BACK_UTF8_NOT_CONTAINED : UnicodeSetStringSpan::BACK_UTF8_CONTAINED; diff -urp icu4c-63_2/icu/source/common/uniset_props.cpp icu4c-63_1/icu/source/common/uniset_props.cpp --- icu4c-63_2/icu/source/common/uniset_props.cpp 2019-04-12 00:38:30.000000000 +0200 +++ icu4c-63_1/icu/source/common/uniset_props.cpp 2018-10-02 00:39:56.000000000 +0200 @@ -47,6 +47,10 @@ U_NAMESPACE_USE +// initial storage. Must be >= 0 +// *** same as in uniset.cpp ! *** +#define START_EXTRA 16 + // Define UChar constants using hex for EBCDIC compatibility // Used #define to reduce private static exports and memory access time. #define SET_OPEN ((UChar)0x005B) /*[*/ @@ -181,8 +185,21 @@ isPOSIXClose(const UnicodeString &patter * @param pattern a string specifying what characters are in the set */ UnicodeSet::UnicodeSet(const UnicodeString& pattern, - UErrorCode& status) { - applyPattern(pattern, status); + UErrorCode& status) : + len(0), capacity(START_EXTRA), list(0), bmpSet(0), buffer(0), + bufferCapacity(0), patLen(0), pat(NULL), strings(NULL), stringSpan(NULL), + fFlags(0) +{ + if(U_SUCCESS(status)){ + list = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity); + /* test for NULL */ + if(list == NULL) { + status = U_MEMORY_ALLOCATION_ERROR; + }else{ + allocateStrings(status); + applyPattern(pattern, status); + } + } _dbgct(this); } @@ -696,11 +713,6 @@ static UBool numericValueFilter(UChar32 return u_getNumericValue(ch) == *(double*)context; } -static UBool generalCategoryMaskFilter(UChar32 ch, void* context) { - int32_t value = *(int32_t*)context; - return (U_GET_GC_MASK((UChar32) ch) & value) != 0; -} - static UBool versionFilter(UChar32 ch, void* context) { static const UVersionInfo none = { 0, 0, 0, 0 }; UVersionInfo v; @@ -709,16 +721,6 @@ static UBool versionFilter(UChar32 ch, v return uprv_memcmp(&v, &none, sizeof(v)) > 0 && uprv_memcmp(&v, version, sizeof(v)) <= 0; } -typedef struct { - UProperty prop; - int32_t value; -} IntPropertyContext; - -static UBool intPropertyFilter(UChar32 ch, void* context) { - IntPropertyContext* c = (IntPropertyContext*)context; - return u_getIntPropertyValue((UChar32) ch, c->prop) == c->value; -} - static UBool scriptExtensionsFilter(UChar32 ch, void* context) { return uscript_hasScript(ch, *(UScriptCode*)context); } @@ -779,6 +781,43 @@ void UnicodeSet::applyFilter(UnicodeSet: namespace { +/** Maps map values to 1 if the mask contains their value'th bit, all others to 0. */ +uint32_t U_CALLCONV generalCategoryMaskFilter(const void *context, uint32_t value) { + uint32_t mask = *(const uint32_t *)context; + value = U_MASK(value) & mask; + if (value != 0) { value = 1; } + return value; +} + +/** Maps one map value to 1, all others to 0. */ +uint32_t U_CALLCONV intValueFilter(const void *context, uint32_t value) { + uint32_t v = *(const uint32_t *)context; + return value == v ? 1 : 0; +} + +} // namespace + +void UnicodeSet::applyIntPropertyValue(const UCPMap *map, + UCPMapValueFilter *filter, const void *context, + UErrorCode &errorCode) { + if (U_FAILURE(errorCode)) { return; } + clear(); + UChar32 start = 0, end; + uint32_t value; + while ((end = ucpmap_getRange(map, start, UCPMAP_RANGE_NORMAL, 0, + filter, context, &value)) >= 0) { + if (value != 0) { + add(start, end); + } + start = end + 1; + } + if (isBogus()) { + errorCode = U_MEMORY_ALLOCATION_ERROR; + } +} + +namespace { + static UBool mungeCharName(char* dst, const char* src, int32_t dstCapacity) { /* Note: we use ' ' in compiler code page */ int32_t j = 0; @@ -806,10 +845,11 @@ static UBool mungeCharName(char* dst, co UnicodeSet& UnicodeSet::applyIntPropertyValue(UProperty prop, int32_t value, UErrorCode& ec) { - if (U_FAILURE(ec) || isFrozen()) { return *this; } + if (U_FAILURE(ec)) { return *this; } + // All of the following check isFrozen() before modifying this set. if (prop == UCHAR_GENERAL_CATEGORY_MASK) { - const UnicodeSet* inclusions = CharacterProperties::getInclusionsForProperty(prop, ec); - applyFilter(generalCategoryMaskFilter, &value, inclusions, ec); + const UCPMap *map = u_getIntPropertyMap(UCHAR_GENERAL_CATEGORY, &ec); + applyIntPropertyValue(map, generalCategoryMaskFilter, &value, ec); } else if (prop == UCHAR_SCRIPT_EXTENSIONS) { const UnicodeSet* inclusions = CharacterProperties::getInclusionsForProperty(prop, ec); UScriptCode script = (UScriptCode)value; @@ -826,11 +866,14 @@ UnicodeSet::applyIntPropertyValue(UPrope clear(); } } else if (UCHAR_INT_START <= prop && prop < UCHAR_INT_LIMIT) { - const UnicodeSet* inclusions = CharacterProperties::getInclusionsForProperty(prop, ec); - IntPropertyContext c = {prop, value}; - applyFilter(intPropertyFilter, &c, inclusions, ec); + const UCPMap *map = u_getIntPropertyMap(prop, &ec); + applyIntPropertyValue(map, intValueFilter, &value, ec); } else { + // This code used to always call getInclusions(property source) + // which sets an error for an unsupported property. ec = U_ILLEGAL_ARGUMENT_ERROR; + // Otherwise we would just clear() this set because + // getIntPropertyValue(c, prop) returns 0 for all code points. } return *this; } diff -urp icu4c-63_2/icu/source/common/uprops.h icu4c-63_1/icu/source/common/uprops.h --- icu4c-63_2/icu/source/common/uprops.h 2019-04-12 00:38:30.000000000 +0200 +++ icu4c-63_1/icu/source/common/uprops.h 2018-10-02 00:39:56.000000000 +0200 @@ -462,6 +462,7 @@ class UnicodeSet; class CharacterProperties { public: CharacterProperties() = delete; + static void U_CALLCONV initInclusion(UPropertySource src, UErrorCode &errorCode); static const UnicodeSet *getInclusionsForProperty(UProperty prop, UErrorCode &errorCode); }; diff -urp icu4c-63_2/icu/source/common/uset.cpp icu4c-63_1/icu/source/common/uset.cpp --- icu4c-63_2/icu/source/common/uset.cpp 2019-04-12 00:38:30.000000000 +0200 +++ icu4c-63_1/icu/source/common/uset.cpp 2018-09-29 02:34:41.000000000 +0200 @@ -249,7 +249,7 @@ class USetAccess /* not : public UObject public: /* Try to have the compiler inline these*/ inline static int32_t getStringCount(const UnicodeSet& set) { - return set.stringsSize(); + return set.getStringCount(); } inline static const UnicodeString* getString(const UnicodeSet& set, int32_t i) { diff -urp icu4c-63_2/icu/source/common/usetiter.cpp icu4c-63_1/icu/source/common/usetiter.cpp --- icu4c-63_2/icu/source/common/usetiter.cpp 2019-04-12 00:38:30.000000000 +0200 +++ icu4c-63_1/icu/source/common/usetiter.cpp 2018-09-29 02:34:41.000000000 +0200 @@ -116,7 +116,7 @@ void UnicodeSetIterator::reset() { stringCount = 0; } else { endRange = set->getRangeCount() - 1; - stringCount = set->stringsSize(); + stringCount = set->strings->size(); } range = 0; endElement = -1; Binary files icu4c-63_2/icu/source/data/in/icudt63l.dat and icu4c-63_1/icu/source/data/in/icudt63l.dat differ diff -urp icu4c-63_2/icu/source/i18n/japancal.cpp icu4c-63_1/icu/source/i18n/japancal.cpp --- icu4c-63_2/icu/source/i18n/japancal.cpp 2019-04-12 00:38:30.000000000 +0200 +++ icu4c-63_1/icu/source/i18n/japancal.cpp 2018-10-02 00:39:56.000000000 +0200 @@ -18,16 +18,6 @@ #if !UCONFIG_NO_FORMATTING #if U_PLATFORM_HAS_WINUWP_API == 0 #include // getenv() is not available in UWP env -#else -#ifndef WIN32_LEAN_AND_MEAN -# define WIN32_LEAN_AND_MEAN -#endif -# define VC_EXTRALEAN -# define NOUSER -# define NOSERVICE -# define NOIME -# define NOMCX -#include #endif #include "cmemory.h" #include "erarules.h" diff -urp icu4c-63_2/icu/source/i18n/unicode/numberrangeformatter.h icu4c-63_1/icu/source/i18n/unicode/numberrangeformatter.h --- icu4c-63_2/icu/source/i18n/unicode/numberrangeformatter.h 2019-04-12 00:38:30.000000000 +0200 +++ icu4c-63_1/icu/source/i18n/unicode/numberrangeformatter.h 2018-10-15 20:02:37.000000000 +0200 @@ -185,14 +185,8 @@ class NumberRangeFormatterImpl; * Export an explicit template instantiation. See datefmt.h * (When building DLLs for Windows this is required.) */ -#if U_PLATFORM == U_PF_WINDOWS && !defined(U_IN_DOXYGEN) -} // namespace icu::number -U_NAMESPACE_END - -template struct U_I18N_API std::atomic< U_NAMESPACE_QUALIFIER number::impl::NumberRangeFormatterImpl*>; - -U_NAMESPACE_BEGIN -namespace number { // icu::number +#if U_PF_WINDOWS <= U_PLATFORM && U_PLATFORM <= U_PF_CYGWIN && !defined(U_IN_DOXYGEN) +template struct U_I18N_API std::atomic; #endif /** \endcond */ diff -urp icu4c-63_2/icu/source/i18n/uspoof.cpp icu4c-63_1/icu/source/i18n/uspoof.cpp --- icu4c-63_2/icu/source/i18n/uspoof.cpp 2019-04-12 00:38:30.000000000 +0200 +++ icu4c-63_1/icu/source/i18n/uspoof.cpp 2018-09-29 02:34:42.000000000 +0200 @@ -547,7 +547,7 @@ uspoof_checkUnicodeString(const USpoofCh return uspoof_check2UnicodeString(sc, id, NULL, status); } -static int32_t checkImpl(const SpoofImpl* This, const UnicodeString& id, CheckResult* checkResult, UErrorCode* status) { +int32_t checkImpl(const SpoofImpl* This, const UnicodeString& id, CheckResult* checkResult, UErrorCode* status) { U_ASSERT(This != NULL); U_ASSERT(checkResult != NULL); checkResult->clear(); diff -urp icu4c-63_2/icu/source/test/intltest/convtest.cpp icu4c-63_1/icu/source/test/intltest/convtest.cpp --- icu4c-63_2/icu/source/test/intltest/convtest.cpp 2019-04-12 00:38:30.000000000 +0200 +++ icu4c-63_1/icu/source/test/intltest/convtest.cpp 2018-09-29 02:34:42.000000000 +0200 @@ -606,7 +606,12 @@ ConversionTest::TestGetUnicodeSet2() { // First try to see if we have different sets because ucnv_getUnicodeSet() // added strings: The above conversion method does not tell us what strings might be convertible. // Remove strings from the set and compare again. - set.removeAllStrings(); + // Unfortunately, there are no good, direct set methods for finding out whether there are strings + // in the set, nor for enumerating or removing just them. + // Intersect all code points with the set. The intersection will not contain strings. + UnicodeSet temp(0, 0x10ffff); + temp.retainAll(set); + set=temp; } if(set!=expected) { UnicodeSet diffSet; diff -urp icu4c-63_2/icu/source/test/intltest/numbertest.h icu4c-63_1/icu/source/test/intltest/numbertest.h --- icu4c-63_2/icu/source/test/intltest/numbertest.h 2019-04-12 00:38:30.000000000 +0200 +++ icu4c-63_1/icu/source/test/intltest/numbertest.h 2018-10-02 00:39:56.000000000 +0200 @@ -10,7 +10,6 @@ #include "intltest.h" #include "number_affixutils.h" #include "numparse_stringsegment.h" -#include "numrange_impl.h" #include "unicode/locid.h" #include "unicode/numberformatter.h" #include "unicode/numberrangeformatter.h"