blob: 487d0c0985ff903bbe20ed3027eb21e6303c5272 [file] [log] [blame]
/*
**********************************************************************
* Copyright (C) 2007, International Business Machines
* Corporation and others. All Rights Reserved.
**********************************************************************
* file name: bitset.cpp
* encoding: US-ASCII
* tab size: 8 (not used)
* indentation:4
*
* created on: 2007jan15
* created by: Markus Scherer
*
* Idea for a "compiled", fast, read-only (immutable) version of a UnicodeSet
* using a folded bit set consisting of a 1k-entry index table and a
* compacted array of 64-bit words.
* Uses a simple hash table for compaction.
* Uses the original set for supplementary code points.
*/
#include "unicode/utypes.h"
#include "unicont.h"
/*
* Hash table for up to 1k 64-bit words, for 1 bit per BMP code point.
* Hashes 64-bit words and maps them to 16-bit integers which are
* assigned in order of new incoming words for subsequent storage
* in a contiguous array.
*/
struct BMPBitHash : public UObject {
int64_t keys[0x800]; // 2k
uint16_t values[0x800];
uint16_t reverse[0x400];
uint16_t count;
const int32_t prime=1301; // Less than 2k.
BMPBitHash() : count(0) {
// Fill values[] with 0xffff.
uprv_memset(values, 0xff, sizeof(values));
}
/*
* Map a key to an integer count.
* Map at most 1k=0x400 different keys with this data structure.
*/
uint16_t map(int64_t key) {
int32_t hash=(int32_t)(key>>55)&0x1ff;
hash^=(int32_t)(key>>44)&0x7ff;
hash^=(int32_t)(key>>33)&0x7ff;
hash^=(int32_t)(key>>22)&0x7ff;
hash^=(int32_t)(key>>11)&0x7ff;
hash^=(int32_t)key&0x7ff;
for(;;) {
if(values[hash]==0xffff) {
// Unused slot.
keys[hash]=key;
reverse[count]=hash;
return values[hash]=count++;
} else if(keys[hash]==key) {
// Found a slot with this key.
return values[hash];
} else {
// Used slot with a different key, move to another slot.
hash=(hash+prime)&0x7ff;
}
}
}
uint16_t countKeys() const { return count; }
/*
* Invert the hash map: Fill an array of length countKeys() with the keys
* indexed by their mapped values.
*/
void invert(int64_t *k) const {
uint16_t i;
for(i=0; i<count; ++i) {
k[i]=keys[reverse[i]];
}
}
};
class BitSet : public UObject, public UnicodeContainable {
public:
BitSet(const UnicodeSet &set, UErrorCode &errorCode) : bits(shortBits), restSet(set.clone()) {
if(U_FAILURE(errorCode)) {
return;
}
BMPBitHash *bitHash=new BMPBitHash;
if(bitHash==NULL || restSet==NULL) {
errorCode=U_MEMORY_ALLOCATION_ERROR;
return;
}
UnicodeSetIterator iter(set);
int64_t b;
UChar32 start, end;
int32_t prevIndex, i, j;
b=0; // Not necessary but makes compilers happy.
prevIndex=-1;
for(;;) {
if(iter.nextRange() && !iter.isString()) {
start=iter.getCodepoint();
end=iter.getCodepointEnd();
} else {
start=0x10000;
}
i=start>>6;
if(prevIndex!=i) {
// Finish the end of the previous range.
if(prevIndex<0) {
prevIndex=0;
} else {
index[prevIndex++]=bitHash->map(b);
}
// Fill all-zero entries between ranges.
if(prevIndex<i) {
uint16_t zero=bitHash->map(0);
do {
index[prevIndex++]=zero;
} while(prevIndex<i);
}
b=0;
}
if(start>0xffff) {
break;
}
b|=~((INT64_C(1)<<(start&0x3f))-1);
j=end>>6;
if(i<j) {
// Set bits for the start of the range.
index[i++]=bitHash->map(b);
// Fill all-one entries inside the range.
if(i<j) {
uint16_t all=bitHash->map(INT64_C(0xffffffffffffffff));
do {
index[i++]=all;
} while(i<j);
}
b=INT64_C(0xffffffffffffffff);
}
/* i==j */
b&=(INT64_C(1)<<(end&0x3f))-1;
prevIndex=j;
}
if(bitHash->countKeys()>LENGTHOF(shortBits)) {
bits=(int64_t *)uprv_malloc(bitHash->countKeys()*8);
}
if(bits!=NULL) {
bitHash->invert(bits);
} else {
bits=shortBits;
errorCode=U_MEMORY_ALLOCATION_ERROR;
return;
}
latin1Set[0]=(uint32_t)bits[0];
latin1Set[1]=(uint32_t)(bits[0]>>32);
latin1Set[2]=(uint32_t)bits[1];
latin1Set[3]=(uint32_t)(bits[1]>>32);
latin1Set[4]=(uint32_t)bits[2];
latin1Set[5]=(uint32_t)(bits[2]>>32);
latin1Set[6]=(uint32_t)bits[3];
latin1Set[7]=(uint32_t)(bits[3]>>32);
restSet.remove(0, 0xffff);
}
~BitSet() {
if(bits!=shortBits) {
uprv_free(bits);
}
delete restSet;
}
UBool contains(UChar32 c) const {
if((uint32_t)c<=0xff) {
return (UBool)((latin1Set[c>>5]&((uint32_t)1<<(c&0x1f)))!=0);
} else if((uint32_t)c<0xffff) {
return (UBool)((bits[c>>6]&(INT64_C(1)<<(c&0x3f)))!=0);
} else {
return restSet->contains(c);
}
}
private:
uint16_t index[0x400];
int64_t shortBits[32];
int64_t *bits;
uint32_t latin1Bits[8];
UnicodeSet *restSet;
};