| #include "./durchschlag.h" |
| |
| #include <algorithm> |
| #include <exception> /* terminate */ |
| |
| #include "divsufsort.h" |
| |
| /* Pointer to position in text. */ |
| typedef DurchschlagTextIdx TextIdx; |
| |
| /* (Sum of) value(s) of slice(s). */ |
| typedef uint32_t Score; |
| |
| typedef struct HashSlot { |
| TextIdx next; |
| TextIdx offset; |
| } HashSlot; |
| |
| typedef struct MetaSlot { |
| TextIdx mark; |
| Score score; |
| } MetaSlot; |
| |
| typedef struct Range { |
| TextIdx start; |
| TextIdx end; |
| } Range; |
| |
| typedef struct Candidate { |
| Score score; |
| TextIdx position; |
| } Candidate; |
| |
| struct greaterScore { |
| bool operator()(const Candidate& a, const Candidate& b) const { |
| return (a.score > b.score) || |
| ((a.score == b.score) && (a.position < b.position)); |
| } |
| }; |
| |
| struct lessScore { |
| bool operator()(const Candidate& a, const Candidate& b) const { |
| return (a.score < b.score) || |
| ((a.score == b.score) && (a.position > b.position)); |
| } |
| }; |
| |
| #define CANDIDATE_BUNDLE_SIZE (1 << 18) |
| |
| static void fatal(const char* error) { |
| fprintf(stderr, "%s\n", error); |
| std::terminate(); |
| } |
| |
| static TextIdx calculateDictionarySize(const std::vector<Range>& ranges) { |
| TextIdx result = 0; |
| for (size_t i = 0; i < ranges.size(); ++i) { |
| const Range& r = ranges[i]; |
| result += r.end - r.start; |
| } |
| return result; |
| } |
| |
| static std::string createDictionary( |
| const uint8_t* data, const std::vector<Range>& ranges, size_t limit) { |
| std::string output; |
| output.reserve(calculateDictionarySize(ranges)); |
| for (size_t i = 0; i < ranges.size(); ++i) { |
| const Range& r = ranges[i]; |
| output.insert(output.end(), &data[r.start], &data[r.end]); |
| } |
| if (output.size() > limit) { |
| output.resize(limit); |
| } |
| return output; |
| } |
| |
| /* precondition: span > 0 |
| precondition: end + span == len(shortcut) */ |
| static Score buildCandidatesList(std::vector<Candidate>* candidates, |
| std::vector<MetaSlot>* map, TextIdx span, const TextIdx* shortcut, |
| TextIdx end) { |
| candidates->resize(0); |
| |
| size_t n = map->size(); |
| MetaSlot* slots = map->data(); |
| for (size_t j = 0; j < n; ++j) { |
| slots[j].mark = 0; |
| } |
| |
| Score score = 0; |
| /* Consider the whole span, except one last item. The following loop will |
| add the last item to the end of the "chain", evaluate it, and cut one |
| "link" form the beginning. */ |
| for (size_t j = 0; j < span - 1; ++j) { |
| MetaSlot& item = slots[shortcut[j]]; |
| if (item.mark == 0) { |
| score += item.score; |
| } |
| item.mark++; |
| } |
| |
| TextIdx i = 0; |
| TextIdx limit = std::min<TextIdx>(end, CANDIDATE_BUNDLE_SIZE); |
| Score maxScore = 0; |
| for (; i < limit; ++i) { |
| TextIdx slice = shortcut[i + span - 1]; |
| MetaSlot& pick = slots[slice]; |
| if (pick.mark == 0) { |
| score += pick.score; |
| } |
| pick.mark++; |
| |
| if (score > maxScore) { |
| maxScore = score; |
| } |
| candidates->push_back({score, i}); |
| |
| MetaSlot& drop = slots[shortcut[i]]; |
| drop.mark--; |
| if (drop.mark == 0) { |
| score -= drop.score; |
| } |
| } |
| |
| std::make_heap(candidates->begin(), candidates->end(), greaterScore()); |
| Score minScore = candidates->at(0).score; |
| for (; i < end; ++i) { |
| TextIdx slice = shortcut[i + span - 1]; |
| MetaSlot& pick = slots[slice]; |
| if (pick.mark == 0) { |
| score += pick.score; |
| } |
| pick.mark++; |
| |
| if (score > maxScore) { |
| maxScore = score; |
| } |
| if (score >= minScore) { |
| candidates->push_back({score, i}); |
| std::push_heap(candidates->begin(), candidates->end(), greaterScore()); |
| if (candidates->size() > CANDIDATE_BUNDLE_SIZE && maxScore != minScore) { |
| while (candidates->at(0).score == minScore) { |
| std::pop_heap(candidates->begin(), candidates->end(), greaterScore()); |
| candidates->pop_back(); |
| } |
| minScore = candidates->at(0).score; |
| } |
| } |
| |
| MetaSlot& drop = slots[shortcut[i]]; |
| drop.mark--; |
| if (drop.mark == 0) { |
| score -= drop.score; |
| } |
| } |
| |
| for (size_t j = 0; j < n; ++j) { |
| slots[j].mark = 0; |
| } |
| |
| std::make_heap(candidates->begin(), candidates->end(), lessScore()); |
| return minScore; |
| } |
| |
| /* precondition: span > 0 |
| precondition: end + span == len(shortcut) */ |
| static Score rebuildCandidatesList(std::vector<TextIdx>* candidates, |
| std::vector<MetaSlot>* map, TextIdx span, const TextIdx* shortcut, |
| TextIdx end, TextIdx* next) { |
| size_t n = candidates->size(); |
| TextIdx* data = candidates->data(); |
| for (size_t i = 0; i < n; ++i) { |
| data[i] = 0; |
| } |
| |
| n = map->size(); |
| MetaSlot* slots = map->data(); |
| for (size_t i = 0; i < n; ++i) { |
| slots[i].mark = 0; |
| } |
| |
| Score score = 0; |
| /* Consider the whole span, except one last item. The following loop will |
| add the last item to the end of the "chain", evaluate it, and cut one |
| "link" form the beginning. */ |
| for (TextIdx i = 0; i < span - 1; ++i) { |
| MetaSlot& item = slots[shortcut[i]]; |
| if (item.mark == 0) { |
| score += item.score; |
| } |
| item.mark++; |
| } |
| |
| Score maxScore = 0; |
| for (TextIdx i = 0; i < end; ++i) { |
| MetaSlot& pick = slots[shortcut[i + span - 1]]; |
| if (pick.mark == 0) { |
| score += pick.score; |
| } |
| pick.mark++; |
| |
| if (candidates->size() <= score) { |
| candidates->resize(score + 1); |
| } |
| if (score > maxScore) { |
| maxScore = score; |
| } |
| next[i] = candidates->at(score); |
| candidates->at(score) = i; |
| |
| MetaSlot& drop = slots[shortcut[i]]; |
| drop.mark--; |
| if (drop.mark == 0) { |
| score -= drop.score; |
| } |
| } |
| |
| for (size_t i = 0; i < n; ++i) { |
| slots[i].mark = 0; |
| } |
| |
| candidates->resize(maxScore + 1); |
| return maxScore; |
| } |
| |
| static void addRange(std::vector<Range>* ranges, TextIdx start, TextIdx end) { |
| for (auto it = ranges->begin(); it != ranges->end();) { |
| if (end < it->start) { |
| ranges->insert(it, {start, end}); |
| return; |
| } |
| if (it->end < start) { |
| it++; |
| continue; |
| } |
| // Combine with existing. |
| start = std::min(start, it->start); |
| end = std::max(end, it->end); |
| // Remove consumed vector and continue. |
| it = ranges->erase(it); |
| } |
| ranges->push_back({start, end}); |
| } |
| |
| std::string durchschlag_generate( |
| size_t dictionary_size_limit, size_t slice_len, size_t block_len, |
| const std::vector<size_t>& sample_sizes, const uint8_t* sample_data) { |
| DurchschlagContext ctx = durchschlag_prepare( |
| slice_len, sample_sizes, sample_data); |
| return durchschlag_generate(DURCHSCHLAG_COLLABORATIVE, |
| dictionary_size_limit, block_len, ctx, sample_data); |
| } |
| |
| DurchschlagContext durchschlag_prepare(size_t slice_len, |
| const std::vector<size_t>& sample_sizes, const uint8_t* sample_data) { |
| /* Parameters aliasing */ |
| TextIdx sliceLen = static_cast<TextIdx>(slice_len); |
| if (sliceLen != slice_len) fatal("slice_len is too large"); |
| if (sliceLen < 1) fatal("slice_len is too small"); |
| const uint8_t* data = sample_data; |
| |
| TextIdx total = 0; |
| std::vector<TextIdx> offsets; |
| offsets.reserve(sample_sizes.size()); |
| for (size_t i = 0; i < sample_sizes.size(); ++i) { |
| TextIdx delta = static_cast<TextIdx>(sample_sizes[i]); |
| if (delta != sample_sizes[i]) fatal("sample is too large"); |
| if (delta == 0) fatal("0-length samples are prohibited"); |
| TextIdx next_total = total + delta; |
| if (next_total <= total) fatal("corpus is too large"); |
| total = next_total; |
| offsets.push_back(total); |
| } |
| |
| if (total < sliceLen) fatal("slice_len is larger than corpus size"); |
| TextIdx end = total - static_cast<TextIdx>(sliceLen) + 1; |
| TextIdx hashLen = 11; |
| while (hashLen < 29 && ((1u << hashLen) < end)) { |
| hashLen += 3; |
| } |
| hashLen -= 3; |
| TextIdx hashMask = (1u << hashLen) - 1u; |
| std::vector<TextIdx> hashHead(1 << hashLen); |
| TextIdx hash = 0; |
| TextIdx lShift = 3; |
| TextIdx rShift = hashLen - lShift; |
| for (TextIdx i = 0; i < sliceLen - 1; ++i) { |
| TextIdx v = data[i]; |
| hash = (((hash << lShift) | (hash >> rShift)) & hashMask) ^ v; |
| } |
| TextIdx lShiftX = (lShift * (sliceLen - 1)) % hashLen; |
| TextIdx rShiftX = hashLen - lShiftX; |
| |
| std::vector<HashSlot> map; |
| map.push_back({0, 0}); |
| TextIdx hashSlot = 1; |
| std::vector<TextIdx> sliceMap; |
| sliceMap.reserve(end); |
| for (TextIdx i = 0; i < end; ++i) { |
| TextIdx v = data[i + sliceLen - 1]; |
| TextIdx bucket = (((hash << lShift) | (hash >> rShift)) & hashMask) ^ v; |
| v = data[i]; |
| hash = bucket ^ (((v << lShiftX) | (v >> rShiftX)) & hashMask); |
| TextIdx slot = hashHead[bucket]; |
| while (slot != 0) { |
| HashSlot& item = map[slot]; |
| TextIdx start = item.offset; |
| bool miss = false; |
| for (TextIdx j = 0; j < sliceLen; ++j) { |
| if (data[i + j] != data[start + j]) { |
| miss = true; |
| break; |
| } |
| } |
| if (!miss) { |
| sliceMap.push_back(slot); |
| break; |
| } |
| slot = item.next; |
| } |
| if (slot == 0) { |
| map.push_back({hashHead[bucket], i}); |
| hashHead[bucket] = hashSlot; |
| sliceMap.push_back(hashSlot); |
| hashSlot++; |
| } |
| } |
| |
| return {total, sliceLen, static_cast<TextIdx>(map.size()), |
| std::move(offsets), std::move(sliceMap)}; |
| } |
| |
| DurchschlagContext durchschlag_prepare(size_t slice_len, |
| const std::vector<size_t>& sample_sizes, const DurchschlagIndex& index) { |
| /* Parameters aliasing */ |
| TextIdx sliceLen = static_cast<TextIdx>(slice_len); |
| if (sliceLen != slice_len) fatal("slice_len is too large"); |
| if (sliceLen < 1) fatal("slice_len is too small"); |
| const TextIdx* lcp = index.lcp.data(); |
| const TextIdx* sa = index.sa.data(); |
| |
| TextIdx total = 0; |
| std::vector<TextIdx> offsets; |
| offsets.reserve(sample_sizes.size()); |
| for (size_t i = 0; i < sample_sizes.size(); ++i) { |
| TextIdx delta = static_cast<TextIdx>(sample_sizes[i]); |
| if (delta != sample_sizes[i]) fatal("sample is too large"); |
| if (delta == 0) fatal("0-length samples are prohibited"); |
| TextIdx next_total = total + delta; |
| if (next_total <= total) fatal("corpus is too large"); |
| total = next_total; |
| offsets.push_back(total); |
| } |
| |
| if (total < sliceLen) fatal("slice_len is larger than corpus size"); |
| TextIdx counter = 1; |
| TextIdx end = total - sliceLen + 1; |
| std::vector<TextIdx> sliceMap(total); |
| TextIdx last = 0; |
| TextIdx current = 1; |
| while (current <= total) { |
| if (lcp[current - 1] < sliceLen) { |
| for (TextIdx i = last; i < current; ++i) { |
| sliceMap[sa[i]] = counter; |
| } |
| counter++; |
| last = current; |
| } |
| current++; |
| } |
| sliceMap.resize(end); |
| |
| // Reorder items for the better locality. |
| std::vector<TextIdx> reorder(counter); |
| counter = 1; |
| for (TextIdx i = 0; i < end; ++i) { |
| if (reorder[sliceMap[i]] == 0) { |
| reorder[sliceMap[i]] = counter++; |
| } |
| } |
| for (TextIdx i = 0; i < end; ++i) { |
| sliceMap[i] = reorder[sliceMap[i]]; |
| } |
| |
| return {total, sliceLen, counter, std::move(offsets), std::move(sliceMap)}; |
| } |
| |
| DurchschlagIndex durchschlag_index(const std::vector<uint8_t>& data) { |
| TextIdx total = static_cast<TextIdx>(data.size()); |
| if (total != data.size()) fatal("corpus is too large"); |
| saidx_t saTotal = static_cast<saidx_t>(total); |
| if (saTotal < 0) fatal("corpus is too large"); |
| if (static_cast<TextIdx>(saTotal) != total) fatal("corpus is too large"); |
| std::vector<TextIdx> sa(total); |
| /* Hopefully, non-negative int32_t values match TextIdx ones. */ |
| if (sizeof(TextIdx) != sizeof(int32_t)) fatal("type length mismatch"); |
| int32_t* saData = reinterpret_cast<int32_t*>(sa.data()); |
| divsufsort(data.data(), saData, saTotal); |
| |
| std::vector<TextIdx> isa(total); |
| for (TextIdx i = 0; i < total; ++i) isa[sa[i]] = i; |
| |
| // TODO: borrowed -> unknown efficiency. |
| std::vector<TextIdx> lcp(total); |
| TextIdx k = 0; |
| lcp[total - 1] = 0; |
| for (TextIdx i = 0; i < total; ++i) { |
| TextIdx current = isa[i]; |
| if (current == total - 1) { |
| k = 0; |
| continue; |
| } |
| TextIdx j = sa[current + 1]; // Suffix which follow i-th suffix. |
| while ((i + k < total) && (j + k < total) && (data[i + k] == data[j + k])) { |
| ++k; |
| } |
| lcp[current] = k; |
| if (k > 0) --k; |
| } |
| |
| return {std::move(lcp), std::move(sa)}; |
| } |
| |
| static void ScoreSlices(const std::vector<TextIdx>& offsets, |
| std::vector<MetaSlot>& map, const TextIdx* shortcut, TextIdx end) { |
| TextIdx piece = 0; |
| /* Fresh map contains all zeroes -> initial mark should be different. */ |
| TextIdx mark = 1; |
| for (TextIdx i = 0; i < end; ++i) { |
| if (offsets[piece] == i) { |
| piece++; |
| mark++; |
| } |
| MetaSlot& item = map[shortcut[i]]; |
| if (item.mark != mark) { |
| item.mark = mark; |
| item.score++; |
| } |
| } |
| } |
| |
| static std::string durchschlagGenerateExclusive( |
| size_t dictionary_size_limit, size_t block_len, |
| const DurchschlagContext& context, const uint8_t* sample_data) { |
| /* Parameters aliasing */ |
| TextIdx targetSize = static_cast<TextIdx>(dictionary_size_limit); |
| if (targetSize != dictionary_size_limit) { |
| fprintf(stderr, "dictionary_size_limit is too large\n"); |
| return ""; |
| } |
| TextIdx sliceLen = context.sliceLen; |
| TextIdx total = context.dataSize; |
| TextIdx blockLen = static_cast<TextIdx>(block_len); |
| if (blockLen != block_len) { |
| fprintf(stderr, "block_len is too large\n"); |
| return ""; |
| } |
| const uint8_t* data = sample_data; |
| const std::vector<TextIdx>& offsets = context.offsets; |
| std::vector<MetaSlot> map(context.numUniqueSlices); |
| const TextIdx* shortcut = context.sliceMap.data(); |
| |
| /* Initialization */ |
| if (blockLen < sliceLen) { |
| fprintf(stderr, "sliceLen is larger than block_len\n"); |
| return ""; |
| } |
| if (targetSize < blockLen || total < blockLen) { |
| fprintf(stderr, "block_len is too large\n"); |
| return ""; |
| } |
| TextIdx end = total - sliceLen + 1; |
| ScoreSlices(offsets, map, shortcut, end); |
| TextIdx span = blockLen - sliceLen + 1; |
| end = static_cast<TextIdx>(context.sliceMap.size()) - span; |
| std::vector<TextIdx> candidates; |
| std::vector<TextIdx> next(end); |
| Score maxScore = rebuildCandidatesList( |
| &candidates, &map, span, shortcut, end, next.data()); |
| |
| /* Block selection */ |
| const size_t triesLimit = (600 * 1000000) / span; |
| const size_t candidatesLimit = (150 * 1000000) / span; |
| std::vector<Range> ranges; |
| TextIdx mark = 0; |
| size_t numTries = 0; |
| while (true) { |
| TextIdx dictSize = calculateDictionarySize(ranges); |
| size_t numCandidates = 0; |
| if (dictSize > targetSize - blockLen) { |
| break; |
| } |
| if (maxScore == 0) { |
| break; |
| } |
| while (true) { |
| TextIdx candidate = 0; |
| while (maxScore > 0) { |
| if (candidates[maxScore] != 0) { |
| candidate = candidates[maxScore]; |
| candidates[maxScore] = next[candidate]; |
| break; |
| } |
| maxScore--; |
| } |
| if (maxScore == 0) { |
| break; |
| } |
| mark++; |
| numTries++; |
| numCandidates++; |
| Score score = 0; |
| for (size_t j = candidate; j < candidate + span; ++j) { |
| MetaSlot& item = map[shortcut[j]]; |
| if (item.mark != mark) { |
| score += item.score; |
| item.mark = mark; |
| } |
| } |
| if (score < maxScore) { |
| if (numTries < triesLimit && numCandidates < candidatesLimit) { |
| next[candidate] = candidates[score]; |
| candidates[score] = candidate; |
| } else { |
| maxScore = rebuildCandidatesList( |
| &candidates, &map, span, shortcut, end, next.data()); |
| mark = 0; |
| numTries = 0; |
| numCandidates = 0; |
| } |
| continue; |
| } else if (score > maxScore) { |
| fprintf(stderr, "Broken invariant\n"); |
| return ""; |
| } |
| for (TextIdx j = candidate; j < candidate + span; ++j) { |
| MetaSlot& item = map[shortcut[j]]; |
| item.score = 0; |
| } |
| addRange(&ranges, candidate, candidate + blockLen); |
| break; |
| } |
| } |
| |
| return createDictionary(data, ranges, targetSize); |
| } |
| |
| static std::string durchschlagGenerateCollaborative( |
| size_t dictionary_size_limit, size_t block_len, |
| const DurchschlagContext& context, const uint8_t* sample_data) { |
| /* Parameters aliasing */ |
| TextIdx targetSize = static_cast<TextIdx>(dictionary_size_limit); |
| if (targetSize != dictionary_size_limit) { |
| fprintf(stderr, "dictionary_size_limit is too large\n"); |
| return ""; |
| } |
| TextIdx sliceLen = context.sliceLen; |
| TextIdx total = context.dataSize; |
| TextIdx blockLen = static_cast<TextIdx>(block_len); |
| if (blockLen != block_len) { |
| fprintf(stderr, "block_len is too large\n"); |
| return ""; |
| } |
| const uint8_t* data = sample_data; |
| const std::vector<TextIdx>& offsets = context.offsets; |
| std::vector<MetaSlot> map(context.numUniqueSlices); |
| const TextIdx* shortcut = context.sliceMap.data(); |
| |
| /* Initialization */ |
| if (blockLen < sliceLen) { |
| fprintf(stderr, "sliceLen is larger than block_len\n"); |
| return ""; |
| } |
| if (targetSize < blockLen || total < blockLen) { |
| fprintf(stderr, "block_len is too large\n"); |
| return ""; |
| } |
| TextIdx end = total - sliceLen + 1; |
| ScoreSlices(offsets, map, shortcut, end); |
| TextIdx span = blockLen - sliceLen + 1; |
| end = static_cast<TextIdx>(context.sliceMap.size()) - span; |
| std::vector<Candidate> candidates; |
| candidates.reserve(CANDIDATE_BUNDLE_SIZE + 1024); |
| Score minScore = buildCandidatesList(&candidates, &map, span, shortcut, end); |
| |
| /* Block selection */ |
| std::vector<Range> ranges; |
| TextIdx mark = 0; |
| while (true) { |
| TextIdx dictSize = calculateDictionarySize(ranges); |
| if (dictSize > targetSize - blockLen) { |
| break; |
| } |
| if (minScore == 0 && candidates.empty()) { |
| break; |
| } |
| while (true) { |
| if (candidates.empty()) { |
| minScore = buildCandidatesList(&candidates, &map, span, shortcut, end); |
| mark = 0; |
| } |
| TextIdx candidate = candidates[0].position; |
| Score expectedScore = candidates[0].score; |
| if (expectedScore == 0) { |
| candidates.resize(0); |
| break; |
| } |
| std::pop_heap(candidates.begin(), candidates.end(), lessScore()); |
| candidates.pop_back(); |
| mark++; |
| Score score = 0; |
| for (TextIdx j = candidate; j < candidate + span; ++j) { |
| MetaSlot& item = map[shortcut[j]]; |
| if (item.mark != mark) { |
| score += item.score; |
| item.mark = mark; |
| } |
| } |
| if (score < expectedScore) { |
| if (score >= minScore) { |
| candidates.push_back({score, candidate}); |
| std::push_heap(candidates.begin(), candidates.end(), lessScore()); |
| } |
| continue; |
| } else if (score > expectedScore) { |
| fatal("Broken invariant"); |
| } |
| for (TextIdx j = candidate; j < candidate + span; ++j) { |
| MetaSlot& item = map[shortcut[j]]; |
| item.score = 0; |
| } |
| addRange(&ranges, candidate, candidate + blockLen); |
| break; |
| } |
| } |
| |
| return createDictionary(data, ranges, targetSize); |
| } |
| |
| std::string durchschlag_generate(DurchschalgResourceStrategy strategy, |
| size_t dictionary_size_limit, size_t block_len, |
| const DurchschlagContext& context, const uint8_t* sample_data) { |
| if (strategy == DURCHSCHLAG_COLLABORATIVE) { |
| return durchschlagGenerateCollaborative( |
| dictionary_size_limit, block_len, context, sample_data); |
| } else { |
| return durchschlagGenerateExclusive( |
| dictionary_size_limit, block_len, context, sample_data); |
| } |
| } |
| |
| void durchschlag_distill(size_t slice_len, size_t minimum_population, |
| std::vector<size_t>* sample_sizes, uint8_t* sample_data) { |
| /* Parameters aliasing */ |
| uint8_t* data = sample_data; |
| |
| /* Build slice map. */ |
| DurchschlagContext context = durchschlag_prepare( |
| slice_len, *sample_sizes, data); |
| |
| /* Calculate slice population. */ |
| const std::vector<TextIdx>& offsets = context.offsets; |
| std::vector<MetaSlot> map(context.numUniqueSlices); |
| const TextIdx* shortcut = context.sliceMap.data(); |
| TextIdx sliceLen = context.sliceLen; |
| TextIdx total = context.dataSize; |
| TextIdx end = total - sliceLen + 1; |
| ScoreSlices(offsets, map, shortcut, end); |
| |
| /* Condense samples, omitting unique slices. */ |
| TextIdx readPos = 0; |
| TextIdx writePos = 0; |
| TextIdx lastNonUniquePos = 0; |
| for (TextIdx i = 0; i < sample_sizes->size(); ++i) { |
| TextIdx sampleStart = writePos; |
| TextIdx oldSampleEnd = |
| readPos + static_cast<TextIdx>(sample_sizes->at(i)); |
| while (readPos < oldSampleEnd) { |
| if (readPos < end) { |
| MetaSlot& item = map[shortcut[readPos]]; |
| if (item.score >= minimum_population) { |
| lastNonUniquePos = readPos + sliceLen; |
| } |
| } |
| if (readPos < lastNonUniquePos) { |
| data[writePos++] = data[readPos]; |
| } |
| readPos++; |
| } |
| sample_sizes->at(i) = writePos - sampleStart; |
| } |
| } |
| |
| void durchschlag_purify(size_t slice_len, size_t minimum_population, |
| const std::vector<size_t>& sample_sizes, uint8_t* sample_data) { |
| /* Parameters aliasing */ |
| uint8_t* data = sample_data; |
| |
| /* Build slice map. */ |
| DurchschlagContext context = durchschlag_prepare( |
| slice_len, sample_sizes, data); |
| |
| /* Calculate slice population. */ |
| const std::vector<TextIdx>& offsets = context.offsets; |
| std::vector<MetaSlot> map(context.numUniqueSlices); |
| const TextIdx* shortcut = context.sliceMap.data(); |
| TextIdx sliceLen = context.sliceLen; |
| TextIdx total = context.dataSize; |
| TextIdx end = total - sliceLen + 1; |
| ScoreSlices(offsets, map, shortcut, end); |
| |
| /* Rewrite samples, zeroing out unique slices. */ |
| TextIdx lastNonUniquePos = 0; |
| for (TextIdx readPos = 0; readPos < total; ++readPos) { |
| if (readPos < end) { |
| MetaSlot& item = map[shortcut[readPos]]; |
| if (item.score >= minimum_population) { |
| lastNonUniquePos = readPos + sliceLen; |
| } |
| } |
| if (readPos >= lastNonUniquePos) { |
| data[readPos] = 0; |
| } |
| } |
| } |