| #include "sieve.h" | 
 |  | 
 | /* Pointer to position in (combined corpus) text. */ | 
 | typedef uint32_t TextIdx; | 
 |  | 
 | /* Index of sample / generation. */ | 
 | typedef uint16_t SampleIdx; | 
 |  | 
 | typedef struct Slot { | 
 |   TextIdx next; | 
 |   TextIdx offset; | 
 |   SampleIdx presence; | 
 |   SampleIdx mark; | 
 | } Slot; | 
 |  | 
 | static const TextIdx kNowhere = static_cast<TextIdx>(-1); | 
 |  | 
 | static TextIdx dryRun(TextIdx sliceLen, Slot* map, TextIdx* shortcut, | 
 |     TextIdx end, TextIdx middle, SampleIdx minPresence, SampleIdx iteration) { | 
 |   TextIdx from = kNowhere; | 
 |   TextIdx to = kNowhere; | 
 |   TextIdx result = 0; | 
 |   SampleIdx targetPresence = minPresence; | 
 |   for (TextIdx i = 0; i < end; ++i) { | 
 |     if (i == middle) { | 
 |       targetPresence++; | 
 |     } | 
 |     Slot& item = map[shortcut[i]]; | 
 |     if (item.mark != iteration) { | 
 |       item.mark = iteration; | 
 |       if (item.presence >= targetPresence) { | 
 |         if ((to == kNowhere) || (to < i)) { | 
 |           if (from != kNowhere) { | 
 |             result += to - from; | 
 |           } | 
 |           from = i; | 
 |         } | 
 |         to = i + sliceLen; | 
 |       } | 
 |     } | 
 |   } | 
 |   if (from != kNowhere) { | 
 |     result += to - from; | 
 |   } | 
 |   return result; | 
 | } | 
 |  | 
 | static std::string createDictionary(const uint8_t* data, TextIdx sliceLen, | 
 |     Slot* map, TextIdx* shortcut, TextIdx end, TextIdx middle, | 
 |     SampleIdx minPresence, SampleIdx iteration) { | 
 |   std::string output; | 
 |   TextIdx from = kNowhere; | 
 |   TextIdx to = kNowhere; | 
 |   SampleIdx targetPresence = minPresence; | 
 |   for (TextIdx i = 0; i < end; ++i) { | 
 |     if (i == middle) { | 
 |       targetPresence++; | 
 |     } | 
 |     Slot& item = map[shortcut[i]]; | 
 |     if (item.mark != iteration) { | 
 |       item.mark = iteration; | 
 |       if (item.presence >= targetPresence) { | 
 |         if ((to == kNowhere) || (to < i)) { | 
 |           if (from != kNowhere) { | 
 |             output.insert(output.end(), &data[from], &data[to]); | 
 |           } | 
 |           from = i; | 
 |         } | 
 |         to = i + sliceLen; | 
 |       } | 
 |     } | 
 |   } | 
 |   if (from != kNowhere) { | 
 |     output.insert(output.end(), &data[from], &data[to]); | 
 |   } | 
 |   return output; | 
 | } | 
 |  | 
 | std::string sieve_generate(size_t dictionary_size_limit, size_t slice_len, | 
 |     const std::vector<size_t>& sample_sizes, const uint8_t* sample_data) { | 
 |   /* Parameters aliasing */ | 
 |   TextIdx targetSize = static_cast<TextIdx>(dictionary_size_limit); | 
 |   if (targetSize != dictionary_size_limit) { | 
 |     fprintf(stderr, "dictionary_size_limit is too large\n"); | 
 |     return ""; | 
 |   } | 
 |   TextIdx sliceLen = static_cast<TextIdx>(slice_len); | 
 |   if (sliceLen != slice_len) { | 
 |     fprintf(stderr, "slice_len is too large\n"); | 
 |     return ""; | 
 |   } | 
 |   if (sliceLen < 1) { | 
 |     fprintf(stderr, "slice_len is too small\n"); | 
 |     return ""; | 
 |   } | 
 |   SampleIdx numSamples = static_cast<SampleIdx>(sample_sizes.size()); | 
 |   if ((numSamples != sample_sizes.size()) || (numSamples * 2 < numSamples)) { | 
 |     fprintf(stderr, "too many samples\n"); | 
 |     return ""; | 
 |   } | 
 |   const uint8_t* data = sample_data; | 
 |  | 
 |   TextIdx total = 0; | 
 |   std::vector<TextIdx> offsets; | 
 |   for (SampleIdx i = 0; i < numSamples; ++i) { | 
 |     TextIdx delta = static_cast<TextIdx>(sample_sizes[i]); | 
 |     if (delta != sample_sizes[i]) { | 
 |       fprintf(stderr, "sample is too large\n"); | 
 |       return ""; | 
 |     } | 
 |     if (delta == 0) { | 
 |       fprintf(stderr, "empty samples are prohibited\n"); | 
 |       return ""; | 
 |     } | 
 |     if (total + delta <= total) { | 
 |       fprintf(stderr, "corpus is too large\n"); | 
 |       return ""; | 
 |     } | 
 |     total += delta; | 
 |     offsets.push_back(total); | 
 |   } | 
 |  | 
 |   if (total * 2 < total) { | 
 |     fprintf(stderr, "corpus is too large\n"); | 
 |     return ""; | 
 |   } | 
 |  | 
 |   if (total < sliceLen) { | 
 |     fprintf(stderr, "slice_len is larger than corpus size\n"); | 
 |     return ""; | 
 |   } | 
 |  | 
 |   /***************************************************************************** | 
 |    * Build coverage map. | 
 |    ****************************************************************************/ | 
 |   std::vector<Slot> map; | 
 |   std::vector<TextIdx> shortcut; | 
 |   map.push_back({0, 0, 0, 0}); | 
 |   TextIdx end = total - sliceLen; | 
 |   TextIdx hashLen = 11; | 
 |   while (hashLen < 29 && ((1u << hashLen) < end)) { | 
 |     hashLen += 3; | 
 |   } | 
 |   hashLen -= 3; | 
 |   TextIdx hashMask = (1u << hashLen) - 1u; | 
 |   std::vector<TextIdx> hashHead(1 << hashLen); | 
 |   TextIdx hashSlot = 1; | 
 |   SampleIdx piece = 0; | 
 |   TextIdx hash = 0; | 
 |   TextIdx lShift = 3; | 
 |   TextIdx rShift = hashLen - lShift; | 
 |   for (TextIdx i = 0; i < sliceLen - 1; ++i) { | 
 |     TextIdx v = data[i]; | 
 |     hash = (((hash << lShift) | (hash >> rShift)) & hashMask) ^ v; | 
 |   } | 
 |   TextIdx lShiftX = (lShift * (sliceLen - 1)) % hashLen; | 
 |   TextIdx rShiftX = hashLen - lShiftX; | 
 |   for (TextIdx i = 0; i < end; ++i) { | 
 |     TextIdx v = data[i + sliceLen - 1]; | 
 |     hash = (((hash << lShift) | (hash >> rShift)) & hashMask) ^ v; | 
 |  | 
 |     if (offsets[piece] == i) { | 
 |       piece++; | 
 |     } | 
 |     TextIdx slot = hashHead[hash]; | 
 |     while (slot != 0) { | 
 |       Slot& item = map[slot]; | 
 |       TextIdx start = item.offset; | 
 |       bool miss = false; | 
 |       for (TextIdx j = 0; j < sliceLen; ++j) { | 
 |         if (data[i + j] != data[start + j]) { | 
 |           miss = true; | 
 |           break; | 
 |         } | 
 |       } | 
 |       if (!miss) { | 
 |         if (item.mark != piece) { | 
 |           item.mark = piece; | 
 |           item.presence++; | 
 |         } | 
 |         shortcut.push_back(slot); | 
 |         break; | 
 |       } | 
 |       slot = item.next; | 
 |     } | 
 |     if (slot == 0) { | 
 |       map.push_back({hashHead[hash], i, 1, piece}); | 
 |       hashHead[hash] = hashSlot; | 
 |       shortcut.push_back(hashSlot); | 
 |       hashSlot++; | 
 |     } | 
 |     v = data[i]; | 
 |     hash ^= ((v << lShiftX) | (v >> rShiftX)) & hashMask; | 
 |   } | 
 |  | 
 |   /***************************************************************************** | 
 |    * Build dictionary of specified size. | 
 |    ****************************************************************************/ | 
 |   SampleIdx a = 1; | 
 |   TextIdx size = dryRun( | 
 |       sliceLen, map.data(), shortcut.data(), end, end, a, ++piece); | 
 |   /* Maximal output is smaller than target. */ | 
 |   if (size <= targetSize) { | 
 |     return createDictionary( | 
 |         data, sliceLen, map.data(), shortcut.data(), end, end, a, ++piece); | 
 |   } | 
 |  | 
 |   SampleIdx b = numSamples; | 
 |   size = dryRun(sliceLen, map.data(), shortcut.data(), end, end, b, ++piece); | 
 |   if (size == targetSize) { | 
 |     return createDictionary( | 
 |         data, sliceLen, map.data(), shortcut.data(), end, end, b, ++piece); | 
 |   } | 
 |   /* Run binary search. */ | 
 |   if (size < targetSize) { | 
 |     /* size(a) > targetSize > size(b) && a < m < b */ | 
 |     while (a + 1 < b) { | 
 |       SampleIdx m = static_cast<SampleIdx>((a + b) / 2); | 
 |       size = dryRun( | 
 |           sliceLen, map.data(), shortcut.data(), end, end, m, ++piece); | 
 |       if (size < targetSize) { | 
 |         b = m; | 
 |       } else if (size > targetSize) { | 
 |         a = m; | 
 |       } else { | 
 |         return createDictionary( | 
 |             data, sliceLen, map.data(), shortcut.data(), end, end, b, ++piece); | 
 |       } | 
 |     } | 
 |   } else { | 
 |     a = b; | 
 |   } | 
 |   /* size(minPresence) > targetSize > size(minPresence + 1) */ | 
 |   SampleIdx minPresence = a; | 
 |   TextIdx c = 0; | 
 |   TextIdx d = end; | 
 |   /* size(a) < targetSize < size(b) && a < m < b */ | 
 |   while (c + 1 < d) { | 
 |     TextIdx m = (c + d) / 2; | 
 |     size = dryRun( | 
 |         sliceLen, map.data(), shortcut.data(), end, m, minPresence, ++piece); | 
 |     if (size < targetSize) { | 
 |       c = m; | 
 |     } else if (size > targetSize) { | 
 |       d = m; | 
 |     } else { | 
 |       return createDictionary(data, sliceLen, map.data(), shortcut.data(), end, | 
 |           m, minPresence, ++piece); | 
 |     } | 
 |   } | 
 |  | 
 |   bool unrestricted = false; | 
 |   if (minPresence <= 2 && !unrestricted) { | 
 |     minPresence = 2; | 
 |     c = end; | 
 |   } | 
 |   return createDictionary(data, sliceLen, map.data(), shortcut.data(), end, c, | 
 |       minPresence, ++piece); | 
 | } |