| #include <climits> | 
 | #include <cstddef> | 
 | #include <cstdio> | 
 | #include <cstring> | 
 | #include <fstream> | 
 | #if !defined(_MSC_VER) | 
 | #include <glob.h> | 
 | #endif | 
 | #include <vector> | 
 |  | 
 | #include "deorummolae.h" | 
 | #include "durchschlag.h" | 
 | #include "sieve.h" | 
 |  | 
 | /* This isn't a definitive list of "--foo" arguments, only those that take an | 
 |  * additional "=#" integer parameter, like "--foo=20" or "--foo=32K". | 
 |  */ | 
 | #define LONG_ARG_BLOCK_LEN "--block_len=" | 
 | #define LONG_ARG_SLICE_LEN "--slice_len=" | 
 | #define LONG_ARG_TARGET_DICT_LEN "--target_dict_len=" | 
 | #define LONG_ARG_MIN_SLICE_POP "--min_slice_pop=" | 
 | #define LONG_ARG_CHUNK_LEN "--chunk_len=" | 
 | #define LONG_ARG_OVERLAP_LEN "--overlap_len=" | 
 |  | 
 | #define METHOD_DM 0 | 
 | #define METHOD_SIEVE 1 | 
 | #define METHOD_DURCHSCHLAG 2 | 
 | #define METHOD_DISTILL 3 | 
 | #define METHOD_PURIFY 4 | 
 |  | 
 | static size_t readInt(const char* str) { | 
 |   size_t result = 0; | 
 |   if (str[0] == 0 || str[0] == '0') { | 
 |     return 0; | 
 |   } | 
 |   for (size_t i = 0; i < 13; ++i) { | 
 |     if (str[i] == 0) { | 
 |       return result; | 
 |     } | 
 |     if (str[i] == 'k' || str[i] == 'K') { | 
 |       if ((str[i + 1] == 0) && ((result << 10) > result)) { | 
 |         return result << 10; | 
 |       } | 
 |       return 0; | 
 |     } | 
 |     if (str[i] == 'm' || str[i] == 'M') { | 
 |       if ((str[i + 1] == 0) && ((result << 20) > result)) { | 
 |         return result << 20; | 
 |       } | 
 |       return 0; | 
 |     } | 
 |     if (str[i] < '0' || str[i] > '9') { | 
 |       return 0; | 
 |     } | 
 |     size_t next = (10 * result) + (str[i] - '0'); | 
 |     if (next <= result) { | 
 |       return 0; | 
 |     } | 
 |     result = next; | 
 |   } | 
 |   return 0; | 
 | } | 
 |  | 
 | static std::string readFile(const std::string& path) { | 
 |   std::ifstream file(path); | 
 |   std::string content( | 
 |       (std::istreambuf_iterator<char>(file)), std::istreambuf_iterator<char>()); | 
 |   return content; | 
 | } | 
 |  | 
 | static void writeFile(const char* file, const std::string& content) { | 
 |   std::ofstream outfile(file, std::ofstream::binary); | 
 |   outfile.write(content.c_str(), static_cast<std::streamsize>(content.size())); | 
 |   outfile.close(); | 
 | } | 
 |  | 
 | static void writeSamples(const std::vector<std::string>& paths, | 
 |     const std::vector<size_t>& sizes, const uint8_t* data) { | 
 |   size_t offset = 0; | 
 |   for (size_t i = 0; i < paths.size(); ++i) { | 
 |     const char* path = paths[i].c_str(); | 
 |     size_t sampleSize = sizes[i]; | 
 |     std::ofstream outfile(path, std::ofstream::binary); | 
 |     outfile.write(reinterpret_cast<const char*>(data + offset), | 
 |         static_cast<std::streamsize>(sampleSize)); | 
 |     outfile.close(); | 
 |     offset += sampleSize; | 
 |   } | 
 | } | 
 |  | 
 | /* Returns "base file name" or its tail, if it contains '/' or '\'. */ | 
 | static const char* fileName(const char* path) { | 
 |   const char* separator_position = strrchr(path, '/'); | 
 |   if (separator_position) path = separator_position + 1; | 
 |   separator_position = strrchr(path, '\\'); | 
 |   if (separator_position) path = separator_position + 1; | 
 |   return path; | 
 | } | 
 |  | 
 | static void printHelp(const char* name) { | 
 |   fprintf(stderr, "Usage: %s [OPTION]... DICTIONARY [SAMPLE]...\n", name); | 
 |   fprintf(stderr, | 
 |       "Options:\n" | 
 |       "  --dm       use 'deorummolae' engine\n" | 
 |       "  --distill  rewrite samples; unique text parts are removed\n" | 
 |       "  --dsh      use 'durchschlag' engine (default)\n" | 
 |       "  --purify   rewrite samples; unique text parts are zeroed out\n" | 
 |       "  --sieve    use 'sieve' engine\n" | 
 |       "  -b#, --block_len=#\n" | 
 |       "             set block length for 'durchschlag'; default: 1024\n" | 
 |       "  -s#, --slice_len=#\n" | 
 |       "             set slice length for 'distill', 'durchschlag', 'purify'\n" | 
 |       "             and 'sieve'; default: 16\n" | 
 |       "  -t#, --target_dict_len=#\n" | 
 |       "             set target dictionary length (limit); default: 16K\n" | 
 |       "  -u#, --min_slice_pop=#\n" | 
 |       "             set minimum slice population (for rewrites); default: 2\n" | 
 |       "  -c#, --chunk_len=#\n" | 
 |       "             if positive, samples are cut into chunks of this length;\n" | 
 |       "             default: 0; cannot mix with 'rewrite samples'\n" | 
 |       "  -o#, --overlap_len=#\n" | 
 |       "             set chunk overlap length; default 0\n" | 
 |       "# is a decimal number with optional k/K/m/M suffix.\n" | 
 |       "WARNING: 'distill' and 'purify' will overwrite original samples!\n" | 
 |       "         Completely unique samples might become empty files.\n\n"); | 
 | } | 
 |  | 
 | int main(int argc, char const* argv[]) { | 
 |   int dictionaryArg = -1; | 
 |   int method = METHOD_DURCHSCHLAG; | 
 |   size_t sliceLen = 16; | 
 |   size_t targetSize = 16 << 10; | 
 |   size_t blockSize = 1024; | 
 |   size_t minimumPopulation = 2; | 
 |   size_t chunkLen = 0; | 
 |   size_t overlapLen = 0; | 
 |  | 
 |   std::vector<uint8_t> data; | 
 |   std::vector<size_t> sizes; | 
 |   std::vector<std::string> paths; | 
 |   size_t total = 0; | 
 |   for (int i = 1; i < argc; ++i) { | 
 |     if (argv[i] == nullptr) { | 
 |       continue; | 
 |     } | 
 |  | 
 |     if (argv[i][0] == '-') { | 
 |       char arg1 = argv[i][1]; | 
 |       const char* arg2 = arg1 ? &argv[i][2] : nullptr; | 
 |       if (arg1 == '-') { | 
 |         if (dictionaryArg != -1) { | 
 |           fprintf(stderr, | 
 |               "Method should be specified before dictionary / sample '%s'\n", | 
 |               argv[i]); | 
 |           exit(1); | 
 |         } | 
 |  | 
 |         /* Look for "--long_arg" via exact match. */ | 
 |         if (std::strcmp(argv[i], "--sieve") == 0) { | 
 |           method = METHOD_SIEVE; | 
 |           continue; | 
 |         } | 
 |         if (std::strcmp(argv[i], "--dm") == 0) { | 
 |           method = METHOD_DM; | 
 |           continue; | 
 |         } | 
 |         if (std::strcmp(argv[i], "--dsh") == 0) { | 
 |           method = METHOD_DURCHSCHLAG; | 
 |           continue; | 
 |         } | 
 |         if (std::strcmp(argv[i], "--distill") == 0) { | 
 |           method = METHOD_DISTILL; | 
 |           continue; | 
 |         } | 
 |         if (std::strcmp(argv[i], "--purify") == 0) { | 
 |           method = METHOD_PURIFY; | 
 |           continue; | 
 |         } | 
 |  | 
 |         /* Look for "--long_arg=#" via prefix match. */ | 
 |         if (std::strncmp(argv[i], LONG_ARG_BLOCK_LEN, | 
 |               std::strlen(LONG_ARG_BLOCK_LEN)) == 0) { | 
 |           arg1 = 'b'; | 
 |           arg2 = &argv[i][std::strlen(LONG_ARG_BLOCK_LEN)]; | 
 |         } else if (std::strncmp(argv[i], LONG_ARG_SLICE_LEN, | 
 |               std::strlen(LONG_ARG_SLICE_LEN)) == 0) { | 
 |           arg1 = 's'; | 
 |           arg2 = &argv[i][std::strlen(LONG_ARG_SLICE_LEN)]; | 
 |         } else if (std::strncmp(argv[i], LONG_ARG_TARGET_DICT_LEN, | 
 |               std::strlen(LONG_ARG_TARGET_DICT_LEN)) == 0) { | 
 |           arg1 = 't'; | 
 |           arg2 = &argv[i][std::strlen(LONG_ARG_TARGET_DICT_LEN)]; | 
 |         } else if (std::strncmp(argv[i], LONG_ARG_MIN_SLICE_POP, | 
 |               std::strlen(LONG_ARG_MIN_SLICE_POP)) == 0) { | 
 |           arg1 = 'u'; | 
 |           arg2 = &argv[i][std::strlen(LONG_ARG_MIN_SLICE_POP)]; | 
 |         } else if (std::strncmp(argv[i], LONG_ARG_CHUNK_LEN, | 
 |               std::strlen(LONG_ARG_CHUNK_LEN)) == 0) { | 
 |           arg1 = 'c'; | 
 |           arg2 = &argv[i][std::strlen(LONG_ARG_CHUNK_LEN)]; | 
 |         } else if (std::strncmp(argv[i], LONG_ARG_OVERLAP_LEN, | 
 |               std::strlen(LONG_ARG_OVERLAP_LEN)) == 0) { | 
 |           arg1 = 'o'; | 
 |           arg2 = &argv[i][std::strlen(LONG_ARG_OVERLAP_LEN)]; | 
 |         } else { | 
 |           printHelp(fileName(argv[0])); | 
 |           fprintf(stderr, "Invalid option '%s'\n", argv[i]); | 
 |           exit(1); | 
 |         } | 
 |       } | 
 |  | 
 |       /* Look for "-f" short args or "--foo=#" long args. */ | 
 |       if (arg1 == 'b') { | 
 |         blockSize = readInt(arg2); | 
 |         if (blockSize < 16 || blockSize > 65536) { | 
 |           printHelp(fileName(argv[0])); | 
 |           fprintf(stderr, "Invalid option '%s'\n", argv[i]); | 
 |           exit(1); | 
 |         } | 
 |       } else if (arg1 == 's') { | 
 |         sliceLen = readInt(arg2); | 
 |         // TODO(eustas): investigate why sliceLen == 4..5 greatly slows down | 
 |         //               durschlag engine, but only from command line; | 
 |         //               durschlag_runner seems to work fine with those. | 
 |         if (sliceLen < 4 || sliceLen > 256) { | 
 |           printHelp(fileName(argv[0])); | 
 |           fprintf(stderr, "Invalid option '%s'\n", argv[i]); | 
 |           exit(1); | 
 |         } | 
 |       } else if (arg1 == 't') { | 
 |         targetSize = readInt(arg2); | 
 |         if (targetSize < 256 || targetSize > (1 << 25)) { | 
 |           printHelp(fileName(argv[0])); | 
 |           fprintf(stderr, "Invalid option '%s'\n", argv[i]); | 
 |           exit(1); | 
 |         } | 
 |       } else if (arg1 == 'u') { | 
 |         minimumPopulation = readInt(arg2); | 
 |         if (minimumPopulation < 256 || minimumPopulation > 65536) { | 
 |           printHelp(fileName(argv[0])); | 
 |           fprintf(stderr, "Invalid option '%s'\n", argv[i]); | 
 |           exit(1); | 
 |         } | 
 |       } else if (arg1 == 'c') { | 
 |         chunkLen = readInt(arg2); | 
 |         if (chunkLen < 0 || chunkLen > INT_MAX) { | 
 |           printHelp(fileName(argv[0])); | 
 |           fprintf(stderr, "Invalid option '%s'\n", argv[i]); | 
 |           exit(1); | 
 |         } | 
 |       } else if (arg1 == 'o') { | 
 |         overlapLen = readInt(arg2); | 
 |         if (overlapLen < 0 || overlapLen > INT_MAX) { | 
 |           printHelp(fileName(argv[0])); | 
 |           fprintf(stderr, "Invalid option '%s'\n", argv[i]); | 
 |           exit(1); | 
 |         } | 
 |       } else { | 
 |         printHelp(fileName(argv[0])); | 
 |         fprintf(stderr, "Unrecognized option '%s'\n", argv[i]); | 
 |         exit(1); | 
 |       } | 
 |       continue; | 
 |     } | 
 |  | 
 |     if (dictionaryArg == -1) { | 
 |       if (method != METHOD_DISTILL && method != METHOD_PURIFY) { | 
 |         dictionaryArg = i; | 
 |         continue; | 
 |       } | 
 |     } | 
 |  | 
 |     bool ok = true; | 
 | #if defined(_MSC_VER) | 
 |         const char* resolved_path = argv[i]; | 
 | #else | 
 |     glob_t resolved_paths; | 
 |     memset(&resolved_paths, 0, sizeof(resolved_paths)); | 
 |     if (glob(argv[i], GLOB_TILDE, NULL, &resolved_paths) == 0) { | 
 |       for(size_t j = 0; j < resolved_paths.gl_pathc; ++j) { | 
 |         const char* resolved_path = resolved_paths.gl_pathv[j]; | 
 | #endif | 
 |         std::string content = readFile(resolved_path); | 
 |         if (chunkLen == 0) { | 
 |           paths.emplace_back(resolved_path); | 
 |           data.insert(data.end(), content.begin(), content.end()); | 
 |           total += content.size(); | 
 |           sizes.push_back(content.size()); | 
 |           continue; | 
 |         } else if (chunkLen <= overlapLen) { | 
 |           printHelp(fileName(argv[0])); | 
 |           fprintf(stderr, "Invalid chunkLen - overlapLen combination\n"); | 
 |           exit(1); | 
 |         } | 
 |         for (size_t chunkStart = 0; | 
 |             chunkStart < content.size(); | 
 |             chunkStart += chunkLen - overlapLen) { | 
 |           std::string chunk = content.substr(chunkStart, chunkLen); | 
 |           data.insert(data.end(), chunk.begin(), chunk.end()); | 
 |           total += chunk.size(); | 
 |           sizes.push_back(chunk.size()); | 
 |         } | 
 | #if !defined(_MSC_VER) | 
 |       } | 
 |     } else { | 
 |       ok = false; | 
 |     } | 
 |     globfree(&resolved_paths); | 
 | #endif | 
 |     if (!ok) exit(1); | 
 |   } | 
 |  | 
 |   fprintf(stderr, "Number of chunks: %zu; total size: %zu\n", sizes.size(), | 
 |           total); | 
 |  | 
 |   bool wantDictionary = (dictionaryArg == -1); | 
 |   if (method == METHOD_DISTILL || method == METHOD_PURIFY) { | 
 |     wantDictionary = false; | 
 |     if (chunkLen != 0) { | 
 |       printHelp(fileName(argv[0])); | 
 |       fprintf(stderr, "Cannot mix 'rewrite samples' with positive chunk_len\n"); | 
 |       exit(1); | 
 |     } | 
 |   } | 
 |   if (wantDictionary || total == 0) { | 
 |     printHelp(fileName(argv[0])); | 
 |     fprintf(stderr, "Not enough arguments\n"); | 
 |     exit(1); | 
 |   } | 
 |  | 
 |   if (method == METHOD_SIEVE) { | 
 |     writeFile(argv[dictionaryArg], sieve_generate( | 
 |         targetSize, sliceLen, sizes, data.data())); | 
 |   } else if (method == METHOD_DM) { | 
 |     writeFile(argv[dictionaryArg], DM_generate( | 
 |         targetSize, sizes, data.data())); | 
 |   } else if (method == METHOD_DURCHSCHLAG) { | 
 |     writeFile(argv[dictionaryArg], durchschlag_generate( | 
 |         targetSize, sliceLen, blockSize, sizes, data.data())); | 
 |   } else if (method == METHOD_DISTILL) { | 
 |     durchschlag_distill(sliceLen, minimumPopulation, &sizes, data.data()); | 
 |     writeSamples(paths, sizes, data.data()); | 
 |   } else if (method == METHOD_PURIFY) { | 
 |     durchschlag_purify(sliceLen, minimumPopulation, sizes, data.data()); | 
 |     writeSamples(paths, sizes, data.data()); | 
 |   } else { | 
 |     printHelp(fileName(argv[0])); | 
 |     fprintf(stderr, "Unknown generator\n"); | 
 |     exit(1); | 
 |   } | 
 |   return 0; | 
 | } |