| // © 2016 and later: Unicode, Inc. and others. | 
 | // License & terms of use: http://www.unicode.org/copyright.html | 
 |  | 
 | #include <stdio.h> | 
 | #include <string> | 
 | #include <stdlib.h> | 
 | #include <errno.h> | 
 | #include <string.h> | 
 | #include <iostream> | 
 | #include <fstream> | 
 |  | 
 | // We only use U8_* macros, which are entirely inline. | 
 | #include "unicode/utf8.h" | 
 |  | 
 | // This contains a codepage and ISO 14882:1998 illegality table. | 
 | // Use "make gen-table" to rebuild it. | 
 | #include "cptbl.h" | 
 |  | 
 | /** | 
 |  * What is this? | 
 |  * | 
 |  * "This" is a preprocessor that makes an attempt to convert fully valid C++11 source code | 
 |  * in utf-8 into something consumable by certain compilers (Solaris, xlC) | 
 |  * which aren't quite standards compliant. | 
 |  * | 
 |  * - u"<unicode>" or u'<unicode>' gets converted to u"\uNNNN" or u'\uNNNN' | 
 |  * - u8"<unicode>" gets converted to "\xAA\xBB\xCC\xDD" etc. | 
 |  *   (some compilers do not support the u8 prefix correctly.) | 
 |  * - if the system is EBCDIC-based, that is used to correct the input characters. | 
 |  * | 
 |  * Usage: | 
 |  *   escapesrc infile.cpp outfile.cpp | 
 |  * Normally this is invoked by the build stage, with a rule such as: | 
 |  * | 
 |  * _%.cpp: $(srcdir)/%.cpp | 
 |  *       @$(BINDIR)/escapesrc$(EXEEXT) $< $@ | 
 |  * %.o: _%.cpp | 
 |  *       $(COMPILE.cc) ... $@ $< | 
 |  * | 
 |  * In the Makefiles, SKIP_ESCAPING=YES is used to prevent escapesrc.cpp  | 
 |  * from being itself escaped. | 
 |  */ | 
 |  | 
 |  | 
 | static const char | 
 |   kSPACE   = 0x20, | 
 |   kTAB     = 0x09, | 
 |   kLF      = 0x0A, | 
 |   kCR      = 0x0D; | 
 |  | 
 | // For convenience | 
 | # define cp1047_to_8859(c) cp1047_8859_1[c] | 
 |  | 
 | // Our app's name | 
 | std::string prog; | 
 |  | 
 | /** | 
 |  * Give the usual 1-line documentation and exit | 
 |  */ | 
 | void usage() { | 
 |   fprintf(stderr, "%s: usage: %s infile.cpp outfile.cpp\n", prog.c_str(), prog.c_str()); | 
 | } | 
 |  | 
 | /** | 
 |  * Delete the output file (if any) | 
 |  * We want to delete even if we didn't generate, because it might be stale. | 
 |  */ | 
 | int cleanup(const std::string &outfile) { | 
 |   const char *outstr = outfile.c_str(); | 
 |   if(outstr && *outstr) { | 
 |     int rc = std::remove(outstr); | 
 |     if(rc == 0) { | 
 |       fprintf(stderr, "%s: deleted %s\n", prog.c_str(), outstr); | 
 |       return 0; | 
 |     } else { | 
 |       if( errno == ENOENT ) { | 
 |         return 0; // File did not exist - no error. | 
 |       } else { | 
 |         perror("std::remove"); | 
 |         return 1; | 
 |       } | 
 |     } | 
 |   } | 
 |   return 0; | 
 | } | 
 |  | 
 | /** | 
 |  * Skip across any known whitespace. | 
 |  * @param p startpoint | 
 |  * @param e limit | 
 |  * @return first non-whitespace char | 
 |  */ | 
 | inline const char *skipws(const char *p, const char *e) { | 
 |   for(;p<e;p++) { | 
 |     switch(*p) { | 
 |     case kSPACE: | 
 |     case kTAB: | 
 |     case kLF: | 
 |     case kCR: | 
 |       break; | 
 |     default: | 
 |       return p; // non ws | 
 |     } | 
 |   } | 
 |   return p; | 
 | } | 
 |  | 
 | /** | 
 |  * Append a byte, hex encoded | 
 |  * @param outstr sstring to append to | 
 |  * @param byte the byte to append | 
 |  */ | 
 | void appendByte(std::string &outstr, | 
 |                 uint8_t byte) { | 
 |     char tmp2[5]; | 
 |     sprintf(tmp2, "\\x%02X", 0xFF & (int)(byte)); | 
 |     outstr += tmp2; | 
 | } | 
 |  | 
 | /** | 
 |  * Append the bytes from 'linestr' into outstr, with escaping | 
 |  * @param outstr the output buffer | 
 |  * @param linestr the input buffer | 
 |  * @param pos in/out: the current char under consideration | 
 |  * @param chars the number of chars to consider | 
 |  * @return true on failure | 
 |  */ | 
 | bool appendUtf8(std::string &outstr, | 
 |                 const std::string &linestr, | 
 |                 size_t &pos, | 
 |                 size_t chars) { | 
 |   char tmp[9]; | 
 |   for(size_t i=0;i<chars;i++) { | 
 |     tmp[i] = linestr[++pos]; | 
 |   } | 
 |   tmp[chars] = 0; | 
 |   unsigned int c; | 
 |   sscanf(tmp, "%X", &c); | 
 |   UChar32 ch = c & 0x1FFFFF;  | 
 |  | 
 |   // now to append \\x%% etc | 
 |   uint8_t bytesNeeded = U8_LENGTH(ch); | 
 |   if(bytesNeeded == 0) { | 
 |     fprintf(stderr, "Illegal code point U+%X\n", ch); | 
 |     return true; | 
 |   } | 
 |   uint8_t bytes[4]; | 
 |   uint8_t *s = bytes; | 
 |   size_t i = 0; | 
 |   U8_APPEND_UNSAFE(s, i, ch); | 
 |   for(size_t t = 0; t<i; t++) { | 
 |     appendByte(outstr, s[t]); | 
 |   } | 
 |   return false; | 
 | } | 
 |  | 
 | /** | 
 |  * Fixup u8"x" | 
 |  * @param linestr string to mutate. Already escaped into \u format. | 
 |  * @param origpos beginning, points to 'u8"' | 
 |  * @param pos end, points to " | 
 |  * @return false for no-problem, true for failure! | 
 |  */ | 
 | bool fixu8(std::string &linestr, size_t origpos, size_t &endpos) { | 
 |   size_t pos = origpos + 3; | 
 |   std::string outstr; | 
 |   outstr += '\"'; // local encoding | 
 |   for(;pos<endpos;pos++) { | 
 |     char c = linestr[pos]; | 
 |     if(c == '\\') { | 
 |       char c2 = linestr[++pos]; | 
 |       switch(c2) { | 
 |       case '\'': | 
 |       case '"': | 
 | #if (U_CHARSET_FAMILY == U_EBCDIC_FAMILY) | 
 |         c2 = cp1047_to_8859(c2); | 
 | #endif | 
 |         appendByte(outstr, c2); | 
 |         break; | 
 |       case 'u': | 
 |         appendUtf8(outstr, linestr, pos, 4); | 
 |         break; | 
 |       case 'U': | 
 |         appendUtf8(outstr, linestr, pos, 8); | 
 |         break; | 
 |       } | 
 |     } else { | 
 | #if (U_CHARSET_FAMILY == U_EBCDIC_FAMILY) | 
 |       c = cp1047_to_8859(c); | 
 | #endif | 
 |       appendByte(outstr, c); | 
 |     } | 
 |   } | 
 |   outstr += ('\"'); | 
 |  | 
 |   linestr.replace(origpos, (endpos-origpos+1), outstr); | 
 |    | 
 |   return false; // OK | 
 | } | 
 |  | 
 | /** | 
 |  * fix the u"x"/u'x'/u8"x" string at the position | 
 |  * u8'x' is not supported, sorry. | 
 |  * @param linestr the input string | 
 |  * @param pos the position | 
 |  * @return false = no err, true = had err | 
 |  */ | 
 | bool fixAt(std::string &linestr, size_t pos) { | 
 |   size_t origpos = pos; | 
 |    | 
 |   if(linestr[pos] != 'u') { | 
 |     fprintf(stderr, "Not a 'u'?"); | 
 |     return true; | 
 |   } | 
 |  | 
 |   pos++; // past 'u' | 
 |  | 
 |   bool utf8 = false; | 
 |    | 
 |   if(linestr[pos] == '8') { // u8" | 
 |     utf8 = true; | 
 |     pos++; | 
 |   } | 
 |    | 
 |   char quote = linestr[pos]; | 
 |  | 
 |   if(quote != '\'' && quote != '\"') { | 
 |     fprintf(stderr, "Quote is '%c' - not sure what to do.\n", quote); | 
 |     return true; | 
 |   } | 
 |  | 
 |   if(quote == '\'' && utf8) { | 
 |     fprintf(stderr, "Cannot do u8'...'\n"); | 
 |     return true; | 
 |   } | 
 |  | 
 |   pos ++; | 
 |  | 
 |   //printf("u%c…%c\n", quote, quote); | 
 |  | 
 |   for(; pos < linestr.size(); pos++) { | 
 |     if(linestr[pos] == quote) { | 
 |       if(utf8) { | 
 |         return fixu8(linestr, origpos, pos); // fix u8"..." | 
 |       } else { | 
 |         return false; // end of quote | 
 |       } | 
 |     } | 
 |     if(linestr[pos] == '\\') { | 
 |       pos++; | 
 |       if(linestr[pos] == quote) continue; // quoted quote | 
 |       if(linestr[pos] == 'u') continue; // for now ... unicode escape | 
 |       if(linestr[pos] == '\\') continue; | 
 |       // some other escape… ignore | 
 |     } else { | 
 |       size_t old_pos = pos; | 
 |       int32_t i = pos; | 
 | #if (U_CHARSET_FAMILY == U_EBCDIC_FAMILY) | 
 |       // mogrify 1-4 bytes from 1047 'back' to utf-8 | 
 |       char old_byte = linestr[pos]; | 
 |       linestr[pos] = cp1047_to_8859(linestr[pos]); | 
 |       // how many more? | 
 |       int32_t trail = U8_COUNT_TRAIL_BYTES(linestr[pos]); | 
 |       for(size_t pos2 = pos+1; trail>0; pos2++,trail--) { | 
 |         linestr[pos2] = cp1047_to_8859(linestr[pos2]); | 
 |         if(linestr[pos2] == 0x0A) { | 
 |           linestr[pos2] = 0x85; // NL is ambiguous here | 
 |         } | 
 |       } | 
 | #endif | 
 |        | 
 |       // Proceed to decode utf-8 | 
 |       const uint8_t *s = (const uint8_t*) (linestr.c_str()); | 
 |       int32_t length = linestr.size(); | 
 |       UChar32 c; | 
 |       if(U8_IS_SINGLE((uint8_t)s[i]) && oldIllegal[s[i]]) { | 
 | #if (U_CHARSET_FAMILY == U_EBCDIC_FAMILY) | 
 |         linestr[pos] = old_byte; // put it back | 
 | #endif | 
 |         continue; // single code point not previously legal for \u escaping | 
 |       } | 
 |  | 
 |       // otherwise, convert it to \u / \U | 
 |       { | 
 |         U8_NEXT(s, i, length, c); | 
 |       } | 
 |       if(c<0) { | 
 |         fprintf(stderr, "Illegal utf-8 sequence at Column: %d\n", (int)old_pos); | 
 |         fprintf(stderr, "Line: >>%s<<\n", linestr.c_str()); | 
 |         return true; | 
 |       } | 
 |  | 
 |       size_t seqLen = (i-pos); | 
 |  | 
 |       //printf("U+%04X pos %d [len %d]\n", c, pos, seqLen);fflush(stdout); | 
 |  | 
 |       char newSeq[20]; | 
 |       if( c <= 0xFFFF) { | 
 |         sprintf(newSeq, "\\u%04X", c); | 
 |       } else { | 
 |         sprintf(newSeq, "\\U%08X", c); | 
 |       } | 
 |       linestr.replace(pos, seqLen, newSeq); | 
 |       pos += strlen(newSeq) - 1; | 
 |     } | 
 |   } | 
 |  | 
 |   return false; | 
 | } | 
 |  | 
 | /** | 
 |  * Fixup an entire line | 
 |  * false = no err | 
 |  * true = had err | 
 |  * @param no the line number (not used) | 
 |  * @param linestr the string to fix | 
 |  * @return true if any err, else false | 
 |  */ | 
 | bool fixLine(int /*no*/, std::string &linestr) { | 
 |   const char *line = linestr.c_str(); | 
 |   size_t len = linestr.size(); | 
 |  | 
 |   // no u' in the line? | 
 |   if(!strstr(line, "u'") && !strstr(line, "u\"") && !strstr(line, "u8\"")) { | 
 |     return false; // Nothing to do. No u' or u" detected | 
 |   } | 
 |  | 
 |   // start from the end and find all u" cases | 
 |   size_t pos = len = linestr.size(); | 
 |   if(len>INT32_MAX/2) { | 
 |     return true; | 
 |   } | 
 |   while((pos>0) && (pos = linestr.rfind("u\"", pos)) != std::string::npos) { | 
 |     //printf("found doublequote at %d\n", pos); | 
 |     if(fixAt(linestr, pos)) return true; | 
 |     if(pos == 0) break; | 
 |     pos--; | 
 |   } | 
 |  | 
 |   // reset and find all u' cases | 
 |   pos = len = linestr.size(); | 
 |   while((pos>0) && (pos = linestr.rfind("u'", pos)) != std::string::npos) { | 
 |     //printf("found singlequote at %d\n", pos); | 
 |     if(fixAt(linestr, pos)) return true; | 
 |     if(pos == 0) break; | 
 |     pos--; | 
 |   } | 
 |  | 
 |   // reset and find all u8" cases | 
 |   pos = len = linestr.size(); | 
 |   while((pos>0) && (pos = linestr.rfind("u8\"", pos)) != std::string::npos) { | 
 |     if(fixAt(linestr, pos)) return true; | 
 |     if(pos == 0) break; | 
 |     pos--; | 
 |   } | 
 |  | 
 |   //fprintf(stderr, "%d - fixed\n", no); | 
 |   return false; | 
 | } | 
 |  | 
 | /** | 
 |  * Convert a whole file | 
 |  * @param infile | 
 |  * @param outfile | 
 |  * @return 1 on err, 0 otherwise | 
 |  */ | 
 | int convert(const std::string &infile, const std::string &outfile) { | 
 |   fprintf(stderr, "escapesrc: %s -> %s\n", infile.c_str(), outfile.c_str()); | 
 |  | 
 |   std::ifstream inf; | 
 |    | 
 |   inf.open(infile.c_str(), std::ios::in); | 
 |  | 
 |   if(!inf.is_open()) { | 
 |     fprintf(stderr, "%s: could not open input file %s\n", prog.c_str(), infile.c_str()); | 
 |     cleanup(outfile); | 
 |     return 1; | 
 |   } | 
 |  | 
 |   std::ofstream outf; | 
 |  | 
 |   outf.open(outfile.c_str(), std::ios::out); | 
 |  | 
 |   if(!outf.is_open()) { | 
 |     fprintf(stderr, "%s: could not open output file %s\n", prog.c_str(), outfile.c_str()); | 
 |     return 1; | 
 |   } | 
 |  | 
 |   // TODO: any platform variations of #line? | 
 |   outf << "#line 1 \"" << infile << "\"" << '\n'; | 
 |  | 
 |   int no = 0; | 
 |   std::string linestr; | 
 |   while( getline( inf, linestr)) { | 
 |     no++; | 
 |     if(fixLine(no, linestr)) { | 
 |       goto fail; | 
 |     } | 
 |     outf << linestr << '\n'; | 
 |   } | 
 |  | 
 |   if(inf.eof()) { | 
 |     return 0; | 
 |   } | 
 | fail: | 
 |   outf.close(); | 
 |   fprintf(stderr, "%s:%d: Fixup failed by %s\n", infile.c_str(), no, prog.c_str()); | 
 |   cleanup(outfile); | 
 |   return 1; | 
 | } | 
 |  | 
 | /** | 
 |  * Main function | 
 |  */ | 
 | int main(int argc, const char *argv[]) { | 
 |   prog = argv[0]; | 
 |  | 
 |   if(argc != 3) { | 
 |     usage(); | 
 |     return 1; | 
 |   } | 
 |  | 
 |   std::string infile = argv[1]; | 
 |   std::string outfile = argv[2]; | 
 |  | 
 |   return convert(infile, outfile); | 
 | } |