blob: ab4e3a618bfe844f913c41df92349c6968b2cb4c [file] [log] [blame]
/*
* Copyright 2017 Google Inc.
*
* Use of this source code is governed by a BSD-style license that can be
* found in the LICENSE file.
*/
#include "src/sksl/lex/DFA.h"
#include "src/sksl/lex/LexUtil.h"
#include "src/sksl/lex/NFA.h"
#include "src/sksl/lex/NFAtoDFA.h"
#include "src/sksl/lex/RegexNode.h"
#include "src/sksl/lex/RegexParser.h"
#include "src/sksl/lex/TransitionTable.h"
#include <stdio.h>
#include <stdlib.h>
#include <algorithm>
#include <sstream>
#include <string>
#include <vector>
/**
* Processes a .lex file and produces .h and .cpp files which implement a lexical analyzer. The .lex
* file is a text file with one token definition per line. Each line is of the form:
* <TOKEN_NAME> = <pattern>
* where <pattern> is either a regular expression (e.g [0-9]) or a double-quoted literal string.
*/
static constexpr const char HEADER[] =
"/*\n"
" * Copyright 2017 Google Inc.\n"
" *\n"
" * Use of this source code is governed by a BSD-style license that can be\n"
" * found in the LICENSE file.\n"
" */\n"
"/*****************************************************************************************\n"
" ******************** This file was generated by sksllex. Do not edit. *******************\n"
" *****************************************************************************************/\n";
static void writeH(const DFA& dfa, const char* lexer, const char* token,
const std::vector<std::string>& tokens, const char* hPath) {
std::ofstream out(hPath);
SkASSERT(out.good());
out << HEADER;
out << "#ifndef SKSL_" << lexer << "\n";
out << "#define SKSL_" << lexer << "\n";
out << "#include <cstdint>\n";
out << "#include <string_view>\n";
out << "namespace SkSL {\n";
out << "\n";
out << "struct " << token << " {\n";
out << " enum class Kind {\n";
for (const std::string& t : tokens) {
out << " TK_" << t << ",\n";
}
out << " TK_NONE,";
out << R"(
};
)" << token << "() {}";
out << token << R"((Kind kind, int32_t offset, int32_t length)
: fKind(kind)
, fOffset(offset)
, fLength(length) {}
Kind fKind = Kind::TK_NONE;
int32_t fOffset = -1;
int32_t fLength = -1;
};
class )" << lexer << R"( {
public:
void start(std::string_view text) {
fText = text;
fOffset = 0;
}
)" << token << R"( next();
struct Checkpoint {
int32_t fOffset;
};
Checkpoint getCheckpoint() const {
return {fOffset};
}
void rewindToCheckpoint(Checkpoint checkpoint) {
fOffset = checkpoint.fOffset;
}
private:
std::string_view fText;
int32_t fOffset;
};
} // namespace
#endif
)";
}
static void writeCPP(const DFA& dfa, const char* lexer, const char* token, const char* include,
const char* cppPath) {
std::ofstream out(cppPath);
SkASSERT(out.good());
out << HEADER;
out << "#include \"" << include << "\"\n";
out << "\n";
out << "namespace SkSL {\n";
out << "\n";
size_t states = 0;
for (const auto& row : dfa.fTransitions) {
states = std::max(states, row.size());
}
out << "using State = " << (states <= 256 ? "uint8_t" : "uint16_t") << ";\n";
// Find the first character mapped in our DFA.
size_t startChar = 0;
for (; startChar < dfa.fCharMappings.size(); ++startChar) {
if (dfa.fCharMappings[startChar] != 0) {
break;
}
}
// Arbitrarily-chosen character which is greater than startChar, and should not appear in actual
// input.
SkASSERT(startChar < 18);
out << "static constexpr uint8_t kInvalidChar = 18;";
out << "static constexpr int8_t kMappings[" << dfa.fCharMappings.size() - startChar << "] = {\n"
" ";
const char* separator = "";
for (size_t index = startChar; index < dfa.fCharMappings.size(); ++index) {
out << separator << std::to_string(dfa.fCharMappings[index]);
separator = ", ";
}
out << "\n};\n";
WriteTransitionTable(out, dfa, states);
out << "static const int8_t kAccepts[" << states << "] = {";
for (size_t i = 0; i < states; ++i) {
if (i < dfa.fAccepts.size()) {
out << " " << dfa.fAccepts[i] << ",";
} else {
out << " " << INVALID << ",";
}
}
out << " };\n";
out << "\n";
out << token << " " << lexer << "::next() {";
out << R"(
// note that we cheat here: normally a lexer needs to worry about the case
// where a token has a prefix which is not itself a valid token - for instance,
// maybe we have a valid token 'while', but 'w', 'wh', etc. are not valid
// tokens. Our grammar doesn't have this property, so we can simplify the logic
// a bit.
int32_t startOffset = fOffset;
State state = 1;
for (;;) {
if (fOffset >= (int32_t)fText.length()) {
if (startOffset == (int32_t)fText.length() || kAccepts[state] == -1) {
return )" << token << "(" << token << R"(::Kind::TK_END_OF_FILE, startOffset, 0);
}
break;
}
uint8_t c = (uint8_t)(fText[fOffset] - )" << startChar << R"();
if (c >= )" << dfa.fCharMappings.size() - startChar << R"() {
c = kInvalidChar;
}
State newState = get_transition(kMappings[c], state);
if (!newState) {
break;
}
state = newState;
++fOffset;
}
Token::Kind kind = ()" << token << R"(::Kind) kAccepts[state];
return )" << token << R"((kind, startOffset, fOffset - startOffset);
}
} // namespace
)";
}
static void process(const char* inPath, const char* lexer, const char* token, const char* hPath,
const char* cppPath) {
NFA nfa;
std::vector<std::string> tokens;
tokens.push_back("END_OF_FILE");
std::string line;
std::ifstream in(inPath);
while (std::getline(in, line)) {
if (line.length() == 0) {
continue;
}
if (line.length() >= 2 && line[0] == '/' && line[1] == '/') {
continue;
}
std::istringstream split(line);
std::string name, delimiter, pattern;
if (split >> name >> delimiter >> pattern) {
SkASSERT(split.eof());
SkASSERT(name != "");
SkASSERT(delimiter == "=");
SkASSERT(pattern != "");
tokens.push_back(name);
if (pattern[0] == '"') {
SkASSERT(pattern.size() > 2 && pattern[pattern.size() - 1] == '"');
RegexNode node = RegexNode(RegexNode::kChar_Kind, pattern[1]);
for (size_t i = 2; i < pattern.size() - 1; ++i) {
node = RegexNode(RegexNode::kConcat_Kind, node,
RegexNode(RegexNode::kChar_Kind, pattern[i]));
}
nfa.addRegex(node);
}
else {
nfa.addRegex(RegexParser().parse(pattern));
}
}
}
NFAtoDFA converter(&nfa);
DFA dfa = converter.convert();
writeH(dfa, lexer, token, tokens, hPath);
writeCPP(dfa, lexer, token, (std::string("src/sksl/SkSL") + lexer + ".h").c_str(), cppPath);
}
int main(int argc, const char** argv) {
if (argc != 6) {
printf("usage: sksllex <input.lex> <lexername> <tokenname> <output.h> <output.cpp>\n");
exit(1);
}
process(argv[1], argv[2], argv[3], argv[4], argv[5]);
return 0;
}