diff options
Diffstat (limited to 'tools/preproc/string_parser.cpp')
-rw-r--r-- | tools/preproc/string_parser.cpp | 710 |
1 files changed, 355 insertions, 355 deletions
diff --git a/tools/preproc/string_parser.cpp b/tools/preproc/string_parser.cpp index dd5196a..b0a4875 100644 --- a/tools/preproc/string_parser.cpp +++ b/tools/preproc/string_parser.cpp @@ -1,355 +1,355 @@ -// Copyright(c) 2016 YamaArashi -// -// Permission is hereby granted, free of charge, to any person obtaining a copy -// of this software and associated documentation files (the "Software"), to deal -// in the Software without restriction, including without limitation the rights -// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -// copies of the Software, and to permit persons to whom the Software is -// furnished to do so, subject to the following conditions: -// -// The above copyright notice and this permission notice shall be included in -// all copies or substantial portions of the Software. -// -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -// THE SOFTWARE. - -#include <cstdio> -#include <cstdarg> -#include <stdexcept> -#include "preproc.h" -#include "string_parser.h" -#include "char_util.h" -#include "utf8.h" - -// Reads a charmap char or escape sequence. -std::string StringParser::ReadCharOrEscape() -{ - std::string sequence; - - bool isEscape = (m_buffer[m_pos] == '\\'); - - if (isEscape) - { - m_pos++; - - if (m_buffer[m_pos] == '"') - { - sequence = g_charmap->Char('"'); - - if (sequence.length() == 0) - RaiseError("no mapping exists for double quote"); - - return sequence; - } - else if (m_buffer[m_pos] == '\\') - { - sequence = g_charmap->Char('\\'); - - if (sequence.length() == 0) - RaiseError("no mapping exists for backslash"); - - return sequence; - } - } - - unsigned char c = m_buffer[m_pos]; - - if (c == 0) - { - if (m_pos >= m_size) - RaiseError("unexpected EOF in UTF-8 string"); - else - RaiseError("unexpected null character in UTF-8 string"); - } - - if (IsAscii(c) && !IsAsciiPrintable(c)) - RaiseError("unexpected character U+%X in UTF-8 string", c); - - UnicodeChar unicodeChar = DecodeUtf8(&m_buffer[m_pos]); - m_pos += unicodeChar.encodingLength; - std::int32_t code = unicodeChar.code; - - if (code == -1) - RaiseError("invalid encoding in UTF-8 string"); - - if (isEscape && code >= 128) - RaiseError("escapes using non-ASCII characters are invalid"); - - sequence = isEscape ? g_charmap->Escape(code) : g_charmap->Char(code); - - if (sequence.length() == 0) - { - if (isEscape) - RaiseError("unknown escape '\\%c'", code); - else - RaiseError("unknown character U+%X", code); - } - - return sequence; -} - -// Reads a charmap constant, i.e. "{FOO}". -std::string StringParser::ReadBracketedConstants() -{ - std::string totalSequence; - - m_pos++; // Assume we're on the left curly bracket. - - while (m_buffer[m_pos] != '}') - { - SkipWhitespace(); - - if (IsIdentifierStartingChar(m_buffer[m_pos])) - { - long startPos = m_pos; - - m_pos++; - - while (IsIdentifierChar(m_buffer[m_pos])) - m_pos++; - - std::string sequence = g_charmap->Constant(std::string(&m_buffer[startPos], m_pos - startPos)); - - if (sequence.length() == 0) - { - m_buffer[m_pos] = 0; - RaiseError("unknown constant '%s'", &m_buffer[startPos]); - } - - totalSequence += sequence; - } - else if (IsAsciiDigit(m_buffer[m_pos])) - { - Integer integer = ReadInteger(); - - switch (integer.size) - { - case 1: - totalSequence += (unsigned char)integer.value; - break; - case 2: - totalSequence += (unsigned char)integer.value; - totalSequence += (unsigned char)(integer.value >> 8); - break; - case 4: - totalSequence += (unsigned char)integer.value; - totalSequence += (unsigned char)(integer.value >> 8); - totalSequence += (unsigned char)(integer.value >> 16); - totalSequence += (unsigned char)(integer.value >> 24); - break; - } - } - else if (m_buffer[m_pos] == 0) - { - if (m_pos >= m_size) - RaiseError("unexpected EOF after left curly bracket"); - else - RaiseError("unexpected null character within curly brackets"); - } - else - { - if (IsAsciiPrintable(m_buffer[m_pos])) - RaiseError("unexpected character '%c' within curly brackets", m_buffer[m_pos]); - else - RaiseError("unexpected character '\\x%02X' within curly brackets", m_buffer[m_pos]); - } - } - - m_pos++; // Go past the right curly bracket. - - return totalSequence; -} - -// Reads a charmap string. -int StringParser::ParseString(long srcPos, unsigned char* dest, int& destLength) -{ - m_pos = srcPos; - - if (m_buffer[m_pos] != '"') - RaiseError("expected UTF-8 string literal"); - - long start = m_pos; - - m_pos++; - - destLength = 0; - - while (m_buffer[m_pos] != '"') - { - std::string sequence = (m_buffer[m_pos] == '{') ? ReadBracketedConstants() : ReadCharOrEscape(); - - for (const char& c : sequence) - { - if (destLength == kMaxStringLength) - RaiseError("mapped string longer than %d bytes", kMaxStringLength); - - dest[destLength++] = c; - } - } - - m_pos++; // Go past the right quote. - - return m_pos - start; -} - -void StringParser::RaiseError(const char* format, ...) -{ - const int bufferSize = 1024; - char buffer[bufferSize]; - - std::va_list args; - va_start(args, format); - std::vsnprintf(buffer, bufferSize, format, args); - va_end(args); - - throw std::runtime_error(buffer); -} - -// Converts digit character to numerical value. -static int ConvertDigit(char c, int radix) -{ - int digit; - - if (c >= '0' && c <= '9') - digit = c - '0'; - else if (c >= 'A' && c <= 'F') - digit = 10 + c - 'A'; - else if (c >= 'a' && c <= 'f') - digit = 10 + c - 'a'; - else - return -1; - - return (digit < radix) ? digit : -1; -} - -void StringParser::SkipRestOfInteger(int radix) -{ - while (ConvertDigit(m_buffer[m_pos], radix) != -1) - m_pos++; -} - -StringParser::Integer StringParser::ReadDecimal() -{ - const int radix = 10; - std::uint64_t n = 0; - int digit; - std::uint64_t max = UINT32_MAX; - long startPos = m_pos; - - while ((digit = ConvertDigit(m_buffer[m_pos], radix)) != -1) - { - n = n * radix + digit; - - if (n >= max) - { - SkipRestOfInteger(radix); - - std::string intLiteral(m_buffer + startPos, m_pos - startPos); - RaiseError("integer literal \"%s\" is too large", intLiteral.c_str()); - } - - m_pos++; - } - - int size; - - if (m_buffer[m_pos] == 'H') - { - if (n >= 0x10000) - { - RaiseError("%lu is too large to be a halfword", (unsigned long)n); - } - - size = 2; - m_pos++; - } - else if (m_buffer[m_pos] == 'W') - { - size = 4; - m_pos++; - } - else - { - if (n >= 0x10000) - size = 4; - else if (n >= 0x100) - size = 2; - else - size = 1; - } - - return{ static_cast<std::uint32_t>(n), size }; -} - -StringParser::Integer StringParser::ReadHex() -{ - const int radix = 16; - std::uint64_t n = 0; - int digit; - std::uint64_t max = UINT32_MAX; - long startPos = m_pos; - - while ((digit = ConvertDigit(m_buffer[m_pos], radix)) != -1) - { - n = n * radix + digit; - - if (n >= max) - { - SkipRestOfInteger(radix); - - std::string intLiteral(m_buffer + startPos, m_pos - startPos); - RaiseError("integer literal \"%s\" is too large", intLiteral.c_str()); - } - - m_pos++; - } - - int length = m_pos - startPos; - int size = 0; - - switch (length) - { - case 2: - size = 1; - break; - case 4: - size = 2; - break; - case 8: - size = 4; - break; - default: - { - std::string intLiteral(m_buffer + startPos, m_pos - startPos); - RaiseError("hex integer literal \"0x%s\" doesn't have length of 2, 4, or 8 digits", intLiteral.c_str()); - } - } - - return{ static_cast<std::uint32_t>(n), size }; -} - -StringParser::Integer StringParser::ReadInteger() -{ - if (!IsAsciiDigit(m_buffer[m_pos])) - RaiseError("expected integer"); - - if (m_buffer[m_pos] == '0' && m_buffer[m_pos + 1] == 'x') - { - m_pos += 2; - return ReadHex(); - } - - return ReadDecimal(); -} - -// Skips tabs and spaces. -void StringParser::SkipWhitespace() -{ - while (m_buffer[m_pos] == '\t' || m_buffer[m_pos] == ' ') - m_pos++; -} +// Copyright(c) 2016 YamaArashi
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+#include <cstdio>
+#include <cstdarg>
+#include <stdexcept>
+#include "preproc.h"
+#include "string_parser.h"
+#include "char_util.h"
+#include "utf8.h"
+
+// Reads a charmap char or escape sequence.
+std::string StringParser::ReadCharOrEscape()
+{
+ std::string sequence;
+
+ bool isEscape = (m_buffer[m_pos] == '\\');
+
+ if (isEscape)
+ {
+ m_pos++;
+
+ if (m_buffer[m_pos] == '"')
+ {
+ sequence = g_charmap->Char('"');
+
+ if (sequence.length() == 0)
+ RaiseError("no mapping exists for double quote");
+
+ return sequence;
+ }
+ else if (m_buffer[m_pos] == '\\')
+ {
+ sequence = g_charmap->Char('\\');
+
+ if (sequence.length() == 0)
+ RaiseError("no mapping exists for backslash");
+
+ return sequence;
+ }
+ }
+
+ unsigned char c = m_buffer[m_pos];
+
+ if (c == 0)
+ {
+ if (m_pos >= m_size)
+ RaiseError("unexpected EOF in UTF-8 string");
+ else
+ RaiseError("unexpected null character in UTF-8 string");
+ }
+
+ if (IsAscii(c) && !IsAsciiPrintable(c))
+ RaiseError("unexpected character U+%X in UTF-8 string", c);
+
+ UnicodeChar unicodeChar = DecodeUtf8(&m_buffer[m_pos]);
+ m_pos += unicodeChar.encodingLength;
+ std::int32_t code = unicodeChar.code;
+
+ if (code == -1)
+ RaiseError("invalid encoding in UTF-8 string");
+
+ if (isEscape && code >= 128)
+ RaiseError("escapes using non-ASCII characters are invalid");
+
+ sequence = isEscape ? g_charmap->Escape(code) : g_charmap->Char(code);
+
+ if (sequence.length() == 0)
+ {
+ if (isEscape)
+ RaiseError("unknown escape '\\%c'", code);
+ else
+ RaiseError("unknown character U+%X", code);
+ }
+
+ return sequence;
+}
+
+// Reads a charmap constant, i.e. "{FOO}".
+std::string StringParser::ReadBracketedConstants()
+{
+ std::string totalSequence;
+
+ m_pos++; // Assume we're on the left curly bracket.
+
+ while (m_buffer[m_pos] != '}')
+ {
+ SkipWhitespace();
+
+ if (IsIdentifierStartingChar(m_buffer[m_pos]))
+ {
+ long startPos = m_pos;
+
+ m_pos++;
+
+ while (IsIdentifierChar(m_buffer[m_pos]))
+ m_pos++;
+
+ std::string sequence = g_charmap->Constant(std::string(&m_buffer[startPos], m_pos - startPos));
+
+ if (sequence.length() == 0)
+ {
+ m_buffer[m_pos] = 0;
+ RaiseError("unknown constant '%s'", &m_buffer[startPos]);
+ }
+
+ totalSequence += sequence;
+ }
+ else if (IsAsciiDigit(m_buffer[m_pos]))
+ {
+ Integer integer = ReadInteger();
+
+ switch (integer.size)
+ {
+ case 1:
+ totalSequence += (unsigned char)integer.value;
+ break;
+ case 2:
+ totalSequence += (unsigned char)integer.value;
+ totalSequence += (unsigned char)(integer.value >> 8);
+ break;
+ case 4:
+ totalSequence += (unsigned char)integer.value;
+ totalSequence += (unsigned char)(integer.value >> 8);
+ totalSequence += (unsigned char)(integer.value >> 16);
+ totalSequence += (unsigned char)(integer.value >> 24);
+ break;
+ }
+ }
+ else if (m_buffer[m_pos] == 0)
+ {
+ if (m_pos >= m_size)
+ RaiseError("unexpected EOF after left curly bracket");
+ else
+ RaiseError("unexpected null character within curly brackets");
+ }
+ else
+ {
+ if (IsAsciiPrintable(m_buffer[m_pos]))
+ RaiseError("unexpected character '%c' within curly brackets", m_buffer[m_pos]);
+ else
+ RaiseError("unexpected character '\\x%02X' within curly brackets", m_buffer[m_pos]);
+ }
+ }
+
+ m_pos++; // Go past the right curly bracket.
+
+ return totalSequence;
+}
+
+// Reads a charmap string.
+int StringParser::ParseString(long srcPos, unsigned char* dest, int& destLength)
+{
+ m_pos = srcPos;
+
+ if (m_buffer[m_pos] != '"')
+ RaiseError("expected UTF-8 string literal");
+
+ long start = m_pos;
+
+ m_pos++;
+
+ destLength = 0;
+
+ while (m_buffer[m_pos] != '"')
+ {
+ std::string sequence = (m_buffer[m_pos] == '{') ? ReadBracketedConstants() : ReadCharOrEscape();
+
+ for (const char& c : sequence)
+ {
+ if (destLength == kMaxStringLength)
+ RaiseError("mapped string longer than %d bytes", kMaxStringLength);
+
+ dest[destLength++] = c;
+ }
+ }
+
+ m_pos++; // Go past the right quote.
+
+ return m_pos - start;
+}
+
+void StringParser::RaiseError(const char* format, ...)
+{
+ const int bufferSize = 1024;
+ char buffer[bufferSize];
+
+ std::va_list args;
+ va_start(args, format);
+ std::vsnprintf(buffer, bufferSize, format, args);
+ va_end(args);
+
+ throw std::runtime_error(buffer);
+}
+
+// Converts digit character to numerical value.
+static int ConvertDigit(char c, int radix)
+{
+ int digit;
+
+ if (c >= '0' && c <= '9')
+ digit = c - '0';
+ else if (c >= 'A' && c <= 'F')
+ digit = 10 + c - 'A';
+ else if (c >= 'a' && c <= 'f')
+ digit = 10 + c - 'a';
+ else
+ return -1;
+
+ return (digit < radix) ? digit : -1;
+}
+
+void StringParser::SkipRestOfInteger(int radix)
+{
+ while (ConvertDigit(m_buffer[m_pos], radix) != -1)
+ m_pos++;
+}
+
+StringParser::Integer StringParser::ReadDecimal()
+{
+ const int radix = 10;
+ std::uint64_t n = 0;
+ int digit;
+ std::uint64_t max = UINT32_MAX;
+ long startPos = m_pos;
+
+ while ((digit = ConvertDigit(m_buffer[m_pos], radix)) != -1)
+ {
+ n = n * radix + digit;
+
+ if (n >= max)
+ {
+ SkipRestOfInteger(radix);
+
+ std::string intLiteral(m_buffer + startPos, m_pos - startPos);
+ RaiseError("integer literal \"%s\" is too large", intLiteral.c_str());
+ }
+
+ m_pos++;
+ }
+
+ int size;
+
+ if (m_buffer[m_pos] == 'H')
+ {
+ if (n >= 0x10000)
+ {
+ RaiseError("%lu is too large to be a halfword", (unsigned long)n);
+ }
+
+ size = 2;
+ m_pos++;
+ }
+ else if (m_buffer[m_pos] == 'W')
+ {
+ size = 4;
+ m_pos++;
+ }
+ else
+ {
+ if (n >= 0x10000)
+ size = 4;
+ else if (n >= 0x100)
+ size = 2;
+ else
+ size = 1;
+ }
+
+ return{ static_cast<std::uint32_t>(n), size };
+}
+
+StringParser::Integer StringParser::ReadHex()
+{
+ const int radix = 16;
+ std::uint64_t n = 0;
+ int digit;
+ std::uint64_t max = UINT32_MAX;
+ long startPos = m_pos;
+
+ while ((digit = ConvertDigit(m_buffer[m_pos], radix)) != -1)
+ {
+ n = n * radix + digit;
+
+ if (n >= max)
+ {
+ SkipRestOfInteger(radix);
+
+ std::string intLiteral(m_buffer + startPos, m_pos - startPos);
+ RaiseError("integer literal \"%s\" is too large", intLiteral.c_str());
+ }
+
+ m_pos++;
+ }
+
+ int length = m_pos - startPos;
+ int size = 0;
+
+ switch (length)
+ {
+ case 2:
+ size = 1;
+ break;
+ case 4:
+ size = 2;
+ break;
+ case 8:
+ size = 4;
+ break;
+ default:
+ {
+ std::string intLiteral(m_buffer + startPos, m_pos - startPos);
+ RaiseError("hex integer literal \"0x%s\" doesn't have length of 2, 4, or 8 digits", intLiteral.c_str());
+ }
+ }
+
+ return{ static_cast<std::uint32_t>(n), size };
+}
+
+StringParser::Integer StringParser::ReadInteger()
+{
+ if (!IsAsciiDigit(m_buffer[m_pos]))
+ RaiseError("expected integer");
+
+ if (m_buffer[m_pos] == '0' && m_buffer[m_pos + 1] == 'x')
+ {
+ m_pos += 2;
+ return ReadHex();
+ }
+
+ return ReadDecimal();
+}
+
+// Skips tabs and spaces.
+void StringParser::SkipWhitespace()
+{
+ while (m_buffer[m_pos] == '\t' || m_buffer[m_pos] == ' ')
+ m_pos++;
+}
|