/*****************************************************************
** The scanner() function does lexical analysis, i.e. finding **
** numbers, and returns a vector of tokens ready for parsing. **
*****************************************************************/
func scanner (code) {
var tokens = [],
/* Array that will later be translated to tokens and returned,
which will be called the "token stream". */
lexem = "", /* Lexem will accumulate characters in case we wanted to
make a token with more than one character, i.e. an identifier.
(Will be referred to as the "accumulation stream".) */
/* Note on streams: We can't write to the token stream unless
accumulation stream is empty.
*/
/* Next come two important tables to increase parsing efficiency. */
character = final {':' : 5,
'<' : 13,
'>' : 14,
'(' : 15,
')' : 16,
'{' : 17,
'}' : 18,
'[' : 19,
']' : 20,
'+' : 21,
'-' : 22,
'*' : 23,
'/' : 24,
'&' : 25,
'|' : 26,
'!' : 27,
';' : 28,
'=' : 29,
' ' : 30}, // For encoding special characters.
keyword = final {"program" : 4,
"constant" : 6,
"integer" : 7,
"real" : 8,
"for" : 10,
"cond" : 9,
"else" : 12,
"put" : 11}, // For encoding keywords.
char0, /* Counter for the loop that follows. */
lx = 0,
lnn = 1; // For retaining the line number of a token.
for (char0 = 0; char0 < #code;) { // Build up tokens[].
if (char0 != 0 &&
code[char0 - 1] == '\n') lnn++;
if (code[char0] == '/') { // Value might be a comment.
var nofl = 1; // Tells us whether we should keep the slash.
char0++; // Look at the next character.
if (code[char0] == '*') {
nofl = 0; // Don't put the slash to tokens[].
for (;; char0++) // It is a comment.
if (code[char0] == '*') { // Comment might end.
char0++; // Look at next character.
if (code[char0] == '/') {
char0++;
break; // Comment is over.
}
}
}
// So that the slash is preserved if there was no comment.
if (nofl) tokens = tokens @ '/';
} else if (code[char0] <= '9' && code[char0] >= '0') {
var realfl = 0; // For support of reals.
lexem = "N";
for (;; char0++) {
for (; code[char0] <= '9' && code[char0] >= '0'; char0++)
lexem = lexem @ code[char0];
if (code[char0] == '.' && ! realfl) {
realfl++; // To avoid meaningless constructs such as 2.3.56.
continue; // For scanning reals.
}
break;
}
tokens = ins (tokens, lexem, #tokens);
lexem = "";
} else if (code [char0] <= 'z' &&
code [char0] >= 'a' || code[char0] <= 'Z' &&
code [char0] >= 'A') {
lexem = "I";
for (; code [char0] <= 'z' && code[char0] >= 'a'
|| code [char0] <= 'Z' && code [char0] >= 'A'
|| code [char0] <= '9' && code [char0] >= '0';
char0++)
lexem = lexem @ tolower (code [char0]);
tokens = ins (tokens, lexem, #tokens);
lexem = "";
} else if (char0 < #code && (code [char0] == ' ' ||
code [char0] == '\n' ||
code [char0] == '\t' ||
code [char0] == '\f' ||
code [char0] == '\r' ||
code [char0] == '\v')) {
for (; char0 < #code && (code [char0] == ' ' ||
code [char0] == '\n' ||
code [char0] == '\t' ||
code [char0] == '\f' ||
code [char0] == '\r' ||
code [char0] == '\v');
char0++)
if (char0 != 0 && code [char0 - 1] == '\n') lnn++;
} else if (code[char0] == '"'){
char0++;
lexem = "S";
for (; char0 < #code && code[char0] != '"'; char0++)
lexem = lexem @ code[char0];
char0++;
tokens = ins (tokens, lexem, #tokens);
lexem = "";
} else {
tokens = ins (tokens, code[char0], #tokens);
char0++;
}
// Translate what we got to a token.
if (lx < #tokens &&
type (tokens[lx]) != char) { // Are we dealing with a string?
if (tokens[lx][0] == 'S') { // We have a string constant.
var temp = token (1); // Token with code 1.
temp.type0 = tokens[lx][0]; // Add the type specifier.
temp.value = del (tokens[lx], 0, 1); // Put the rest into value.
tokens[lx] = temp; // Put the result into tokens.
} else if (tokens[lx][0] == 'I') { // Identifier.
var temp;
if (del (tokens[lx], 0, 1) in keyword) {
// Translate it into a keyword token if it's a keyword.
temp = token(keyword{tokens[lx]});
// Look it up in the keyword table.
temp.type0 = 'K';
// Give it type "K".
temp.value = tokens[lx];
// Give its reproduction for the syntax error manager.
} else {
temp = token (2); // Token - code 2.
temp.type0 = 'I';
temp.value = tokens[lx];
// Same procedure here as for string constants.
}
tokens[lx] = temp;
} else if (tokens[lx][0] == 'N') { // Number.
var temp = token (3); // Token - code 3.
temp.type0 = tokens[lx][0];
temp.value = del (tokens[lx], 0, 1);
tokens[lx] = temp;
} else fatal_error ("Internal scanner error.", tokens, tokens[lx]);
// Scanner malfunction.
} else if (lx < #tokens) { // Single character.
if (!(tokens[lx] in character)) {
// We have a bad character that we know the parser will spit out.
put (file_name, ":", lnn, ":", char0 % lnn, ": ");
error ("Lexical error: Invalid character -- no encoding available.",
sprint(tokens[lx]));
/* Generate an invalid token that the parser recognizes, but does
not use in any rule. */
tokens [lx] = token (31);
lx++;
continue;
} else {
var temp = token(character {tokens[lx]});
temp.type0 = 'C';
temp.value = tokens[lx];
tokens[lx] = temp;
}
}
if (lx < #tokens) {
tokens[lx].line_num = lnn;
tokens[lx].char_num = char0 % lnn;
lx++;
}
}
return tokens; // Make the result available to the client.
}