The interpreter scanner.


 /*****************************************************************
 ** The scanner() function does lexical analysis, i.e. finding   **
 ** numbers, and returns a vector of tokens ready for parsing.   **
 *****************************************************************/
func scanner (code) {
  var tokens = [],
  /* Array that will later be translated to tokens and returned,
     which will be called the "token stream". */
  lexem = "", /* Lexem will accumulate characters in case we wanted to
  make a token with more than one character, i.e. an identifier.
  (Will be referred to as the "accumulation stream".) */
  /* Note on streams: We can't write to the token stream unless
                      accumulation stream is empty.
  */
  /* Next come two important tables to increase parsing efficiency. */
  character = final {':' : 5,
                     '<' : 13,
                     '>' : 14,
                     '(' : 15,
                     ')' : 16,
                     '{' : 17,
                     '}' : 18,
                     '[' : 19,
                     ']' : 20,
                     '+' : 21,
                     '-' : 22,
                     '*' : 23,
                     '/' : 24,
                     '&' : 25,
                     '|' : 26,
                     '!' : 27,
                     ';' : 28,
                     '=' : 29,
                     ' ' : 30}, // For encoding special characters.
  keyword = final {"program" : 4,
                   "constant" : 6,
                   "integer" : 7,
                   "real" : 8,
                   "for" : 10,
                   "cond" : 9,
                   "else" : 12,
                   "put" : 11}, // For encoding keywords.
  char0, /* Counter for the loop that follows. */
  lx = 0,
  lnn = 1; // For retaining the line number of a token.
  for (char0 = 0; char0 < #code;) { // Build up tokens[].
    if (char0 != 0 &&
        code[char0 - 1] == '\n') lnn++;
    if (code[char0] == '/') { // Value might be a comment.
      var nofl = 1; // Tells us whether we should keep the slash.
      char0++; // Look at the next character.
      if (code[char0] == '*') {
        nofl = 0; // Don't put the slash to tokens[].
        for (;; char0++) // It is a comment.
          if (code[char0] == '*') { // Comment might end.
            char0++; // Look at next character.
            if (code[char0] == '/') {
              char0++;
              break; // Comment is over.
            }
          }
      }
      // So that the slash is preserved if there was no comment.
      if (nofl) tokens = tokens @ '/';
    } else if (code[char0] <= '9' && code[char0] >= '0') {
      var realfl = 0; // For support of reals.
      lexem = "N";
      for (;; char0++) {
        for (; code[char0] <= '9' && code[char0] >= '0'; char0++)
          lexem = lexem @ code[char0];
        if (code[char0] == '.' && ! realfl) {
          realfl++; // To avoid meaningless constructs such as 2.3.56.
          continue; // For scanning reals.
        }
        break;
      }
      tokens = ins (tokens, lexem, #tokens);
      lexem = "";
    } else if (code [char0] <= 'z' &&
               code [char0] >= 'a' || code[char0] <= 'Z' &&
               code [char0] >= 'A') {
      lexem = "I";
      for (; code [char0] <= 'z' && code[char0] >= 'a'
             || code [char0] <= 'Z' && code [char0] >= 'A'
             || code [char0] <= '9' && code [char0] >= '0';
             char0++)
        lexem = lexem @ tolower (code [char0]);
      tokens = ins (tokens, lexem, #tokens);
      lexem = "";
    } else if (char0 < #code && (code [char0] == ' '  ||
                                 code [char0] == '\n' ||
                                 code [char0] == '\t' ||
                                 code [char0] == '\f' ||
                                 code [char0] == '\r' ||
                                 code [char0] == '\v')) {
      for (; char0 < #code && (code [char0] == ' '  ||
                               code [char0] == '\n' ||
                               code [char0] == '\t' ||
                               code [char0] == '\f' ||
                               code [char0] == '\r' ||
                               code [char0] == '\v');
           char0++)
        if (char0 != 0 && code [char0 - 1] == '\n') lnn++;
    } else if (code[char0] == '"'){
      char0++;
      lexem = "S";
      for (; char0 < #code && code[char0] != '"'; char0++)
        lexem = lexem @ code[char0];
      char0++;
      tokens = ins (tokens, lexem, #tokens);
      lexem = "";
    } else {
      tokens = ins (tokens, code[char0], #tokens);
      char0++;
    }
    // Translate what we got to a token.
    if (lx < #tokens &&
        type (tokens[lx]) != char) { // Are we dealing with a string?
      if (tokens[lx][0] == 'S') { // We have a string constant.
        var temp = token (1); // Token with code 1.
        temp.type0 = tokens[lx][0]; // Add the type specifier.
        temp.value = del (tokens[lx], 0, 1); // Put the rest into value.
        tokens[lx] = temp; // Put the result into tokens.
      } else if (tokens[lx][0] == 'I') { // Identifier.
        var temp;
        if (del (tokens[lx], 0, 1) in keyword) {
          // Translate it into a keyword token if it's a keyword.
          temp = token(keyword{tokens[lx]});
          // Look it up in the keyword table.
          temp.type0 = 'K';
          // Give it type "K".
          temp.value = tokens[lx];
          // Give its reproduction for the syntax error manager.
        } else {
          temp = token (2); // Token - code 2.
          temp.type0 = 'I';
          temp.value = tokens[lx];
          // Same procedure here as for string constants.
        }
        tokens[lx] = temp;
      } else if (tokens[lx][0] == 'N') { // Number.
        var temp = token (3); // Token - code 3.
        temp.type0 = tokens[lx][0];
        temp.value = del (tokens[lx], 0, 1);
        tokens[lx] = temp;
      } else fatal_error ("Internal scanner error.", tokens, tokens[lx]);
      // Scanner malfunction.
    } else if (lx < #tokens) { // Single character.
      if (!(tokens[lx] in character)) { 
        // We have a bad character that we know the parser will spit out.
        put (file_name, ":", lnn, ":", char0 % lnn, ": ");
        error ("Lexical error: Invalid character -- no encoding available.",
               sprint(tokens[lx]));
        /* Generate an invalid token that the parser recognizes, but does
           not use in any rule. */
        tokens [lx] = token (31);
        lx++;
        continue;
      } else {
        var temp = token(character {tokens[lx]});
        temp.type0 = 'C';
        temp.value = tokens[lx];
        tokens[lx] = temp;
      }
    }
    if (lx < #tokens) {
      tokens[lx].line_num = lnn;
      tokens[lx].char_num = char0 % lnn;
      lx++;
    }
  }
  return tokens; // Make the result available to the client.
}