HelenOS sources

root/uspace/app/sbi/src/lex.c

/* [<][>][^][v][top][bottom][index][help] */

DEFINITIONS

This source file includes following definitions.
  1. lclass_print
  2. lem_print
  3. lem_print_coords
  4. lex_init
  5. lex_next
  6. lex_get_current
  7. lex_peek_prev
  8. lex_touch
  9. lex_read_try
  10. lex_word
  11. lex_char
  12. lex_number
  13. lex_string
  14. lex_char_string_core
  15. lex_skip_comment
  16. lex_skip_ws
  17. is_wstart
  18. is_wcont
  19. is_digit
  20. digit_value

/*
 * Copyright (c) 2011 Jiri Svoboda
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *
 * - Redistributions of source code must retain the above copyright
 *   notice, this list of conditions and the following disclaimer.
 * - Redistributions in binary form must reproduce the above copyright
 *   notice, this list of conditions and the following disclaimer in the
 *   documentation and/or other materials provided with the distribution.
 * - The name of the author may not be used to endorse or promote products
 *   derived from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

/** @file Lexer (lexical analyzer).
 *
 * Consumes a text file and produces a sequence of lexical elements (lems).
 */

#include <stdio.h>
#include <stdlib.h>
#include "bigint.h"
#include "cspan.h"
#include "mytypes.h"
#include "input.h"
#include "os/os.h"
#include "strtab.h"

#include "lex.h"

#define TAB_WIDTH 8

typedef enum {
        cs_chr,
        cs_str
} chr_str_t;

static void lex_touch(lex_t *lex);
static bool_t lex_read_try(lex_t *lex);

static void lex_skip_comment(lex_t *lex);
static void lex_skip_ws(lex_t *lex);
static bool_t is_wstart(char c);
static bool_t is_wcont(char c);
static bool_t is_digit(char c);
static void lex_word(lex_t *lex);
static void lex_char(lex_t *lex);
static void lex_number(lex_t *lex);
static void lex_string(lex_t *lex);
static void lex_char_string_core(lex_t *lex, chr_str_t cs);
static int digit_value(char c);

/* Note: This imposes an implementation limit on identifier length. */
#define IBUF_SIZE 128
static char ident_buf[IBUF_SIZE + 1];

/* XXX This imposes an implementation limit on string literal length. */
#define SLBUF_SIZE 128
static char strlit_buf[SLBUF_SIZE + 1];

/** Lclass-string pair */
struct lc_name {
        lclass_t lclass;
        const char *name;
};

/** Keyword names. Used both for printing and recognition. */
static struct lc_name keywords[] = {
        { lc_and,       "and" },
        { lc_as,        "as" },
        { lc_bool,      "bool" },
        { lc_break,     "break" },
        { lc_builtin,   "builtin" },
        { lc_char,      "char" },
        { lc_class,     "class" },
        { lc_deleg,     "deleg" },
        { lc_do,        "do" },
        { lc_elif,      "elif" },
        { lc_else,      "else" },
        { lc_end,       "end" },
        { lc_enum,      "enum" },
        { lc_except,    "except" },
        { lc_false,     "false" },
        { lc_finally,   "finally" },
        { lc_for,       "for" },
        { lc_fun,       "fun" },
        { lc_get,       "get" },
        { lc_if,        "if" },
        { lc_in,        "in" },
        { lc_int,       "int" },
        { lc_interface, "interface" },
        { lc_is,        "is" },
        { lc_new,       "new" },
        { lc_not,       "not" },
        { lc_nil,       "nil" },
        { lc_or,        "or" },
        { lc_override,  "override" },
        { lc_packed,    "packed" },
        { lc_private,   "private" },
        { lc_prop,      "prop" },
        { lc_protected, "protected" },
        { lc_public,    "public" },
        { lc_raise,     "raise" },
        { lc_resource,  "resource" },
        { lc_return,    "return" },
        { lc_self,      "self" },
        { lc_set,       "set" },
        { lc_static,    "static" },
        { lc_string,    "string" },
        { lc_struct,    "struct" },
        { lc_switch,    "switch" },
        { lc_then,      "then" },
        { lc_this,      "this" },
        { lc_true,      "true" },
        { lc_var,       "var" },
        { lc_with,      "with" },
        { lc_when,      "when" },
        { lc_while,     "while" },
        { lc_yield,     "yield" },

        { 0,            NULL }
};

/** Other simple lclasses. Only used for printing. */
static struct lc_name simple_lc[] = {
        { lc_invalid,   "INVALID" },
        { lc_eof,       "EOF" },

        /* Operators */
        { lc_period,    "." },
        { lc_slash,     "/" },
        { lc_lparen,    "(" },
        { lc_rparen,    ")" },
        { lc_lsbr,      "[" },
        { lc_rsbr,      "]" },
        { lc_equal,     "==" },
        { lc_notequal,  "!=" },
        { lc_lt,        "<" },
        { lc_gt,        ">" },
        { lc_lt_equal,  "<=" },
        { lc_gt_equal,  ">=" },
        { lc_assign,    "=" },
        { lc_plus,      "+" },
        { lc_minus,     "-" },
        { lc_mult,      "*" },
        { lc_increase,  "+=" },

        /* Punctuators */
        { lc_comma,     "," },
        { lc_colon,     ":" },
        { lc_scolon,    ";" },

        { 0,            NULL },
};

/** Print lclass value.
 *
 * Prints lclass (lexical element class) value in human-readable form
 * (for debugging).
 *
 * @param lclass        Lclass value for display.
 */
void lclass_print(lclass_t lclass)
{
        struct lc_name *dp;

        dp = keywords;
        while (dp->name != NULL) {
                if (dp->lclass == lclass) {
                        printf("%s", dp->name);
                        return;
                }
                ++dp;
        }

        dp = simple_lc;
        while (dp->name != NULL) {
                if (dp->lclass == lclass) {
                        printf("%s", dp->name);
                        return;
                }
                ++dp;
        }

        switch (lclass) {
        case lc_ident:
                printf("ident");
                break;
        case lc_lit_int:
                printf("int_literal");
                break;
        case lc_lit_string:
                printf("string_literal");
                break;
        default:
                printf("<unknown?>");
                break;
        }
}

/** Print lexical element.
 *
 * Prints lexical element in human-readable form (for debugging).
 *
 * @param lem           Lexical element for display.
 */
void lem_print(lem_t *lem)
{
        lclass_print(lem->lclass);

        switch (lem->lclass) {
        case lc_ident:
                printf("('%s')", strtab_get_str(lem->u.ident.sid));
                break;
        case lc_lit_int:
                printf("(");
                bigint_print(&lem->u.lit_int.value);
                printf(")");
                break;
        case lc_lit_string:
                printf("(\"%s\")", lem->u.lit_string.value);
        default:
                break;
        }
}

/** Print lem coordinates.
 *
 * Print the coordinates (line number, column number) of a lexical element.
 *
 * @param lem           Lexical element for coordinate printing.
 */
void lem_print_coords(lem_t *lem)
{
        cspan_print(lem->cspan);
}

/** Initialize lexer instance.
 *
 * @param lex           Lexer object to initialize.
 * @param input         Input to associate with lexer.
 */
void lex_init(lex_t *lex, struct input *input)
{
        errno_t rc;

        lex->input = input;

        rc = input_get_line(lex->input, &lex->inbuf);
        if (rc != EOK) {
                printf("Error reading input.\n");
                exit(1);
        }

        lex->ibp = lex->inbuf;
        lex->col_adj = 0;
        lex->prev_valid = b_false;
        lex->current_valid = b_true;
}

/** Advance to next lexical element.
 *
 * The new element is read in lazily then it is actually accessed.
 *
 * @param lex           Lexer object.
 */
void lex_next(lex_t *lex)
{
        /* Make sure the current lem has already been read in. */
        lex_touch(lex);

        /* Force a new lem to be read on next access. */
        lex->current_valid = b_false;
}

/** Get current lem.
 *
 * The returned pointer is invalidated by next call to lex_next()
 *
 * @param lex           Lexer object.
 * @return              Pointer to current lem. Owned by @a lex and only valid
 *                      until next call to lex_xxx().
 */
lem_t *lex_get_current(lex_t *lex)
{
        lex_touch(lex);
        return &lex->current;
}

/** Get previous lem if valid.
 *
 * The returned pointer is invalidated by next call to lex_next()
 *
 * @param lex           Lexer object.
 * @return              Pointer to previous lem. Owned by @a lex and only valid
 *                      until next call to lex_xxx().
 */
lem_t *lex_peek_prev(lex_t *lex)
{
        if (lex->current_valid == b_false) {
                /*
                 * This means the head is advanced but next lem was not read.
                 * Thus the previous lem is still in @a current.
                 */
                return &lex->current;
        }

        if (lex->prev_valid != b_true) {
                /* Looks like we are still at the first lem. */
                return NULL;
        }

        /*
         * Current lem has been read in. Thus the previous lem was moved to
         * @a previous.
         */
        return &lex->prev;
}

/** Read in the current lexical element (unless already read in).
 *
 * @param lex           Lexer object.
 */
static void lex_touch(lex_t *lex)
{
        bool_t got_lem;

        if (lex->current_valid == b_true)
                return;

        /* Copy previous lem */
        lex->prev = lex->current;
        lex->prev_valid = b_true;

        do {
                got_lem = lex_read_try(lex);
        } while (got_lem == b_false);

        lex->current_valid = b_true;
}

/** Try reading next lexical element.
 *
 * Attemps to read the next lexical element. In some cases (such as a comment)
 * this function will need to give it another try and returns @c b_false
 * in such case.
 *
 * @param lex           Lexer object.
 * @return              @c b_true on success or @c b_false if it needs
 *                      restarting. On success the lem is stored to
 *                      the current lem in @a lex.
 */
static bool_t lex_read_try(lex_t *lex)
{
        char *bp, *lsp;
        int line0, col0;

        lex_skip_ws(lex);

        /*
         * Record lem coordinates. Line number we already have. For column
         * number we start with position in the input buffer. This works
         * for all characters except tab. Thus we keep track of tabs
         * separately using col_adj.
         */
        line0 = input_get_line_no(lex->input);
        col0 = 1 + lex->col_adj + (lex->ibp - lex->inbuf);

        lex->current.cspan = cspan_new(lex->input, line0, col0, line0, col0);

        lsp = lex->ibp;
        bp = lex->ibp;

        if (bp[0] == '\0') {
                /* End of input */
                lex->current.lclass = lc_eof;
                goto finish;
        }

        if (is_wstart(bp[0])) {
                lex_word(lex);
                goto finish;
        }

        if (bp[0] == '\'') {
                lex_char(lex);
                goto finish;
        }

        if (is_digit(bp[0])) {
                lex_number(lex);
                goto finish;
        }

        if (bp[0] == '"') {
                lex_string(lex);
                goto finish;
        }

        if (bp[0] == '-' && bp[1] == '-') {
                lex_skip_comment(lex);

                /* Compute ending column number */
                lex->current.cspan->col1 = col0 + (lex->ibp - lsp) - 1;

                /* Try again */
                return b_false;
        }

        switch (bp[0]) {
        case ',':
                lex->current.lclass = lc_comma;
                ++bp;
                break;
        case ':':
                lex->current.lclass = lc_colon;
                ++bp;
                break;
        case ';':
                lex->current.lclass = lc_scolon;
                ++bp;
                break;

        case '.':
                lex->current.lclass = lc_period;
                ++bp;
                break;
        case '/':
                lex->current.lclass = lc_slash;
                ++bp;
                break;
        case '(':
                lex->current.lclass = lc_lparen;
                ++bp;
                break;
        case ')':
                lex->current.lclass = lc_rparen;
                ++bp;
                break;
        case '[':
                lex->current.lclass = lc_lsbr;
                ++bp;
                break;
        case ']':
                lex->current.lclass = lc_rsbr;
                ++bp;
                break;

        case '=':
                if (bp[1] == '=') {
                        lex->current.lclass = lc_equal;
                        bp += 2;
                        break;
                }
                lex->current.lclass = lc_assign;
                ++bp;
                break;

        case '!':
                if (bp[1] == '=') {
                        lex->current.lclass = lc_notequal;
                        bp += 2;
                        break;
                }
                goto invalid;

        case '+':
                if (bp[1] == '=') {
                        lex->current.lclass = lc_increase;
                        bp += 2;
                        break;
                }
                lex->current.lclass = lc_plus;
                ++bp;
                break;

        case '-':
                lex->current.lclass = lc_minus;
                ++bp;
                break;

        case '*':
                lex->current.lclass = lc_mult;
                ++bp;
                break;

        case '<':
                if (bp[1] == '=') {
                        lex->current.lclass = lc_lt_equal;
                        bp += 2;
                        break;
                }
                lex->current.lclass = lc_lt;
                ++bp;
                break;

        case '>':
                if (bp[1] == '=') {
                        lex->current.lclass = lc_gt_equal;
                        bp += 2;
                        break;
                }
                lex->current.lclass = lc_gt;
                ++bp;
                break;

        default:
                goto invalid;
        }

        lex->ibp = bp;

finish:
        /* Compute ending column number */
        lex->current.cspan->col1 = col0 + (lex->ibp - lsp) - 1;
        return b_true;

invalid:
        lex->current.lclass = lc_invalid;
        ++bp;
        lex->ibp = bp;

        return b_true;
}

/** Lex a word (identifier or keyword).
 *
 * Read in a word. This may later turn out to be a keyword or a regular
 * identifier. It is stored in the current lem in @a lex.
 *
 * @param lex           Lexer object.
 */
static void lex_word(lex_t *lex)
{
        struct lc_name *dp;
        char *bp;
        int idx;

        bp = lex->ibp;
        ident_buf[0] = bp[0];
        idx = 1;

        while (is_wcont(bp[idx])) {
                if (idx >= IBUF_SIZE) {
                        printf("Error: Identifier too long.\n");
                        exit(1);
                }

                ident_buf[idx] = bp[idx];
                ++idx;
        }

        lex->ibp = bp + idx;

        ident_buf[idx] = '\0';

        dp = keywords;
        while (dp->name != NULL) {
                if (os_str_cmp(ident_buf, dp->name) == 0) {
                        /* Match */
                        lex->current.lclass = dp->lclass;
                        return;
                }
                ++dp;
        }

        /* No matching keyword -- it must be an identifier. */
        lex->current.lclass = lc_ident;
        lex->current.u.ident.sid = strtab_get_sid(ident_buf);
}

/** Lex a character literal.
 *
 * Reads in a character literal and stores it in the current lem in @a lex.
 *
 * @param lex           Lexer object.
 */
static void lex_char(lex_t *lex)
{
        size_t len;
        int char_val;

        lex_char_string_core(lex, cs_chr);

        len = os_str_length(strlit_buf);
        if (len != 1) {
                printf("Character literal should contain one character, "
                    "but contains %u characters instead.\n", (unsigned) len);
                exit(1);
        }

        os_str_get_char(strlit_buf, 0, &char_val);
        lex->current.lclass = lc_lit_char;
        bigint_init(&lex->current.u.lit_char.value, char_val);
}

/** Lex a numeric literal.
 *
 * Reads in a numeric literal and stores it in the current lem in @a lex.
 *
 * @param lex           Lexer object.
 */
static void lex_number(lex_t *lex)
{
        char *bp;
        bigint_t value;
        bigint_t dgval;
        bigint_t base;
        bigint_t tprod;

        bp = lex->ibp;

        bigint_init(&value, 0);
        bigint_init(&base, 10);

        while (is_digit(*bp)) {
                bigint_mul(&value, &base, &tprod);
                bigint_init(&dgval, digit_value(*bp));

                bigint_destroy(&value);
                bigint_add(&tprod, &dgval, &value);
                bigint_destroy(&tprod);
                bigint_destroy(&dgval);

                ++bp;
        }

        bigint_destroy(&base);

        lex->ibp = bp;

        lex->current.lclass = lc_lit_int;
        bigint_shallow_copy(&value, &lex->current.u.lit_int.value);
}

/** Lex a string literal.
 *
 * Reads in a string literal and stores it in the current lem in @a lex.
 *
 * @param lex           Lexer object.
 */
static void lex_string(lex_t *lex)
{
        lex_char_string_core(lex, cs_str);

        lex->current.lclass = lc_lit_string;
        lex->current.u.lit_string.value = os_str_dup(strlit_buf);
}

static void lex_char_string_core(lex_t *lex, chr_str_t cs)
{
        char *bp;
        int sidx, didx;
        char term;
        const char *descr, *cap_descr;
        char spchar;

        /* Make compiler happy */
        term = '\0';
        descr = NULL;
        cap_descr = NULL;

        switch (cs) {
        case cs_chr:
                term = '\'';
                descr = "character";
                cap_descr = "Character";
                break;
        case cs_str:
                term = '"';
                descr = "string";
                cap_descr = "String";
                break;
        }

        bp = lex->ibp + 1;
        sidx = didx = 0;

        while (bp[sidx] != term) {
                if (didx >= SLBUF_SIZE) {
                        printf("Error: %s literal too long.\n", cap_descr);
                        exit(1);
                }

                if (bp[sidx] == '\0') {
                        printf("Error: Unterminated %s literal.\n", descr);
                        exit(1);
                }

                if (bp[sidx] == '\\') {
                        switch (bp[sidx + 1]) {
                        case '\\':
                                spchar = '\\';
                                break;
                        case '\'':
                                spchar = '\'';
                                break;
                        case '"':
                                spchar = '"';
                                break;
                        case 'n':
                                spchar = '\n';
                                break;
                        case 't':
                                spchar = '\t';
                                break;
                        default:
                                printf("Error: Unknown character escape sequence.\n");
                                exit(1);
                        }

                        strlit_buf[didx] = spchar;
                        ++didx;
                        sidx += 2;
                } else {
                        strlit_buf[didx] = bp[sidx];
                        ++sidx;
                        ++didx;
                }
        }

        lex->ibp = bp + sidx + 1;

        strlit_buf[didx] = '\0';
}

/** Lex a single-line comment.
 *
 * This does not produce any lem. The comment is just skipped.
 *
 * @param lex           Lexer object.
 */
static void lex_skip_comment(lex_t *lex)
{
        char *bp;

        bp = lex->ibp + 2;

        while (*bp != '\n' && *bp != '\0') {
                ++bp;
        }

        lex->ibp = bp;
}

/** Skip whitespace characters.
 *
 * This does not produce any lem. The whitespace is just skipped.
 *
 * @param lex           Lexer object.
 */
static void lex_skip_ws(lex_t *lex)
{
        char *bp;
        errno_t rc;

        bp = lex->ibp;

        while (b_true) {
                while (*bp == ' ' || *bp == '\t') {
                        if (*bp == '\t') {
                                /* XXX This is too simplifed. */
                                lex->col_adj += (TAB_WIDTH - 1);
                        }
                        ++bp;
                }

                if (*bp != '\n')
                        break;

                /* Read next line */
                rc = input_get_line(lex->input, &lex->inbuf);
                if (rc != EOK) {
                        printf("Error reading input.\n");
                        exit(1);
                }

                bp = lex->inbuf;
                lex->col_adj = 0;
        }

        lex->ibp = bp;
}

/** Determine if character can start a word.
 *
 * @param c     Character.
 * @return      @c b_true if @a c can start a word, @c b_false otherwise.
 */
static bool_t is_wstart(char c)
{
        return ((c >= 'a') && (c <= 'z')) || ((c >= 'A') && (c <= 'Z')) ||
            (c == '_');
}

/** Determine if character can continue a word.
 *
 * @param c     Character.
 * @return      @c b_true if @a c can start continue word, @c b_false
 *              otherwise.
 */
static bool_t is_wcont(char c)
{
        return is_digit(c) || is_wstart(c);
}

/** Determine if character is a numeric digit.
 *
 * @param c     Character.
 * @return      @c b_true if @a c is a numeric digit, @c b_false otherwise.
 */
static bool_t is_digit(char c)
{
        return ((c >= '0') && (c <= '9'));
}

/** Determine numeric value of digit character.
 *
 * @param c     Character, must be a valid decimal digit.
 * @return      Value of the digit (0-9).
 */
static int digit_value(char c)
{
        return (c - '0');
}

/* [<][>][^][v][top][bottom][index][help] */
HelenOS homepage, sources at GitHub