readme-thing/lexer.c

#include <stdio.h>
#include <stdlib.h>
#include <string.h>

#include "lexer.h"

static void readChar(Lexer* l);
static char* readIdentifier(Lexer* l);
static char* readNumber(Lexer* l);
static int isLetter(char c);
static int isDigit(char c);

static Token* newTickToken(Lexer* l);
static Token* newToken(Lexer* l, TokenType tt);
static Token* newIdentToken(Lexer* l, char* literal, TokenType tt);

Lexer*
NewLexer(const char* filename)
{
    FILE* fp;
    fp = fopen(filename, "r");

    if (fp == NULL)
    {
        printf("Can't open the file for some reason\n");
        return NULL;
    }

    fseek(fp, 0, SEEK_END);
    int fileSize = ftell(fp);
    fseek(fp, 0, SEEK_SET);
    printf("fileSize: %d\n", fileSize);

    Lexer* state = malloc(sizeof(Lexer));
    state->rawFile = malloc((sizeof(char) * fileSize) + 1);
    state->rawLen = fileSize;
    state->readPosition = 0;

    size_t read = fread(state->rawFile, sizeof(char), fileSize, fp);
    fclose(fp);

    if (read != fileSize)
    {
        printf("something borked.  only read %d bytes of %d\n", (int)read, fileSize);

        free(state->rawFile);
        free(state);

        return NULL;
    }

    state->rawFile[fileSize] = '\0';
    state->line = 1;

    readChar(state);
    return state;
}

void
FreeLexer(Lexer* l)
{
    free(l->rawFile);
    free(l);
}

Token*
NextToken(Lexer* l)
{
    Token* tok;
    switch (l->ch) {
        case '#':
            tok = newToken(l, TT_HASH);
            break;
        case '*':
            tok = newToken(l, TT_ASTERISK);
            break;
        case '_':
            tok = newToken(l, TT_UNDERSCORE);
            break;
        case '-':
            tok = newToken(l, TT_DASH);
            break;
        case '.':
            tok = newToken(l, TT_PERIOD);
            break;
        case '>':
            tok = newToken(l, TT_GT);
            break;
        case '`':
            tok = newTickToken(l);
            break;
        case '\0':
            tok = newToken(l, TT_EOF);
            break;
        case '\n':
            tok = newToken(l, TT_NEWLINE);
            l->line++;
            l->column = 0;
            break;
        case ' ':
        case '\t':
            tok = newToken(l, TT_WHITESPACE);
            break;
        case '\r':
            readChar(l);
            return NextToken(l); // lets GOOOOO
        default:
            if (isLetter(l->ch))
            {
                int start = l->column;
                char* literal = readIdentifier(l);
                tok = newIdentToken(l, literal, TT_WORD);
                tok->column = start;
                return tok;
            }
            else if (isDigit(l->ch))
            {
                int start = l->column;
                char* literal = readNumber(l);
                tok = newIdentToken(l, literal, TT_NUMBER);
                tok->column = start;
                return tok;
            }
            else
            {
                tok = newToken(l, TT_ILLEGAL);
            }
            //printf("Invalid token: %X\n", l->ch);
            //return NULL;
    }

    readChar(l);
    return tok;
}

static
char*
readNumber(Lexer* l)
{
    int position = l->position;
    while (isDigit(l->ch))
    {
        readChar(l);
    }

    int len = (l->position - position);
    char* out = malloc(sizeof(char) * len + 1);
    memcpy(out, &l->rawFile[position], len);
    out[len] = '\0';
    return out;
}

static
char*
readIdentifier(Lexer* l)
{
    int position = l->position;
    while (isLetter(l->ch))
    {
        readChar(l);
    }

    int len = (l->position - position);
    char* out = malloc(sizeof(char) * len + 1);
    memcpy(out, &l->rawFile[position], len);
    out[len] = '\0';
    return out;
}

static
void
readChar(Lexer* l)
{
    l->column++;
    if (l->readPosition >= l->rawLen)
    {
        l->ch = 0;
    }
    else
    {
        l->ch = l->rawFile[l->readPosition];
    }

    l->position = l->readPosition;
    l->readPosition++;
}

int
isLetter(char ch)
{
    return (('a' <= ch && ch <= 'z') || ('A' <= ch && ch <= 'Z') || ch == '_');
}

int
isDigit(char ch)
{
    return ('0' <= ch && ch <= '9');
}

void
FreeToken(Token* t)
{
    free(t->literal);
    free(t);
}

static
Token*
newTickToken(Lexer* l)
{

    printf("backticks @ %d:%d\n", l->line, l->column);

    if (l->position+3 > l->rawLen) {
        printf("premature EOF parsing ticks\n");
        return newToken(l, TT_BACKTICK);
    }

    // peek up to two more characters
    int i;
    for(i = 0; i < 3; i++) {

        if (l->rawFile[l->position+i] != '`') {
            printf("next char isn't a backtick @ %d: 0x%02X '%c'\n", l->readPosition+i, l->rawFile[l->position+i], l->rawFile[l->position+i]);
            return newToken(l, TT_BACKTICK);
        }
    }

    Token* tok = malloc(sizeof(Token));
    tok->line = l->line;
    tok->column = l->column;
    tok->literal = "```";
    tok->type = TT_TRIPLEBACKTICK;
    tok->length = 3;
    readChar(l);
    readChar(l);
    return tok;
}

static
Token*
newToken(Lexer* l,
         TokenType tt)
{
    Token* tok = malloc(sizeof(Token));
    char* nc = malloc(sizeof(char)+1);
    *nc = l->ch;
    nc[1] = '\0';
    tok->type = tt;
    tok->literal = nc;
    tok->line = l->line;
    tok->column = l->column;
    tok->length = 1;
    return tok;
}

static
Token*
newIdentToken(Lexer* l,
              char* literal,
              TokenType tt)
{
    Token* tok = malloc(sizeof(Token));
    tok->type = tt;
    tok->literal = literal;
    tok->line = l->line;
    tok->column = l->column;
    tok->length = strlen(literal);
    return tok;
}
Initial commit 2021-07-14 08:15:46 -07:00			`#include <stdio.h>`
			`#include <stdlib.h>`
			`#include <string.h>`

			`#include "lexer.h"`

			`static void readChar(Lexer* l);`
			`static char* readIdentifier(Lexer* l);`
			`static char* readNumber(Lexer* l);`
			`static int isLetter(char c);`
			`static int isDigit(char c);`

Add codeblocks; Fix some stuff - Parse code blocks that are wrapped sets of three backticks. - Added a TT_TRIPLEBACKTICK token. - Added a length field to the Token struct. - Added an error node. - Cleaned up some file read code. - Fixed the header raw text (don't reuse the same buffer for each node). 2023-10-15 17:55:12 -07:00			`static Token* newTickToken(Lexer* l);`
Initial commit 2021-07-14 08:15:46 -07:00			`static Token* newToken(Lexer* l, TokenType tt);`
			`static Token* newIdentToken(Lexer* l, char* literal, TokenType tt);`

			`Lexer*`
Fix basic parse issues Fixed parsing the tokens so it actually worked. Only headers are currently implemented, but header nodes are now properly parsed and everything else is properly ignored. - Get the input filename from the command line - Added in a bunch of checks to avoid segfaults - Added some more debug info in places 2023-10-15 15:30:59 -07:00			`NewLexer(const char* filename)`
Initial commit 2021-07-14 08:15:46 -07:00			`{`
			`FILE* fp;`
			`fp = fopen(filename, "r");`

			`if (fp == NULL)`
			`{`
			`printf("Can't open the file for some reason\n");`
			`return NULL;`
			`}`

			`fseek(fp, 0, SEEK_END);`
			`int fileSize = ftell(fp);`
			`fseek(fp, 0, SEEK_SET);`
			`printf("fileSize: %d\n", fileSize);`

			`Lexer* state = malloc(sizeof(Lexer));`
			`state->rawFile = malloc((sizeof(char) * fileSize) + 1);`
			`state->rawLen = fileSize;`
Add codeblocks; Fix some stuff - Parse code blocks that are wrapped sets of three backticks. - Added a TT_TRIPLEBACKTICK token. - Added a length field to the Token struct. - Added an error node. - Cleaned up some file read code. - Fixed the header raw text (don't reuse the same buffer for each node). 2023-10-15 17:55:12 -07:00			`state->readPosition = 0;`
Initial commit 2021-07-14 08:15:46 -07:00
			`size_t read = fread(state->rawFile, sizeof(char), fileSize, fp);`
Add codeblocks; Fix some stuff - Parse code blocks that are wrapped sets of three backticks. - Added a TT_TRIPLEBACKTICK token. - Added a length field to the Token struct. - Added an error node. - Cleaned up some file read code. - Fixed the header raw text (don't reuse the same buffer for each node). 2023-10-15 17:55:12 -07:00			`fclose(fp);`

Initial commit 2021-07-14 08:15:46 -07:00			`if (read != fileSize)`
			`{`
			`printf("something borked. only read %d bytes of %d\n", (int)read, fileSize);`

			`free(state->rawFile);`
			`free(state);`

			`return NULL;`
			`}`

			`state->rawFile[fileSize] = '\0';`
			`state->line = 1;`

			`readChar(state);`
			`return state;`
			`}`

			`void`
			`FreeLexer(Lexer* l)`
			`{`
			`free(l->rawFile);`
			`free(l);`
			`}`

			`Token*`
			`NextToken(Lexer* l)`
			`{`
			`Token* tok;`
			`switch (l->ch) {`
			`case '#':`
			`tok = newToken(l, TT_HASH);`
			`break;`
			`case '*':`
			`tok = newToken(l, TT_ASTERISK);`
			`break;`
			`case '_':`
			`tok = newToken(l, TT_UNDERSCORE);`
			`break;`
			`case '-':`
			`tok = newToken(l, TT_DASH);`
			`break;`
			`case '.':`
			`tok = newToken(l, TT_PERIOD);`
			`break;`
Add codeblocks; Fix some stuff - Parse code blocks that are wrapped sets of three backticks. - Added a TT_TRIPLEBACKTICK token. - Added a length field to the Token struct. - Added an error node. - Cleaned up some file read code. - Fixed the header raw text (don't reuse the same buffer for each node). 2023-10-15 17:55:12 -07:00			`case '>':`
			`tok = newToken(l, TT_GT);`
			`break;`
Initial commit 2021-07-14 08:15:46 -07:00			case '`':
Add codeblocks; Fix some stuff - Parse code blocks that are wrapped sets of three backticks. - Added a TT_TRIPLEBACKTICK token. - Added a length field to the Token struct. - Added an error node. - Cleaned up some file read code. - Fixed the header raw text (don't reuse the same buffer for each node). 2023-10-15 17:55:12 -07:00			`tok = newTickToken(l);`
Initial commit 2021-07-14 08:15:46 -07:00			`break;`
			`case '\0':`
			`tok = newToken(l, TT_EOF);`
			`break;`
			`case '\n':`
			`tok = newToken(l, TT_NEWLINE);`
			`l->line++;`
			`l->column = 0;`
			`break;`
			`case ' ':`
			`case '\t':`
			`tok = newToken(l, TT_WHITESPACE);`
			`break;`
			`case '\r':`
			`readChar(l);`
			`return NextToken(l); // lets GOOOOO`
			`default:`
			`if (isLetter(l->ch))`
			`{`
			`int start = l->column;`
			`char* literal = readIdentifier(l);`
			`tok = newIdentToken(l, literal, TT_WORD);`
			`tok->column = start;`
			`return tok;`
			`}`
			`else if (isDigit(l->ch))`
			`{`
			`int start = l->column;`
			`char* literal = readNumber(l);`
			`tok = newIdentToken(l, literal, TT_NUMBER);`
			`tok->column = start;`
			`return tok;`
			`}`
			`else`
			`{`
			`tok = newToken(l, TT_ILLEGAL);`
			`}`
			`//printf("Invalid token: %X\n", l->ch);`
			`//return NULL;`
			`}`

			`readChar(l);`
			`return tok;`
			`}`

			`static`
			`char*`
			`readNumber(Lexer* l)`
			`{`
			`int position = l->position;`
			`while (isDigit(l->ch))`
			`{`
			`readChar(l);`
			`}`

			`int len = (l->position - position);`
			`char* out = malloc(sizeof(char) * len + 1);`
			`memcpy(out, &l->rawFile[position], len);`
			`out[len] = '\0';`
			`return out;`
			`}`

			`static`
			`char*`
			`readIdentifier(Lexer* l)`
			`{`
			`int position = l->position;`
			`while (isLetter(l->ch))`
			`{`
			`readChar(l);`
			`}`

			`int len = (l->position - position);`
			`char* out = malloc(sizeof(char) * len + 1);`
			`memcpy(out, &l->rawFile[position], len);`
			`out[len] = '\0';`
			`return out;`
			`}`

			`static`
			`void`
			`readChar(Lexer* l)`
			`{`
			`l->column++;`
			`if (l->readPosition >= l->rawLen)`
			`{`
			`l->ch = 0;`
			`}`
			`else`
			`{`
			`l->ch = l->rawFile[l->readPosition];`
			`}`

			`l->position = l->readPosition;`
			`l->readPosition++;`
			`}`

			`int`
			`isLetter(char ch)`
			`{`
			`return (('a' <= ch && ch <= 'z') \|\| ('A' <= ch && ch <= 'Z') \|\| ch == '_');`
			`}`

			`int`
			`isDigit(char ch)`
			`{`
			`return ('0' <= ch && ch <= '9');`
			`}`

			`void`
			`FreeToken(Token* t)`
			`{`
			`free(t->literal);`
			`free(t);`
			`}`

Add codeblocks; Fix some stuff - Parse code blocks that are wrapped sets of three backticks. - Added a TT_TRIPLEBACKTICK token. - Added a length field to the Token struct. - Added an error node. - Cleaned up some file read code. - Fixed the header raw text (don't reuse the same buffer for each node). 2023-10-15 17:55:12 -07:00			`static`
			`Token*`
			`newTickToken(Lexer* l)`
			`{`

			`printf("backticks @ %d:%d\n", l->line, l->column);`

			`if (l->position+3 > l->rawLen) {`
			`printf("premature EOF parsing ticks\n");`
			`return newToken(l, TT_BACKTICK);`
			`}`

			`// peek up to two more characters`
			`int i;`
			`for(i = 0; i < 3; i++) {`

			if (l->rawFile[l->position+i] != '`') {
			`printf("next char isn't a backtick @ %d: 0x%02X '%c'\n", l->readPosition+i, l->rawFile[l->position+i], l->rawFile[l->position+i]);`
			`return newToken(l, TT_BACKTICK);`
			`}`
			`}`

			`Token* tok = malloc(sizeof(Token));`
			`tok->line = l->line;`
			`tok->column = l->column;`
			tok->literal = "```";
			`tok->type = TT_TRIPLEBACKTICK;`
			`tok->length = 3;`
			`readChar(l);`
			`readChar(l);`
			`return tok;`
			`}`

Initial commit 2021-07-14 08:15:46 -07:00			`static`
			`Token*`
			`newToken(Lexer* l,`
			`TokenType tt)`
			`{`
			`Token* tok = malloc(sizeof(Token));`
			`char* nc = malloc(sizeof(char)+1);`
			`*nc = l->ch;`
			`nc[1] = '\0';`
			`tok->type = tt;`
			`tok->literal = nc;`
			`tok->line = l->line;`
			`tok->column = l->column;`
Add codeblocks; Fix some stuff - Parse code blocks that are wrapped sets of three backticks. - Added a TT_TRIPLEBACKTICK token. - Added a length field to the Token struct. - Added an error node. - Cleaned up some file read code. - Fixed the header raw text (don't reuse the same buffer for each node). 2023-10-15 17:55:12 -07:00			`tok->length = 1;`
Initial commit 2021-07-14 08:15:46 -07:00			`return tok;`
			`}`

			`static`
			`Token*`
			`newIdentToken(Lexer* l,`
			`char* literal,`
			`TokenType tt)`
			`{`
			`Token* tok = malloc(sizeof(Token));`
			`tok->type = tt;`
			`tok->literal = literal;`
			`tok->line = l->line;`
			`tok->column = l->column;`
Add codeblocks; Fix some stuff - Parse code blocks that are wrapped sets of three backticks. - Added a TT_TRIPLEBACKTICK token. - Added a length field to the Token struct. - Added an error node. - Cleaned up some file read code. - Fixed the header raw text (don't reuse the same buffer for each node). 2023-10-15 17:55:12 -07:00			`tok->length = strlen(literal);`
Initial commit 2021-07-14 08:15:46 -07:00			`return tok;`
			`}`