#include #include #include #include "lexer.h" static char peekChar(Lexer* l); static void readChar(Lexer* l); static char* readIdentifier(Lexer* l); static char* readNumber(Lexer* l); static int isLetter(char c); static int isDigit(char c); static Token* newTickToken(Lexer* l); static Token* newToken(Lexer* l, TokenType tt); static Token* newIdentToken(Lexer* l, char* literal, TokenType tt); static Token* newWhitespaceToken(Lexer* l); Lexer* NewLexer(const char* filename) { FILE* fp; fp = fopen(filename, "r"); if (fp == NULL) { printf("Can't open the file for some reason\n"); return NULL; } fseek(fp, 0, SEEK_END); int fileSize = ftell(fp); fseek(fp, 0, SEEK_SET); printf("fileSize: %d\n", fileSize); Lexer* state = malloc(sizeof(Lexer)); state->rawFile = malloc((sizeof(char) * fileSize) + 1); state->rawLen = fileSize; state->readPosition = 0; size_t read = fread(state->rawFile, sizeof(char), fileSize, fp); fclose(fp); if (read != fileSize) { printf("something borked. only read %d bytes of %d\n", (int)read, fileSize); free(state->rawFile); free(state); return NULL; } state->rawFile[fileSize] = '\0'; state->line = 1; state->column = 0; readChar(state); return state; } void FreeLexer(Lexer* l) { free(l->rawFile); free(l); } Token* NextToken(Lexer* l) { Token* tok; switch (l->ch) { case '#': tok = newToken(l, TT_HASH); break; case '*': tok = newToken(l, TT_ASTERISK); break; case '_': tok = newToken(l, TT_UNDERSCORE); break; case '-': tok = newToken(l, TT_DASH); break; case '.': tok = newToken(l, TT_PERIOD); break; case '>': tok = newToken(l, TT_GT); break; case '`': tok = newTickToken(l); break; case '\0': tok = newToken(l, TT_EOF); break; case '\n': tok = newToken(l, TT_NEWLINE); l->line++; l->column = 0; break; case ' ': case '\t': tok = newWhitespaceToken(l); break; case '\r': readChar(l); return NextToken(l); // lets GOOOOO default: if (isLetter(l->ch)) { int start = l->column; char* literal = readIdentifier(l); tok = newIdentToken(l, literal, TT_WORD); tok->column = start; return tok; } else if (isDigit(l->ch)) { int start = l->column; char* literal = readNumber(l); tok = newIdentToken(l, literal, TT_NUMBER); tok->column = start; return tok; } else { tok = newToken(l, TT_ILLEGAL); } //printf("Invalid token: %X\n", l->ch); //return NULL; } readChar(l); return tok; } static Token* newWhitespaceToken(Lexer* l) { Token* tok = malloc(sizeof(Token)); tok->line = l->line; tok->column = l->column; tok->type = TT_WHITESPACE; int position = l->position; // grab the char so we can use this funciton for both // spaces and tabs. char ch = l->ch; while (peekChar(l) == ch){ readChar(l); } int count = l->position - position+1; tok->literal = malloc(sizeof(char)*count+1); for (int i = 0; i < count; i++) { tok->literal[i] = ch; } tok->literal[count] = '\0'; tok->length = count; tok->next = NULL; return tok; } static char* readNumber(Lexer* l) { int position = l->position; while (isDigit(l->ch)) { readChar(l); } int len = (l->position - position); char* out = malloc(sizeof(char) * len + 1); memcpy(out, &l->rawFile[position], len); out[len] = '\0'; return out; } static char* readIdentifier(Lexer* l) { int position = l->position; while (isLetter(l->ch)) { readChar(l); } int len = (l->position - position); char* out = malloc(sizeof(char) * len + 1); memcpy(out, &l->rawFile[position], len); out[len] = '\0'; return out; } static char peekChar(Lexer* l) { if (l->readPosition >= l->rawLen) { return '\0'; } return l->rawFile[l->readPosition]; } static void readChar(Lexer* l) { l->column++; if (l->readPosition >= l->rawLen) { l->ch = 0; } else { l->ch = l->rawFile[l->readPosition]; } l->position = l->readPosition; l->readPosition++; } int isLetter(char ch) { return (('a' <= ch && ch <= 'z') || ('A' <= ch && ch <= 'Z') || ch == '_'); } int isDigit(char ch) { return ('0' <= ch && ch <= '9'); } static Token* newTickToken(Lexer* l) { if (l->position+3 > l->rawLen) { printf("premature EOF parsing ticks\n"); return newToken(l, TT_BACKTICK); } // peek up to two more characters int i; for(i = 0; i < 3; i++) { if (l->rawFile[l->position+i] != '`') { return newToken(l, TT_BACKTICK); } } Token* tok = malloc(sizeof(Token)); tok->line = l->line; tok->column = l->column; tok->literal = "```"; tok->type = TT_TRIPLEBACKTICK; tok->length = 3; tok->next = NULL; readChar(l); readChar(l); return tok; } static Token* newToken(Lexer* l, TokenType tt) { Token* tok = malloc(sizeof(Token)); char* nc = malloc(sizeof(char)+1); *nc = l->ch; nc[1] = '\0'; tok->type = tt; tok->literal = nc; tok->line = l->line; tok->column = l->column; tok->length = 1; tok->next = NULL; return tok; } static Token* newIdentToken(Lexer* l, char* literal, TokenType tt) { Token* tok = malloc(sizeof(Token)); tok->type = tt; tok->literal = literal; tok->line = l->line; tok->column = l->column; tok->length = strlen(literal); tok->next = NULL; return tok; }