2021-07-14 08:15:46 -07:00
|
|
|
#include <stdio.h>
|
|
|
|
#include <stdlib.h>
|
|
|
|
#include <string.h>
|
|
|
|
|
|
|
|
#include "lexer.h"
|
|
|
|
|
|
|
|
static void readChar(Lexer* l);
|
|
|
|
static char* readIdentifier(Lexer* l);
|
|
|
|
static char* readNumber(Lexer* l);
|
|
|
|
static int isLetter(char c);
|
|
|
|
static int isDigit(char c);
|
|
|
|
|
2023-10-15 17:55:12 -07:00
|
|
|
static Token* newTickToken(Lexer* l);
|
2021-07-14 08:15:46 -07:00
|
|
|
static Token* newToken(Lexer* l, TokenType tt);
|
|
|
|
static Token* newIdentToken(Lexer* l, char* literal, TokenType tt);
|
|
|
|
|
|
|
|
Lexer*
|
2023-10-15 15:30:59 -07:00
|
|
|
NewLexer(const char* filename)
|
2021-07-14 08:15:46 -07:00
|
|
|
{
|
|
|
|
FILE* fp;
|
|
|
|
fp = fopen(filename, "r");
|
|
|
|
|
|
|
|
if (fp == NULL)
|
|
|
|
{
|
|
|
|
printf("Can't open the file for some reason\n");
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
fseek(fp, 0, SEEK_END);
|
|
|
|
int fileSize = ftell(fp);
|
|
|
|
fseek(fp, 0, SEEK_SET);
|
|
|
|
printf("fileSize: %d\n", fileSize);
|
|
|
|
|
|
|
|
Lexer* state = malloc(sizeof(Lexer));
|
|
|
|
state->rawFile = malloc((sizeof(char) * fileSize) + 1);
|
|
|
|
state->rawLen = fileSize;
|
2023-10-15 17:55:12 -07:00
|
|
|
state->readPosition = 0;
|
2021-07-14 08:15:46 -07:00
|
|
|
|
|
|
|
size_t read = fread(state->rawFile, sizeof(char), fileSize, fp);
|
2023-10-15 17:55:12 -07:00
|
|
|
fclose(fp);
|
|
|
|
|
2021-07-14 08:15:46 -07:00
|
|
|
if (read != fileSize)
|
|
|
|
{
|
|
|
|
printf("something borked. only read %d bytes of %d\n", (int)read, fileSize);
|
|
|
|
|
|
|
|
free(state->rawFile);
|
|
|
|
free(state);
|
|
|
|
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
state->rawFile[fileSize] = '\0';
|
|
|
|
state->line = 1;
|
|
|
|
|
|
|
|
readChar(state);
|
|
|
|
return state;
|
|
|
|
}
|
|
|
|
|
|
|
|
void
|
|
|
|
FreeLexer(Lexer* l)
|
|
|
|
{
|
|
|
|
free(l->rawFile);
|
|
|
|
free(l);
|
|
|
|
}
|
|
|
|
|
|
|
|
Token*
|
|
|
|
NextToken(Lexer* l)
|
|
|
|
{
|
|
|
|
Token* tok;
|
|
|
|
switch (l->ch) {
|
|
|
|
case '#':
|
|
|
|
tok = newToken(l, TT_HASH);
|
|
|
|
break;
|
|
|
|
case '*':
|
|
|
|
tok = newToken(l, TT_ASTERISK);
|
|
|
|
break;
|
|
|
|
case '_':
|
|
|
|
tok = newToken(l, TT_UNDERSCORE);
|
|
|
|
break;
|
|
|
|
case '-':
|
|
|
|
tok = newToken(l, TT_DASH);
|
|
|
|
break;
|
|
|
|
case '.':
|
|
|
|
tok = newToken(l, TT_PERIOD);
|
|
|
|
break;
|
2023-10-15 17:55:12 -07:00
|
|
|
case '>':
|
|
|
|
tok = newToken(l, TT_GT);
|
|
|
|
break;
|
2021-07-14 08:15:46 -07:00
|
|
|
case '`':
|
2023-10-15 17:55:12 -07:00
|
|
|
tok = newTickToken(l);
|
2021-07-14 08:15:46 -07:00
|
|
|
break;
|
|
|
|
case '\0':
|
|
|
|
tok = newToken(l, TT_EOF);
|
|
|
|
break;
|
|
|
|
case '\n':
|
|
|
|
tok = newToken(l, TT_NEWLINE);
|
|
|
|
l->line++;
|
|
|
|
l->column = 0;
|
|
|
|
break;
|
|
|
|
case ' ':
|
|
|
|
case '\t':
|
|
|
|
tok = newToken(l, TT_WHITESPACE);
|
|
|
|
break;
|
|
|
|
case '\r':
|
|
|
|
readChar(l);
|
|
|
|
return NextToken(l); // lets GOOOOO
|
|
|
|
default:
|
|
|
|
if (isLetter(l->ch))
|
|
|
|
{
|
|
|
|
int start = l->column;
|
|
|
|
char* literal = readIdentifier(l);
|
|
|
|
tok = newIdentToken(l, literal, TT_WORD);
|
|
|
|
tok->column = start;
|
|
|
|
return tok;
|
|
|
|
}
|
|
|
|
else if (isDigit(l->ch))
|
|
|
|
{
|
|
|
|
int start = l->column;
|
|
|
|
char* literal = readNumber(l);
|
|
|
|
tok = newIdentToken(l, literal, TT_NUMBER);
|
|
|
|
tok->column = start;
|
|
|
|
return tok;
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
tok = newToken(l, TT_ILLEGAL);
|
|
|
|
}
|
|
|
|
//printf("Invalid token: %X\n", l->ch);
|
|
|
|
//return NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
readChar(l);
|
|
|
|
return tok;
|
|
|
|
}
|
|
|
|
|
|
|
|
static
|
|
|
|
char*
|
|
|
|
readNumber(Lexer* l)
|
|
|
|
{
|
|
|
|
int position = l->position;
|
|
|
|
while (isDigit(l->ch))
|
|
|
|
{
|
|
|
|
readChar(l);
|
|
|
|
}
|
|
|
|
|
|
|
|
int len = (l->position - position);
|
|
|
|
char* out = malloc(sizeof(char) * len + 1);
|
|
|
|
memcpy(out, &l->rawFile[position], len);
|
|
|
|
out[len] = '\0';
|
|
|
|
return out;
|
|
|
|
}
|
|
|
|
|
|
|
|
static
|
|
|
|
char*
|
|
|
|
readIdentifier(Lexer* l)
|
|
|
|
{
|
|
|
|
int position = l->position;
|
|
|
|
while (isLetter(l->ch))
|
|
|
|
{
|
|
|
|
readChar(l);
|
|
|
|
}
|
|
|
|
|
|
|
|
int len = (l->position - position);
|
|
|
|
char* out = malloc(sizeof(char) * len + 1);
|
|
|
|
memcpy(out, &l->rawFile[position], len);
|
|
|
|
out[len] = '\0';
|
|
|
|
return out;
|
|
|
|
}
|
|
|
|
|
|
|
|
static
|
|
|
|
void
|
|
|
|
readChar(Lexer* l)
|
|
|
|
{
|
|
|
|
l->column++;
|
|
|
|
if (l->readPosition >= l->rawLen)
|
|
|
|
{
|
|
|
|
l->ch = 0;
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
l->ch = l->rawFile[l->readPosition];
|
|
|
|
}
|
|
|
|
|
|
|
|
l->position = l->readPosition;
|
|
|
|
l->readPosition++;
|
|
|
|
}
|
|
|
|
|
|
|
|
int
|
|
|
|
isLetter(char ch)
|
|
|
|
{
|
|
|
|
return (('a' <= ch && ch <= 'z') || ('A' <= ch && ch <= 'Z') || ch == '_');
|
|
|
|
}
|
|
|
|
|
|
|
|
int
|
|
|
|
isDigit(char ch)
|
|
|
|
{
|
|
|
|
return ('0' <= ch && ch <= '9');
|
|
|
|
}
|
|
|
|
|
|
|
|
void
|
|
|
|
FreeToken(Token* t)
|
|
|
|
{
|
|
|
|
free(t->literal);
|
|
|
|
free(t);
|
|
|
|
}
|
|
|
|
|
2023-10-15 17:55:12 -07:00
|
|
|
static
|
|
|
|
Token*
|
|
|
|
newTickToken(Lexer* l)
|
|
|
|
{
|
|
|
|
|
|
|
|
printf("backticks @ %d:%d\n", l->line, l->column);
|
|
|
|
|
|
|
|
if (l->position+3 > l->rawLen) {
|
|
|
|
printf("premature EOF parsing ticks\n");
|
|
|
|
return newToken(l, TT_BACKTICK);
|
|
|
|
}
|
|
|
|
|
|
|
|
// peek up to two more characters
|
|
|
|
int i;
|
|
|
|
for(i = 0; i < 3; i++) {
|
|
|
|
|
|
|
|
if (l->rawFile[l->position+i] != '`') {
|
|
|
|
printf("next char isn't a backtick @ %d: 0x%02X '%c'\n", l->readPosition+i, l->rawFile[l->position+i], l->rawFile[l->position+i]);
|
|
|
|
return newToken(l, TT_BACKTICK);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
Token* tok = malloc(sizeof(Token));
|
|
|
|
tok->line = l->line;
|
|
|
|
tok->column = l->column;
|
|
|
|
tok->literal = "```";
|
|
|
|
tok->type = TT_TRIPLEBACKTICK;
|
|
|
|
tok->length = 3;
|
|
|
|
readChar(l);
|
|
|
|
readChar(l);
|
|
|
|
return tok;
|
|
|
|
}
|
|
|
|
|
2021-07-14 08:15:46 -07:00
|
|
|
static
|
|
|
|
Token*
|
|
|
|
newToken(Lexer* l,
|
|
|
|
TokenType tt)
|
|
|
|
{
|
|
|
|
Token* tok = malloc(sizeof(Token));
|
|
|
|
char* nc = malloc(sizeof(char)+1);
|
|
|
|
*nc = l->ch;
|
|
|
|
nc[1] = '\0';
|
|
|
|
tok->type = tt;
|
|
|
|
tok->literal = nc;
|
|
|
|
tok->line = l->line;
|
|
|
|
tok->column = l->column;
|
2023-10-15 17:55:12 -07:00
|
|
|
tok->length = 1;
|
2021-07-14 08:15:46 -07:00
|
|
|
return tok;
|
|
|
|
}
|
|
|
|
|
|
|
|
static
|
|
|
|
Token*
|
|
|
|
newIdentToken(Lexer* l,
|
|
|
|
char* literal,
|
|
|
|
TokenType tt)
|
|
|
|
{
|
|
|
|
Token* tok = malloc(sizeof(Token));
|
|
|
|
tok->type = tt;
|
|
|
|
tok->literal = literal;
|
|
|
|
tok->line = l->line;
|
|
|
|
tok->column = l->column;
|
2023-10-15 17:55:12 -07:00
|
|
|
tok->length = strlen(literal);
|
2021-07-14 08:15:46 -07:00
|
|
|
return tok;
|
|
|
|
}
|