readme-thing/lexer.c

271 lines
5.6 KiB
C
Raw Normal View History

2021-07-14 08:15:46 -07:00
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "lexer.h"
static void readChar(Lexer* l);
static char* readIdentifier(Lexer* l);
static char* readNumber(Lexer* l);
static int isLetter(char c);
static int isDigit(char c);
static Token* newTickToken(Lexer* l);
2021-07-14 08:15:46 -07:00
static Token* newToken(Lexer* l, TokenType tt);
static Token* newIdentToken(Lexer* l, char* literal, TokenType tt);
Lexer*
NewLexer(const char* filename)
2021-07-14 08:15:46 -07:00
{
FILE* fp;
fp = fopen(filename, "r");
if (fp == NULL)
{
printf("Can't open the file for some reason\n");
return NULL;
}
fseek(fp, 0, SEEK_END);
int fileSize = ftell(fp);
fseek(fp, 0, SEEK_SET);
printf("fileSize: %d\n", fileSize);
Lexer* state = malloc(sizeof(Lexer));
state->rawFile = malloc((sizeof(char) * fileSize) + 1);
state->rawLen = fileSize;
state->readPosition = 0;
2021-07-14 08:15:46 -07:00
size_t read = fread(state->rawFile, sizeof(char), fileSize, fp);
fclose(fp);
2021-07-14 08:15:46 -07:00
if (read != fileSize)
{
printf("something borked. only read %d bytes of %d\n", (int)read, fileSize);
free(state->rawFile);
free(state);
return NULL;
}
state->rawFile[fileSize] = '\0';
state->line = 1;
readChar(state);
return state;
}
void
FreeLexer(Lexer* l)
{
free(l->rawFile);
free(l);
}
Token*
NextToken(Lexer* l)
{
Token* tok;
switch (l->ch) {
case '#':
tok = newToken(l, TT_HASH);
break;
case '*':
tok = newToken(l, TT_ASTERISK);
break;
case '_':
tok = newToken(l, TT_UNDERSCORE);
break;
case '-':
tok = newToken(l, TT_DASH);
break;
case '.':
tok = newToken(l, TT_PERIOD);
break;
case '>':
tok = newToken(l, TT_GT);
break;
2021-07-14 08:15:46 -07:00
case '`':
tok = newTickToken(l);
2021-07-14 08:15:46 -07:00
break;
case '\0':
tok = newToken(l, TT_EOF);
break;
case '\n':
tok = newToken(l, TT_NEWLINE);
l->line++;
l->column = 0;
break;
case ' ':
case '\t':
tok = newToken(l, TT_WHITESPACE);
break;
case '\r':
readChar(l);
return NextToken(l); // lets GOOOOO
default:
if (isLetter(l->ch))
{
int start = l->column;
char* literal = readIdentifier(l);
tok = newIdentToken(l, literal, TT_WORD);
tok->column = start;
return tok;
}
else if (isDigit(l->ch))
{
int start = l->column;
char* literal = readNumber(l);
tok = newIdentToken(l, literal, TT_NUMBER);
tok->column = start;
return tok;
}
else
{
tok = newToken(l, TT_ILLEGAL);
}
//printf("Invalid token: %X\n", l->ch);
//return NULL;
}
readChar(l);
return tok;
}
static
char*
readNumber(Lexer* l)
{
int position = l->position;
while (isDigit(l->ch))
{
readChar(l);
}
int len = (l->position - position);
char* out = malloc(sizeof(char) * len + 1);
memcpy(out, &l->rawFile[position], len);
out[len] = '\0';
return out;
}
static
char*
readIdentifier(Lexer* l)
{
int position = l->position;
while (isLetter(l->ch))
{
readChar(l);
}
int len = (l->position - position);
char* out = malloc(sizeof(char) * len + 1);
memcpy(out, &l->rawFile[position], len);
out[len] = '\0';
return out;
}
static
void
readChar(Lexer* l)
{
l->column++;
if (l->readPosition >= l->rawLen)
{
l->ch = 0;
}
else
{
l->ch = l->rawFile[l->readPosition];
}
l->position = l->readPosition;
l->readPosition++;
}
int
isLetter(char ch)
{
return (('a' <= ch && ch <= 'z') || ('A' <= ch && ch <= 'Z') || ch == '_');
}
int
isDigit(char ch)
{
return ('0' <= ch && ch <= '9');
}
void
FreeToken(Token* t)
{
free(t->literal);
free(t);
}
static
Token*
newTickToken(Lexer* l)
{
printf("backticks @ %d:%d\n", l->line, l->column);
if (l->position+3 > l->rawLen) {
printf("premature EOF parsing ticks\n");
return newToken(l, TT_BACKTICK);
}
// peek up to two more characters
int i;
for(i = 0; i < 3; i++) {
if (l->rawFile[l->position+i] != '`') {
printf("next char isn't a backtick @ %d: 0x%02X '%c'\n", l->readPosition+i, l->rawFile[l->position+i], l->rawFile[l->position+i]);
return newToken(l, TT_BACKTICK);
}
}
Token* tok = malloc(sizeof(Token));
tok->line = l->line;
tok->column = l->column;
tok->literal = "```";
tok->type = TT_TRIPLEBACKTICK;
tok->length = 3;
readChar(l);
readChar(l);
return tok;
}
2021-07-14 08:15:46 -07:00
static
Token*
newToken(Lexer* l,
TokenType tt)
{
Token* tok = malloc(sizeof(Token));
char* nc = malloc(sizeof(char)+1);
*nc = l->ch;
nc[1] = '\0';
tok->type = tt;
tok->literal = nc;
tok->line = l->line;
tok->column = l->column;
tok->length = 1;
2021-07-14 08:15:46 -07:00
return tok;
}
static
Token*
newIdentToken(Lexer* l,
char* literal,
TokenType tt)
{
Token* tok = malloc(sizeof(Token));
tok->type = tt;
tok->literal = literal;
tok->line = l->line;
tok->column = l->column;
tok->length = strlen(literal);
2021-07-14 08:15:46 -07:00
return tok;
}