readme-thing/lexer.c

306 lines
6.4 KiB
C

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "lexer.h"
static char peekChar(Lexer* l);
static void readChar(Lexer* l);
static char* readIdentifier(Lexer* l);
static char* readNumber(Lexer* l);
static int isLetter(char c);
static int isDigit(char c);
static Token* newTickToken(Lexer* l);
static Token* newToken(Lexer* l, TokenType tt);
static Token* newIdentToken(Lexer* l, char* literal, TokenType tt);
static Token* newWhitespaceToken(Lexer* l);
Lexer*
NewLexer(const char* filename)
{
FILE* fp;
fp = fopen(filename, "r");
if (fp == NULL)
{
printf("Can't open the file for some reason\n");
return NULL;
}
fseek(fp, 0, SEEK_END);
int fileSize = ftell(fp);
fseek(fp, 0, SEEK_SET);
printf("fileSize: %d\n", fileSize);
Lexer* state = malloc(sizeof(Lexer));
state->rawFile = malloc((sizeof(char) * fileSize) + 1);
state->rawLen = fileSize;
state->readPosition = 0;
size_t read = fread(state->rawFile, sizeof(char), fileSize, fp);
fclose(fp);
if (read != fileSize)
{
printf("something borked. only read %d bytes of %d\n", (int)read, fileSize);
free(state->rawFile);
free(state);
return NULL;
}
state->rawFile[fileSize] = '\0';
state->line = 1;
state->column = 0;
readChar(state);
return state;
}
void
FreeLexer(Lexer* l)
{
free(l->rawFile);
free(l);
}
Token*
NextToken(Lexer* l)
{
Token* tok;
switch (l->ch) {
case '#':
tok = newToken(l, TT_HASH);
break;
case '*':
tok = newToken(l, TT_ASTERISK);
break;
case '_':
tok = newToken(l, TT_UNDERSCORE);
break;
case '-':
tok = newToken(l, TT_DASH);
break;
case '.':
tok = newToken(l, TT_PERIOD);
break;
case '>':
tok = newToken(l, TT_GT);
break;
case '`':
tok = newTickToken(l);
break;
case '\0':
tok = newToken(l, TT_EOF);
break;
case '\n':
tok = newToken(l, TT_NEWLINE);
l->line++;
l->column = 0;
break;
case ' ':
case '\t':
tok = newWhitespaceToken(l);
break;
case '\r':
readChar(l);
return NextToken(l); // lets GOOOOO
default:
if (isLetter(l->ch))
{
int start = l->column;
char* literal = readIdentifier(l);
tok = newIdentToken(l, literal, TT_WORD);
tok->column = start;
return tok;
}
else if (isDigit(l->ch))
{
int start = l->column;
char* literal = readNumber(l);
tok = newIdentToken(l, literal, TT_NUMBER);
tok->column = start;
return tok;
}
else
{
tok = newToken(l, TT_ILLEGAL);
}
//printf("Invalid token: %X\n", l->ch);
//return NULL;
}
readChar(l);
return tok;
}
static
Token*
newWhitespaceToken(Lexer* l)
{
Token* tok = malloc(sizeof(Token));
tok->line = l->line;
tok->column = l->column;
tok->type = TT_WHITESPACE;
int position = l->position;
// grab the char so we can use this funciton for both
// spaces and tabs.
char ch = l->ch;
while (peekChar(l) == ch){
readChar(l);
}
int count = l->position - position+1;
tok->literal = malloc(sizeof(char)*count+1);
for (int i = 0; i < count; i++) {
tok->literal[i] = ch;
}
tok->literal[count] = '\0';
tok->length = count;
tok->next = NULL;
return tok;
}
static
char*
readNumber(Lexer* l)
{
int position = l->position;
while (isDigit(l->ch))
{
readChar(l);
}
int len = (l->position - position);
char* out = malloc(sizeof(char) * len + 1);
memcpy(out, &l->rawFile[position], len);
out[len] = '\0';
return out;
}
static
char*
readIdentifier(Lexer* l)
{
int position = l->position;
while (isLetter(l->ch))
{
readChar(l);
}
int len = (l->position - position);
char* out = malloc(sizeof(char) * len + 1);
memcpy(out, &l->rawFile[position], len);
out[len] = '\0';
return out;
}
static
char
peekChar(Lexer* l)
{
if (l->readPosition >= l->rawLen) {
return '\0';
}
return l->rawFile[l->readPosition];
}
static
void
readChar(Lexer* l)
{
l->column++;
if (l->readPosition >= l->rawLen)
{
l->ch = 0;
}
else
{
l->ch = l->rawFile[l->readPosition];
}
l->position = l->readPosition;
l->readPosition++;
}
int
isLetter(char ch)
{
return (('a' <= ch && ch <= 'z') || ('A' <= ch && ch <= 'Z') || ch == '_');
}
int
isDigit(char ch)
{
return ('0' <= ch && ch <= '9');
}
static
Token*
newTickToken(Lexer* l)
{
if (l->position+3 > l->rawLen) {
printf("premature EOF parsing ticks\n");
return newToken(l, TT_BACKTICK);
}
// peek up to two more characters
int i;
for(i = 0; i < 3; i++) {
if (l->rawFile[l->position+i] != '`') {
return newToken(l, TT_BACKTICK);
}
}
Token* tok = malloc(sizeof(Token));
tok->line = l->line;
tok->column = l->column;
tok->literal = "```";
tok->type = TT_TRIPLEBACKTICK;
tok->length = 3;
tok->next = NULL;
readChar(l);
readChar(l);
return tok;
}
static
Token*
newToken(Lexer* l,
TokenType tt)
{
Token* tok = malloc(sizeof(Token));
char* nc = malloc(sizeof(char)+1);
*nc = l->ch;
nc[1] = '\0';
tok->type = tt;
tok->literal = nc;
tok->line = l->line;
tok->column = l->column;
tok->length = 1;
tok->next = NULL;
return tok;
}
static
Token*
newIdentToken(Lexer* l,
char* literal,
TokenType tt)
{
Token* tok = malloc(sizeof(Token));
tok->type = tt;
tok->literal = literal;
tok->line = l->line;
tok->column = l->column;
tok->length = strlen(literal);
tok->next = NULL;
return tok;
}