Add codeblocks; Fix some stuff

- Parse code blocks that are wrapped sets of three backticks.
- Added a TT_TRIPLEBACKTICK token.
- Added a length field to the Token struct.
- Added an error node.
- Cleaned up some file read code.
- Fixed the header raw text (don't reuse the same buffer for each node).
This commit is contained in:
Zorchenhimer 2023-10-15 20:55:12 -04:00
parent ca6bdca7ab
commit d855df380b
7 changed files with 180 additions and 66 deletions

View File

@ -1,6 +1,7 @@
CC=gcc CC=gcc
CFLAGS=-Wall -pedantic -Werror -std=c99 CFLAGS=-Wall -pedantic -Werror -std=c99 -g -O0
#-fsanitize=address
OBJ=main.o lexer.o token.o node.o OBJ=main.o lexer.o token.o node.o

51
lexer.c
View File

@ -10,6 +10,7 @@ static char* readNumber(Lexer* l);
static int isLetter(char c); static int isLetter(char c);
static int isDigit(char c); static int isDigit(char c);
static Token* newTickToken(Lexer* l);
static Token* newToken(Lexer* l, TokenType tt); static Token* newToken(Lexer* l, TokenType tt);
static Token* newIdentToken(Lexer* l, char* literal, TokenType tt); static Token* newIdentToken(Lexer* l, char* literal, TokenType tt);
@ -33,8 +34,11 @@ NewLexer(const char* filename)
Lexer* state = malloc(sizeof(Lexer)); Lexer* state = malloc(sizeof(Lexer));
state->rawFile = malloc((sizeof(char) * fileSize) + 1); state->rawFile = malloc((sizeof(char) * fileSize) + 1);
state->rawLen = fileSize; state->rawLen = fileSize;
state->readPosition = 0;
size_t read = fread(state->rawFile, sizeof(char), fileSize, fp); size_t read = fread(state->rawFile, sizeof(char), fileSize, fp);
fclose(fp);
if (read != fileSize) if (read != fileSize)
{ {
printf("something borked. only read %d bytes of %d\n", (int)read, fileSize); printf("something borked. only read %d bytes of %d\n", (int)read, fileSize);
@ -44,7 +48,6 @@ NewLexer(const char* filename)
return NULL; return NULL;
} }
fclose(fp);
state->rawFile[fileSize] = '\0'; state->rawFile[fileSize] = '\0';
state->line = 1; state->line = 1;
@ -80,8 +83,11 @@ NextToken(Lexer* l)
case '.': case '.':
tok = newToken(l, TT_PERIOD); tok = newToken(l, TT_PERIOD);
break; break;
case '>':
tok = newToken(l, TT_GT);
break;
case '`': case '`':
tok = newToken(l, TT_BACKTICK); tok = newTickToken(l);
break; break;
case '\0': case '\0':
tok = newToken(l, TT_EOF); tok = newToken(l, TT_EOF);
@ -161,7 +167,6 @@ readIdentifier(Lexer* l)
return out; return out;
} }
static static
void void
readChar(Lexer* l) readChar(Lexer* l)
@ -180,11 +185,6 @@ readChar(Lexer* l)
l->readPosition++; l->readPosition++;
} }
void
Parse(Lexer* l)
{
}
int int
isLetter(char ch) isLetter(char ch)
{ {
@ -204,6 +204,39 @@ FreeToken(Token* t)
free(t); free(t);
} }
static
Token*
newTickToken(Lexer* l)
{
printf("backticks @ %d:%d\n", l->line, l->column);
if (l->position+3 > l->rawLen) {
printf("premature EOF parsing ticks\n");
return newToken(l, TT_BACKTICK);
}
// peek up to two more characters
int i;
for(i = 0; i < 3; i++) {
if (l->rawFile[l->position+i] != '`') {
printf("next char isn't a backtick @ %d: 0x%02X '%c'\n", l->readPosition+i, l->rawFile[l->position+i], l->rawFile[l->position+i]);
return newToken(l, TT_BACKTICK);
}
}
Token* tok = malloc(sizeof(Token));
tok->line = l->line;
tok->column = l->column;
tok->literal = "```";
tok->type = TT_TRIPLEBACKTICK;
tok->length = 3;
readChar(l);
readChar(l);
return tok;
}
static static
Token* Token*
newToken(Lexer* l, newToken(Lexer* l,
@ -217,6 +250,7 @@ newToken(Lexer* l,
tok->literal = nc; tok->literal = nc;
tok->line = l->line; tok->line = l->line;
tok->column = l->column; tok->column = l->column;
tok->length = 1;
return tok; return tok;
} }
@ -231,5 +265,6 @@ newIdentToken(Lexer* l,
tok->literal = literal; tok->literal = literal;
tok->line = l->line; tok->line = l->line;
tok->column = l->column; tok->column = l->column;
tok->length = strlen(literal);
return tok; return tok;
} }

51
main.c
View File

@ -7,35 +7,6 @@
#include "lexer.h" #include "lexer.h"
#include "node.h" #include "node.h"
/*
* RawText ""
* LineNumber 0
* NodeType NT_Root
* ChildNodes
* RawText "# Header1"
* LineNumber 1
* NodeType NT_Header1
* ChildNodes
* {"Some text."}
*
* RawText "## Header2"
*/
/*
* NodeType NT_Root
* ChildNodes
* RawText "## Header2"
* ChildNodes
* paragraph
* ChildNodes
* {*bold text*}
* {_underlined text_}
* paragraph
*
*
*/
//Node* ParseLine(char *buffer);
void writeTokenFile(TokenList* tl); void writeTokenFile(TokenList* tl);
int int
@ -87,10 +58,24 @@ main(int argc, const char** argv)
case NT_Header2: case NT_Header2:
case NT_Header3: case NT_Header3:
case NT_Header4: case NT_Header4:
{ {
HeaderNode* hnode = (HeaderNode*)node; HeaderNode* hnode = (HeaderNode*)node;
printf("{HeaderNode type:%s text:%s}\n", NodeTypeString(hnode->type), hnode->rawText); printf("{HeaderNode type:%s text:%s}\n", NodeTypeString(hnode->type), hnode->rawText);
} }
break;
case NT_BlockCode:
{
CodeBlockNode* cnode = (CodeBlockNode*)node;
printf("{CodeBlockNode text:%s}\n", cnode->rawText);
}
break;
case NT_Error:
{
ErrorNode* enode = (ErrorNode*)node;
printf("{ErrorNode error:%s}\n", enode->error);
}
break; break;
default: default:

109
node.c
View File

@ -7,7 +7,8 @@
static char stringBuff[STRING_BUFF_SIZE]; static char stringBuff[STRING_BUFF_SIZE];
HeaderNode* parseHeader(TokenList** list); Node* parseHeader(TokenList** list);
Node* parseCodeBlock(TokenList** list);
NodeList* NodeList*
ParseNodes(TokenList* list) ParseNodes(TokenList* list)
@ -32,13 +33,18 @@ ParseNodes(TokenList* list)
break; break;
case TT_HASH: case TT_HASH:
// start of header // start of header
currentNode = (Node*)parseHeader(&currentToken); currentNode = parseHeader(&currentToken);
break;
case TT_TRIPLEBACKTICK:
currentNode = parseCodeBlock(&currentToken);
break; break;
case TT_EOF: case TT_EOF:
printf("EOF found\n"); printf("EOF found\n");
return nl; return nl;
default:
default: // paragraph start?
break; break;
} }
@ -65,7 +71,7 @@ ParseNodes(TokenList* list)
return nl; return nl;
} }
HeaderNode* Node*
parseHeader(TokenList** list) parseHeader(TokenList** list)
{ {
TokenList* l = *list; TokenList* l = *list;
@ -96,23 +102,19 @@ parseHeader(TokenList** list)
return NULL; return NULL;
} }
stringBuff[0] = '\0'; TokenList* end = l;
while (1) int len = 0;
{ // find header text size
int bufSize = strlen(stringBuff); while (end->token->type != TT_NEWLINE && end->token->type != TT_EOF) {
int litSize = strlen(l->token->literal); len += end->token->length;
if (bufSize + litSize + 1 > STRING_BUFF_SIZE) end = end->next;
{ }
printf("Buffer not big enough!");
return NULL;
}
strncat(stringBuff, l->token->literal, strlen(l->token->literal));
if (l->next == NULL || l->next->token->type == TT_NEWLINE) char* strbuff = malloc(len+1);
{ strbuff[0] = '\0';
break;
}
while(l != end) {
strncat(strbuff, l->token->literal, l->token->length);
l = l->next; l = l->next;
} }
@ -136,9 +138,72 @@ parseHeader(TokenList** list)
} }
retval->next = NULL; retval->next = NULL;
retval->rawText = stringBuff; retval->rawText = strbuff;
return retval; return (Node*)retval;
}
Node*
parseCodeBlock(TokenList** list)
{
TokenList* l = *list;
// find closing ticks
int tlen = 0; // number of tokens
int clen = 0; // number of characters
l = l->next; // skip past the opening triple backtick
// skip the first newline
if (l->token->type == TT_NEWLINE) {
l = l->next;
}
while (l->next != NULL && l->next->token->type != TT_TRIPLEBACKTICK) {
if (l->next->token->type == TT_EOF) {
printf("premature EOF");
ErrorNode* err = malloc(sizeof(ErrorNode));
err->type = NT_Error;
err->next = NULL;
err->error = "premature EOF searching for closing triple backtick";
return (Node*)err;
}
tlen++;
clen += l->token->length;
l = l->next;
}
l = *list;
printf("codeblock token length: %d\n", tlen);
printf("codeblock char length: %d\n", clen);
printf("malloc(%ld)\n", sizeof(char)*clen+1);
char* strbuff = malloc(sizeof(char)*clen+1);
strbuff[0] = '\0';
int i;
l = l->next; // skip past the opening triple backtick
// skip the first newline
if (l->token->type == TT_NEWLINE) {
l = l->next;
}
for(i = 0; i < tlen; i++) {
strncat(strbuff, l->token->literal, l->token->length);
l = l->next;
}
// skip past closing triple backtick
*list = l->next;
printf("malloc(%ld)\n", sizeof(CodeBlockNode));
CodeBlockNode* ret = malloc(sizeof(CodeBlockNode));
ret->type = NT_BlockCode;
ret->rawText = strbuff;
ret->next = NULL;
return (Node*)ret;
} }
char* char*
@ -169,6 +234,8 @@ NodeTypeString(NodeType t)
return "NT_Bold"; return "NT_Bold";
case NT_Underline: case NT_Underline:
return "NT_Underline"; return "NT_Underline";
case NT_Error:
return "NT_Error";
default: default:
snprintf(stringBuff, 1000, "unknown NodeType: %d", t); snprintf(stringBuff, 1000, "unknown NodeType: %d", t);

27
node.h
View File

@ -6,18 +6,29 @@
#define NODE_H #define NODE_H
typedef enum { typedef enum {
// Stand-alone elements
// cannot contain text modifiers
NT_Header1, NT_Header1,
NT_Header2, NT_Header2,
NT_Header3, NT_Header3,
NT_Header4, NT_Header4,
NT_BlockCode,
// Container elements
// can contain text modifiers
NT_Paragraph, NT_Paragraph,
NT_UnorderedList, NT_UnorderedList,
NT_OrderedList, NT_OrderedList,
NT_InlineCode,
NT_BlockCode,
NT_BlockQuote, NT_BlockQuote,
// Contained elements (cannot be bare)
// text modifiers
NT_InlineCode,
NT_Bold, NT_Bold,
NT_Underline, NT_Underline,
// something went wrong
NT_Error,
} NodeType; } NodeType;
struct NodeList; struct NodeList;
@ -40,6 +51,18 @@ typedef struct {
char* rawText; char* rawText;
} HeaderNode; } HeaderNode;
typedef struct {
NodeType type;
struct Node* next;
char* rawText;
} CodeBlockNode;
typedef struct {
NodeType type;
struct Node* next;
char* error;
} ErrorNode;
/* /*
typedef struct { typedef struct {
NodeType type; NodeType type;

View File

@ -63,6 +63,8 @@ TokenTypeString(TokenType tt)
return "TT_PERIOD"; return "TT_PERIOD";
case TT_BACKTICK: case TT_BACKTICK:
return "TT_BACKTICK"; return "TT_BACKTICK";
case TT_TRIPLEBACKTICK:
return "TT_TRIPLEBACKTICK";
case TT_WHITESPACE: case TT_WHITESPACE:
return "TT_WHITESPACE"; return "TT_WHITESPACE";
case TT_NEWLINE: case TT_NEWLINE:

View File

@ -11,6 +11,7 @@ typedef enum {
TT_DASH, TT_DASH,
TT_PERIOD, TT_PERIOD,
TT_BACKTICK, TT_BACKTICK,
TT_TRIPLEBACKTICK,
TT_WHITESPACE, TT_WHITESPACE,
TT_NEWLINE, TT_NEWLINE,
TT_WORD, TT_WORD,
@ -22,7 +23,7 @@ typedef struct Token {
char* literal; char* literal;
int line; int line;
int column; int column;
char* printBuff; int length;
} Token; } Token;
typedef struct TokenList { typedef struct TokenList {