Add codeblocks; Fix some stuff

- Parse code blocks that are wrapped sets of three backticks.
- Added a TT_TRIPLEBACKTICK token.
- Added a length field to the Token struct.
- Added an error node.
- Cleaned up some file read code.
- Fixed the header raw text (don't reuse the same buffer for each node).
This commit is contained in:
Zorchenhimer 2023-10-15 20:55:12 -04:00
parent ca6bdca7ab
commit d855df380b
7 changed files with 180 additions and 66 deletions

View File

@ -1,6 +1,7 @@
CC=gcc
CFLAGS=-Wall -pedantic -Werror -std=c99
CFLAGS=-Wall -pedantic -Werror -std=c99 -g -O0
#-fsanitize=address
OBJ=main.o lexer.o token.o node.o

51
lexer.c
View File

@ -10,6 +10,7 @@ static char* readNumber(Lexer* l);
static int isLetter(char c);
static int isDigit(char c);
static Token* newTickToken(Lexer* l);
static Token* newToken(Lexer* l, TokenType tt);
static Token* newIdentToken(Lexer* l, char* literal, TokenType tt);
@ -33,8 +34,11 @@ NewLexer(const char* filename)
Lexer* state = malloc(sizeof(Lexer));
state->rawFile = malloc((sizeof(char) * fileSize) + 1);
state->rawLen = fileSize;
state->readPosition = 0;
size_t read = fread(state->rawFile, sizeof(char), fileSize, fp);
fclose(fp);
if (read != fileSize)
{
printf("something borked. only read %d bytes of %d\n", (int)read, fileSize);
@ -44,7 +48,6 @@ NewLexer(const char* filename)
return NULL;
}
fclose(fp);
state->rawFile[fileSize] = '\0';
state->line = 1;
@ -80,8 +83,11 @@ NextToken(Lexer* l)
case '.':
tok = newToken(l, TT_PERIOD);
break;
case '>':
tok = newToken(l, TT_GT);
break;
case '`':
tok = newToken(l, TT_BACKTICK);
tok = newTickToken(l);
break;
case '\0':
tok = newToken(l, TT_EOF);
@ -161,7 +167,6 @@ readIdentifier(Lexer* l)
return out;
}
static
void
readChar(Lexer* l)
@ -180,11 +185,6 @@ readChar(Lexer* l)
l->readPosition++;
}
void
Parse(Lexer* l)
{
}
int
isLetter(char ch)
{
@ -204,6 +204,39 @@ FreeToken(Token* t)
free(t);
}
static
Token*
newTickToken(Lexer* l)
{
printf("backticks @ %d:%d\n", l->line, l->column);
if (l->position+3 > l->rawLen) {
printf("premature EOF parsing ticks\n");
return newToken(l, TT_BACKTICK);
}
// peek up to two more characters
int i;
for(i = 0; i < 3; i++) {
if (l->rawFile[l->position+i] != '`') {
printf("next char isn't a backtick @ %d: 0x%02X '%c'\n", l->readPosition+i, l->rawFile[l->position+i], l->rawFile[l->position+i]);
return newToken(l, TT_BACKTICK);
}
}
Token* tok = malloc(sizeof(Token));
tok->line = l->line;
tok->column = l->column;
tok->literal = "```";
tok->type = TT_TRIPLEBACKTICK;
tok->length = 3;
readChar(l);
readChar(l);
return tok;
}
static
Token*
newToken(Lexer* l,
@ -217,6 +250,7 @@ newToken(Lexer* l,
tok->literal = nc;
tok->line = l->line;
tok->column = l->column;
tok->length = 1;
return tok;
}
@ -231,5 +265,6 @@ newIdentToken(Lexer* l,
tok->literal = literal;
tok->line = l->line;
tok->column = l->column;
tok->length = strlen(literal);
return tok;
}

43
main.c
View File

@ -7,35 +7,6 @@
#include "lexer.h"
#include "node.h"
/*
* RawText ""
* LineNumber 0
* NodeType NT_Root
* ChildNodes
* RawText "# Header1"
* LineNumber 1
* NodeType NT_Header1
* ChildNodes
* {"Some text."}
*
* RawText "## Header2"
*/
/*
* NodeType NT_Root
* ChildNodes
* RawText "## Header2"
* ChildNodes
* paragraph
* ChildNodes
* {*bold text*}
* {_underlined text_}
* paragraph
*
*
*/
//Node* ParseLine(char *buffer);
void writeTokenFile(TokenList* tl);
int
@ -93,6 +64,20 @@ main(int argc, const char** argv)
}
break;
case NT_BlockCode:
{
CodeBlockNode* cnode = (CodeBlockNode*)node;
printf("{CodeBlockNode text:%s}\n", cnode->rawText);
}
break;
case NT_Error:
{
ErrorNode* enode = (ErrorNode*)node;
printf("{ErrorNode error:%s}\n", enode->error);
}
break;
default:
printf("%s\n", NodeTypeString(node->type));
}

109
node.c
View File

@ -7,7 +7,8 @@
static char stringBuff[STRING_BUFF_SIZE];
HeaderNode* parseHeader(TokenList** list);
Node* parseHeader(TokenList** list);
Node* parseCodeBlock(TokenList** list);
NodeList*
ParseNodes(TokenList* list)
@ -32,13 +33,18 @@ ParseNodes(TokenList* list)
break;
case TT_HASH:
// start of header
currentNode = (Node*)parseHeader(&currentToken);
currentNode = parseHeader(&currentToken);
break;
case TT_TRIPLEBACKTICK:
currentNode = parseCodeBlock(&currentToken);
break;
case TT_EOF:
printf("EOF found\n");
return nl;
default:
default: // paragraph start?
break;
}
@ -65,7 +71,7 @@ ParseNodes(TokenList* list)
return nl;
}
HeaderNode*
Node*
parseHeader(TokenList** list)
{
TokenList* l = *list;
@ -96,23 +102,19 @@ parseHeader(TokenList** list)
return NULL;
}
stringBuff[0] = '\0';
while (1)
{
int bufSize = strlen(stringBuff);
int litSize = strlen(l->token->literal);
if (bufSize + litSize + 1 > STRING_BUFF_SIZE)
{
printf("Buffer not big enough!");
return NULL;
}
strncat(stringBuff, l->token->literal, strlen(l->token->literal));
if (l->next == NULL || l->next->token->type == TT_NEWLINE)
{
break;
TokenList* end = l;
int len = 0;
// find header text size
while (end->token->type != TT_NEWLINE && end->token->type != TT_EOF) {
len += end->token->length;
end = end->next;
}
char* strbuff = malloc(len+1);
strbuff[0] = '\0';
while(l != end) {
strncat(strbuff, l->token->literal, l->token->length);
l = l->next;
}
@ -136,9 +138,72 @@ parseHeader(TokenList** list)
}
retval->next = NULL;
retval->rawText = stringBuff;
retval->rawText = strbuff;
return retval;
return (Node*)retval;
}
Node*
parseCodeBlock(TokenList** list)
{
TokenList* l = *list;
// find closing ticks
int tlen = 0; // number of tokens
int clen = 0; // number of characters
l = l->next; // skip past the opening triple backtick
// skip the first newline
if (l->token->type == TT_NEWLINE) {
l = l->next;
}
while (l->next != NULL && l->next->token->type != TT_TRIPLEBACKTICK) {
if (l->next->token->type == TT_EOF) {
printf("premature EOF");
ErrorNode* err = malloc(sizeof(ErrorNode));
err->type = NT_Error;
err->next = NULL;
err->error = "premature EOF searching for closing triple backtick";
return (Node*)err;
}
tlen++;
clen += l->token->length;
l = l->next;
}
l = *list;
printf("codeblock token length: %d\n", tlen);
printf("codeblock char length: %d\n", clen);
printf("malloc(%ld)\n", sizeof(char)*clen+1);
char* strbuff = malloc(sizeof(char)*clen+1);
strbuff[0] = '\0';
int i;
l = l->next; // skip past the opening triple backtick
// skip the first newline
if (l->token->type == TT_NEWLINE) {
l = l->next;
}
for(i = 0; i < tlen; i++) {
strncat(strbuff, l->token->literal, l->token->length);
l = l->next;
}
// skip past closing triple backtick
*list = l->next;
printf("malloc(%ld)\n", sizeof(CodeBlockNode));
CodeBlockNode* ret = malloc(sizeof(CodeBlockNode));
ret->type = NT_BlockCode;
ret->rawText = strbuff;
ret->next = NULL;
return (Node*)ret;
}
char*
@ -169,6 +234,8 @@ NodeTypeString(NodeType t)
return "NT_Bold";
case NT_Underline:
return "NT_Underline";
case NT_Error:
return "NT_Error";
default:
snprintf(stringBuff, 1000, "unknown NodeType: %d", t);

27
node.h
View File

@ -6,18 +6,29 @@
#define NODE_H
typedef enum {
// Stand-alone elements
// cannot contain text modifiers
NT_Header1,
NT_Header2,
NT_Header3,
NT_Header4,
NT_BlockCode,
// Container elements
// can contain text modifiers
NT_Paragraph,
NT_UnorderedList,
NT_OrderedList,
NT_InlineCode,
NT_BlockCode,
NT_BlockQuote,
// Contained elements (cannot be bare)
// text modifiers
NT_InlineCode,
NT_Bold,
NT_Underline,
// something went wrong
NT_Error,
} NodeType;
struct NodeList;
@ -40,6 +51,18 @@ typedef struct {
char* rawText;
} HeaderNode;
typedef struct {
NodeType type;
struct Node* next;
char* rawText;
} CodeBlockNode;
typedef struct {
NodeType type;
struct Node* next;
char* error;
} ErrorNode;
/*
typedef struct {
NodeType type;

View File

@ -63,6 +63,8 @@ TokenTypeString(TokenType tt)
return "TT_PERIOD";
case TT_BACKTICK:
return "TT_BACKTICK";
case TT_TRIPLEBACKTICK:
return "TT_TRIPLEBACKTICK";
case TT_WHITESPACE:
return "TT_WHITESPACE";
case TT_NEWLINE:

View File

@ -11,6 +11,7 @@ typedef enum {
TT_DASH,
TT_PERIOD,
TT_BACKTICK,
TT_TRIPLEBACKTICK,
TT_WHITESPACE,
TT_NEWLINE,
TT_WORD,
@ -22,7 +23,7 @@ typedef struct Token {
char* literal;
int line;
int column;
char* printBuff;
int length;
} Token;
typedef struct TokenList {