From d855df380b5cec0dc0e84e8cba2235f73958b756 Mon Sep 17 00:00:00 2001 From: Zorchenhimer Date: Sun, 15 Oct 2023 20:55:12 -0400 Subject: [PATCH] Add codeblocks; Fix some stuff - Parse code blocks that are wrapped sets of three backticks. - Added a TT_TRIPLEBACKTICK token. - Added a length field to the Token struct. - Added an error node. - Cleaned up some file read code. - Fixed the header raw text (don't reuse the same buffer for each node). --- Makefile | 3 +- lexer.c | 51 ++++++++++++++++++++++---- main.c | 51 +++++++++----------------- node.c | 109 ++++++++++++++++++++++++++++++++++++++++++++----------- node.h | 27 +++++++++++++- token.c | 2 + token.h | 3 +- 7 files changed, 180 insertions(+), 66 deletions(-) diff --git a/Makefile b/Makefile index 7cca081..bacddda 100644 --- a/Makefile +++ b/Makefile @@ -1,6 +1,7 @@ CC=gcc -CFLAGS=-Wall -pedantic -Werror -std=c99 +CFLAGS=-Wall -pedantic -Werror -std=c99 -g -O0 +#-fsanitize=address OBJ=main.o lexer.o token.o node.o diff --git a/lexer.c b/lexer.c index 03757f4..86344d0 100644 --- a/lexer.c +++ b/lexer.c @@ -10,6 +10,7 @@ static char* readNumber(Lexer* l); static int isLetter(char c); static int isDigit(char c); +static Token* newTickToken(Lexer* l); static Token* newToken(Lexer* l, TokenType tt); static Token* newIdentToken(Lexer* l, char* literal, TokenType tt); @@ -33,8 +34,11 @@ NewLexer(const char* filename) Lexer* state = malloc(sizeof(Lexer)); state->rawFile = malloc((sizeof(char) * fileSize) + 1); state->rawLen = fileSize; + state->readPosition = 0; size_t read = fread(state->rawFile, sizeof(char), fileSize, fp); + fclose(fp); + if (read != fileSize) { printf("something borked. only read %d bytes of %d\n", (int)read, fileSize); @@ -44,7 +48,6 @@ NewLexer(const char* filename) return NULL; } - fclose(fp); state->rawFile[fileSize] = '\0'; state->line = 1; @@ -80,8 +83,11 @@ NextToken(Lexer* l) case '.': tok = newToken(l, TT_PERIOD); break; + case '>': + tok = newToken(l, TT_GT); + break; case '`': - tok = newToken(l, TT_BACKTICK); + tok = newTickToken(l); break; case '\0': tok = newToken(l, TT_EOF); @@ -161,7 +167,6 @@ readIdentifier(Lexer* l) return out; } - static void readChar(Lexer* l) @@ -180,11 +185,6 @@ readChar(Lexer* l) l->readPosition++; } -void -Parse(Lexer* l) -{ -} - int isLetter(char ch) { @@ -204,6 +204,39 @@ FreeToken(Token* t) free(t); } +static +Token* +newTickToken(Lexer* l) +{ + + printf("backticks @ %d:%d\n", l->line, l->column); + + if (l->position+3 > l->rawLen) { + printf("premature EOF parsing ticks\n"); + return newToken(l, TT_BACKTICK); + } + + // peek up to two more characters + int i; + for(i = 0; i < 3; i++) { + + if (l->rawFile[l->position+i] != '`') { + printf("next char isn't a backtick @ %d: 0x%02X '%c'\n", l->readPosition+i, l->rawFile[l->position+i], l->rawFile[l->position+i]); + return newToken(l, TT_BACKTICK); + } + } + + Token* tok = malloc(sizeof(Token)); + tok->line = l->line; + tok->column = l->column; + tok->literal = "```"; + tok->type = TT_TRIPLEBACKTICK; + tok->length = 3; + readChar(l); + readChar(l); + return tok; +} + static Token* newToken(Lexer* l, @@ -217,6 +250,7 @@ newToken(Lexer* l, tok->literal = nc; tok->line = l->line; tok->column = l->column; + tok->length = 1; return tok; } @@ -231,5 +265,6 @@ newIdentToken(Lexer* l, tok->literal = literal; tok->line = l->line; tok->column = l->column; + tok->length = strlen(literal); return tok; } diff --git a/main.c b/main.c index 9eefe79..de27c68 100644 --- a/main.c +++ b/main.c @@ -7,35 +7,6 @@ #include "lexer.h" #include "node.h" -/* - * RawText "" - * LineNumber 0 - * NodeType NT_Root - * ChildNodes - * RawText "# Header1" - * LineNumber 1 - * NodeType NT_Header1 - * ChildNodes - * {"Some text."} - * - * RawText "## Header2" - */ - -/* - * NodeType NT_Root - * ChildNodes - * RawText "## Header2" - * ChildNodes - * paragraph - * ChildNodes - * {*bold text*} - * {_underlined text_} - * paragraph - * - * - */ -//Node* ParseLine(char *buffer); - void writeTokenFile(TokenList* tl); int @@ -87,10 +58,24 @@ main(int argc, const char** argv) case NT_Header2: case NT_Header3: case NT_Header4: - { - HeaderNode* hnode = (HeaderNode*)node; - printf("{HeaderNode type:%s text:%s}\n", NodeTypeString(hnode->type), hnode->rawText); - } + { + HeaderNode* hnode = (HeaderNode*)node; + printf("{HeaderNode type:%s text:%s}\n", NodeTypeString(hnode->type), hnode->rawText); + } + break; + + case NT_BlockCode: + { + CodeBlockNode* cnode = (CodeBlockNode*)node; + printf("{CodeBlockNode text:%s}\n", cnode->rawText); + } + break; + + case NT_Error: + { + ErrorNode* enode = (ErrorNode*)node; + printf("{ErrorNode error:%s}\n", enode->error); + } break; default: diff --git a/node.c b/node.c index 295b975..d60a1e9 100644 --- a/node.c +++ b/node.c @@ -7,7 +7,8 @@ static char stringBuff[STRING_BUFF_SIZE]; -HeaderNode* parseHeader(TokenList** list); +Node* parseHeader(TokenList** list); +Node* parseCodeBlock(TokenList** list); NodeList* ParseNodes(TokenList* list) @@ -32,13 +33,18 @@ ParseNodes(TokenList* list) break; case TT_HASH: // start of header - currentNode = (Node*)parseHeader(¤tToken); + currentNode = parseHeader(¤tToken); + break; + + case TT_TRIPLEBACKTICK: + currentNode = parseCodeBlock(¤tToken); break; case TT_EOF: printf("EOF found\n"); return nl; - default: + + default: // paragraph start? break; } @@ -65,7 +71,7 @@ ParseNodes(TokenList* list) return nl; } -HeaderNode* +Node* parseHeader(TokenList** list) { TokenList* l = *list; @@ -96,23 +102,19 @@ parseHeader(TokenList** list) return NULL; } - stringBuff[0] = '\0'; - while (1) - { - int bufSize = strlen(stringBuff); - int litSize = strlen(l->token->literal); - if (bufSize + litSize + 1 > STRING_BUFF_SIZE) - { - printf("Buffer not big enough!"); - return NULL; - } - strncat(stringBuff, l->token->literal, strlen(l->token->literal)); + TokenList* end = l; + int len = 0; + // find header text size + while (end->token->type != TT_NEWLINE && end->token->type != TT_EOF) { + len += end->token->length; + end = end->next; + } - if (l->next == NULL || l->next->token->type == TT_NEWLINE) - { - break; - } + char* strbuff = malloc(len+1); + strbuff[0] = '\0'; + while(l != end) { + strncat(strbuff, l->token->literal, l->token->length); l = l->next; } @@ -136,9 +138,72 @@ parseHeader(TokenList** list) } retval->next = NULL; - retval->rawText = stringBuff; + retval->rawText = strbuff; - return retval; + return (Node*)retval; +} + +Node* +parseCodeBlock(TokenList** list) +{ + TokenList* l = *list; + // find closing ticks + int tlen = 0; // number of tokens + int clen = 0; // number of characters + l = l->next; // skip past the opening triple backtick + + // skip the first newline + if (l->token->type == TT_NEWLINE) { + l = l->next; + } + + while (l->next != NULL && l->next->token->type != TT_TRIPLEBACKTICK) { + if (l->next->token->type == TT_EOF) { + printf("premature EOF"); + + ErrorNode* err = malloc(sizeof(ErrorNode)); + err->type = NT_Error; + err->next = NULL; + err->error = "premature EOF searching for closing triple backtick"; + + return (Node*)err; + } + + tlen++; + clen += l->token->length; + l = l->next; + } + + l = *list; + + printf("codeblock token length: %d\n", tlen); + printf("codeblock char length: %d\n", clen); + + printf("malloc(%ld)\n", sizeof(char)*clen+1); + char* strbuff = malloc(sizeof(char)*clen+1); + strbuff[0] = '\0'; + int i; + l = l->next; // skip past the opening triple backtick + + // skip the first newline + if (l->token->type == TT_NEWLINE) { + l = l->next; + } + + for(i = 0; i < tlen; i++) { + strncat(strbuff, l->token->literal, l->token->length); + l = l->next; + } + + // skip past closing triple backtick + *list = l->next; + + printf("malloc(%ld)\n", sizeof(CodeBlockNode)); + CodeBlockNode* ret = malloc(sizeof(CodeBlockNode)); + ret->type = NT_BlockCode; + ret->rawText = strbuff; + ret->next = NULL; + return (Node*)ret; } char* @@ -169,6 +234,8 @@ NodeTypeString(NodeType t) return "NT_Bold"; case NT_Underline: return "NT_Underline"; + case NT_Error: + return "NT_Error"; default: snprintf(stringBuff, 1000, "unknown NodeType: %d", t); diff --git a/node.h b/node.h index acf3097..6e1c502 100644 --- a/node.h +++ b/node.h @@ -6,18 +6,29 @@ #define NODE_H typedef enum { + // Stand-alone elements + // cannot contain text modifiers NT_Header1, NT_Header2, NT_Header3, NT_Header4, + NT_BlockCode, + + // Container elements + // can contain text modifiers NT_Paragraph, NT_UnorderedList, NT_OrderedList, - NT_InlineCode, - NT_BlockCode, NT_BlockQuote, + + // Contained elements (cannot be bare) + // text modifiers + NT_InlineCode, NT_Bold, NT_Underline, + + // something went wrong + NT_Error, } NodeType; struct NodeList; @@ -40,6 +51,18 @@ typedef struct { char* rawText; } HeaderNode; +typedef struct { + NodeType type; + struct Node* next; + char* rawText; +} CodeBlockNode; + +typedef struct { + NodeType type; + struct Node* next; + char* error; +} ErrorNode; + /* typedef struct { NodeType type; diff --git a/token.c b/token.c index ca753bb..01c96a7 100644 --- a/token.c +++ b/token.c @@ -63,6 +63,8 @@ TokenTypeString(TokenType tt) return "TT_PERIOD"; case TT_BACKTICK: return "TT_BACKTICK"; + case TT_TRIPLEBACKTICK: + return "TT_TRIPLEBACKTICK"; case TT_WHITESPACE: return "TT_WHITESPACE"; case TT_NEWLINE: diff --git a/token.h b/token.h index fef4d0e..b5bb0d7 100644 --- a/token.h +++ b/token.h @@ -11,6 +11,7 @@ typedef enum { TT_DASH, TT_PERIOD, TT_BACKTICK, + TT_TRIPLEBACKTICK, TT_WHITESPACE, TT_NEWLINE, TT_WORD, @@ -22,7 +23,7 @@ typedef struct Token { char* literal; int line; int column; - char* printBuff; + int length; } Token; typedef struct TokenList {