Add codeblocks; Fix some stuff

- Parse code blocks that are wrapped sets of three backticks. - Added a TT_TRIPLEBACKTICK token. - Added a length field to the Token struct. - Added an error node. - Cleaned up some file read code. - Fixed the header raw text (don't reuse the same buffer for each node).
2023-10-15 20:55:12 -04:00 · 2023-10-15 20:55:12 -04:00 · d855df380b
parent ca6bdca7ab
commit d855df380b
7 changed files with 180 additions and 66 deletions
--- a/3
+++ b/3
@ -1,6 +1,7 @@
 CC=gcc
-CFLAGS=-Wall -pedantic -Werror -std=c99
+CFLAGS=-Wall -pedantic -Werror -std=c99 -g -O0
 #-fsanitize=address
 OBJ=main.o lexer.o token.o node.o
--- a/lexer.c
+++ b/lexer.c
@ -10,6 +10,7 @@ static char* readNumber(Lexer* l);
 static int isLetter(char c);
 static int isDigit(char c);
 static Token* newTickToken(Lexer* l);
 static Token* newToken(Lexer* l, TokenType tt);
 static Token* newIdentToken(Lexer* l, char* literal, TokenType tt);
@ -33,8 +34,11 @@ NewLexer(const char* filename)
    Lexer* state = malloc(sizeof(Lexer));
    state->rawFile = malloc((sizeof(char) * fileSize) + 1);
    state->rawLen = fileSize;
    state->readPosition = 0;
    size_t read = fread(state->rawFile, sizeof(char), fileSize, fp);
    fclose(fp);
    if (read != fileSize)
    {
        printf("something borked.  only read %d bytes of %d\n", (int)read, fileSize);
@ -44,7 +48,6 @@ NewLexer(const char* filename)
        return NULL;
    }
    fclose(fp);
    state->rawFile[fileSize] = '\0';
    state->line = 1;
@ -80,8 +83,11 @@ NextToken(Lexer* l)
        case '.':
            tok = newToken(l, TT_PERIOD);
            break;
        case '>':
            tok = newToken(l, TT_GT);
            break;
        case '`':
-            tok = newToken(l, TT_BACKTICK);
+            tok = newTickToken(l);
            break;
        case '\0':
            tok = newToken(l, TT_EOF);
@ -161,7 +167,6 @@ readIdentifier(Lexer* l)
    return out;
 }
 static
 void
 readChar(Lexer* l)
@ -180,11 +185,6 @@ readChar(Lexer* l)
    l->readPosition++;
 }
 void
 Parse(Lexer* l)
 {
 }
 int
 isLetter(char ch)
 {
@ -204,6 +204,39 @@ FreeToken(Token* t)
    free(t);
 }
 static
 Token*
 newTickToken(Lexer* l)
 {
    printf("backticks @ %d:%d\n", l->line, l->column);
    if (l->position+3 > l->rawLen) {
        printf("premature EOF parsing ticks\n");
        return newToken(l, TT_BACKTICK);
    }
    // peek up to two more characters
    int i;
    for(i = 0; i < 3; i++) {
        if (l->rawFile[l->position+i] != '`') {
            printf("next char isn't a backtick @ %d: 0x%02X '%c'\n", l->readPosition+i, l->rawFile[l->position+i], l->rawFile[l->position+i]);
            return newToken(l, TT_BACKTICK);
        }
    }
    Token* tok = malloc(sizeof(Token));
    tok->line = l->line;
    tok->column = l->column;
    tok->literal = "```";
    tok->type = TT_TRIPLEBACKTICK;
    tok->length = 3;
    readChar(l);
    readChar(l);
    return tok;
 }
 static
 Token*
 newToken(Lexer* l,
@ -217,6 +250,7 @@ newToken(Lexer* l,
    tok->literal = nc;
    tok->line = l->line;
    tok->column = l->column;
    tok->length = 1;
    return tok;
 }
@ -231,5 +265,6 @@ newIdentToken(Lexer* l,
    tok->literal = literal;
    tok->line = l->line;
    tok->column = l->column;
    tok->length = strlen(literal);
    return tok;
 }
--- a/main.c
+++ b/main.c
@ -7,35 +7,6 @@
 #include "lexer.h"
 #include "node.h"
 /*
 *  RawText ""
 *  LineNumber 0
 *  NodeType NT_Root
 *  ChildNodes
 *      RawText "# Header1"
 *      LineNumber 1
 *      NodeType NT_Header1
 *      ChildNodes
 *          {"Some text."}
 *
 *      RawText "## Header2"
 */
 /*
 * NodeType NT_Root
 * ChildNodes
 *      RawText "## Header2"
 *      ChildNodes
 *          paragraph
 *          ChildNodes
 *              {*bold text*}
 *              {_underlined text_}
 *          paragraph
 *
 *
 */
 //Node* ParseLine(char *buffer);
 void writeTokenFile(TokenList* tl);
 int
@ -87,10 +58,24 @@ main(int argc, const char** argv)
            case NT_Header2:
            case NT_Header3:
            case NT_Header4:
-            {
+                {
-                HeaderNode* hnode = (HeaderNode*)node;
+                    HeaderNode* hnode = (HeaderNode*)node;
-                printf("{HeaderNode type:%s text:%s}\n", NodeTypeString(hnode->type), hnode->rawText);
+                    printf("{HeaderNode type:%s text:%s}\n", NodeTypeString(hnode->type), hnode->rawText);
-            }
+                }
                break;
            case NT_BlockCode:
                {
                    CodeBlockNode* cnode = (CodeBlockNode*)node;
                    printf("{CodeBlockNode text:%s}\n", cnode->rawText);
                }
                break;
            case NT_Error:
                {
                    ErrorNode* enode = (ErrorNode*)node;
                    printf("{ErrorNode error:%s}\n", enode->error);
                }
                break;
            default:
--- a/node.c
+++ b/node.c
@ -7,7 +7,8 @@
 static char stringBuff[STRING_BUFF_SIZE];
-HeaderNode* parseHeader(TokenList** list);
+Node* parseHeader(TokenList** list);
 Node* parseCodeBlock(TokenList** list);
 NodeList*
 ParseNodes(TokenList* list)
@ -32,13 +33,18 @@ ParseNodes(TokenList* list)
                break;
            case TT_HASH:
                // start of header
-                currentNode = (Node*)parseHeader(&currentToken);
+                currentNode = parseHeader(&currentToken);
                break;
            case TT_TRIPLEBACKTICK:
                currentNode = parseCodeBlock(&currentToken);
                break;
            case TT_EOF:
                printf("EOF found\n");
                return nl;
-            default:
+
            default:    // paragraph start?
                break;
        }
@ -65,7 +71,7 @@ ParseNodes(TokenList* list)
    return nl;
 }
-HeaderNode*
+Node*
 parseHeader(TokenList** list)
 {
    TokenList* l = *list;
@ -96,23 +102,19 @@ parseHeader(TokenList** list)
        return NULL;
    }
-    stringBuff[0] = '\0';
+    TokenList* end = l;
-    while (1)
+    int len = 0;
-    {
+    // find header text size
-        int bufSize = strlen(stringBuff);
+    while (end->token->type != TT_NEWLINE && end->token->type != TT_EOF) {
-        int litSize = strlen(l->token->literal);
+        len += end->token->length;
-        if (bufSize + litSize + 1 > STRING_BUFF_SIZE)
+        end = end->next;
-        {
+    }
            printf("Buffer not big enough!");
            return NULL;
        }
        strncat(stringBuff, l->token->literal, strlen(l->token->literal));
-        if (l->next == NULL || l->next->token->type == TT_NEWLINE)
+    char* strbuff = malloc(len+1);
-        {
+    strbuff[0] = '\0';
            break;
        }
    while(l != end) {
        strncat(strbuff, l->token->literal, l->token->length);
        l = l->next;
    }
@ -136,9 +138,72 @@ parseHeader(TokenList** list)
    }
    retval->next = NULL;
-    retval->rawText = stringBuff;
+    retval->rawText = strbuff;
-    return retval;
+    return (Node*)retval;
 }
 Node*
 parseCodeBlock(TokenList** list)
 {
    TokenList* l = *list;
    // find closing ticks
    int tlen = 0;   // number of tokens
    int clen = 0;   // number of characters
    l = l->next; // skip past the opening triple backtick
    // skip the first newline
    if (l->token->type == TT_NEWLINE) {
        l = l->next;
    }
    while (l->next != NULL && l->next->token->type != TT_TRIPLEBACKTICK) {
        if (l->next->token->type == TT_EOF) {
            printf("premature EOF");
            ErrorNode* err = malloc(sizeof(ErrorNode));
            err->type = NT_Error;
            err->next = NULL;
            err->error = "premature EOF searching for closing triple backtick";
            return (Node*)err;
        }
        tlen++;
        clen += l->token->length;
        l = l->next;
    }
    l = *list;
    printf("codeblock token length: %d\n", tlen);
    printf("codeblock char length: %d\n", clen);
    printf("malloc(%ld)\n", sizeof(char)*clen+1);
    char* strbuff = malloc(sizeof(char)*clen+1);
    strbuff[0] = '\0';
    int i;
    l = l->next; // skip past the opening triple backtick
    // skip the first newline
    if (l->token->type == TT_NEWLINE) {
        l = l->next;
    }
    for(i = 0; i < tlen; i++) {
        strncat(strbuff, l->token->literal, l->token->length);
        l = l->next;
    }
    // skip past closing triple backtick
    *list = l->next;
    printf("malloc(%ld)\n", sizeof(CodeBlockNode));
    CodeBlockNode* ret = malloc(sizeof(CodeBlockNode));
    ret->type = NT_BlockCode;
    ret->rawText = strbuff;
    ret->next = NULL;
    return (Node*)ret;
 }
 char*
@ -169,6 +234,8 @@ NodeTypeString(NodeType t)
            return "NT_Bold";
        case NT_Underline:
            return "NT_Underline";
        case NT_Error:
            return "NT_Error";
        default:
            snprintf(stringBuff, 1000, "unknown NodeType: %d", t);
--- a/node.h
+++ b/node.h
@ -6,18 +6,29 @@
 #define NODE_H
 typedef enum {
    // Stand-alone elements
    // cannot contain text modifiers
    NT_Header1,
    NT_Header2,
    NT_Header3,
    NT_Header4,
    NT_BlockCode,
    // Container elements
    // can contain text modifiers
    NT_Paragraph,
    NT_UnorderedList,
    NT_OrderedList,
    NT_InlineCode,
    NT_BlockCode,
    NT_BlockQuote,
    // Contained elements (cannot be bare)
    // text modifiers
    NT_InlineCode,
    NT_Bold,
    NT_Underline,
    // something went wrong
    NT_Error,
 } NodeType;
 struct NodeList;
@ -40,6 +51,18 @@ typedef struct {
    char* rawText;
 } HeaderNode;
 typedef struct {
    NodeType type;
    struct Node* next;
    char* rawText;
 } CodeBlockNode;
 typedef struct {
    NodeType type;
    struct Node* next;
    char* error;
 } ErrorNode;
 /*
 typedef struct {
    NodeType type;
--- a/token.c
+++ b/token.c
@ -63,6 +63,8 @@ TokenTypeString(TokenType tt)
        return "TT_PERIOD";
    case TT_BACKTICK:
        return "TT_BACKTICK";
    case TT_TRIPLEBACKTICK:
        return "TT_TRIPLEBACKTICK";
    case TT_WHITESPACE:
        return "TT_WHITESPACE";
    case TT_NEWLINE:
--- a/token.h
+++ b/token.h
@ -11,6 +11,7 @@ typedef enum {
    TT_DASH,
    TT_PERIOD,
    TT_BACKTICK,
    TT_TRIPLEBACKTICK,
    TT_WHITESPACE,
    TT_NEWLINE,
    TT_WORD,
@ -22,7 +23,7 @@ typedef struct Token {
    char* literal;
    int line;
    int column;
-    char* printBuff;
+    int length;
 } Token;
 typedef struct TokenList {