Implement the paragraph node

This node handles both plain paragraphs as well as block quotes. Single newlines are turned into spaces, double newlines terminate the block. Care is taken to not have repeated whitespace tokens in the content, while keeping the number of spaces.
2023-10-22 22:29:38 -04:00 · 2023-10-22 22:29:38 -04:00 · c17eaf1ec4
parent 3d0bba21e1
commit c17eaf1ec4
7 changed files with 203 additions and 14 deletions
--- a/lexer.c
+++ b/lexer.c
@ -240,13 +240,6 @@ isDigit(char ch)
    return ('0' <= ch && ch <= '9');
 }

-void
-FreeToken(Token* t)
-{
-    free(t->literal);
-    free(t);
-}
-
 static
 Token*
 newTickToken(Lexer* l)
--- a/main.c
+++ b/main.c
@ -85,11 +85,33 @@ main(int argc, const char** argv)
                }
                break;

+            case NT_Paragraph:
+                {
+                    ParagraphNode* pnode = (ParagraphNode*)node;
+                    printf("{ParagraphNode ptype:%s}\n", ParagraphTypeString(pnode->ptype));
+                    Token* content = pnode->content;
+                    while(content != NULL)
+                    {
+                        if (content->type == TT_WHITESPACE)
+                        {
+                            printf(" ");
+                        }
+                        else
+                        {
+                            printf("%s", content->literal);
+                        }
+
+                        content = content->next;
+                    }
+                    printf("\n");
+
+                }
+                break;
+
            default:
                printf("%s\n", NodeTypeString(node->type));
        }

-
        node = node->next;
    }

--- a/node.c
+++ b/node.c
@ -9,6 +9,7 @@ static char stringBuff[STRING_BUFF_SIZE];

 Node* parseHeader(Token** firstToken);
 Node* parseCodeBlock(Token** firstToken);
+Node* parseParagraph(Token** startToken);

 Node*
 ParseNodes(Token* firstToken)
@ -23,7 +24,9 @@ ParseNodes(Token* firstToken)

        switch (currentToken->type) {
            case TT_NEWLINE:
+            case TT_WHITESPACE:
                break;
+
            case TT_HASH:
                // start of header
                currentNode = parseHeader(&currentToken);
@ -37,11 +40,11 @@ ParseNodes(Token* firstToken)
                return firstNode;

            default:    // paragraph start?
+                currentNode = parseParagraph(&currentToken);
                break;
        }

        if (currentToken->next == NULL) {
-            printf("currentToken->next == NULL\n");
            break;
        }

@ -193,6 +196,134 @@ parseCodeBlock(Token** startToken)
    return (Node*)ret;
 }

+Node*
+parseParagraph(Token** startToken)
+{
+    ParagraphNode* pnode = malloc(sizeof(ParagraphNode));
+    pnode->next = NULL;
+    pnode->type = NT_Paragraph;
+    Token* t = *startToken;
+    pnode->ptype = PT_Standard;
+
+    if (t->type == TT_GT) {
+        pnode->ptype = PT_Quote;
+        // consume TT_GT
+        Token* consumed = t;
+        t = t->next;
+        FreeToken(consumed);
+    }
+
+    pnode->content = t;
+    Token* prevToken = NULL;
+    Token* consumed = NULL;
+
+    while(t != NULL)
+    {
+
+        // Look for the end of the paragraph.
+        if (t->type == TT_NEWLINE && t->next != NULL)
+        {
+            if (t->next->type == TT_WHITESPACE)
+            {
+                // Consume the newline if the next one is a space.
+                consumed = t;
+                t = t->next;
+                prevToken->next = t;
+                FreeToken(consumed);
+            }
+            else
+            {
+                // Convert this token into a whitespace character
+                t->literal[0] = ' ';
+                t->type = TT_WHITESPACE;
+                if (prevToken != NULL)
+                    prevToken->next = t;
+                prevToken = t;
+                t = t->next;
+            }
+
+            if (pnode->ptype == PT_Quote) {
+                if (t->type == TT_GT) {
+                    // removes TT_GT
+                    consumed = t;
+                    t = t->next;
+                    prevToken->next = t;
+                    FreeToken(consumed);
+
+                    if (t->next != NULL && t->next->type == TT_WHITESPACE)
+                    {
+                        // removes TT_WHITESPACE
+                        consumed = t;
+                        t = t->next;
+                        prevToken->next = t;
+                        FreeToken(consumed);
+                    }
+                    continue;
+                }
+                goto paragraphEnd;
+            }
+
+            switch (t->type)
+            {
+                case TT_NEWLINE:
+                case TT_EOF:
+                case TT_TRIPLEBACKTICK:
+                case TT_GT:
+                    goto paragraphEnd;
+                    break;
+                default:
+                    break;
+            }
+        } // TT_NEWLINE check
+
+        //printf("t->literal: %s\n", t->literal);
+        if (prevToken != NULL)
+            prevToken->next = t;
+        prevToken = t;
+        t = t->next;
+    }
+
+paragraphEnd:
+    *startToken = t;        // on double newlines, this is the second newline.
+    prevToken->next = NULL; // terminate the pnode->content list
+
+    // remove trailing whitespace
+    prevToken = NULL;
+    t = pnode->content;
+    while(t != NULL)
+    {
+        if (t->type == TT_WHITESPACE)
+        {
+            if(t->next == NULL)
+            {
+                prevToken->next = NULL;
+                break;
+            }
+            else if (t->next->type == TT_WHITESPACE)
+            {
+                // concatinate the two.
+                int len = t->length + t->next->length;
+                char* newws = malloc(sizeof(char)*len+1);
+                newws[0] = '\0';
+                strncat(newws, t->literal, t->length);
+                strncat(newws, t->next->literal, t->next->length);
+
+                consumed = t;
+                t = t->next;
+                prevToken->next = t;
+                FreeToken(consumed);
+                t->length = len;
+                free(t->literal);
+                t->literal = newws;
+            }
+        }
+        prevToken = t;
+        t = t->next;
+    }
+
+    return (Node*)pnode;
+}
+
 char*
 NodeTypeString(NodeType t)
 {
@ -230,3 +361,17 @@ NodeTypeString(NodeType t)
    }
 }

+char*
+ParagraphTypeString(ParagraphType t)
+{
+    switch (t)
+    {
+    case PT_Standard:
+        return "PT_Standard";
+    case PT_Quote:
+        return "PT_Quote";
+    case PT_Code:
+        return "PT_Code";
+    }
+    return "UNKNOWN";
+}
--- a/node.h
+++ b/node.h
@ -55,14 +55,21 @@ typedef struct {
    char* error;
 } ErrorNode;

-/*
+typedef enum {
+    PT_Standard,
+    PT_Quote,
+    PT_Code,
+} ParagraphType;
+
 typedef struct {
    NodeType type;
    struct Node* next;
+    ParagraphType ptype;
+    struct Token* content;
 } ParagraphNode;
-*/

 Node* ParseNodes(Token* firstToken);
 char* NodeTypeString(NodeType t);
+char* ParagraphTypeString(ParagraphType t);

 #endif
--- a/sample.md
+++ b/sample.md
@ -10,11 +10,21 @@ _underlined text_
 Nostra sem bibendum ridiculus aenean condimentum sed eleifend et odio egestas
 pellentesque. *Sit fusce.* At ligula dolor parturient sodales auctor. Egestas.

-Dictum pharetra nulla _aliquet tincidunt_ parturient netus gravida rutrum
-rhoncus. Donec dis mollis ornare `bibendum sollicitudin` velit lectus inceptos.
+this has some `inline 
+ code` in it.

+> Block Quote thing.
+> Dictum pharetra nulla _aliquet tincidunt_ parturient netus gravida rutrum
+>
+> rhoncus. Donec dis mollis ornare `bibendum sollicitudin` velit lectus inceptos.
+
+		tabbed
+
+``` other code
+```
 ```
 Laoreet arcu eget cubilia auctor vitae cursus lacus volutpat dui.
+one     two
 ```

 ### Header 3
--- a/token.c
+++ b/token.c
@ -24,6 +24,14 @@ TokenString(Token* t)
    return stringBuff;
 }

+void
+FreeToken(Token* t)
+{
+    if (t->type != TT_TRIPLEBACKTICK)
+        free(t->literal);
+    free(t);
+}
+
 char*
 TokenTypeString(TokenType tt)
 {
@ -54,6 +62,8 @@ TokenTypeString(TokenType tt)
        return "TT_WORD";
    case TT_NUMBER:
        return "TT_NUMBER";
+    case TT_GT:
+        return "TT_GT";
    }

    return "\0";
--- a/token.h
+++ b/token.h
@ -16,6 +16,7 @@ typedef enum {
    TT_NEWLINE,
    TT_WORD,
    TT_NUMBER,
+    TT_GT, // greater than; used for block quotes
 } TokenType;

 typedef struct Token {
@ -29,5 +30,6 @@ typedef struct Token {

 char* TokenString(Token* t);
 char* TokenTypeString(TokenType tt);
+void FreeToken(Token* t);

 #endif