From c17eaf1ec452cfbc9b448ef50cee7d852dc4811f Mon Sep 17 00:00:00 2001
From: Zorchenhimer <zorchenhimer@gmail.com>
Date: Sun, 22 Oct 2023 22:29:38 -0400
Subject: [PATCH] Implement the paragraph node

This node handles both plain paragraphs as well as block quotes. Single
newlines are turned into spaces, double newlines terminate the block.
Care is taken to not have repeated whitespace tokens in the content,
while keeping the number of spaces.
---
 lexer.c   |   7 ---
 main.c    |  24 ++++++++-
 node.c    | 147 +++++++++++++++++++++++++++++++++++++++++++++++++++++-
 node.h    |  11 +++-
 sample.md |  16 ++++--
 token.c   |  10 ++++
 token.h   |   2 +
 7 files changed, 203 insertions(+), 14 deletions(-)

diff --git a/lexer.c b/lexer.c
index 2c303cb..458764b 100644
--- a/lexer.c
+++ b/lexer.c
@@ -240,13 +240,6 @@ isDigit(char ch)
     return ('0' <= ch && ch <= '9');
 }
 
-void
-FreeToken(Token* t)
-{
-    free(t->literal);
-    free(t);
-}
-
 static
 Token*
 newTickToken(Lexer* l)
diff --git a/main.c b/main.c
index 9909f60..fc71174 100644
--- a/main.c
+++ b/main.c
@@ -85,11 +85,33 @@ main(int argc, const char** argv)
                 }
                 break;
 
+            case NT_Paragraph:
+                {
+                    ParagraphNode* pnode = (ParagraphNode*)node;
+                    printf("{ParagraphNode ptype:%s}\n", ParagraphTypeString(pnode->ptype));
+                    Token* content = pnode->content;
+                    while(content != NULL)
+                    {
+                        if (content->type == TT_WHITESPACE)
+                        {
+                            printf(" ");
+                        }
+                        else
+                        {
+                            printf("%s", content->literal);
+                        }
+
+                        content = content->next;
+                    }
+                    printf("\n");
+
+                }
+                break;
+
             default:
                 printf("%s\n", NodeTypeString(node->type));
         }
 
-
         node = node->next;
     }
 
diff --git a/node.c b/node.c
index f64de6c..d627c36 100644
--- a/node.c
+++ b/node.c
@@ -9,6 +9,7 @@ static char stringBuff[STRING_BUFF_SIZE];
 
 Node* parseHeader(Token** firstToken);
 Node* parseCodeBlock(Token** firstToken);
+Node* parseParagraph(Token** startToken);
 
 Node*
 ParseNodes(Token* firstToken)
@@ -23,7 +24,9 @@ ParseNodes(Token* firstToken)
 
         switch (currentToken->type) {
             case TT_NEWLINE:
+            case TT_WHITESPACE:
                 break;
+
             case TT_HASH:
                 // start of header
                 currentNode = parseHeader(&currentToken);
@@ -37,11 +40,11 @@ ParseNodes(Token* firstToken)
                 return firstNode;
 
             default:    // paragraph start?
+                currentNode = parseParagraph(&currentToken);
                 break;
         }
 
         if (currentToken->next == NULL) {
-            printf("currentToken->next == NULL\n");
             break;
         }
 
@@ -193,6 +196,134 @@ parseCodeBlock(Token** startToken)
     return (Node*)ret;
 }
 
+Node*
+parseParagraph(Token** startToken)
+{
+    ParagraphNode* pnode = malloc(sizeof(ParagraphNode));
+    pnode->next = NULL;
+    pnode->type = NT_Paragraph;
+    Token* t = *startToken;
+    pnode->ptype = PT_Standard;
+
+    if (t->type == TT_GT) {
+        pnode->ptype = PT_Quote;
+        // consume TT_GT
+        Token* consumed = t;
+        t = t->next;
+        FreeToken(consumed);
+    }
+
+    pnode->content = t;
+    Token* prevToken = NULL;
+    Token* consumed = NULL;
+
+    while(t != NULL)
+    {
+
+        // Look for the end of the paragraph.
+        if (t->type == TT_NEWLINE && t->next != NULL)
+        {
+            if (t->next->type == TT_WHITESPACE)
+            {
+                // Consume the newline if the next one is a space.
+                consumed = t;
+                t = t->next;
+                prevToken->next = t;
+                FreeToken(consumed);
+            }
+            else
+            {
+                // Convert this token into a whitespace character
+                t->literal[0] = ' ';
+                t->type = TT_WHITESPACE;
+                if (prevToken != NULL)
+                    prevToken->next = t;
+                prevToken = t;
+                t = t->next;
+            }
+
+            if (pnode->ptype == PT_Quote) {
+                if (t->type == TT_GT) {
+                    // removes TT_GT
+                    consumed = t;
+                    t = t->next;
+                    prevToken->next = t;
+                    FreeToken(consumed);
+
+                    if (t->next != NULL && t->next->type == TT_WHITESPACE)
+                    {
+                        // removes TT_WHITESPACE
+                        consumed = t;
+                        t = t->next;
+                        prevToken->next = t;
+                        FreeToken(consumed);
+                    }
+                    continue;
+                }
+                goto paragraphEnd;
+            }
+
+            switch (t->type)
+            {
+                case TT_NEWLINE:
+                case TT_EOF:
+                case TT_TRIPLEBACKTICK:
+                case TT_GT:
+                    goto paragraphEnd;
+                    break;
+                default:
+                    break;
+            }
+        } // TT_NEWLINE check
+
+        //printf("t->literal: %s\n", t->literal);
+        if (prevToken != NULL)
+            prevToken->next = t;
+        prevToken = t;
+        t = t->next;
+    }
+
+paragraphEnd:
+    *startToken = t;        // on double newlines, this is the second newline.
+    prevToken->next = NULL; // terminate the pnode->content list
+
+    // remove trailing whitespace
+    prevToken = NULL;
+    t = pnode->content;
+    while(t != NULL)
+    {
+        if (t->type == TT_WHITESPACE)
+        {
+            if(t->next == NULL)
+            {
+                prevToken->next = NULL;
+                break;
+            }
+            else if (t->next->type == TT_WHITESPACE)
+            {
+                // concatinate the two.
+                int len = t->length + t->next->length;
+                char* newws = malloc(sizeof(char)*len+1);
+                newws[0] = '\0';
+                strncat(newws, t->literal, t->length);
+                strncat(newws, t->next->literal, t->next->length);
+
+                consumed = t;
+                t = t->next;
+                prevToken->next = t;
+                FreeToken(consumed);
+                t->length = len;
+                free(t->literal);
+                t->literal = newws;
+            }
+        }
+        prevToken = t;
+        t = t->next;
+    }
+
+    return (Node*)pnode;
+}
+
 char*
 NodeTypeString(NodeType t)
 {
@@ -230,3 +361,17 @@ NodeTypeString(NodeType t)
     }
 }
 
+char*
+ParagraphTypeString(ParagraphType t)
+{
+    switch (t)
+    {
+    case PT_Standard:
+        return "PT_Standard";
+    case PT_Quote:
+        return "PT_Quote";
+    case PT_Code:
+        return "PT_Code";
+    }
+    return "UNKNOWN";
+}
diff --git a/node.h b/node.h
index 282c7ac..ec62cab 100644
--- a/node.h
+++ b/node.h
@@ -55,14 +55,21 @@ typedef struct {
     char* error;
 } ErrorNode;
 
-/*
+typedef enum {
+    PT_Standard,
+    PT_Quote,
+    PT_Code,
+} ParagraphType;
+
 typedef struct {
     NodeType type;
     struct Node* next;
+    ParagraphType ptype;
+    struct Token* content;
 } ParagraphNode;
-*/
 
 Node* ParseNodes(Token* firstToken);
 char* NodeTypeString(NodeType t);
+char* ParagraphTypeString(ParagraphType t);
 
 #endif
diff --git a/sample.md b/sample.md
index a0e8947..032e62b 100644
--- a/sample.md
+++ b/sample.md
@@ -10,11 +10,21 @@ _underlined text_
 Nostra sem bibendum ridiculus aenean condimentum sed eleifend et odio egestas
 pellentesque. *Sit fusce.* At ligula dolor parturient sodales auctor. Egestas.
 
-Dictum pharetra nulla _aliquet tincidunt_ parturient netus gravida rutrum
-rhoncus. Donec dis mollis ornare `bibendum sollicitudin` velit lectus inceptos.
+this has some `inline 
+ code` in it.
 
+> Block Quote thing.
+> Dictum pharetra nulla _aliquet tincidunt_ parturient netus gravida rutrum
+>
+> rhoncus. Donec dis mollis ornare `bibendum sollicitudin` velit lectus inceptos.
+
+		tabbed
+
+``` other code
+```
 ```
 Laoreet arcu eget cubilia auctor vitae cursus lacus volutpat dui.
+one     two
 ```
 
 ### Header 3
@@ -42,5 +52,5 @@ Laoreet arcu eget cubilia auctor vitae cursus lacus volutpat dui.
     1. Ordered second level two
 1. Ordered toplevel two
     1. Ordered second level one
-        1. Ordered third level
+        1. Ordered third level    
     1. Ordered second level two
diff --git a/token.c b/token.c
index 4d89542..014c3bf 100644
--- a/token.c
+++ b/token.c
@@ -24,6 +24,14 @@ TokenString(Token* t)
     return stringBuff;
 }
 
+void
+FreeToken(Token* t)
+{
+    if (t->type != TT_TRIPLEBACKTICK)
+        free(t->literal);
+    free(t);
+}
+
 char*
 TokenTypeString(TokenType tt)
 {
@@ -54,6 +62,8 @@ TokenTypeString(TokenType tt)
         return "TT_WORD";
     case TT_NUMBER:
         return "TT_NUMBER";
+    case TT_GT:
+        return "TT_GT";
     }
 
     return "\0";
diff --git a/token.h b/token.h
index bb937ea..cee0e44 100644
--- a/token.h
+++ b/token.h
@@ -16,6 +16,7 @@ typedef enum {
     TT_NEWLINE,
     TT_WORD,
     TT_NUMBER,
+    TT_GT, // greater than; used for block quotes
 } TokenType;
 
 typedef struct Token {
@@ -29,5 +30,6 @@ typedef struct Token {
 
 char* TokenString(Token* t);
 char* TokenTypeString(TokenType tt);
+void FreeToken(Token* t);
 
 #endif