From 5e1d6ae207241def1a53495bcd635a2d8769db12 Mon Sep 17 00:00:00 2001 From: Zorchenhimer Date: Wed, 14 Jul 2021 11:15:46 -0400 Subject: [PATCH] Initial commit --- .gitignore | 3 + Makefile | 23 ++++++ lexer.c | 235 +++++++++++++++++++++++++++++++++++++++++++++++++++++ lexer.h | 50 ++++++++++++ main.c | 99 ++++++++++++++++++++++ main.h | 7 ++ node.c | 104 ++++++++++++++++++++++++ node.h | 43 ++++++++++ readme.md | 28 +++++++ sample.md | 46 +++++++++++ token.c | 99 ++++++++++++++++++++++ token.h | 38 +++++++++ 12 files changed, 775 insertions(+) create mode 100644 .gitignore create mode 100644 Makefile create mode 100644 lexer.c create mode 100644 lexer.h create mode 100644 main.c create mode 100644 main.h create mode 100644 node.c create mode 100644 node.h create mode 100644 readme.md create mode 100644 sample.md create mode 100644 token.c create mode 100644 token.h diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..8b0c2b6 --- /dev/null +++ b/.gitignore @@ -0,0 +1,3 @@ +*.o +*.txt +readme diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..45b72f7 --- /dev/null +++ b/Makefile @@ -0,0 +1,23 @@ +.PHONY: run test run clean + +CC=gcc +CFLAGS=-Wall -Wpedantic -Werror -std=c99 + +OBJ=main.o lexer.o token.o node.o + +all: readme + +run: readme + ./readme + +readme: ${OBJ} + ${CC} ${CFLAGS} -o $@ $^ + +#token.o: token.h token.c +# ${CC} ${CFLAGS} -o $@ $< + +.c.o: + ${CC} ${CFLAGS} -c -o $@ $< + +clean: + -rm *.o readme diff --git a/lexer.c b/lexer.c new file mode 100644 index 0000000..e33d038 --- /dev/null +++ b/lexer.c @@ -0,0 +1,235 @@ +#include +#include +#include + +#include "lexer.h" + +static void readChar(Lexer* l); +static char* readIdentifier(Lexer* l); +static char* readNumber(Lexer* l); +static int isLetter(char c); +static int isDigit(char c); + +static Token* newToken(Lexer* l, TokenType tt); +static Token* newIdentToken(Lexer* l, char* literal, TokenType tt); + +Lexer* +NewLexer(char* filename) +{ + FILE* fp; + fp = fopen(filename, "r"); + + if (fp == NULL) + { + printf("Can't open the file for some reason\n"); + return NULL; + } + + fseek(fp, 0, SEEK_END); + int fileSize = ftell(fp); + fseek(fp, 0, SEEK_SET); + printf("fileSize: %d\n", fileSize); + + Lexer* state = malloc(sizeof(Lexer)); + state->rawFile = malloc((sizeof(char) * fileSize) + 1); + state->rawLen = fileSize; + + size_t read = fread(state->rawFile, sizeof(char), fileSize, fp); + if (read != fileSize) + { + printf("something borked. only read %d bytes of %d\n", (int)read, fileSize); + + free(state->rawFile); + free(state); + + return NULL; + } + fclose(fp); + + state->rawFile[fileSize] = '\0'; + state->line = 1; + + readChar(state); + return state; +} + +void +FreeLexer(Lexer* l) +{ + free(l->rawFile); + free(l); +} + +Token* +NextToken(Lexer* l) +{ + Token* tok; + switch (l->ch) { + case '#': + tok = newToken(l, TT_HASH); + break; + case '*': + tok = newToken(l, TT_ASTERISK); + break; + case '_': + tok = newToken(l, TT_UNDERSCORE); + break; + case '-': + tok = newToken(l, TT_DASH); + break; + case '.': + tok = newToken(l, TT_PERIOD); + break; + case '`': + tok = newToken(l, TT_BACKTICK); + break; + case '\0': + tok = newToken(l, TT_EOF); + break; + case '\n': + tok = newToken(l, TT_NEWLINE); + l->line++; + l->column = 0; + break; + case ' ': + case '\t': + tok = newToken(l, TT_WHITESPACE); + break; + case '\r': + readChar(l); + return NextToken(l); // lets GOOOOO + default: + if (isLetter(l->ch)) + { + int start = l->column; + char* literal = readIdentifier(l); + tok = newIdentToken(l, literal, TT_WORD); + tok->column = start; + return tok; + } + else if (isDigit(l->ch)) + { + int start = l->column; + char* literal = readNumber(l); + tok = newIdentToken(l, literal, TT_NUMBER); + tok->column = start; + return tok; + } + else + { + tok = newToken(l, TT_ILLEGAL); + } + //printf("Invalid token: %X\n", l->ch); + //return NULL; + } + + readChar(l); + return tok; +} + +static +char* +readNumber(Lexer* l) +{ + int position = l->position; + while (isDigit(l->ch)) + { + readChar(l); + } + + int len = (l->position - position); + char* out = malloc(sizeof(char) * len + 1); + memcpy(out, &l->rawFile[position], len); + out[len] = '\0'; + return out; +} + +static +char* +readIdentifier(Lexer* l) +{ + int position = l->position; + while (isLetter(l->ch)) + { + readChar(l); + } + + int len = (l->position - position); + char* out = malloc(sizeof(char) * len + 1); + memcpy(out, &l->rawFile[position], len); + out[len] = '\0'; + return out; +} + + +static +void +readChar(Lexer* l) +{ + l->column++; + if (l->readPosition >= l->rawLen) + { + l->ch = 0; + } + else + { + l->ch = l->rawFile[l->readPosition]; + } + + l->position = l->readPosition; + l->readPosition++; +} + +void +Parse(Lexer* l) +{ +} + +int +isLetter(char ch) +{ + return (('a' <= ch && ch <= 'z') || ('A' <= ch && ch <= 'Z') || ch == '_'); +} + +int +isDigit(char ch) +{ + return ('0' <= ch && ch <= '9'); +} + +void +FreeToken(Token* t) +{ + free(t->literal); + free(t); +} + +static +Token* +newToken(Lexer* l, + TokenType tt) +{ + Token* tok = malloc(sizeof(Token)); + char* nc = malloc(sizeof(char)+1); + *nc = l->ch; + nc[1] = '\0'; + tok->type = tt; + tok->literal = nc; + tok->line = l->line; + tok->column = l->column; + return tok; +} + +static +Token* +newIdentToken(Lexer* l, + char* literal, + TokenType tt) +{ + Token* tok = malloc(sizeof(Token)); + tok->type = tt; + tok->literal = literal; + tok->line = l->line; + tok->column = l->column; + return tok; +} diff --git a/lexer.h b/lexer.h new file mode 100644 index 0000000..a1edc7b --- /dev/null +++ b/lexer.h @@ -0,0 +1,50 @@ + +#include "token.h" + +#ifndef LEXER_H +#define LEXER_H + +//typedef enum NodeType { +// NT_Root, +// NT_Header1, +// NT_Header2, +// NT_Header3, +// NT_ListItem, +// NT_OrderedListItem, +// NT_Paragraph, +// NT_PlainText, +// NT_BoldText, +// NT_UnderlineText, +// NT_InlineCode, +// NT_BlockCode, +//} NodeType; + +typedef struct Lexer { + char* rawFile; + int rawLen; + int position; // current index + int readPosition; // next index + char ch; // character under examination + + // values for current index + int line; + int column; + +} Lexer; + +//typedef struct Node { +// NodeType type; +// char RawText; +// int LineNumber; +// +// //struct Node **ChildNodes; +// void** ChildNodes; +// int ChildCount; +//} Node; + +Lexer* NewLexer(char* filename); +Token* NextToken(Lexer* l); +void ReadChar(Lexer* l); +void Parse(Lexer* l); + +#endif diff --git a/main.c b/main.c new file mode 100644 index 0000000..4ba37ac --- /dev/null +++ b/main.c @@ -0,0 +1,99 @@ +#include +#include +#include + +#include "main.h" +#include "token.h" +#include "lexer.h" +#include "node.h" + +/* + * RawText "" + * LineNumber 0 + * NodeType NT_Root + * ChildNodes + * RawText "# Header1" + * LineNumber 1 + * NodeType NT_Header1 + * ChildNodes + * {"Some text."} + * + * RawText "## Header2" + */ + +/* + * NodeType NT_Root + * ChildNodes + * RawText "## Header2" + * ChildNodes + * paragraph + * ChildNodes + * {*bold text*} + * {_underlined text_} + * paragraph + * + * + */ +//Node* ParseLine(char *buffer); + +void writeTokenFile(TokenList* tl); + +int +main(int argc, const char** argv) +{ + Lexer* l = NewLexer("sample.md"); + TokenList* current = malloc(sizeof(TokenList)); + TokenList* tl = current;//= malloc(sizeof(TokenList)); + current->token = NULL; + + TokenType tt; + do + { + Token* t = NextToken(l); + tt = t->type; + current = TokenListAdd(current, t); + } + while(tt != TT_EOF); + + writeTokenFile(tl); + ParseNodes(tl); + + printf("rawLen: %d position: %d readPosition: %d ch: %c line: %d column: %d\n", + l->rawLen, + l->position, + l->readPosition, + l->ch, + l->line, + l->column + ); + return 0; +} + +void +writeTokenFile(TokenList* tl) +{ + int count; + FILE* fp = fopen("tokens.txt", "w"); + if (fp == NULL) + { + printf("unable to open output.txt\n"); + return; + } + + TokenList* current = tl; + for(count = 0; current->next != NULL; count++) { + if (count == 0 && current->token == NULL) + { + printf("first token null\n"); + } + else if (count == 0) + { + printf("%s\n", TokenString(current->token)); + } + fprintf(fp, "%s\n", TokenString(current->token)); + current = current->next; + } + fclose(fp); + + printf("Token count: %d\n", count); +} diff --git a/main.h b/main.h new file mode 100644 index 0000000..6d49149 --- /dev/null +++ b/main.h @@ -0,0 +1,7 @@ + +#ifndef MAIN_H +#define MAIN_H + +#define MAXBUFFER 1024 + +#endif diff --git a/node.c b/node.c new file mode 100644 index 0000000..5dd5537 --- /dev/null +++ b/node.c @@ -0,0 +1,104 @@ +#include +#include + +#include "node.h" + +#define STRING_BUFF_SIZE 1024 + +static char stringBuff[STRING_BUFF_SIZE]; + +Node* parseHeader(TokenList** list); + +NodeList* +ParseNodes(TokenList* list) +{ + NodeList* nl = malloc(sizeof(NodeList)); + NodeList* currentNode = nl; + + currentNode->next = NULL; + currentNode->node = NULL; + + TokenList* current = list; + + //while(current != NULL) { + while (1) { + switch (current->token->type) { + case TT_NEWLINE: + break; + case TT_HASH: + // start of header + //Node* nodes; + //nodes = parseHeader(current); + currentNode->node = parseHeader(¤t); + break; + default: + break; + } + + if (current->next == NULL) { + //printf("next is null\n"); + break; + } + //printf("current = current->next;\n"); + current = current->next; + } + + return nl; +} + +Node* +parseHeader(TokenList** list) +{ + TokenList* l = *list; + // Count the number of TT_HASH tokens + int count = 1; + while (l->next != NULL && l->next->token->type == TT_HASH) + { + count++; + l = l->next; + } + + if (l->next == NULL) + { + printf("Header missing text"); + return NULL; + } + l = l->next; + + // Trim leading whitespace + while (l->next != NULL && l->token->type == TT_WHITESPACE) + { + l = l->next; + } + + if (l->next == NULL) + { + printf("Header missing text"); + return NULL; + } + + stringBuff[0] = '\0'; + while (1) + { + int bufSize = strlen(stringBuff); + int litSize = strlen(l->token->literal); + if (bufSize + litSize + 1 > STRING_BUFF_SIZE) + { + printf("Buffer not big enough!"); + return NULL; + } + strncat(stringBuff, l->token->literal, strlen(l->token->literal)); + + if (l->next == NULL || l->next->token->type == TT_NEWLINE) + { + break; + } + + l = l->next; + } + + *list = l; + printf("header hash count: %d\ntext: '%s'\n", count, stringBuff); + return NULL; +} + diff --git a/node.h b/node.h new file mode 100644 index 0000000..12a6210 --- /dev/null +++ b/node.h @@ -0,0 +1,43 @@ +#include + +#include "token.h" + +#ifndef NODE_H +#define NODE_H + +typedef enum { + NT_Header1, + NT_Header2, + NT_Header3, + NT_Header4, + NT_Paragraph, + NT_UnorderedList, + NT_OrderedList, + NT_InlineCode, + NT_BlockCode, + NT_BlockQuote, + NT_Bold, + NT_Underline, +} NodeType; + +struct NodeList; + +typedef struct Node { + NodeType type; + struct NodeList* children; +} Node; + +typedef struct NodeList { + struct Node* node; + struct Node* next; +} NodeList; + +typedef struct { + NodeType type; + struct Node* next; + char* rawText; +} HeaderNode; + +NodeList* ParseNodes(TokenList* list); + +#endif diff --git a/readme.md b/readme.md new file mode 100644 index 0000000..9e245ad --- /dev/null +++ b/readme.md @@ -0,0 +1,28 @@ +# Terminal Markdown Viewer + +## Goals + +To render markdown in the terminal and use colors, font weights, etc, to +display the document. + +## Implemented syntax + +- Headers +- Unordered lists +- Ordered lists +- Inline code +- Block code +- Block quote? +- Bold +- Underline + +### maybies + +- Task list +- Explicit colors +- Inter-document links + +### nopes + +- Tables +- Syntax highlighting code diff --git a/sample.md b/sample.md new file mode 100644 index 0000000..a0e8947 --- /dev/null +++ b/sample.md @@ -0,0 +1,46 @@ +# Header 1 + +Some text. + +## Header 2 + +*bold text* +_underlined text_ + +Nostra sem bibendum ridiculus aenean condimentum sed eleifend et odio egestas +pellentesque. *Sit fusce.* At ligula dolor parturient sodales auctor. Egestas. + +Dictum pharetra nulla _aliquet tincidunt_ parturient netus gravida rutrum +rhoncus. Donec dis mollis ornare `bibendum sollicitudin` velit lectus inceptos. + +``` +Laoreet arcu eget cubilia auctor vitae cursus lacus volutpat dui. +``` + +### Header 3 + +- List item one. +- List item two. +- List item three. +- List item four. + +1. Ordered list one +1. Ordered list two +1. Ordered list three +1. Ordered list four + +- Toplevel one + - Second level one + - Second level two +- Toplevel two + - Second level one + - Third level + - Second level two + +1. Ordered toplevel one + 1. Ordered second level one + 1. Ordered second level two +1. Ordered toplevel two + 1. Ordered second level one + 1. Ordered third level + 1. Ordered second level two diff --git a/token.c b/token.c new file mode 100644 index 0000000..b559dce --- /dev/null +++ b/token.c @@ -0,0 +1,99 @@ +#include +#include +#include + +#include "token.h" + +#define STRING_BUFF_SIZE 1024 + +static char stringBuff[STRING_BUFF_SIZE]; + +char* printableOnly(char* input); + +TokenList* +TokenListAdd(TokenList* current, Token* next) +{ + if (current->token == NULL) + { + printf("current->token == null\n"); + current->token = next; + return current; + } + + TokenList* nl = malloc(sizeof(TokenList)); + nl->token = next; + current->next = nl; + return nl; +} + +char* +TokenString(Token* t) +{ + //char* str = malloc(sizeof(char) * 1000); + snprintf(stringBuff, 1000, "[%d:%d] Type: %s Literal: '%s'", + t->line, + t->column, + TokenTypeString(t->type), + printableOnly(t->literal) + ); + + return stringBuff; +} + +char* +TokenTypeString(TokenType tt) +{ + switch (tt) { + case TT_ILLEGAL: + return "TT_ILLEGAL"; + case TT_EOF: + return "TT_EOF"; + case TT_HASH: + return "TT_HASH"; + case TT_ASTERISK: + return "TT_ASTERISK"; + case TT_UNDERSCORE: + return "TT_UNDERSCORE"; + case TT_DASH: + return "TT_DASH"; + case TT_PERIOD: + return "TT_PERIOD"; + case TT_BACKTICK: + return "TT_BACKTICK"; + case TT_WHITESPACE: + return "TT_WHITESPACE"; + case TT_NEWLINE: + return "TT_NEWLINE"; + case TT_WORD: + return "TT_WORD"; + case TT_NUMBER: + return "TT_NUMBER"; + } + + return "\0"; +} + +char* +printableOnly(char* input) +{ + char *str = malloc(sizeof(char) * ((strlen(input)*4)+1)); + int i, j; + int len = strlen(input); + for(i = 0, j = 0; i < len; i++, j++) + { + if(input[i] < 0x20 || input[i] > 0x7F) + { + // hex notation + snprintf(&str[j], 5, "\\x%02X", input[i]); + j+=3; + } + else + { + str[j] = input[i]; + } + } + + str[j] = '\0'; + return str; +} + diff --git a/token.h b/token.h new file mode 100644 index 0000000..fef4d0e --- /dev/null +++ b/token.h @@ -0,0 +1,38 @@ + +#ifndef TOKEN_H +#define TOKEN_H + +typedef enum { + TT_ILLEGAL, + TT_EOF, + TT_HASH, // # + TT_ASTERISK, + TT_UNDERSCORE, + TT_DASH, + TT_PERIOD, + TT_BACKTICK, + TT_WHITESPACE, + TT_NEWLINE, + TT_WORD, + TT_NUMBER, +} TokenType; + +typedef struct Token { + TokenType type; + char* literal; + int line; + int column; + char* printBuff; +} Token; + +typedef struct TokenList { + Token* token; + struct TokenList* next; +} TokenList; + +TokenList* TokenListAdd(TokenList* current, Token* next); + +char* TokenString(Token* t); +char* TokenTypeString(TokenType tt); + +#endif