package dasmlbl import ( "unicode" "unicode/utf8" "strings" "fmt" ) var ( eof LexItem = LexItem{lex_EOF, "EOF"} unicode_EOF rune = '\uFFFC' ) type stateFn func(l *Lexer) stateFn type Lexer struct { input string start int pos int width int items chan LexItem } func NewLexer(input string) (*Lexer, chan LexItem) { l := &Lexer{ input: input, items: make(chan LexItem), } //fmt.Println("new lex") return l, l.items } func (l *Lexer) Run() { //fmt.Println("lex run()") for state := lexStateStart; state != nil; { state = state(l) } close(l.items) } func (l *Lexer) emit(t lexItemType) { itm := LexItem{t, l.input[l.start:l.pos]} //fmt.Println(">", itm) l.items <- itm l.start = l.pos } func (l *Lexer) next() rune { if l.pos >= len(l.input) { l.width = 0 return unicode_EOF } r, size := utf8.DecodeRuneInString(l.input[l.pos:]) if size == 0 { panic(fmt.Sprintf("zero width at %d", l.pos)) } l.width = size l.pos += l.width return r } func (l *Lexer) backup() { l.pos -= l.width } func (l *Lexer) ignore() { l.start = l.pos } func (l *Lexer) accept(valid string) bool { if strings.IndexRune(valid, l.next()) >= 0 { return true } l.backup() return false } func (l *Lexer) acceptRun(valid string) { for { r := l.next() idx := strings.IndexRune(valid, r) if idx < 0 { break } } l.backup() } type lexItemType int const ( lex_EOF lexItemType = iota lex_Ident lex_OpenBracket lex_CloseBracket lex_Semicolon lex_String lex_Number lex_Pound lex_Space lex_Error ) func (lit lexItemType) String() string { switch lit { case lex_Ident: return "lex_Ident" case lex_OpenBracket: return "lex_OpenBracket" case lex_CloseBracket: return "lex_CloseBracket" case lex_Semicolon: return "lex_Semicolon" case lex_String: return "lex_String" case lex_Number: return "lex_Number" case lex_Pound: return "lex_Pound" case lex_Space: return "lex_Space" case lex_EOF: return "lex_EOF" case lex_Error: return "lex_Error" } return "lex_UNKNOWN" } type LexItem struct { typ lexItemType val string } func (itm LexItem) String() string { return fmt.Sprintf("%s: %q", itm.typ, itm.val) } func lexComment(l *Lexer) stateFn { for { r := l.next() if r == '\n' { l.backup() l.emit(lex_String) break } if r == unicode_EOF { return nil } } return lexStateStart } func lexIdent(l *Lexer) stateFn { l.acceptRun("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ") l.emit(lex_Ident) return lexStateStart } func lexNumber(l *Lexer) stateFn { l.accept("-+") l.accept("0x") l.accept("$%") l.acceptRun("0123456789ABCDEFabcdef") l.emit(lex_Number) return lexStateStart } func lexString(l *Lexer) stateFn { for { r := l.next() if r == unicode_EOF { l.items <- LexItem{lex_Error, "EOF before string end"} return nil } if r == '"' { break } if r == '\n' { l.items <- LexItem{lex_Error, fmt.Sprintf("newline in string; pos: %d", l.pos)} return nil } } l.backup() l.emit(lex_String) l.pos++ l.ignore() return lexStateStart } // consume until the first non-space func lexStateStart(l *Lexer) stateFn { for { r := l.next() if r == unicode_EOF { break } if unicode.IsSpace(r) { l.ignore() continue } switch r { case '{': l.emit(lex_OpenBracket) continue case '}': l.emit(lex_CloseBracket) continue case ';': l.emit(lex_Semicolon) continue case '#': l.emit(lex_Pound) return lexComment case '"': //l.backup() l.ignore() return lexString } if r == '$' || r == '%' || r == '+' || r == '-' || ('0' <= r && r <= '9') { l.backup() return lexNumber } if unicode.IsLetter(r) { l.backup() return lexIdent } } // eof if l.pos > l.start { l.emit(lex_Space) } l.emit(lex_EOF) return nil } func isAlphaNumeric(r rune) bool { if !isAlpha(r) { return false } return '0' <= r && r <= '9' } func isAlpha(r rune) bool { return ('A' <= r && r <= 'Z') || ('a' <= r && r <= 'Z') }