go-dasm-mlb/lex.go

278 lines
4.3 KiB
Go

package dasmlbl
import (
"unicode"
"unicode/utf8"
"strings"
"fmt"
)
var (
eof LexItem = LexItem{lex_EOF, "EOF"}
unicode_EOF rune = '\uFFFC'
)
type stateFn func(l *Lexer) stateFn
type Lexer struct {
input string
start int
pos int
width int
items chan LexItem
}
func NewLexer(input string) (*Lexer, chan LexItem) {
l := &Lexer{
input: input,
items: make(chan LexItem),
}
//fmt.Println("new lex")
return l, l.items
}
func (l *Lexer) Run() {
//fmt.Println("lex run()")
for state := lexStateStart; state != nil; {
state = state(l)
}
close(l.items)
}
func (l *Lexer) emit(t lexItemType) {
itm := LexItem{t, l.input[l.start:l.pos]}
//fmt.Println(">", itm)
l.items <- itm
l.start = l.pos
}
func (l *Lexer) next() rune {
if l.pos >= len(l.input) {
l.width = 0
return unicode_EOF
}
r, size := utf8.DecodeRuneInString(l.input[l.pos:])
if size == 0 {
panic(fmt.Sprintf("zero width at %d", l.pos))
}
l.width = size
l.pos += l.width
return r
}
func (l *Lexer) peek() rune {
if l.pos >= len(l.input) {
l.width = 0
return unicode_EOF
}
r, size := utf8.DecodeRuneInString(l.input[l.pos:])
if size == 0 {
panic(fmt.Sprintf("zero width at %d", l.pos))
}
return r
}
func (l *Lexer) backup() {
l.pos -= l.width
}
func (l *Lexer) ignore() {
l.start = l.pos
}
func (l *Lexer) accept(valid string) bool {
if strings.IndexRune(valid, l.next()) >= 0 {
return true
}
l.backup()
return false
}
func (l *Lexer) acceptRun(valid string) {
for {
r := l.next()
idx := strings.IndexRune(valid, r)
if idx < 0 {
break
}
}
l.backup()
}
type lexItemType int
const (
lex_EOF lexItemType = iota
lex_Ident
lex_OpenBracket
lex_CloseBracket
lex_Semicolon
lex_String
lex_Number
lex_Pound
lex_Space
lex_Error
)
func (lit lexItemType) String() string {
switch lit {
case lex_Ident:
return "lex_Ident"
case lex_OpenBracket:
return "lex_OpenBracket"
case lex_CloseBracket:
return "lex_CloseBracket"
case lex_Semicolon:
return "lex_Semicolon"
case lex_String:
return "lex_String"
case lex_Number:
return "lex_Number"
case lex_Pound:
return "lex_Pound"
case lex_Space:
return "lex_Space"
case lex_EOF:
return "lex_EOF"
case lex_Error:
return "lex_Error"
}
return "lex_UNKNOWN"
}
type LexItem struct {
typ lexItemType
val string
}
func (itm LexItem) String() string {
return fmt.Sprintf("%s: %q", itm.typ, itm.val)
}
func lexComment(l *Lexer) stateFn {
for {
r := l.next()
if r == '\n' {
l.backup()
l.emit(lex_String)
break
}
if r == unicode_EOF {
return nil
}
}
return lexStateStart
}
func lexIdent(l *Lexer) stateFn {
l.acceptRun("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ")
l.emit(lex_Ident)
return lexStateStart
}
func lexNumber(l *Lexer) stateFn {
l.accept("-+")
l.accept("0x")
l.accept("$%")
l.acceptRun("0123456789ABCDEFabcdef")
l.emit(lex_Number)
return lexStateStart
}
func lexString(l *Lexer) stateFn {
for {
r := l.next()
if r == unicode_EOF {
l.items <- LexItem{lex_Error, "EOF before string end"}
return nil
}
if r == '\\' && l.peek() == '"' {
r = l.next()
//fmt.Printf("consuming %c\n", r) // consume
continue
}
if r == '"' {
break
}
if r == '\n' {
l.items <- LexItem{lex_Error, fmt.Sprintf("newline in string; pos: %d", l.pos)}
return nil
}
}
l.backup()
l.emit(lex_String)
l.pos++
l.ignore()
return lexStateStart
}
// consume until the first non-space
func lexStateStart(l *Lexer) stateFn {
for {
r := l.next()
if r == unicode_EOF {
break
}
if unicode.IsSpace(r) {
l.ignore()
continue
}
switch r {
case '{':
l.emit(lex_OpenBracket)
continue
case '}':
l.emit(lex_CloseBracket)
continue
case ';':
l.emit(lex_Semicolon)
continue
case '#':
l.emit(lex_Pound)
return lexComment
case '"':
//l.backup()
l.ignore()
return lexString
}
if r == '$' || r == '%' || r == '+' || r == '-' || ('0' <= r && r <= '9') {
l.backup()
return lexNumber
}
if unicode.IsLetter(r) {
l.backup()
return lexIdent
}
}
// eof
if l.pos > l.start {
l.emit(lex_Space)
}
l.emit(lex_EOF)
return nil
}
func isAlphaNumeric(r rune) bool {
if !isAlpha(r) {
return false
}
return '0' <= r && r <= '9'
}
func isAlpha(r rune) bool {
return ('A' <= r && r <= 'Z') || ('a' <= r && r <= 'Z')
}