From abb94bc7da02ac69ade9a747a8a205741c96d35b Mon Sep 17 00:00:00 2001 From: Bobby Date: Sun, 29 Oct 2023 00:36:24 -0400 Subject: Added basic tokens and lexing --- go.mod | 3 ++ lexer/lexer.go | 146 ++++++++++++++++++++++++++++++++++++++++++++++++++++ lexer/lexer_test.go | 127 +++++++++++++++++++++++++++++++++++++++++++++ repl/repl.go | 1 + tokens/tokens.go | 65 +++++++++++++++++++++++ 5 files changed, 342 insertions(+) create mode 100644 go.mod create mode 100644 lexer/lexer.go create mode 100644 lexer/lexer_test.go create mode 100644 repl/repl.go create mode 100644 tokens/tokens.go diff --git a/go.mod b/go.mod new file mode 100644 index 0000000..a16e091 --- /dev/null +++ b/go.mod @@ -0,0 +1,3 @@ +module mana + +go 1.21.3 diff --git a/lexer/lexer.go b/lexer/lexer.go new file mode 100644 index 0000000..b65d00b --- /dev/null +++ b/lexer/lexer.go @@ -0,0 +1,146 @@ +package lexer +import "mana/tokens" + +type Lexer struct { + input string + position int // current position in input (points to current char) + readPosition int // current reading position in input (after current char) + ch byte // current char under examination +} + +// New returns a new Lexer instance. +func New(input string) *Lexer { + l := &Lexer{input: input} + l.readChar() + return l +} + +// NextToken returns the next token in the input string. +func (l *Lexer) NextToken() tokens.Token { + var tok tokens.Token + + l.skipWhitespace() + + switch l.ch { + case '=': + if l.peekChar() == '=' { + ch := l.ch + l.readChar() + literal := string(ch) + string(l.ch) + tok = tokens.Token{Type: tokens.EQ, Literal: literal} + } else { + tok = newToken(tokens.ASSIGN, l.ch) + } + case '+': + tok = newToken(tokens.PLUS, l.ch) + case '-': + tok = newToken(tokens.MINUS, l.ch) + case '/': + tok = newToken(tokens.SLASH, l.ch) + case '*': + tok = newToken(tokens.ASTERISK, l.ch) + case '<': + tok = newToken(tokens.LT, l.ch) + case '>': + tok = newToken(tokens.GT, l.ch) + case '!': + if l.peekChar() == '=' { + ch := l.ch + l.readChar() + literal := string(ch) + string(l.ch) + tok = tokens.Token{Type: tokens.NOT_EQ, Literal: literal} + } else { + tok = newToken(tokens.BANG, l.ch) + } + case ';': + tok = newToken(tokens.SEMICOLON, l.ch) + case '(': + tok = newToken(tokens.LPAREN, l.ch) + case ')': + tok = newToken(tokens.RPAREN, l.ch) + case ',': + tok = newToken(tokens.COMMA, l.ch) + case '{': + tok = newToken(tokens.LBRACE, l.ch) + case '}': + tok = newToken(tokens.RBRACE, l.ch) + case 0: + tok.Literal = "" + tok.Type = tokens.EOF + default: + if isLetter(l.ch) { + tok.Literal = l.readIdentifier() + tok.Type = tokens.LookupIdent(tok.Literal) + return tok + } else if isDigit(l.ch) { + tok.Type = tokens.INT + tok.Literal = l.readNumber() + return tok + } else { + tok = newToken(tokens.ILLEGAL, l.ch) + } + } + + l.readChar() + return tok +} + +// skipWhitespace skips whitespace characters. +func (l *Lexer) skipWhitespace() { + for l.ch == ' ' || l.ch == '\t' || l.ch == '\n' || l.ch == '\r' { + l.readChar() + } +} + +// newToken returns a new Token instance. +func newToken(tokenType tokens.TokenType, ch byte) tokens.Token { + return tokens.Token{Type: tokenType, Literal: string(ch)} +} + +// isLetter returns true if the given character is a letter. +func isLetter(ch byte) bool { + return 'a' <= ch && ch <= 'z' || 'A' <= ch && ch <= 'Z' || ch == '_' +} + +// isDigit returns true if the given character is a digit. +func isDigit(ch byte) bool { + return '0' <= ch && ch <= '9' +} + +// readChar reads the next character in the input and advances the position in the input string. +func (l *Lexer) readChar() { + if l.readPosition >= len(l.input) { + l.ch = 0 // ASCII code for "NUL" character + } else { + l.ch = l.input[l.readPosition] + } + l.position = l.readPosition + l.readPosition++ +} + +// peekChar returns the next character in the input string without advancing the position in the input string. +func (l *Lexer) peekChar() byte { + if l.readPosition >= len(l.input) { + return 0 // ASCII code for "NUL" character + } else { + return l.input[l.readPosition] + } +} + +// readIdentifier reads an identifier and advances the position in the input string until it encounters a non-letter character. +func (l *Lexer) readIdentifier() string { + position := l.position + for isLetter(l.ch) { + l.readChar() + } + return l.input[position:l.position] +} + +// readNumber reads a number and advances the position in the input string until it encounters a non-digit character. +func (l *Lexer) readNumber() string { + position := l.position + for isDigit(l.ch) { + l.readChar() + } + return l.input[position:l.position] +} diff --git a/lexer/lexer_test.go b/lexer/lexer_test.go new file mode 100644 index 0000000..f8e3eae --- /dev/null +++ b/lexer/lexer_test.go @@ -0,0 +1,127 @@ +package lexer + +import ( + "testing" + "mana/tokens" +) + +func TestNextToken(t *testing.T) { + input := ` + let five = 5; + let ten = 10; + + let add = fn(x, y) { + x + y; + }; + + let result = add(five, ten); + + !-/*5; + 5 < 10 > 5; + + if (5 < 10) { + return true; + } else { + return false; + } + + 10 == 10; + 10 != 9; + ` + + tests := []struct { + expectedType tokens.TokenType + expectedLiteral string + }{ + {tokens.LET, "let"}, + {tokens.IDENT, "five"}, + {tokens.ASSIGN, "="}, + {tokens.INT, "5"}, + {tokens.SEMICOLON, ";"}, + {tokens.LET, "let"}, + {tokens.IDENT, "ten"}, + {tokens.ASSIGN, "="}, + {tokens.INT, "10"}, + {tokens.SEMICOLON, ";"}, + {tokens.LET, "let"}, + {tokens.IDENT, "add"}, + {tokens.ASSIGN, "="}, + {tokens.FUNCTION, "fn"}, + {tokens.LPAREN, "("}, + {tokens.IDENT, "x"}, + {tokens.COMMA, ","}, + {tokens.IDENT, "y"}, + {tokens.RPAREN, ")"}, + {tokens.LBRACE, "{"}, + {tokens.IDENT, "x"}, + {tokens.PLUS, "+"}, + {tokens.IDENT, "y"}, + {tokens.SEMICOLON, ";"}, + {tokens.RBRACE, "}"}, + {tokens.SEMICOLON, ";"}, + {tokens.LET, "let"}, + {tokens.IDENT, "result"}, + {tokens.ASSIGN, "="}, + {tokens.IDENT, "add"}, + {tokens.LPAREN, "("}, + {tokens.IDENT, "five"}, + {tokens.COMMA, ","}, + {tokens.IDENT, "ten"}, + {tokens.RPAREN, ")"}, + {tokens.SEMICOLON, ";"}, + {tokens.BANG, "!"}, + {tokens.MINUS, "-"}, + {tokens.SLASH, "/"}, + {tokens.ASTERISK, "*"}, + {tokens.INT, "5"}, + {tokens.SEMICOLON, ";"}, + {tokens.INT, "5"}, + {tokens.LT, "<"}, + {tokens.INT, "10"}, + {tokens.GT, ">"}, + {tokens.INT, "5"}, + {tokens.SEMICOLON, ";"}, + {tokens.IF, "if"}, + {tokens.LPAREN, "("}, + {tokens.INT, "5"}, + {tokens.LT, "<"}, + {tokens.INT, "10"}, + {tokens.RPAREN, ")"}, + {tokens.LBRACE, "{"}, + {tokens.RETURN, "return"}, + {tokens.TRUE, "true"}, + {tokens.SEMICOLON, ";"}, + {tokens.RBRACE, "}"}, + {tokens.ELSE, "else"}, + {tokens.LBRACE, "{"}, + {tokens.RETURN, "return"}, + {tokens.FALSE, "false"}, + {tokens.SEMICOLON, ";"}, + {tokens.RBRACE, "}"}, + {tokens.INT, "10"}, + {tokens.EQ, "=="}, + {tokens.INT, "10"}, + {tokens.SEMICOLON, ";"}, + {tokens.INT, "10"}, + {tokens.NOT_EQ, "!="}, + {tokens.INT, "9"}, + {tokens.SEMICOLON, ";"}, + {tokens.EOF, ""}, + } + + l := New(input) + + for i, tt := range tests { + tok := l.NextToken() + + if tok.Type != tt.expectedType { + t.Fatalf("tests[%d] - tokentype wrong. expected=%q, got=%q", + i, tt.expectedType, tok.Type) + } + + if tok.Literal != tt.expectedLiteral { + t.Fatalf("tests[%d] - literal wrong. expected=%q, got=%q", + i, tt.expectedLiteral, tok.Literal) + } + } +} diff --git a/repl/repl.go b/repl/repl.go new file mode 100644 index 0000000..9119db8 --- /dev/null +++ b/repl/repl.go @@ -0,0 +1 @@ +package repl \ No newline at end of file diff --git a/tokens/tokens.go b/tokens/tokens.go new file mode 100644 index 0000000..b93d9f9 --- /dev/null +++ b/tokens/tokens.go @@ -0,0 +1,65 @@ +package tokens + +type TokenType string + +type Token struct { + Type TokenType + Literal string +} + +const ( + ILLEGAL = "ILLEGAL" + EOF = "EOF" + + // Identifiers + literals + IDENT = "IDENT" + INT = "INT" + + // Operators + ASSIGN = "=" + PLUS = "+" + MINUS = "-" + BANG = "!" + ASTERISK = "*" + SLASH = "/" + LT = "<" + GT = ">" + EQ = "==" + NOT_EQ = "!=" + + // Delimiters + COMMA = "," + SEMICOLON = ";" + + LPAREN = "(" + RPAREN = ")" + LBRACE = "{" + RBRACE = "}" + + // Keywords + FUNCTION = "FUNCTION" + LET = "LET" + IF = "IF" + ELSE = "ELSE" + TRUE = "TRUE" + FALSE = "FALSE" + RETURN = "RETURN" +) + +var keywords = map[string]TokenType { + "fn": FUNCTION, + "let": LET, + "if": IF, + "else": ELSE, + "true": TRUE, + "false": FALSE, + "return": RETURN, +} + +// LookupIdent looks up an identifier and returns the TokenType. +func LookupIdent(ident string) TokenType { + if tok, ok := keywords[ident]; ok { + return tok + } + return IDENT +} -- cgit v1.2.3