Files
eceg431/11/hjc.py
2025-11-21 10:24:48 -05:00

1009 lines
30 KiB
Python

import os
import sys
class JackTokenizer:
# tokenizes Jack source code
def __init__(self, filename):
# load and clean Jack file
self.lines = []
self.currentLine = ""
self.lineNumber = 0
self.inComment = False
# current token info
self.currentToken = ""
self.tokenType = ""
# Jack language keywords
self.keywords = {
"class",
"constructor",
"function",
"method",
"field",
"static",
"var",
"int",
"char",
"boolean",
"void",
"true",
"false",
"null",
"this",
"let",
"do",
"if",
"else",
"while",
"return",
}
# Jack language symbols
self.symbols = {
"{",
"}",
"(",
")",
"[",
"]",
".",
",",
";",
"+",
"-",
"*",
"/",
"&",
"|",
"<",
">",
"=",
"~",
}
# read file
with open(filename, "r") as file:
self.lines = file.readlines()
def hasMoreTokens(self):
# check if more tokens available
# still have content on current line or more lines to process
return len(self.currentLine) > 0 or self.lineNumber < len(self.lines)
def advance(self):
# get next token from input
while True:
# check if current line is empty
if len(self.currentLine) == 0:
# get new line
if self.lineNumber >= len(self.lines):
# end of file
return False
self.currentLine = self.lines[self.lineNumber]
self.lineNumber += 1
# remove newline
if self.currentLine.endswith("\n"):
self.currentLine = self.currentLine[:-1]
# handle comments
# remove inline comments
if "//" in self.currentLine:
self.currentLine = self.currentLine[: self.currentLine.index("//")]
# handle multi-line comments
if self.inComment:
if "*/" in self.currentLine:
self.currentLine = self.currentLine[
self.currentLine.index("*/") + 2 :
]
self.inComment = False
else:
self.currentLine = ""
continue
if "/*" in self.currentLine:
if "*/" in self.currentLine:
before = self.currentLine[: self.currentLine.index("/*")]
after = self.currentLine[self.currentLine.index("*/") + 2 :]
self.currentLine = before + after
else:
self.currentLine = self.currentLine[
: self.currentLine.index("/*")
]
self.inComment = True
self.currentLine = self.currentLine.strip()
if len(self.currentLine) == 0:
continue
# skip whitespace
while len(self.currentLine) > 0 and self.currentLine[0] in " \t":
self.currentLine = self.currentLine[1:]
if len(self.currentLine) == 0:
continue
# check for string constant
if self.currentLine[0] == '"':
end = self.currentLine.index('"', 1)
self.currentToken = self.currentLine[1:end]
self.tokenType = "STRING_CONST"
self.currentLine = self.currentLine[end + 1 :]
return True
# check for symbols
if self.currentLine[0] in self.symbols:
self.currentToken = self.currentLine[0]
self.tokenType = "SYMBOL"
self.currentLine = self.currentLine[1:]
return True
# check for numbers
if self.currentLine[0].isdigit():
i = 0
while i < len(self.currentLine) and self.currentLine[i].isdigit():
i += 1
self.currentToken = self.currentLine[:i]
self.tokenType = "INT_CONST"
self.currentLine = self.currentLine[i:]
return True
# check for identifiers/keywords
if self.currentLine[0].isalpha() or self.currentLine[0] == "_":
i = 0
while i < len(self.currentLine) and (
self.currentLine[i].isalnum() or self.currentLine[i] == "_"
):
i += 1
self.currentToken = self.currentLine[:i]
if self.currentToken in self.keywords:
self.tokenType = "KEYWORD"
else:
self.tokenType = "IDENTIFIER"
self.currentLine = self.currentLine[i:]
return True
# shouldn't reach here with valid Jack code
self.currentLine = self.currentLine[1:]
def getTokenType(self):
# return current token type
return self.tokenType
def keyword(self):
# return keyword (only if token is keyword)
if self.tokenType == "KEYWORD":
return self.currentToken
return None
def symbol(self):
# return symbol (only if token is symbol)
if self.tokenType == "SYMBOL":
return self.currentToken
return None
def identifier(self):
# return identifier (only if token is identifier)
if self.tokenType == "IDENTIFIER":
return self.currentToken
return None
def intVal(self):
# return integer value (only if token is int)
if self.tokenType == "INT_CONST":
return int(self.currentToken)
return None
def stringVal(self):
# return string value (only if token is string)
if self.tokenType == "STRING_CONST":
return self.currentToken
return None
class SymbolTable:
# manages symbol table for Jack compilation
def __init__(self):
self.classTable = {} # class-scope symbols (static, field)
self.subroutineTable = {} # subroutine-scope symbols (arg, var)
self.staticCount = 0
self.fieldCount = 0
self.argCount = 0
self.varCount = 0
def startSubroutine(self):
# start a new subroutine scope
self.subroutineTable = {}
self.argCount = 0
self.varCount = 0
def define(self, name, type_name, kind):
# define a new identifier
if kind == "STATIC":
self.classTable[name] = {
"type": type_name,
"kind": kind,
"index": self.staticCount,
}
self.staticCount += 1
elif kind == "FIELD":
self.classTable[name] = {
"type": type_name,
"kind": kind,
"index": self.fieldCount,
}
self.fieldCount += 1
elif kind == "ARG":
self.subroutineTable[name] = {
"type": type_name,
"kind": kind,
"index": self.argCount,
}
self.argCount += 1
elif kind == "VAR":
self.subroutineTable[name] = {
"type": type_name,
"kind": kind,
"index": self.varCount,
}
self.varCount += 1
def getVarCount(self, kind):
# return count of variables of given kind
if kind == "STATIC":
return self.staticCount
elif kind == "FIELD":
return self.fieldCount
elif kind == "ARG":
return self.argCount
elif kind == "VAR":
return self.varCount
return 0
def kindOf(self, name):
# return the kind of named identifier
if name in self.subroutineTable:
return self.subroutineTable[name]["kind"]
elif name in self.classTable:
return self.classTable[name]["kind"]
return "NONE"
def typeOf(self, name):
# return the type of named identifier
if name in self.subroutineTable:
return self.subroutineTable[name]["type"]
elif name in self.classTable:
return self.classTable[name]["type"]
return None
def indexOf(self, name):
# return the index of named identifier
if name in self.subroutineTable:
return self.subroutineTable[name]["index"]
elif name in self.classTable:
return self.classTable[name]["index"]
return None
class VMWriter:
# emits VM commands into a file
def __init__(self, output_file):
self.output = open(output_file, "w")
def writePush(self, segment, index):
# write a VM push command
self.output.write(f"push {segment.lower()} {index}\n")
def writePop(self, segment, index):
# write a VM pop command
self.output.write(f"pop {segment.lower()} {index}\n")
def writeArithmetic(self, command):
# write a VM arithmetic command
self.output.write(f"{command.lower()}\n")
def writeLabel(self, label):
# write a VM label command
self.output.write(f"label {label}\n")
def writeGoto(self, label):
# write a VM goto command
self.output.write(f"goto {label}\n")
def writeIf(self, label):
# write a VM if-goto command
self.output.write(f"if-goto {label}\n")
def writeCall(self, name, nArgs):
# write a VM call command
self.output.write(f"call {name} {nArgs}\n")
def writeFunction(self, name, nLocals):
# write a VM function command
self.output.write(f"function {name} {nLocals}\n")
def writeReturn(self):
# write a VM return command
self.output.write("return\n")
def close(self):
# close the output file
self.output.close()
class CompilationEngine:
# compiles Jack source code to VM code
def __init__(self, tokenizer, output_file):
self.tokenizer = tokenizer
self.vmWriter = VMWriter(output_file)
self.symbolTable = SymbolTable()
self.className = ""
self.labelCount = 0
self.whileLabelCount = 0
self.ifLabelCount = 0
def getNextWhileLabel(self):
# generate unique while labels
exp_label = f"WHILE_EXP{self.whileLabelCount}"
end_label = f"WHILE_END{self.whileLabelCount}"
self.whileLabelCount += 1
return exp_label, end_label
def getNextIfLabel(self):
# generate unique if labels
true_label = f"IF_TRUE{self.ifLabelCount}"
false_label = f"IF_FALSE{self.ifLabelCount}"
end_label = f"IF_END{self.ifLabelCount}"
self.ifLabelCount += 1
return true_label, false_label, end_label
def compileClass(self):
# compile a complete class
# 'class'
if not self.tokenizer.advance():
return
# className
if not self.tokenizer.advance():
return
self.className = self.tokenizer.identifier()
# '{'
if not self.tokenizer.advance():
return
# classVarDec*
if not self.tokenizer.advance():
return
while (
self.tokenizer.getTokenType() == "KEYWORD"
and self.tokenizer.keyword() in ["static", "field"]
):
self.compileClassVarDec()
# subroutineDec*
while (
self.tokenizer.getTokenType() == "KEYWORD"
and self.tokenizer.keyword() in ["constructor", "function", "method"]
):
self.compileSubroutine()
# '}'
# Already at the closing brace
def compileClassVarDec(self):
# compile a static or field declaration
# ('static' | 'field')
kind = "STATIC" if self.tokenizer.keyword() == "static" else "FIELD"
# type
self.tokenizer.advance()
type_name = self.tokenizer.currentToken
# varName
self.tokenizer.advance()
name = self.tokenizer.identifier()
self.symbolTable.define(name, type_name, kind)
# (',' varName)*
self.tokenizer.advance()
while (
self.tokenizer.getTokenType() == "SYMBOL" and self.tokenizer.symbol() == ","
):
self.tokenizer.advance() # ','
name = self.tokenizer.identifier()
self.symbolTable.define(name, type_name, kind)
self.tokenizer.advance()
# ';'
self.tokenizer.advance()
def compileSubroutine(self):
# compile a method, function, or constructor
self.symbolTable.startSubroutine()
# ('constructor' | 'function' | 'method')
subroutineType = self.tokenizer.keyword()
# If method, add 'this' as first argument
if subroutineType == "method":
self.symbolTable.define("this", self.className, "ARG")
# returnType
self.tokenizer.advance()
# subroutineName
self.tokenizer.advance()
subroutineName = self.tokenizer.identifier()
# '('
self.tokenizer.advance()
# parameterList
self.tokenizer.advance()
self.compileParameterList()
# ')'
# Already past the closing parenthesis
# subroutineBody
self.compileSubroutineBody(subroutineType, subroutineName)
def compileParameterList(self):
# compile a parameter list
if self.tokenizer.getTokenType() == "SYMBOL" and self.tokenizer.symbol() == ")":
return
# type
type_name = self.tokenizer.currentToken
# varName
self.tokenizer.advance()
name = self.tokenizer.identifier()
self.symbolTable.define(name, type_name, "ARG")
# (',' type varName)*
self.tokenizer.advance()
while (
self.tokenizer.getTokenType() == "SYMBOL" and self.tokenizer.symbol() == ","
):
self.tokenizer.advance() # ','
type_name = self.tokenizer.currentToken
self.tokenizer.advance()
name = self.tokenizer.identifier()
self.symbolTable.define(name, type_name, "ARG")
self.tokenizer.advance()
def compileSubroutineBody(self, subroutineType, subroutineName):
# compile subroutine body
# '{'
self.tokenizer.advance()
# varDec* - need to advance to first token after '{'
if not (
self.tokenizer.getTokenType() == "KEYWORD"
and self.tokenizer.keyword() == "var"
):
self.tokenizer.advance()
while (
self.tokenizer.getTokenType() == "KEYWORD"
and self.tokenizer.keyword() == "var"
):
self.compileVarDec()
# Write function declaration
nLocals = self.symbolTable.getVarCount("VAR")
functionName = f"{self.className}.{subroutineName}"
self.vmWriter.writeFunction(functionName, nLocals)
# Handle constructor/method setup
if subroutineType == "constructor":
# Allocate memory for object
nFields = self.symbolTable.getVarCount("FIELD")
self.vmWriter.writePush("constant", nFields)
self.vmWriter.writeCall("Memory.alloc", 1)
self.vmWriter.writePop("pointer", 0)
elif subroutineType == "method":
# Set 'this' pointer
self.vmWriter.writePush("argument", 0)
self.vmWriter.writePop("pointer", 0)
# statements
self.compileStatements()
# '}'
if self.tokenizer.getTokenType() == "SYMBOL" and self.tokenizer.symbol() == "}":
self.tokenizer.advance()
def compileVarDec(self):
# compile a var declaration
# 'var'
self.tokenizer.advance()
# type
type_name = self.tokenizer.currentToken
# varName
self.tokenizer.advance()
name = self.tokenizer.identifier()
self.symbolTable.define(name, type_name, "VAR")
# (',' varName)*
self.tokenizer.advance()
while (
self.tokenizer.getTokenType() == "SYMBOL" and self.tokenizer.symbol() == ","
):
self.tokenizer.advance() # ','
name = self.tokenizer.identifier()
self.symbolTable.define(name, type_name, "VAR")
self.tokenizer.advance()
# ';'
if self.tokenizer.getTokenType() == "SYMBOL" and self.tokenizer.symbol() == ";":
self.tokenizer.advance()
def compileStatements(self):
# compile a sequence of statements
# We should already be positioned at the first statement token
while (
self.tokenizer.getTokenType() == "KEYWORD"
and self.tokenizer.keyword() in ["let", "if", "while", "do", "return"]
):
keyword = self.tokenizer.keyword()
if keyword == "let":
self.compileLet()
elif keyword == "if":
self.compileIf()
elif keyword == "while":
self.compileWhile()
elif keyword == "do":
self.compileDo()
elif keyword == "return":
self.compileReturn()
def compileLet(self):
# compile a let statement
# 'let'
self.tokenizer.advance()
# varName
varName = self.tokenizer.identifier()
# Check for array access
self.tokenizer.advance()
isArray = (
self.tokenizer.getTokenType() == "SYMBOL" and self.tokenizer.symbol() == "["
)
if isArray:
# Push array base address
self.pushIdentifier(varName)
# '['
self.tokenizer.advance()
# expression (array index)
self.compileExpression()
# ']'
self.tokenizer.advance()
# Add base + index
self.vmWriter.writeArithmetic("add")
# '='
self.tokenizer.advance()
# expression (value to assign)
self.compileExpression()
if isArray:
# Pop value to temp, set that pointer, pop value to that 0
self.vmWriter.writePop("temp", 0)
self.vmWriter.writePop("pointer", 1)
self.vmWriter.writePush("temp", 0)
self.vmWriter.writePop("that", 0)
else:
# Simple assignment - pop the expression result to the variable
self.popIdentifier(varName)
# ';'
if self.tokenizer.getTokenType() == "SYMBOL" and self.tokenizer.symbol() == ";":
self.tokenizer.advance()
def compileIf(self):
# compile an if statement
trueLabel, falseLabel, endLabel = self.getNextIfLabel()
# 'if'
self.tokenizer.advance()
# '('
self.tokenizer.advance()
# expression
self.compileExpression()
# ')'
self.tokenizer.advance()
# Jump to true branch if condition is true
self.vmWriter.writeIf(trueLabel)
self.vmWriter.writeGoto(falseLabel)
self.vmWriter.writeLabel(trueLabel)
# '{'
self.tokenizer.advance()
# statements
self.compileStatements()
# '}'
self.tokenizer.advance()
# ('else' '{' statements '}')?
if (
self.tokenizer.getTokenType() == "KEYWORD"
and self.tokenizer.keyword() == "else"
):
# Jump over else part
self.vmWriter.writeGoto(endLabel)
self.vmWriter.writeLabel(falseLabel)
self.tokenizer.advance() # 'else'
self.tokenizer.advance() # '{'
self.compileStatements()
self.tokenizer.advance() # '}'
self.vmWriter.writeLabel(endLabel)
else:
self.vmWriter.writeLabel(falseLabel)
def compileWhile(self):
# compile a while statement
expLabel, endLabel = self.getNextWhileLabel()
# Start of loop
self.vmWriter.writeLabel(expLabel)
# 'while'
self.tokenizer.advance()
# '('
self.tokenizer.advance()
# expression
self.compileExpression()
# ')'
self.tokenizer.advance()
# Negate condition and jump to end
self.vmWriter.writeArithmetic("not")
self.vmWriter.writeIf(endLabel)
# '{'
self.tokenizer.advance()
# statements
self.compileStatements()
# '}'
self.tokenizer.advance()
# Jump back to start
self.vmWriter.writeGoto(expLabel)
# End label
self.vmWriter.writeLabel(endLabel)
def compileDo(self):
# compile a do statement
# 'do'
self.tokenizer.advance()
# subroutineCall
self.compileSubroutineCall()
# Pop return value (do statements ignore return value)
self.vmWriter.writePop("temp", 0)
# ';'
if self.tokenizer.getTokenType() == "SYMBOL" and self.tokenizer.symbol() == ";":
self.tokenizer.advance()
def compileReturn(self):
# compile a return statement
# 'return'
self.tokenizer.advance()
# expression?
if not (
self.tokenizer.getTokenType() == "SYMBOL" and self.tokenizer.symbol() == ";"
):
self.compileExpression()
else:
# Void function returns 0
self.vmWriter.writePush("constant", 0)
self.vmWriter.writeReturn()
# ';'
if self.tokenizer.getTokenType() == "SYMBOL" and self.tokenizer.symbol() == ";":
self.tokenizer.advance()
def compileExpression(self):
# compile an expression
# term
self.compileTerm()
# (op term)*
while self.tokenizer.getTokenType() == "SYMBOL" and self.tokenizer.symbol() in [
"+",
"-",
"*",
"/",
"&",
"|",
"<",
">",
"=",
]:
op = self.tokenizer.symbol()
self.tokenizer.advance()
self.compileTerm()
# Write arithmetic operation
if op == "+":
self.vmWriter.writeArithmetic("add")
elif op == "-":
self.vmWriter.writeArithmetic("sub")
elif op == "*":
self.vmWriter.writeCall("Math.multiply", 2)
elif op == "/":
self.vmWriter.writeCall("Math.divide", 2)
elif op == "&":
self.vmWriter.writeArithmetic("and")
elif op == "|":
self.vmWriter.writeArithmetic("or")
elif op == "<":
self.vmWriter.writeArithmetic("lt")
elif op == ">":
self.vmWriter.writeArithmetic("gt")
elif op == "=":
self.vmWriter.writeArithmetic("eq")
def compileTerm(self):
# compile a term
if self.tokenizer.getTokenType() == "INT_CONST":
# integerConstant
self.vmWriter.writePush("constant", self.tokenizer.intVal())
self.tokenizer.advance()
elif self.tokenizer.getTokenType() == "STRING_CONST":
# stringConstant
string = self.tokenizer.stringVal()
# Create string object
self.vmWriter.writePush("constant", len(string))
self.vmWriter.writeCall("String.new", 1)
# Append each character
for char in string:
self.vmWriter.writePush("constant", ord(char))
self.vmWriter.writeCall("String.appendChar", 2)
self.tokenizer.advance()
elif self.tokenizer.getTokenType() == "KEYWORD":
# keywordConstant
keyword = self.tokenizer.keyword()
if keyword == "true":
self.vmWriter.writePush("constant", 0)
self.vmWriter.writeArithmetic("not")
elif keyword in ["false", "null"]:
self.vmWriter.writePush("constant", 0)
elif keyword == "this":
self.vmWriter.writePush("pointer", 0)
self.tokenizer.advance()
elif self.tokenizer.getTokenType() == "IDENTIFIER":
# varName | varName[expression] | subroutineCall
name = self.tokenizer.identifier()
self.tokenizer.advance()
if self.tokenizer.getTokenType() == "SYMBOL":
if self.tokenizer.symbol() == "[":
# Array access
self.pushIdentifier(name)
self.tokenizer.advance() # '['
self.compileExpression()
self.tokenizer.advance() # ']'
self.vmWriter.writeArithmetic("add")
self.vmWriter.writePop("pointer", 1)
self.vmWriter.writePush("that", 0)
elif self.tokenizer.symbol() in ["(", "."]:
# Subroutine call - backtrack
# This is a bit tricky - we need to handle the identifier we already consumed
self.compileSubroutineCallFromName(name)
else:
# Simple variable
self.pushIdentifier(name)
else:
# Simple variable
self.pushIdentifier(name)
elif (
self.tokenizer.getTokenType() == "SYMBOL" and self.tokenizer.symbol() == "("
):
# '(' expression ')'
self.tokenizer.advance() # '('
self.compileExpression()
self.tokenizer.advance() # ')'
elif self.tokenizer.getTokenType() == "SYMBOL" and self.tokenizer.symbol() in [
"-",
"~",
]:
# unaryOp term
op = self.tokenizer.symbol()
self.tokenizer.advance()
self.compileTerm()
if op == "-":
self.vmWriter.writeArithmetic("neg")
elif op == "~":
self.vmWriter.writeArithmetic("not")
def compileSubroutineCall(self):
# compile a subroutine call
# subroutineName | className.subroutineName | varName.subroutineName
name = self.tokenizer.identifier()
self.tokenizer.advance()
self.compileSubroutineCallFromName(name)
def compileSubroutineCallFromName(self, name):
# compile subroutine call starting from identifier name
nArgs = 0
if self.tokenizer.getTokenType() == "SYMBOL" and self.tokenizer.symbol() == ".":
# className.subroutineName or varName.subroutineName
self.tokenizer.advance() # '.'
subroutineName = self.tokenizer.identifier()
self.tokenizer.advance()
# Check if name is a variable (object method call)
if self.symbolTable.kindOf(name) != "NONE":
# Object method call - push object reference as first argument
self.pushIdentifier(name) # Push object reference
nArgs = 1
className = self.symbolTable.typeOf(name)
fullName = f"{className}.{subroutineName}"
else:
# Static method call - no implicit 'this' argument
fullName = f"{name}.{subroutineName}"
else:
# Method call on current object
self.vmWriter.writePush("pointer", 0) # Push 'this'
nArgs = 1
fullName = f"{self.className}.{name}"
# '('
self.tokenizer.advance()
# expressionList
nArgs += self.compileExpressionList()
# ')'
self.tokenizer.advance()
# Call function
self.vmWriter.writeCall(fullName, nArgs)
def compileExpressionList(self):
# compile expression list and return argument count
nArgs = 0
if not (
self.tokenizer.getTokenType() == "SYMBOL" and self.tokenizer.symbol() == ")"
):
# expression
self.compileExpression()
nArgs = 1
# (',' expression)*
while (
self.tokenizer.getTokenType() == "SYMBOL"
and self.tokenizer.symbol() == ","
):
self.tokenizer.advance() # ','
self.compileExpression()
nArgs += 1
return nArgs
def pushIdentifier(self, name):
# push identifier value onto stack
kind = self.symbolTable.kindOf(name)
index = self.symbolTable.indexOf(name)
if kind == "STATIC":
self.vmWriter.writePush("static", index)
elif kind == "FIELD":
self.vmWriter.writePush("this", index)
elif kind == "ARG":
self.vmWriter.writePush("argument", index)
elif kind == "VAR":
self.vmWriter.writePush("local", index)
def popIdentifier(self, name):
# pop value from stack to identifier
kind = self.symbolTable.kindOf(name)
index = self.symbolTable.indexOf(name)
if kind == "STATIC":
self.vmWriter.writePop("static", index)
elif kind == "FIELD":
self.vmWriter.writePop("this", index)
elif kind == "ARG":
self.vmWriter.writePop("argument", index)
elif kind == "VAR":
self.vmWriter.writePop("local", index)
def close(self):
# close compilation
self.vmWriter.close()
def compileFile(input_file):
# compile a single Jack file
output_file = input_file.replace(".jack", ".vm")
try:
tokenizer = JackTokenizer(input_file)
engine = CompilationEngine(tokenizer, output_file)
# Start compilation
engine.compileClass()
engine.close()
print(f"Compiled {input_file} -> {output_file}")
except Exception as e:
print(f"ERROR: Failed to compile {input_file}: {e}")
import traceback
traceback.print_exc()
def main():
if len(sys.argv) != 2:
print("Usage: python JackCompilerFinal.py <source>")
print(" <source> can be a .jack file or a directory containing .jack files")
sys.exit(1)
source = sys.argv[1]
if os.path.isfile(source) and source.endswith(".jack"):
# Single file
compileFile(source)
elif os.path.isdir(source):
# Directory
for file in os.listdir(source):
if file.endswith(".jack"):
compileFile(os.path.join(source, file))
else:
print(f"Error: {source} is not a valid .jack file or directory")
sys.exit(1)
if __name__ == "__main__":
main()