import os import sys class JackTokenizer: # tokenizes Jack source code def __init__(self, filename): # load and clean Jack file self.lines = [] self.currentLine = "" self.lineNumber = 0 self.inComment = False # current token info self.currentToken = "" self.tokenType = "" # Jack language keywords self.keywords = { "class", "constructor", "function", "method", "field", "static", "var", "int", "char", "boolean", "void", "true", "false", "null", "this", "let", "do", "if", "else", "while", "return", } # Jack language symbols self.symbols = { "{", "}", "(", ")", "[", "]", ".", ",", ";", "+", "-", "*", "/", "&", "|", "<", ">", "=", "~", } # read file with open(filename, "r") as file: self.lines = file.readlines() def hasMoreTokens(self): # check if more tokens available # still have content on current line or more lines to process return len(self.currentLine) > 0 or self.lineNumber < len(self.lines) def advance(self): # get next token from input while True: # check if current line is empty if len(self.currentLine) == 0: # get new line if self.lineNumber >= len(self.lines): # end of file return False self.currentLine = self.lines[self.lineNumber] self.lineNumber += 1 # remove newline if self.currentLine.endswith("\n"): self.currentLine = self.currentLine[:-1] # handle comments # remove inline comments if "//" in self.currentLine: self.currentLine = self.currentLine[: self.currentLine.index("//")] # handle multi-line comments if self.inComment: # check for comment end if "*/" in self.currentLine: endIdx = self.currentLine.index("*/") + 2 self.currentLine = self.currentLine[endIdx:] self.inComment = False else: # still in comment, skip this line self.currentLine = "" continue # check for comment start if "/*" in self.currentLine: startIdx = self.currentLine.index("/*") # check if comment ends on same line if "*/" in self.currentLine[startIdx:]: endIdx = self.currentLine.index("*/", startIdx) + 2 self.currentLine = ( self.currentLine[:startIdx] + " " + self.currentLine[endIdx:] ) else: # comment continues to next line self.currentLine = self.currentLine[:startIdx] self.inComment = True # replace tabs with spaces and strip self.currentLine = self.currentLine.replace("\t", " ").strip() # if line is empty after cleaning, get next line if len(self.currentLine) == 0: continue # parse token from current line # skip leading spaces self.currentLine = self.currentLine.lstrip() if len(self.currentLine) == 0: continue # check first character firstChar = self.currentLine[0] # check if symbol if firstChar in self.symbols: self.currentToken = firstChar self.tokenType = "SYMBOL" self.currentLine = self.currentLine[1:] return True # check if string constant if firstChar == '"': # find closing quote endIdx = self.currentLine.index('"', 1) self.currentToken = self.currentLine[1:endIdx] self.tokenType = "STRING_CONST" self.currentLine = self.currentLine[endIdx + 1 :] return True # check if integer constant if firstChar.isdigit(): # parse integer endIdx = 0 while ( endIdx < len(self.currentLine) and self.currentLine[endIdx].isdigit() ): endIdx += 1 self.currentToken = self.currentLine[:endIdx] self.tokenType = "INT_CONST" self.currentLine = self.currentLine[endIdx:] return True # must be identifier or keyword if firstChar.isalpha() or firstChar == "_": # parse identifier endIdx = 0 while endIdx < len(self.currentLine): char = self.currentLine[endIdx] if char.isalnum() or char == "_": endIdx += 1 else: break self.currentToken = self.currentLine[:endIdx] self.currentLine = self.currentLine[endIdx:] # check if keyword if self.currentToken in self.keywords: self.tokenType = "KEYWORD" else: self.tokenType = "IDENTIFIER" return True # shouldn't reach here with valid Jack code self.currentLine = self.currentLine[1:] def getTokenType(self): # return current token type return self.tokenType def keyword(self): # return keyword (only if token is keyword) if self.tokenType == "KEYWORD": return self.currentToken return None def symbol(self): # return symbol (only if token is symbol) if self.tokenType == "SYMBOL": return self.currentToken return None def identifier(self): # return identifier (only if token is identifier) if self.tokenType == "IDENTIFIER": return self.currentToken return None def intVal(self): # return integer value (only if token is int) if self.tokenType == "INT_CONST": return int(self.currentToken) return None def stringVal(self): # return string value (only if token is string) if self.tokenType == "STRING_CONST": return self.currentToken return None class CompilationEngine: # generates XML from Jack code def __init__(self, tokenizer, output_file): # init compilation engine self.tokenizer = tokenizer self.output = open(output_file, "w") self.indent = 0 def writeOpenTag(self, tag): # write opening XML tag self.output.write(" " * self.indent + f"<{tag}>\r\n") self.indent += 1 def writeCloseTag(self, tag): # write closing XML tag self.indent -= 1 self.output.write(" " * self.indent + f"\r\n") def writeTerminal(self, tag, value): # write terminal (token) XML element # escape special characters if value == "<": value = "<" elif value == ">": value = ">" elif value == '"': value = """ elif value == "&": value = "&" self.output.write(" " * self.indent + f"<{tag}> {value} \r\n") def writeCurrentToken(self): # write current token as XML tokenType = self.tokenizer.getTokenType() if tokenType == "KEYWORD": self.writeTerminal("keyword", self.tokenizer.keyword()) elif tokenType == "SYMBOL": self.writeTerminal("symbol", self.tokenizer.symbol()) elif tokenType == "IDENTIFIER": self.writeTerminal("identifier", self.tokenizer.identifier()) elif tokenType == "INT_CONST": self.writeTerminal("integerConstant", str(self.tokenizer.intVal())) elif tokenType == "STRING_CONST": self.writeTerminal("stringConstant", self.tokenizer.stringVal()) def compileClass(self): # compile complete class self.writeOpenTag("class") # class keyword self.tokenizer.advance() self.writeCurrentToken() # class name self.tokenizer.advance() self.writeCurrentToken() # opening brace self.tokenizer.advance() self.writeCurrentToken() # class var declarations self.tokenizer.advance() while self.tokenizer.keyword() in ["static", "field"]: self.compileClassVarDec() self.tokenizer.advance() # subroutine declarations while self.tokenizer.keyword() in ["constructor", "function", "method"]: self.compileSubroutine() self.tokenizer.advance() # closing brace self.writeCurrentToken() self.writeCloseTag("class") def compileClassVarDec(self): # compile static or field declaration self.writeOpenTag("classVarDec") # static or field self.writeCurrentToken() # type self.tokenizer.advance() self.writeCurrentToken() # var name self.tokenizer.advance() self.writeCurrentToken() # additional var names self.tokenizer.advance() while self.tokenizer.symbol() == ",": # comma self.writeCurrentToken() # var name self.tokenizer.advance() self.writeCurrentToken() self.tokenizer.advance() # semicolon self.writeCurrentToken() self.writeCloseTag("classVarDec") def compileSubroutine(self): # compile method, function, or constructor self.writeOpenTag("subroutineDec") # constructor, function, or method self.writeCurrentToken() # return type self.tokenizer.advance() self.writeCurrentToken() # subroutine name self.tokenizer.advance() self.writeCurrentToken() # opening paren self.tokenizer.advance() self.writeCurrentToken() # parameter list self.tokenizer.advance() self.compileParameterList() # closing paren self.writeCurrentToken() # subroutine body self.tokenizer.advance() self.compileSubroutineBody() self.writeCloseTag("subroutineDec") def compileParameterList(self): # compile parameter list (possibly empty) self.writeOpenTag("parameterList") # check if empty if self.tokenizer.symbol() != ")": # type self.writeCurrentToken() # var name self.tokenizer.advance() self.writeCurrentToken() # additional parameters self.tokenizer.advance() while self.tokenizer.symbol() == ",": # comma self.writeCurrentToken() # type self.tokenizer.advance() self.writeCurrentToken() # var name self.tokenizer.advance() self.writeCurrentToken() self.tokenizer.advance() self.writeCloseTag("parameterList") def compileSubroutineBody(self): # compile subroutine body self.writeOpenTag("subroutineBody") # opening brace self.writeCurrentToken() # var declarations self.tokenizer.advance() while self.tokenizer.keyword() == "var": self.compileVarDec() self.tokenizer.advance() # statements self.compileStatements() # closing brace self.writeCurrentToken() self.writeCloseTag("subroutineBody") def compileVarDec(self): # compile var declaration self.writeOpenTag("varDec") # var keyword self.writeCurrentToken() # type self.tokenizer.advance() self.writeCurrentToken() # var name self.tokenizer.advance() self.writeCurrentToken() # additional var names self.tokenizer.advance() while self.tokenizer.symbol() == ",": # comma self.writeCurrentToken() # var name self.tokenizer.advance() self.writeCurrentToken() self.tokenizer.advance() # semicolon self.writeCurrentToken() self.writeCloseTag("varDec") def compileStatements(self): # compile sequence of statements self.writeOpenTag("statements") # process statements while ( self.tokenizer.getTokenType() == "KEYWORD" and self.tokenizer.keyword() in ["let", "if", "while", "do", "return"] ): keyword = self.tokenizer.keyword() if keyword == "let": self.compileLet() elif keyword == "if": self.compileIf() elif keyword == "while": self.compileWhile() elif keyword == "do": self.compileDo() elif keyword == "return": self.compileReturn() self.writeCloseTag("statements") def compileLet(self): # compile let statement self.writeOpenTag("letStatement") # let keyword self.writeCurrentToken() # var name self.tokenizer.advance() self.writeCurrentToken() # check for array indexing self.tokenizer.advance() if self.tokenizer.symbol() == "[": # opening bracket self.writeCurrentToken() # expression self.tokenizer.advance() self.compileExpression() # closing bracket self.writeCurrentToken() self.tokenizer.advance() # equals sign self.writeCurrentToken() # expression self.tokenizer.advance() self.compileExpression() # semicolon self.writeCurrentToken() self.writeCloseTag("letStatement") self.tokenizer.advance() def compileIf(self): # compile if statement self.writeOpenTag("ifStatement") # if keyword self.writeCurrentToken() # opening paren self.tokenizer.advance() self.writeCurrentToken() # expression self.tokenizer.advance() self.compileExpression() # closing paren self.writeCurrentToken() # opening brace self.tokenizer.advance() self.writeCurrentToken() # statements self.tokenizer.advance() self.compileStatements() # closing brace self.writeCurrentToken() # check for else self.tokenizer.advance() if self.tokenizer.keyword() == "else": # else keyword self.writeCurrentToken() # opening brace self.tokenizer.advance() self.writeCurrentToken() # statements self.tokenizer.advance() self.compileStatements() # closing brace self.writeCurrentToken() self.tokenizer.advance() self.writeCloseTag("ifStatement") def compileWhile(self): # compile while statement self.writeOpenTag("whileStatement") # while keyword self.writeCurrentToken() # opening paren self.tokenizer.advance() self.writeCurrentToken() # expression self.tokenizer.advance() self.compileExpression() # closing paren self.writeCurrentToken() # opening brace self.tokenizer.advance() self.writeCurrentToken() # statements self.tokenizer.advance() self.compileStatements() # closing brace self.writeCurrentToken() self.writeCloseTag("whileStatement") self.tokenizer.advance() def compileDo(self): # compile do statement self.writeOpenTag("doStatement") # do keyword self.writeCurrentToken() # subroutine call (identifier) self.tokenizer.advance() self.writeCurrentToken() # check for class/var name or direct call self.tokenizer.advance() if self.tokenizer.symbol() == ".": # class or object method call # dot self.writeCurrentToken() # method name self.tokenizer.advance() self.writeCurrentToken() self.tokenizer.advance() # opening paren self.writeCurrentToken() # expression list self.tokenizer.advance() self.compileExpressionList() # closing paren self.writeCurrentToken() # semicolon self.tokenizer.advance() self.writeCurrentToken() self.writeCloseTag("doStatement") self.tokenizer.advance() def compileReturn(self): # compile return statement self.writeOpenTag("returnStatement") # return keyword self.writeCurrentToken() # check for return value self.tokenizer.advance() if self.tokenizer.symbol() != ";": # expression self.compileExpression() # semicolon self.writeCurrentToken() self.writeCloseTag("returnStatement") self.tokenizer.advance() def compileExpression(self): # compile expression self.writeOpenTag("expression") # term self.compileTerm() # check for op term ops = {"+", "-", "*", "/", "&", "|", "<", ">", "="} while self.tokenizer.symbol() in ops: # operator self.writeCurrentToken() # term self.tokenizer.advance() self.compileTerm() self.writeCloseTag("expression") def compileTerm(self): # compile term self.writeOpenTag("term") tokenType = self.tokenizer.getTokenType() if tokenType == "INT_CONST": # integer constant self.writeCurrentToken() self.tokenizer.advance() elif tokenType == "STRING_CONST": # string constant self.writeCurrentToken() self.tokenizer.advance() elif tokenType == "KEYWORD": # keyword constant (true, false, null, this) self.writeCurrentToken() self.tokenizer.advance() elif self.tokenizer.symbol() == "(": # opening paren self.writeCurrentToken() # expression self.tokenizer.advance() self.compileExpression() # closing paren self.writeCurrentToken() self.tokenizer.advance() elif self.tokenizer.symbol() in ["-", "~"]: # unary operator self.writeCurrentToken() # term self.tokenizer.advance() self.compileTerm() elif tokenType == "IDENTIFIER": # var name, array access, or subroutine call self.writeCurrentToken() self.tokenizer.advance() if self.tokenizer.symbol() == "[": # array access # opening bracket self.writeCurrentToken() # expression self.tokenizer.advance() self.compileExpression() # closing bracket self.writeCurrentToken() self.tokenizer.advance() elif self.tokenizer.symbol() == "(": # subroutine call # opening paren self.writeCurrentToken() # expression list self.tokenizer.advance() self.compileExpressionList() # closing paren self.writeCurrentToken() self.tokenizer.advance() elif self.tokenizer.symbol() == ".": # method call # dot self.writeCurrentToken() # method name self.tokenizer.advance() self.writeCurrentToken() # opening paren self.tokenizer.advance() self.writeCurrentToken() # expression list self.tokenizer.advance() self.compileExpressionList() # closing paren self.writeCurrentToken() self.tokenizer.advance() self.writeCloseTag("term") def compileExpressionList(self): # compile expression list (possibly empty) self.writeOpenTag("expressionList") # check if empty if self.tokenizer.symbol() != ")": # expression self.compileExpression() # additional expressions while self.tokenizer.symbol() == ",": # comma self.writeCurrentToken() # expression self.tokenizer.advance() self.compileExpression() self.writeCloseTag("expressionList") def close(self): # close output file self.output.close() def analyzeFile(jackFile, outputFile, tokenizeOnly=False): # analyze single Jack file tokenizer = JackTokenizer(jackFile) if tokenizeOnly: # tokenizer test output output = open(outputFile, "w") output.write("\r\n") while tokenizer.hasMoreTokens(): tokenizer.advance() tokenType = tokenizer.getTokenType() if tokenType == "KEYWORD": value = tokenizer.keyword() output.write(f" {value} \r\n") elif tokenType == "SYMBOL": value = tokenizer.symbol() # escape special characters if value == "<": value = "<" elif value == ">": value = ">" elif value == '"': value = """ elif value == "&": value = "&" output.write(f" {value} \r\n") elif tokenType == "IDENTIFIER": value = tokenizer.identifier() output.write(f" {value} \r\n") elif tokenType == "INT_CONST": value = tokenizer.intVal() output.write(f" {value} \r\n") elif tokenType == "STRING_CONST": value = tokenizer.stringVal() output.write(f" {value} \r\n") output.write("\r\n") output.close() else: # full compilation engine = CompilationEngine(tokenizer, outputFile) engine.compileClass() engine.close() def main(): # analyze Jack file or directory if len(sys.argv) != 2: print("Usage: python hjc.py ") sys.exit(1) inputPath = sys.argv[1] if not os.path.exists(inputPath): print(f"Error: Path '{inputPath}' not found") sys.exit(1) if os.path.isfile(inputPath): # single file mode if not inputPath.endswith(".jack"): print("Error: Input file must have .jack extension") sys.exit(1) # Generate tokenizer output tokenizerFile = inputPath[:-5] + "T.xml" analyzeFile(inputPath, tokenizerFile, True) # Generate parser output parserFile = inputPath[:-5] + ".xml" analyzeFile(inputPath, parserFile, False) print( f"Analyzed '{inputPath}' - generated '{tokenizerFile}' and '{parserFile}'" ) elif os.path.isdir(inputPath): # directory mode jackFiles = [f for f in os.listdir(inputPath) if f.endswith(".jack")] if not jackFiles: print(f"Error: No .jack files found in directory '{inputPath}'") sys.exit(1) for jackFile in jackFiles: inputFile = os.path.join(inputPath, jackFile) # Generate tokenizer output tokenizerFile = os.path.join(inputPath, jackFile[:-5] + "T.xml") analyzeFile(inputFile, tokenizerFile, True) # Generate parser output parserFile = os.path.join(inputPath, jackFile[:-5] + ".xml") analyzeFile(inputFile, parserFile, False) print(f"Analyzed {len(jackFiles)} files in '{inputPath}'") else: print(f"Error: '{inputPath}' is neither file nor directory") sys.exit(1) if __name__ == "__main__": main()