view assembler/assembler.py @ 25:45340c2a38c5

tidier and less buggier
author james <jb302@eecs.qmul.ac.uk>
date Fri, 28 Feb 2014 17:21:11 +0000
parents 2efb577ac2d7
children 84716cd835dd
line wrap: on
line source
#!/usr/bin/env python2
# assembler.py
import struct
import sys
from language import *

# take source file and return preprocessed assembly code
# for each non-empty line in the file:
#   remove comments from source
#   replace equated strings
#   store label definitions and remove label from source
#   store new equates
#   make hashable format symbol from arguments
#   identify and save constant data
#   save instruction, arguments, symbol and data to list
# also prepares org and db instructions for second_pass()
def first_pass(f):
    asm = []
    labels = {}
    equates = {}
    pc = 0
    
    # read file into list, remove blank line
    f.seek(0)
    source_code = filter(lambda l: l != '\n', f.readlines())  
    
    # <line> ::= [<statement>] [";"<comment>] <EOL>
    for line in source_code:  
        try:
            # remove trailing whitespace and comments
            line = line.strip()
            for i in range(len(line)):
                if line[i] == ';':
                    line = line[:i]
                    break
            
            # <statement> ::= [ <label> ":"] <mnemonic> [<arguments>]
            #                 | <label> ":"
            #                 | "EOF"
            statement = line.split()
            if not statement:
                continue

            # replace equated strings
            # because this happens on the first pass
            # equates must be assigned before they are used
            i = 1
            for s in statement[1:]:
                # replace any equates already stored
                # remove prefixes and suffixes before attempting to replace
                prefix = suffix = ''
                # prefixes
                if s[0] in ids:
                    prefix = prefix + s[0]
                    s = s[1:]
                if s[0] == '(':
                    prefix = prefix + s[0]
                    s = s[1:]
                # suffixes
                if s and (s[-1] == ','):
                    suffix = suffix + s[-1]
                    s = s[:-1]
                if s and (s[-1] == ')'):
                    suffix = s[-1] + suffix
                    s = s[:-1]
                # replace and put removed characters back
                if s in equates:
                    statement[i] = prefix + equates[s] + suffix
                # labels can be used in equates but they have
                # to be assigned before they are used as well
                elif s in labels:
                    statement[i] = prefix + str(labels[s]) + suffix
                i = i + 1                   
            
            # deal with org
            if statement[0].lower() == 'org':
                asm.append(['org', statement[1:], ('',), ''])
                pc = stoi(statement[1])
                continue
            # if needed update index and remove label 
            elif statement[0][-1] == ':':
                labels[statement[0][:-1]] = pc;
                del statement[0]
            
            # store equates
            # these are case sensative
            if (len(statement) >= 3) and (statement[1].lower() == 'equ'):
                equates[statement[0]] = ' '.join(statement[2:])
                continue
                
            if not statement:
                continue
            
            # <statement> ::= <mnemonic> [<arguments>]
            mne = statement[0].lower()
            args = ''.join(statement[1:]).split(',')
        
            # deal with db
            if mne == 'db':
                const = ''
                for a in args:
                    data = tokenize(mne, ['#' + a])[1]
                    # deal with leading zeros
                    # skip zeros unless zero is the 
                    # only number
                    if data == '\x00\x00':
                        const = const + '\x00'
                        continue
                    i = 0
                    for c in data:
                        if c == '\x00':
                            i = i + 1
                        else:
                            pass
                    const = const + data[i:]
                asm.append([mne, args, ('',), const])
                pc = pc + len(const)
                continue
            
            # tokenize
            sym, const = tokenize(mne, args)
            asm.append([mne, args, sym, const])
            # increase pc
            width = iset[mne][sym][1]
            pc = pc + width
        
        except:
            print ' ** first pass error **\nline:\n', line
            raise
    
    return asm, labels

# take a preprocessed object asm and write machine code to binary file
# for each line of asm:
#   check if it's an org or db command deal with it accordingly
#   check if arguments are labels and replace with value
#   write instruction to file
def second_pass(f, asm, labels):
    pc = 0

    for line in asm:
        f.seek(pc)
        mne, args, sym, const = line
    
        try:
            # deal with org and db
            if mne == 'org':
                pc = stoi(args[0])
                continue
            elif mne == 'db':
                f.write(const)
                pc = pc + len(const)
                continue
            
            # replace labels with addresses
            i = 0
            for a in args:
                if not a:
                    continue
                elif (sym[i] == 'label') or (sym[i] == '@label'):
                    # labeled pointer uglyness
                    if (a[0] == '@') and (a[1:] in labels):
                        args[i] = '@' + str(labels[a[1:]])
                        const = const + tokenize(mne, [args[i]])[1]
                    else:
                        # check if constant needs to be a relative address
                        if mne in rinst:
                            args[i] = str(labels[a] - pc)
                        else:
                            args[i] = str(labels[a])
                        const = const + tokenize(mne, [args[i]])[1]  
                i = i + 1
            
            # assemble to file
            op, width = iset[mne][sym]
            # theres gotta be a better way do deal with paged addresses
            if mne in ['pcall', 'pjmp']:
                op = op | ((stoi(args[0]) &  0x7FF) >> 8)
                const = const[-1]
            f.write(struct.pack('>B', op))
            
            # pad if needed
            # i don't think this ever happens
            #for i in range(width - len(const) - 1):
            #    f.write(struct.pack('>B', 0))
            
            # check length and write constant or throw error
            of = len(const) - width + 1
            if of > 0:
                if const[0] == ('\x00'):
                    const = const[of:]
                else:
                    raise ValueError
            f.write(const)
            pc = pc + width
        
        except:
            print '** second pass error **\nline:\n', line
            raise

    return f
    
if __name__ == '__main__':
    f = open(sys.argv[1], 'r')
    try:
        b = open(sys.argv[2], 'wb')
    except IndexError:
        b = open('a.out', 'wb')
    asm, labels = first_pass(f)
    b = second_pass(b, asm, labels)
    f.close()
    b.close()