.ifdef 0

  This file is an attempt to port a byte-code forth system from 
  the x86 architecture to the atmega328 chip (usually running on some
  kind of arduino system). The x86 system is at 
     bumble.sf.net/books/osdev/os.asm

  The system uses ijmp to call the opcodes and caches the 
  top-of-stack in r25:r24

 CHALLENGES

   Dealing with the Harvard architecture of the avr chip is going
   to be tricky. However Flashforth contains solutions to this.

   The arduino contains a bootloader which is not necessary for 
   a forth system. So I need to learn how to burn the forth system
   into the arduino board (and override the bootloader). 
   
   Also, I need to learn how to use the SPM instruction
   to actually write to flash (program) memory. This may involve using
   a ram buffer, because some flash memory can only be written one page
   at a time. 

   Flash memory should not be written to overly frequently, so it
   may be better not to recompile code that is already on the atmega/avr
   chip. So create could check for a word of the same name and abort
   if it already exists in the dictionary.


  COMPILE UPLOAD AND RUN

    Use some aliases, bash functions to aid development.

   * compile whole file and upload to the arduino 'uno' board
   >> map ,V :! avra %; sudo avrdude -p atmega328p -carduino -P/dev/ttyACM0 -b115200 -D -Uflash:w:%:r.hex:i

   * compile and upload to the arduino 'uno' board
   >> avra os.avr.asm; sudo avrdude -p atmega328p -carduino -P/dev/ttyACM0 -b115200 -D -Uflash:w:os.avr.hex:i

   * compile and upload to the arduino 'duemilenove' board
   >> avra os.avr.asm; sudo avrdude -p atmega328p -carduino -P/dev/ttyUSB0 -b57600 -D -Uflash:w:os.avr.hex:i

   * start a serial terminal to communicate with the arduino
   >> sudo picocom -b 9600 --echo /dev/ttyACM0

   * a bash function to compile and upload to uno board
   -----
   aos() {
     # cat os.avr.asm | sed -f os.sed > os.avr.pre.asm 
     avra os.avr.asm
     sudo avrdude -p atmega328p -carduino -P/dev/ttyACM0 \
        -b115200 -D -Uflash:w:test.hex:i
   }
   ,,,

  HISTORY
  

  * 25 july 2018
     Ported more opcodes from x86. Seems fairly straight forward.
     Maybe tosh and tosl should be kept in Z which would make 
     @ fetch and ! easier and faster. No! because the exec routine
     need zh:zl to get opcodes.
     Also, the idea of an opcode which pushes the memory map
     onto the stack (eg eeprom, rom, ram). In a von-neumann 
     machine the rom and ram will be the same but in a harvard
     architecture they will not.
     Had the idea of programming the bytecode onto the arduino
     using key and ! or c,
  
  * 24 july 2018
     continuing to port opcodes from the x86 version. Grappling
     with a .db problem in the code: section. When I write two
     .db sections on separate lines, the assembler (avra) seems
     to put a zero at the end of the first .db line. Added some
     vim macros to compile and upload whole file to arduino uno.
  * 22 july 2018
     Need to move the exec routine into a word. Then deal with
     jumps. Then recode all of the x86 system in avr assembler.
     Got the offset working but not propagating the carry to pch
     (program counter high byte).
     struggling with jump to get the offset correct in the byte code
     section. Separated this into its own file to 
     continue development there. This is a direct parallel to
     bumble.sf.net/books/osdev/os.asm  (which is x86 code)
  * 20 july 2018
     The opcode retrieval and indirect procedure call appear to
     be working. These are the essential elements for a bytecode 
     system. using LPM and ICALL. Will convert to IJMP so that the 
     data stack is easier to use
  * 19 July 2018
     started writing a bytecode system. 
     revisiting this to try to create a forth style bytecode
     system using avr assembly. I will use ideas from the x86 
     forth bytecode system at bumble.sf.net/books/osdev/os.asm
  * 8 November 2016
     I combined the arduino book with the avr book, since my 
     main focus now with the arduino will be programming in
     assembler. First I need to learn the avr architecture.

 .endif

  .include "m328Pdef.inc"

    .def rph = r1  ; return stack pointer, high byte
    .def rpl = r0  ; return stack pointer, low byte
    .def pch = r23  ; (program counter high byte)
    .def pcl = r22
    .def tosh = r25 ; (top-of-stack high byte)
    .def tosl = r24 

    ; also here define the datastack point and the 
    ; return stack point
    ; eg, This allows recursive
     .equ DSTACKH = high(ramend)
     .equ DSTACKL = low(ramend)-128
     .equ RSTACKH = high(ramend)
     .equ RSTACKL = low(ramend)

    ; opcode constants
    .equ DUP = 1 
    .equ DROP = DUP+1
    .equ SWAP = DROP+1
    .equ OVER = SWAP+1
    .equ TWODUP = OVER+1
    .equ FLAGS = TWODUP+1 
    .equ DEPTH = FLAGS+1
    .equ RDEPTH = DEPTH+1 
    .equ RON = RDEPTH+1 
    .equ ROFF = RON+1 
    .equ STORE = ROFF+1
    .equ STOREPLUS = STORE+1
    .equ FETCH = STOREPLUS+1
    .equ FETCHPLUS = FETCH+1
    .equ CSTORE = FETCHPLUS+1
    .equ CSTOREPLUS = CSTORE+1
    .equ CFETCH = CSTOREPLUS+1
    .equ CFETCHPLUS = CFETCH+1
    .equ COUNT = CFETCHPLUS    ; count is an alias for c@+
    .equ EQUALS = CFETCHPLUS+1
   ;  NOTEQUALS equ EQUALS+1
   ;  LESSTHAN equ NOTEQUALS+1
   ;  ULESSTHAN equ LESSTHAN+1
   ;  LIT equ ULESSTHAN+1
   ;  LITW equ LIT+1
   ;  EMIT equ LITW+1
   ;  KEY equ EMIT+1
    .equ EMIT = EQUALS+1 
    .equ KEY = EMIT+1 
   ;  EKEY equ KEY+1
    .equ JUMP = KEY+1 
    .equ STAR = JUMP+1 
    .equ DIGIT = STAR+1 

    ; domain name constants
    .equ LANG = 1 

   .dseg
   ; ram data segment 
   .cseg
   ; start of flash "rom" code segment

   .org 0x0000

   jmp wakeup

   ; a table of code pointers. The pointers have the 
   ; same offset in the table as the value of the opcode
   optable:
     .dw 0, dupx, dropx, swapx, overx, twodupx
     .dw flagsx, depthx, rdepthx, ronx, roffx
     .dw storex, storeplusx, fetchx, fetchplusx 
     .dw cstorex, cstoreplusx, cfetchx, cfetchplusx 
     .dw equalsx
     ;.dw notequals.x, 
     ;dw lessthan.x, ulessthan.x, lit.x, litw.x
     .dw emitx, keyx, jumpx, starx, digitx
     ;dw emit.x, key.x, ekey.x, getxy.x, atxy.x
     ;dw plus.x, minus.x, incr.x, decr.x, neg.x
     ;dw dplus.x
     ;dw logor.x, logxor.x, logand.x
     ;dw divmod.x, timestwo.x, mult.x, umult.x
     ;dw fcall.x, pcall.x, exit.x
     ;dw ljump.x, jump.x, jumpz.x, jumpnz.x, rloop.x
     ;dw divtwo.x
     ;dw read.x, write.x
     ;dw fg.x, bg.x, cls.x, vid.x, pix.x
     ;dw glyph.x
     ;dw rtc.x, clock.x
     ; could put just-in-time opcodes here.
     ;dw noop.x, -1 


   ; 
   execx:
     jmp nextopcode 

   dupdoc:
     ; db 'Duplicates the top item on the stack.'
     ; dw $-dup.doc
   duph:
     .db LANG         ; domain name prefix for universal naming 
     .dw 0            ; link to previous word 
     .db "dup", 3     ; strings are 'counted' 
   dupx:
     push tosh      ; high byte
     push tosl      ; low byte
     jmp nextopcode 

   dropdoc:
     ; db 'removes the top item on the stack.'
     ; dw $-drop.doc
   droph:
     .db 0           ; domain name prefix for universal naming 
     .dw dupx        ; link to previous word 
     .db "drop", 4   ; name with reverse count 
   dropx:
     pop tosl
     pop tosh
     jmp nextopcode

   swapdoc:
     ; db 'swaps the top 2 items on the stack.'
     ; dw $-swap.doc
   swaph:
     .dw dropx         ; link to previous word 
     .db "swap", 4
   swapx:
     movw r17:r16, tosh:tosl 
     pop tosl
     pop tosh
     push r17
     push r16
     jmp nextopcode

   overdoc:
     ; db ' ( n1 n2 -- n1 n2 n1 ) '
     ; db ' Puts a copy of 2nd stack item on top of stack. '
     ; db ' dont use this, will probably remove. '
     ; dw $-over.doc
   overh:
     .db LANG         ; domain name prefix for universal naming 
     .dw swapx        ; back-link 
     .db "over", 4
   overx:
     pop r16 
     pop r17 
     push r17 
     push r16 
     push tosh 
     push tosl 
     movw tosh:tosl, r17:r16
     jmp nextopcode

   twodupdoc:
     ; db ' ( n1 n2 -- n1 n2 n1 n2 ) '
     ; db ' copies 2 stack items onto stack '
     ; dw $-twodup.doc
   twoduph: 
     .db LANG        ; domain name prefix for universal naming 
     .dw overx       
     .db "2dup", 4
   twodupx:
     pop r16 
     pop r17 
     push r17 
     push r16 
     push tosh 
     push tosl 
     push r17 
     push r16 
     jmp nextopcode
 
   flagsdoc:
     ; db ' ( -- n ) '
     ; db ' push flag register onto the stack  '
     ; db ' execution flags such as carry overflow negate etc '
     ; db ' are pushed onto the stack '
     ; dw $-flags.doc
   flagsh: 
     .db LANG        ; domain name prefix for universal naming 
     .dw twodupx  
     .db "flags", 5
   flagsx:
     ;push dx   ; save NOS on stack
     ;pushf
     ;pop dx    ; get flags into TOS (dx)
     jmp nextopcode
 
   depthdoc:
     ; db ' ( -- n ) '
     ; db ' Puts on the stack the number of stack items '
     ; db ' before this word was executed '
     ; dw $-depth.doc
   depthh: 
     .db LANG        ; domain name prefix for universal naming 
     .dw flagsx   
     .db "depth", 5
   depthx:
     ; untested code
     push tosh       ; high byte
     push tosl       ; low byte (character)
     ldi r17, high(ramend)  
     in r19, sph
     ldi r16, low(ramend)    
     in r18, spl
     sub r16, r18   ; r17:r16 - r19:r18
     sbc r17, r19 
     ; need to divide by 2 but how??
     ;lsr r17        ; div r17:r16 by 2 
     ;ror r16
     movw tosh:tosl, r17:r16
     jmp nextopcode
 
   rdepthdoc:
     ; db ' ( -- n ) '
     ; db ' Puts on the stack the number of stack items '
     ; db ' on the return stack before this word '
     ; db ' was executed '
     ; dw $-rdepth.doc
   rdepthh: 
     .db LANG        ; domain name prefix for universal naming 
     .dw depthx  
     .db "rdepth", 6
   rdepthx:
     ;push dx
     ;mov dx, di 
     ;shl dx, 1

     ; where to implement the return stack?
     ; above the data stack?
     jmp nextopcode
 
   rondoc:
     ; db '( S: n -- )( R: -- n ) '
     ; db ' put the top item of the data stack onto the return stack.'
     ; dw $-ron.doc
   ronh: 
     .db LANG        ; domain name prefix 
     .dw rdepthx      
     .db ">r", 2
   ronx:
     ;mov ax, dx     ; value to store at address (tos=dx)
     ;stosw          ; [es:di] := ax, di+2
     ;pop dx
     jmp nextopcode
 
   roffdoc:
     ; db '( S: -- n)( R: n -- ) '
     ; db ' put the top item of the return stack onto the data stack.'
     ; dw $-roff.doc
   roffh: 
     .db LANG        ; domain name prefix 
     .dw ronx      
     .db "r>", 2
   roffx:
     ;push dx         ; push new nos on stack
     ;sub di, 2       ; 
     ;mov dx, [es:di] ; get top item off return stack
     jmp nextopcode
 
   ; this needs to be "vectored" for eeprom/rom/ram memory types
   storedoc:
     ; db '( w adr -- ) '
     ; db ' place 2 byte value w at address "adr" '
     ; dw $-store.doc
   storeh: 
     .db LANG        ; domain name prefix 
     .dw roffx       
     .db "!", 1
   storex:
     movw xh:xl, tosh:tosl    ; pointer to address
     pop r16
     pop r17         ; value to store at address
     st X+, r16 
     st X+, r17      ; 2 bytes are stored
     pop tosl           
     pop tosh          
     jmp nextopcode
 
   storeplusdoc:
     ; db '( w adr -- adr+2 ) '
     ; dw $-storeplus.doc
   storeplush: 
     .db LANG        ; domain name prefix 
     .dw storex      ; link to previous word 
     .db "!+", 2
   storeplusx:
     movw xh:xl, tosh:tosl    ; pointer to address
     pop r16
     pop r17         ; value to store at address
     st X+, r16 
     st X+, r17      ; 2 bytes are stored
     movw tosh:tosl, xh:xl 
     jmp nextopcode
 
   ; needs to be vectored for different memory types
   fetchdoc:
     ; .db '( adr -- n ) '
     ; .db ' Replace the top element of the stack with the '
     ; .db ' value of the 16bites at the given memory address '
     ; .dw $-fetch.doc
   fetchh: 
     .db LANG           ; domain name prefix 
     .dw storeplusx     ; link to previous word 
     .db "@", 1
   fetchx:
     movw xh:xl, tosh:tosl    ; 
     ld tosl, X+
     ld tosh, X+
     jmp nextopcode
 
   fetchplusdoc:
     ; db '( adr -- adr+2 n ) '
     ; db ' Replace the top element of the stack with the '
     ; db ' value of the 16bites at the given memory address '
     ; db ' and increment the address by 2 bytes. '
     ; dw $-fetchplus.doc
   fetchplush: 
     .db LANG        ; domain name prefix 
     .dw fetchx      ; link to previous word 
     .db "@+", 2
   fetchplusx:
     movw xh:xl, tosh:tosl    ; 
     ld tosl, X+
     ld tosh, X+
     push xh        
     push xl        
     jmp nextopcode
 
   cstoredoc:
     ; db '( n adr -- ) store the byte value n at address adr.'
     ; dw $-cstore.doc
   cstoreh: 
     .db LANG            ; domain name prefix 
     .dw fetchplusx     ; link to previous word 
     .db "c!", 2
   cstorex:
     movw xh:xl, tosh:tosl    ; pointer to address
     pop r16
     pop r17         ; value to store at address r16
     st X+, r16      ; only lower byte is stored
     pop tosl           
     pop tosh          
     jmp nextopcode
 
   cstoreplusdoc:
     ; db '( n adr -- adr+1 ) store the byte value n at address adr.'
     ; db ' And increment the address '
     ; dw $-cstoreplus.doc
   cstoreplush:  
     .db LANG            ; domain name prefix 
     .dw cstorex         ; link to previous word 
     .db "c!+", 3
   cstoreplusx:
     movw xh:xl, tosh:tosl    ; pointer to address
     pop r16         ; value to store at adr
     pop r17         ; 
     st X+, r16      ; only lower byte stored
     movw tosh:tosl, xh:xl 
     jmp nextopcode
 
   cfetchdoc:
     ; db '( adr -- n ) Replace the top element of the stack with the value '
     ; db ' of the byte at the given memory address.'
     ; db ' eg: myvar @ . '
     ; db '  displays the value at the address given by "myvar" '
     ; dw $-cfetch.doc
   cfetchh: 
     .db LANG              ; domain name prefix 
     .dw cstoreplusx      ; link to previous word 
     .db "c@", 2
   cfetchx:
     movw xh:xl, tosh:tosl    ; 
     ; need to sign extend here!!! 
     ; Most efficient way, I think:
     ;signextend:
     ;ldi   r25, 127       ; can be hoisted out of loops, and any reg is fine.
     ;cp    r25, r24        ; C = (r24 < 0)
     ;sbc   r25, r25        ; r25 = (r24 < 0) ? -1 : 0
     ; result in r25:r24

     ld tosl, X
     ldi tosh, 0 
     jmp nextopcode
 
   ; need to sign extend !!
   cfetchplusdoc:
     ; db '( adr -- adr+1 n ) '
     ; db ' Replace the top element of the stack with the value '
     ; db ' of the byte at the given memory address and increment the '
     ; db ' address . This is exactly the same as "count"'
     ; dw $-fetchplus.doc
   cfetchplush: 
     .db LANG            ; domain name prefix 
     .dw cfetchx         ; link to previous word 
     .db "c@+", 3
   cfetchplusx:
     movw xh:xl, tosh:tosl    ; 
     ld tosl, X+
     ; need to sign extend here!!!
     ldi tosh, 0 
     push xh        
     push xl        
     jmp nextopcode
 
   equalsdoc:
     ; db ' ( n1 n2 -- flag ) '
     ; db 'Puts -1 (true) on the stack if n1==n2 '
     ; db 'otherwise puts zero (false) on the stack. '
     ; dw $-equals.doc
   equalsh: 
     .db LANG             ; domain name prefix 
     .dw cfetchplusx     ; link to previous word 
     .db "=", 1     
   equalsx:
     pop r16
     pop r17        ; NOS high byte 
     cp r16, tosl   ; 
     brne notequal
     cp r17, tosh   ; 
     brne notequal
     ldi tosh, 0
     ldi tosl, 1
     jmp nextopcode
   notequal:
     ldi tosh, 0
     ldi tosl, 0
     jmp nextopcode
 
   emith:
     .dw dupx       ; the classic dictionary backlink
     .db "emit", 4
   emitx:
    waitagain:
     lds r16, UCSR0A       ; usart control, status register
     sbrs r16, UDRE0       ; is UDR empty?
     rjmp waitagain        ; if not, then just wait 
     sts UDR0, tosl        ; if empty tx character in TOS (r25:r24)k
     pop tosl              ; low byte of TOS 
     pop tosh              ; high byte of TOS (should be zero)
     jmp nextopcode 

   keyh:
    .dw emitx 
    .db "key", 3
   keyx:

   wait:
     push tosh       ; high byte
     push tosl       ; low byte (character)
     lds r16, UCSR0A    ; get usart status info
     sbrs r16, RXC0     ; has a byte been received?
     rjmp wait          ; if not just wait for one.
     ldi tosh, 0         ; set high byte of TOS to 0
     lds tosl, udr0      ; get key char into low byte of TOS 
     jmp nextopcode 

   jumph:
   jumpx:
     movw zh:zl, pch:pcl    ; get ip 
     lpm r16, z             ; get jump
     ; add or subtract jump offset from PC program counter  

     subi pcl, 1
     sbci pch, 0      ; adjust offset to JUMP instruction 
     tst r16
     brpl plus 
     ; there is a bug!! here. We need to propagate the carry
     ; to the high byte of the PC program counter
     neg r16
     sub pcl, r16
     sbci pch, 0
     jmp nextopcode
   plus:
     ldi r17, 0
     add pcl, r16
     adc pch, r17     ; add with carry
     jmp nextopcode 

   starh:
    .dw keyx
    .db "star", 4
   starx:
    waithere:
     lds r16, UCSR0A       ; usart control, status register
     sbrs r16, UDRE0       ; is UDR empty?
     rjmp waithere         ; if not, then just wait 
     ldi r17, '*'    
     sts UDR0, r17         ; if empty tx next character
     jmp nextopcode 
     
   ; a primitive method of debugging 
   ; prints one digit from the top stack item
   digith:
    .dw starx
    .db ".digit", 6
   digitx:
   repeat:
     lds r16, UCSR0A    ; usart control, status register
     sbrs r16, UDRE0    ; is UDR empty?
     rjmp repeat        ; if not, then just wait 
     movw r17:r16, tosh:tosl      
     ;ldi tosl, 5
     adiw tosl, '0'     ; convert opcode to asci digit
     sts UDR0, tosl     ; if empty tx next character
     movw tosh:tosl, r17:r16
     jmp nextopcode 

   serialx:
     ldi r16,(1<<txen0)|(1<<rxen0)     ; enable transmit receive 
     sts ucsr0b, r16       
     ldi r16,(1<<ucsz01)|(1<<ucsz00)
     sts ucsr0c, r16      
     ldi r16,0x66    ; the baud rate, 9600 on 16 megaherz chip
     sts ubrr0l,r16
     ldi r16,0x00    ; the baud rate, 9600
     sts ubrr0h,r16
     ret

   code:
     ; some bytecode to test the system
     .db STAR, KEY, KEY, EQUALS, DIGIT, JUMP, -5, 0

   wakeup:
    ; initialise the stack at the top of ram
    ldi r21, high(ramend)  
    out sph, r21        
    ldi r21, low(ramend)    
    out spl, r21        
   
    call serialx    ; set up serial interface

    ; r25:r24 holds top of stack
    ; r23:r22 is the PC program counter
    ; put all this in an exec function.
    ; lpm=load program memory, must use Z register
    ldi pch, high(code<<1) 
    ldi pcl, low(code<<1)

   nextopcode:
    ; retrieve and save PC program counter to r23:r22
    movw zh:zl, pch:pcl     ; get ip 
    lpm r20, z+
    movw pch:pcl, zh:zl     ; save ip 

   ; call digitx

   checkzero:
    ; halt if opcode is zero
    cpi r20, 0 
    breq goodbye 

    ldi zh, high(optable<<1) 
    ldi zl, low(optable<<1)
    add zl, r20    ; opcode number (need to double?)
    add zl, r20    ; 
    lpm r16, z+
    lpm r17, z+
    movw zh:zl, r17:r16     ;  
    ijmp

    ; never executed I think
    ; jmp nextopcode

   goodbye:

    lds r16, UCSR0A     ; usart control, status register
    sbrs r16, UDRE0     ; is UDR empty?
    rjmp goodbye        ; if not, then just wait 
    ldi r20, '+'
    sts UDR0, r20       ; if empty tx character

   here: rjmp here