4,887,235
	493	494
	 (assign byte-r (- b-temp bb-s-bitpos))
	 (store-word (ldb bb-s-word byte-s byte-r memory-data)))
       (assign bb-s-word (rotate bb-s-word byte-r))
       (assign bb-s-bitpos (- bb-s-bitpos b-temp))
       (parallel
	 (decr-d-offset)
	 (lisp (trace-path #/d))
	 (jump ubitblt-d-aligned-row-source-backwards)))
     (sequential			;s < d, need to fetch another word
       (parallel-with-s-access
	  bb-s-offset
	  (assign byte-r (- b-temp bb-s-bitpos))
	  (assign bb-s-word (logxor bb-constant (rotate memory-data byte-r))))
	(decr-wrap-s-offset)
	(parallel-with-s-access
	  bb-s-offset
	  (assign a-temp (- b-temp bb-s-bitpos))
	  (assign byte-s (1- a-temp))
	(assign bb-s-word2 (logxor bb-constant memory-data)))
	(assign bb-s-word (ldb bb-s-word2 byte-s byte-r bb-s-word))
	(parallel-with-d-access
	  bb-d-offset
	  (assign byte-r (b-constant 0))
	  (assign byte-s (1- bb-d-bitpos))
	  (store-word (ldb bb-s-word byte-s byte-r memory-data)))
	(assign bb-width (- bb-width bb-d-bitpos))
	(assign bb-s-bitpos (32- a-temp))
	(assign byte-r a-temp)
	(assign bb-s-word (rotate bb-s-word2 byte-r))
	(parallel
	  (decr-d-offset)
	  (lisp (trace-path #/e))
	  (jump ubitblt-d-aligned-row-source-backwards))))))))

(defucode ubitblt-aligned-row-source-backwards	;8 cycles per word
  (parallel					;1
    (assign bb-width (- bb-width (a-constant 32.)))
    (trap-if (minus-fixnum obus) ubitblt-aligned-row-source-backwards-done))
  (decr-wrap-s-offset)				;1
  (parallel-with-s-access			;3
    bb-s-offset
    (assign bb-s-word (logxor bb-constant memory-data)))
  (assign-vma-offset d)				;1
  (store-word bb-s-word)			;1
  (parallel					;1
   (decr-d-offset)
   (lisp (trace-path #/,))
   (jump ubitblt-aligned-row-source-backwards)))

(defucode ubitblt-aligned-row-source-backwards-done
  (trap-no-save)
  (if (plus-fixnum bb-width)
      (sequential
       (decr-wrap-s-offset)
       (parallel-with-s-access
	 bb-s-offset
	 (assign byte-s (1- bb-width))
	 (assign byte-r bb-width)
	 (assign bb-s-word (logxor bb-constant (ldb memory-data byte-s byte-r))))
       (parallel-with-d-access
	 bb-d-offset
	 (assign byte-r (- (a-constant 32.) bb-width))
	 (parallel-with-return
	  (store-word (dpb bb-s-word byte-s byte-r memory-data))
	  (lisp (trace-path #/2)))))
    (parallel-with-return
     (lisp (trace-path #/1)))))

;;each time through the loop, bb-s-word has the low part of the previous word
;;rotated to be at the high end of the word. We use it as background to LDB the
;;high part of the next word into it.
(defucode ubitblt-d-aligned-row-source-backwards	;9 cycles per word
  (parallel					;1 cycle
   (assign bb-width (- bb-width (a-constant 32.)))	;assign is aborted if trap occurs
   (trap-if (minus-fixnum obus) ubitbit-d-aligned-row-source-backwards-done))
  (decr-wrap-s-offset) 				;1
  (parallel-with-s-access			;3
    bb-s-offset
    (assign byte-r (32- bb-s-bitpos))
    (assign byte-s (31- bb-s-bitpos))
    (assign bb-s-word2 (logxor bb-constant memory-data)))
  (assign-vma-offset d)				;1
  (store-word (ldb bb-s-word2 byte-s byte-r bb-s-word)) ;1
  (decr-d-offset)				;1
  (parallel
    (assign bb-s-word (rotate bb-s-word2 byte-r))
    (lisp (trace-path #/.))
    (jump ubitblt-d-aligned-row-source-backwards)))

(defucode ubitblt-d-aligned-row-source-backwards-done
  (trap-no-save)
4,887,235
	495	496
  (if (plus-fixnum bb-width)
      (if (greater-or-equal-fixnum bb-s-bitpos bb-width)
	  (parallel-with-d-access
	    bb-d-offset
	    (assign byte-r (b-constant 0))
	    (assign byte-s (- (a-constant 31.) bb-width))
	    (parallel-with-return
	      (store-word (ldb memory-data byte-s byte-r- bb-s-word))
	      (lisp (trace-path #/4))))
	(sequential
	  (decr-wrap-s-offset)
	  (parallel-with-s-access
	    bb-s-offset
	    (assign byte-r bb-width)
	    (assign bb-s-word (rotate bb-s-word byte-r))
	    (assign bb-s-word2 (logxor bb-constant memory-data)))
	  (parallel
	   (assign byte-r (- bb-width bb-s-bitpos))
	   (assign a-temp obus))
	  (assign byte-s (1- a-temp))
	  (assign bt-s-word (ldb bb-s-word2 byte-s byte-r bb-s-word))
	  (parallel-with-d-access
	    bb-d-offset
	    (assign byte-s (1- bb-width))
	    (assign byte-r (- (a-constant 32.) bb-width))
	    (parallel-with-return
	      (store-word (dpb bb-s-word byte-s byte-r memory-data))
	      (lisp (trace-path #/5))))))
    (parallel-with-return
     (lisp (trace-path #/3)))))

(defucode ubitblt-long-row-both-backwards
  (parallel
    (assign b-temp bb-d-bitpos)
    (if (zero-fixnum bb-d-bitpos)
	(if (zero-fixnum bb-s-bitpos)
	    (parallel
	      (assign bb-s-offset (1+ bb-s-offset))	 ;loop will decr first
	      (lisp (trace-path #/a))
	      (jump ubitblt-aligned-row-both-backwards))
	  (parallel-with-s-access
	    bb-s-offset
	    (assign byte-r (32- bb-s-bitpos))
	    (parallel
	      (assign bb-s-word (logxor bb-constant (rotate memory-data byte-r)))
	      (lisp (trace-path #/c))
	      (jump ubitblt-d-aligned-row-both-backwards))))
      (if (equal-fixnum b-temp bb-s-bitpos)
	  (sequential
	    (parallel-with-s-access
	      bb-s-offset
	      (assign byte-s (1- bb-s-bitpos))
	      (assign byte-r (b-constant 0))
	      (assign bb-s-word (logxor bb-constant memory-data)))
	    (assign-vma-offset d)
	    (parallel
	     (decr-d-offset)
	     (start-memory read)
	     (call bb-byte-alu-operation-dispatch))
	    (parallel
	      (assign bb-width (- bb-width bb-s-bitpos))
	      (lisp (trace-path #/b))
	      (jump ubitblt-aligned-row-both-backwards)))
	(if (greater-fixnum bb-s-bitpos b-temp)	 	;s > d, enough in first word
	    (sequential
	      (parallel-with-s-access
	        bb-s-offset
		(parallel
		 (assign byte-r (- b-temp bb-s-bitpos))
		 (assign a-temp obus))	;this is negative
		(assign byte-s (1- bb-d-bitpos))
		(assign bb-s-word (logxor bb-constant (rotate memory-data byte-r))))
	      (assign byte-r (b-constant 0))
	      (parallel
	        (assign-vma-offset d)
		(call bb-byte-alu-operation-dispatch))
	      (assign bb-width (- bb-width bb-d-bitpos))
	      (assign b-temp bb-d-bitpos)
	      (assign bb-s-bitpos (- bb-s-bitpos b-temp))
	      (parallel
	        (decr-d-offset)
		(lisp (trace-path #/d))
		(jump ubitblt-d-aligned-row-both-backwards)))
	  (sequential
	   (parallel-with-s-access	;s<d, need to fetch another word
	     bb-s-offset
	     (assign byte-r (- b-temp bb-s-bitpos))
	     (assign bb-s-word (logxor bb-constant (rotate memory-data byte-r))))
	   (decr-wrap-s-offset)
	   (parallel-with-s-access
	     bb-s-offset
4,887,235
	497	498
	     (assign a-temp (- b-temp bb-s-bitpos))
	     (assign byte-s (1- a-temp))
	     (assign bb-s-word2 (logxor bb-constant memory-data)))
	   (assign bb-s-word (ldb bb-s-word2 byte-s byte-r bb-s-word))
	   (assign byte-s (1- bb-d-bitpos))
	   (assign byte-r (b-constant 0))
	   (parallel
	     (assign-vma-offset d)
	     (call bb-byte-alu-operation-dispatch))
	   (assign bb-width (- bb-width bb-d-bitpos))
	   (assign b-temp bb-d-bitpos)
	   (parallel
	    (assign b-temp (- b-temp bb-s-bitpos))
	    (assign byte-r obus))
	   (assign bb-s-word (rotate bb-s-word2 byte-r))
	   (assign bb-s-bitpos (- (a-constant 32.) b-temp))
	   (parallel
	    (decr-d-offset)
	    (lisp (trace-path #/e))
	    (jump ubitblt-d-aligned-row-both-backwards))))))))

(defucode ubitblt-aligned-row-both-backwards	;18 cycles per word
  (parallel					;1
    (assign bb-width (- bb-width (a-constant 32.)))
    (trap-if (minus-fixnum obus) ubitblt-aligned-row-both-backwards-done))
  (decr-wrap-s-offset)				;1
  (parallel-with-s-access			;3
    bb-s-offset
    (assign bb-s-word (logxor bb-constant memory-data)))
  (parallel					;1+3
    (assign-vma-offset d)
    (call bb-word-alu-operation-dispatch))
  (parallel					;1
    (decr-d-offset)
    (lisp (trace-path #/.))
    (jump ubitblt-aligned-row-both-backwards)))

(defucode ubitblt-aligned-row-both-backwards-done
  (if (plus-fixnum bb-width)
      (sequential
        (decr-wrap-s-offset)
	(parallel-with-s-access
	  bb-s-offset
	  (assign byte-s (1- bb-width))
	  (assign byte-r bb-width)
	  (assign bb-s-word (logxor bb-constant (ldb memory-data byte-s byte-r))))
	(assign byte-r (- (a-constant 32.) bb-width))
	(parallel
	  (assign-vma-offset d)
	  (lisp (trace-path #/2))
	  (jump bb-byte-alu-operation-dispatch)))	;jcall
    (parallel-with-return
     (lisp (trace-path #/1)))))

(defucode ubitblt-d-aligned-row-both-backwards	;13 cycles per word
  (parallel					; l cycle
   (assign bb-width (- bb-width (a-constant 32.)))
   (trap-if (minus-fixnum obus) ubitblt-d-aligned-row-both-backwards-done))
  (decr-wrap-s-offset)				;1. cycles
  (parallel-with-s-access			;3 cycles
    bb-s-offset
    (assign byte-s (31- bb-s-bitpos))
    (assign byte-r (32- bb-s-bitpos))
    (assign bb-s-word (logxor bb-constant memory-data)))
  (assign bb-s-word (ldb bb-s-word2 byte-s byte-r bb-s-word))	;1 cycle
  (parallel					;1+3 cycles
    (assign-vma-offset d)
    (call bb-word-alu-operation-dispatch))
  (decr-d-offset)				;1
  (parallel
    (assign bb-s-word (rotate bb-s-word2 byte-r))	;1
    (lisp (trace-path #/.))
    (jump ubitblt-d-aligned-row-both-backwards)))

(defucode ubitblt-d-aligned-row-both-backwards-done
  (trap-no-save)
  (if (plus-fixnum bb-width)
      (if (greater-or-equal-fixnum bb-s-bitpos bb-width)
	  (sequential
	   (assign byte-r bb-width)
	   (assign bb-s-word (rotate bb-s-word byte-r))
	   (assign byte-s (1- bb-width))
	   (assign byte-r (- (a-constant 32.) bb-width))
	   (parallel
	     (assign-vma-offset d)
	     (lisp (trace-path #/4))
	     (jump bb-byte-alu-operation-dispatch)))	;jcall
	(sequential
	 (decr-wrap-s-offset)
	 (parallel-with-s-access
	   bb-s-offset
	   (assign byte-r bb-width)
4,887,235
	499	500
	   (assign bb-s-word (rotate bb-s-word byte-r))
	   (assign bb-s-word2 (logxor bb-constant memory-data)))
	 (parallel
	  (assign byte-r (- bb-width bb-s-bitpos))
	  (assign a-temp obus))
	 (assign byte-s (1- a-temp))
	 (assign bb-s-word (ldb bb-s-word2 byte-s byte-r bb-s-word))
	 (assign byte-s (1- bb-width))
	 (assign byte-r (- (a-constant 32.) bb-width))
	 (parallel
	  (assign-vma-offset d)
	  (lisp (trace-path #/5))
	  (jump bb-byte-alu-operation-dispatch)))) 	;jcall
    (parallel-with-return
     (lisp (trace-path #/3)))))

;;code for %decode-bitblt-arrays
;;Take alu from-array to-array
;;Return (s-reg-addr s-beg-bitpos s-row-length s-height s-bits-per-elt
;;	  d-reg-addr d-beg-bitpos d-row-length d-height d-bits-per-el t
;;	  array-reg-event-count)

;;args
(defatomicro bbd-alu (amem (stack-pointer -2)))
(defatomicro bbd-s-array (amem (stack-pointer -1)))
(defatomicro bbd-d-array top-of-stack-a)

;4 slots for array-setup-2d to return its results
(defatomicro bbd-control (amem (stack-pointer 1)))
(defatomicro bbd-base-pointer (amem (stack-pointer 2)))
(defatomicro bbd-width (amem (stack-pointer 3)))
(defatomicro bbd-height (amem (stack-pointer 4)))

(defatomicro bbd-s-beg-addr	(amem (stack-pointer 5)))
(defatomicro bbd-s-beg-bitpos	(amem (stack-pointer 6)))
(defatomicro bbd-s-row-length	(amem (stack-pointer 7)))
(defatomicro bbd-s-height	(amem (stack-pointer 8)))
(defatomicro bbd-s-bits-per-elt	(amem (stack-pointer 9.)))
(defatomicro bbd-d-beg-addr	(amem (stack-pointer 10.)))
(defatomicro bbd-d-beg-bitpos	(amem (stack-pointer 11.)))
(defatomicro bbd-d-row-length	(amem (stack-pointer 12.)))
(defatomicro bbd-d-height	(amem (stack-pointer 13.)))
(defatomicro bbd-d-bits-per-elt	(amem (stack-pointer 14.)))
(defatomicro bbd-event-count	(amem (stack-pointer 15.)))

(defatomicro bb-alu-depends-on-source
	     (b-constant #,(loop for alu in '( 5 10.	;source
					      ;3 12.	;dest
					      ;0 15	;neither
					      1 2 4 6 7 8. 9. 11. 13. 14. ;both
					      )
				 sum (ash 1 alu))))

(defmicro compute-beg-bitpos (for-what)
  (let ((beg-bitpos (selectq for-what
		      (s 'bbd-s-beg-bitpos)
		      (d 'bbd-d-beg-bitpos)
		      (otherwise (ferror "What is ~S" for-what))))
	(row-length (selectq for-what
		      (s 'bbd-s-row-length)
		      (d 'bbd-d-row-length)
		      (otherwise (ferror "What is ~S" for-what)))))
    `(sequential
       (assign b-low-dividend top-of-stack)
       (assign a-positive-divisor bbd-width)
       (parallel
	 (assign b-high-dividend (a-constant 0))
	 (assign a-divide-step-count (b-constant 15.)))
       (parallel
	(assign a-negative-divisor (- a-positive-divisor))
	(cal divide-subroutine))
       ;; bits per elt setup correctly in byte-r
       (assign ,beg-bitpos (set-type (rotate b-high-dividend byte-r) dtp-fix))
       (assign b-temp (set-type (ldb ,row-length 27. 5 0) dtp-fix))
       (assign a-temp b-temp)
       (mpy-32-32 a-temp b-low-dividend set-b-temp for-effect nil))))

(defmicro set-b-temp (x)
  `(assign b-temp ,x))

(defucode ubitblt-decode-arrays
  ;;see whether the alu operation depends on the source array
  (assign byte-r (32- bbd-alu))
  (if (ldb-bit-test bb-alu-depends-on-source byte-r)
      (sequential
        (assign top-of-stack (b-constant 0))	;the "subscript"
	(parallel
	  (check-arg-type array bbd-s-array dtp-array)
	  (assign vma bbd-s-array)
	  (assign b-vma bbd-s-array)
	  (call array-setup-2d))
4,887,235
	501	502
	  (parallel (assign b-temp bbd-control)
		    (assign bbd-event-count bbd-control)
		    (call bbd-bits-per-elt))
	  (parallel (assign bbd-m-bits-per-elt t-temp)
		    (assign byte-r b-temp))
	  (assign bbd-s-row-length (rotate bbd-width byte-r))
	  (computer-beg-bitpos s)
	  (assign bbd-s-beg-addr (+ bbd-base-pointer b-temp))
	  (assign bbd-s-height bbd-height)
	  (assign bbd-event-count array-register-event-count))
    (assign top-of-stack (b-constant 0))
    (parallel
      (check-arg-type array bbd-d-array dtp-array)
      (assign vma bbd-d-array)
      (assign b-vma bbd-d-array)
      (call array-setup-2d))
    (parallel
      (assign bbd-event-count (ldb bbd-event-count 28. 0))
      (assign b-temp obus))			;move to b side
    (if	(not-equal-pointer b-temp bbd-control)	;assuming event count is low 28. bits
	(goto ubitblt-decode-arrays)		;an event happened. go retry
      (drop-through))
    (parallel (assign b-temp bbd-control)
	      (call bbd-bits-per-elt))
    (parallel (assign bbd-d-bits-per-elt b-temp)
	      (assign byte-r b-temp))
    (assign bbd-d-row-length (rotate bbd-width byte-r))
    (compute-beg-bitpos d)
    (assign bbd-d-beg-addr (+ bbd-base-pointer b-temp))
    (assign bbd-d-height bbd-height)
    ;;well, I guess wed better not get pclsrd here.
    (parallel (assign xbas (+ stack-pointer (b-constant 5)))
	      (assign b-temp-2 obus))
    (assign stack-pointer (+ stack-pointer (b-constant -3)))
    (parallel
      (assign b-temp (b-constant 11.))
      (jump bbd-finish-loop)))

(defucode bbd-finish-loop
  (parallel
    (assign b-temp (1- b-temp))
    (if (minus-fixnum obus) (return) (drop-through)))
  (pushval (set-type (amem (xbas 0)) dtp-fix))
  (parallel
    (assign b-temp-2 (1+ b-temp-2))
    (assign xbas obus)
    (jump bbd-finish-loop)))

;;take an array-register control word in top-of-stack, return a decoding of its
;;dispatch type in top-of-stack.

(defucode bbd-bits-per-elt
  (dispatch-after-this (array-regster-dispatch-field b-temp)
		       (nop)
    ((%array-register-dispatch-1-bit)
     (parallel (assign b-temp (set-type (b-constant 0) dtp-fix)) (return)))
    ((%array-register-dispatch-2-bit)
     (parallel (assign b-temp (set-type (b-constant 1) dtp-fix)) (return)))
    ((%array-register-dispatch-4-bit)
     (parallel (assign b-temp (set-type (b-constant 2) dtp-fix)) (return)))
    ((%array-register-dispatch-8-bit)
     (parallel (assign b-temp (set-type (b-constant 3) dtp-fix)) (return)))
    ((%array-register-dispatch-16-bit)
     (parallel (assign b-temp (set-type (b-constant 4) dtp-fix)) (return)))
    ((%array-register-dispatch-word)
     (parallel (assign b-temp (set-type (b-constant 5) dtp-fix)) (return)))
    (otherwise (signal-error unimplemented-or-illegal-array-type))))

;;; -*- Mode:Lisp; Package:Micro; Base:8; Lowerceme:yes -*-
;;; (c) Copyright 1982, Symbolics, Inc.

;;:: BITBLT microcode for 3600


;; Reads can be repeated with no harmful effects, writes cannot be (in most cases).
;; State is not permanently updated until a write is consummated.
;; After every write, state should be updated so that it the next memory operation
;; faults and pclsrs, that write will not be repeated (the bitblt row will be shorter).
;: To avoid the overhead of doing this for every write, we have block mode
;; operations that only update the state after writing a block of words.
;;
;; For the block mode things, we use a buffer that can be saved. See next+1 page.
;;
;; For the short-row things, when the destination is split across two words.
;; we check write access to both words before modifying either of them.
;; No pclsring problems if the operation depends on neither operand.
;;
;: When there is a partial word at the front, do it and then advance the arguments
;: so the bitblt is word aligned in the destination. When there is a partial word
;; at the end, when we get there the arguments have been advanced.

(reserve-scratchpad-memory 2460 2470 320 330)

4,887,235
	503	504
(defmicro waiting-for-memory ()		;documentation only.
  `(nop))

(defmicro abus-array-data (&body body)
  `(parallel
     (transport data)
     (check-data-type memory-data dtp-fix)
     ,@body))

(defmicro assign-vma-offset (which &rest stuff)
  (selectq which
    (S `(assign vma (+ bb-s-row-addr bb-s-offset ,@stuff)))
    (D `(assign vma (+ bb-d-row-addr bb-d-offset ,@stuff)))
    (S-ahead `(assign vma (+ bb-s-row-addr bb-s-offset-ahead ,@stuff)))
    (otherwise
     (ferror "assign-vma-offset knows about only S and 0, not ~S" which))))

(defmicro parallel-with-s-access (offset &body body)
  (make-memory-access 'bb-s-row-addr 'bb-s-offset offset body '(read)))

(defmicro parallel-with-d-access (offset &body body)
  (make-memory-access 'bb-d-row-addr 'bb-d-offset offset body '(read)))

(defmicro parallel-with-d-accees-check-write (offset &body body)
  (make-memory-access 'bb-d-row-addr 'bb-d-offset offset body (read write)))

(eval-when (eval compile load)
(defun make-memory-access (baseaddr offset-sym offset body memory-modes)
  (or (eq offset offset-sym)
      (equal offset `(1+ ,offset-sym))
      (and (eq offset-sym 'bb-s-offset) (eq offset 'bb-s-offset-ahead))
      (ferror "~S is not a recognized offset for ~S" offset offset-sym))
  (let* ((body (reverse body))
	 (finally `(abus-array-data ,(car body))))
    (do ((ll (reverse
	      `((assign vma ,(if (atom offset)
				 `(+ ,baseaddr ,offset)
			         `(+ ,baseaddr ,(second offset) 1)))
		(start-memory ,@memory-modes)
		(waiting-for-memory)))
	     (cdr ll))
	 (body (cdr body) (cdr body))
	 (l))
	((and (null ll) (null body))
	 `(sequential ,@l ,finally))
      (cond ((null ll) (push (car body) l))
	    ((null body) (push (car ll) l))
      (T (push `(parallel ,(car ll) ,(car body)) l))))))
);eval-when



;Note that these are *not* analogous to 1-
(defmicro 32- (operand)
  `(- (b-constant 32.) ,operand))

(defmicro 31- (operand)
  `(- (b-constant 31.) ,operand))

(defmicro incr-d-offset ()
  `(assign bb-d-offset (1+ bb-d-offset)))

(defmicro decr-d-offset ()
  `(assign bb-d-offset (1- bt-d-offset)))

(defmicro incr-wrap-s-offset ()
  `(sequential
    (parallel
      (assign bb-s-offset (1+ bb-s-offset))
      (assign b-temp-3 obus))
    (if (greater-or-equal-fixnum b-temp-3 bb-s-row-length)
	(parallel
	 (lisp (format T "~&>>>Wrapping around on bb-s-offset from ~d."
		       (low32 (tr 'bb-s-offset))))
	 (assign bb-s-offset (b-constant 0)))
      (drop-through))))

(defmicro decr-wrap-s-offset ()
  `(parallel
     (assign bb-s-offset (1- bb-s-offset))
     (if (minus-fixnum obus)
	 (parallel
	  (lisp (format T "~&>>>Decr wrapping around on bb-s-offset"))
	  (assign bb-s-offset (1- bb-s-row-length)))
       (drop-through))))

(defmicro incr-wrap-s-offset-ahead ()
  `(sequential
    (parallel
     (assign bb-s-offset-ahead (1+ bb-s-offset))
     (assign b-temp-3 obus))
4,887,235
	505	506
     (if (greater-or-equal-fixnum b-temp-3 bb-s-row-length)
	 (parallel
	  (lisp (format T "~&>>>Wrapping around on bb-s-offset from ~d."
			(low32 (tr `bb-s-offset-ahead))))
	  (assign bb-s-offset-ahead (b-constant 0)))
       (drop-through))))

(defmicro decr-wrap-s-offset-ahead ()
  `(parallel
     (assign bb-s-offset-ahead (1- bb-s-offset))
     (if (minus-fixnum obus)
	 (parallel
	  (lisp (format t "~&>>>Decr wrapping around on bb-s-offset"))
	  (assign bb-s-offset-ahead (1- bb-s-row-length)))
       (drop-through))))

(defmicro store-word (datum &rest options)
  `(store-contents (set-type ,datum dtp-fix) not-pointer . ,options))

;;---the goddamn simulator compiles
;;	(parallel (assign ...) (return))
;;into
;;	(prog ... (return nil) (setq ...))
(defmicro parallel-with-return (&body stm)
  `(,(if (eq *machine-version* 'sim) 'sequential 'parallel)
    ,@stm
    (return)))

(defvar *fp-offset-names* ())

(defmacro def-fp-offsets (&rest names)
  (loop for i upfrom 0
	for name in names
	append `((defatomicro ,name (amem (frame-pointer ,i)))
		 (defprop ,name ,i fp-offset)
		 (or (memq ',name *fp-offset-names*)
		     (push ',name *fp-offset-names*)))
	into foo
	finally (return `(progn 'compile ,@foo))))

;;decode fp offset numbers into symbols. Debugging only.
(defun dfp (&rest numbers)
  (loop for number in numbers
	collect (loop for name in *fp-offset-names*
		      when (equal (get name 'fp-offset) number)
		      return name
		      finally (return number))))

;; Define arguments/state for BITBLT instructions. Note that these must be
;; relative to FP. not to the top of the stack, since there might be a
;; saved bitblt-buffer on the stack it the instruction was interrupted.
(def-fp-offsets
  bb-arg-alu bb-arg-width bb-arg-height		;lisp arg
  bb-arg-from-array bb-arg-from-x bb-arg-from-y ;lisp arg
  bb-arg-to-array bb-arg-to-x bb-arg-to-y	;lisp arg
  bb-width					;ucode arg
  bb-s-data-addr				;ucode arg
  bb-s-row-offset				;ucode arg
  bb-s-offset					;ucode arg
  bb-s-bitpos					;ucode arg
  bb-s-row-length 				;ucode arg
  bb-d-data-addr				;ucoda arg
  bb-d-offset					;ucode arg
  bb-d-bitpos					;ucode org
  bb-event-count				;ucode arg
  bb-alu-operation				;ucode arg
  )

;;; Some temporaries.
(define-b-temps bb-constant	;Value to store or to X0R in
  		bb-s-word	;temp (source word)
		bb-s-row-addr	;start of current source row
		bb-d-row-addr	;start of current destination row
		bb-width-b	;copy of width on B side (sometimes)
		b-block-size)	;numoer of words in block

(defareg bb-constant-a)		;A-side copy of bb-constant
(defareg bb-identity)		;Background to dpb into when doing part word
(defareg bb-s-word2)		;temp (other source word)
(defareg bb-a-temp)
(defareg bb-s-offset-ahead)	;s-offset not finalized yet (if pclsr)
(defareg a-block-size)		;number- of words in block

;;; Bitblt-buffer hair
(eval-when (compile load eval)
(defconst n-bitblt-buffers 8))

#.`(progn 'compile		;B-memcry buffer for blccK-mode operations
     . ,(loop for i from 0 below n-bitblt-buffers
	      collect `(defbreg ,(fintern "BITBLT-BUFFER-~D" i))))
4,887,235
	507	508
(defmicro bitblt-buffer (i)
  (fintern "BITBLT-BUFFER-~D" i))

;We first compute the result n words at a time into the bitblt-buffer,
;and then store it into the destination (in one case the whole buffer
;is rotated by 1 to 31 bits as it is being stored).
;The bitblt-buffer is "active" while we are storming it into the destination.
;The bitblt buffer must be active while wo are modifying the destination,
;since the words copied into the buffer might overlapped with parts of
;the destination we have already modified.
;
;A pclsr while the bitblt-buffer is active will copy it into
;the stack, set first-part-done. and clear bitblt-buffer-active.
;A restart with first-part-done set will proceed normally until it comes time
;to store the bitblt-buffer. At that time, first-part-done is seen, the
;bitblt-buffer is restored from the stack (replacing the possibly-erroneous
;contents that were just computed), and execution then proceeds normally.
;
;The contents of the bitblt-buffer are assumed to have valid data type tags.
;For now, they could be forced to fixnum, but in the future we may have
;other instructions using this buffer and its save/restore mechanism.
;--- Still need to fix microcompiler to default cdr source from Bbus correctly ---

;Call here if we pclsr with the bitblt-buffer active
(defucode save-bitblt-buffer
  #.`(sequential
      ,(loop for i from 0 below n-bitblt-buffers
	     collect `(pushval-with-cdr (bitblt-buffer ,i))))
  (assign first-part-done (b-constant 1))
  (parallel
    (assign bitblt-buffer-active (b-constant 0))
    (return)))

;Call here when about to start storing the bitblt-buffer
;This is actually a micro so that the first instruction of the routine
;gets open-coded into the caller
;This is hairily bummed to make the normal case go in only one cycle
;(if the trap is not taken then the obus has -1 on it)
(defmicro activate-bitblt-buffer ()
  `(parallel
    (assign bitblt-buffer-active obus)
    (trap-if (bit-test frame-misc-data (b-constant (byte-mask first-part-done)))
	     activate-saved-bitblt-buffer)))

;We also need this closed-subroutine version
(defucode activate-bitblt-buffer
  (parallel
   (activate-bitblt-buffer)
   (return)))

(defucode activate-saved-bitblt-buffer
  (parallel
    (trap-save)			;Retry the assign,trap-if upon return
    #.`(sequential
	,@(loop for i from (1- n-bitblt-buffers) downto 0
		collect `(parallel
			  (assign (bitblt-buffer ,i) top-of-stack-a)
			  (decrement-stack-pointer)))))
  (parallel
    (assign first-part-done (b-constant 0))
    (return)))

;Call here when done storing the bitblt-buffer
(defucode deactivate-bitblt-buffer
  (parallel
    (assign bitblt-buffer-active (b-constant 0))
    (assign top-of-stack top-of-stack-a)	;Could have been bashed by activate.,.
    (return)))

(defmicro read-bb-s-word ()
  `(parallel
    (assign a-temp (+ bb-width-b bb-s-bitpos))
    (call read-bb-s-word1)))

;a-temp has the number of s bits needed relative to bit 0 of the first word
(defucode read-bb-s-word1
  (assign-vma-offset s)
  (parallel
    (assign byte-r (32- bb-s-bitpos))
    (start-memory read))
  (parallel
    (waiting-for-memory)
    (if (lesser-or-equal-fixnum a-temp (b-constant 32.))
	;;source is entirely within one word
	(parallel-with-return
	  (abus-array-data
	   (assign bb-s-word (logxor bb-constant (rotate memory-data byte-r)))))
      ;; source is split across two words
      (sequential
        (abus-array-data
	  (assign bb-s-word (rotate memory-data byte-r)))
4,887,235
	509	510
	(incr-wrap-s-offset-ahead)
	(assign-vma-offset s-ahead)
	(parallel
	  (start-memory read)		;byte-r is already ok
	  )
	(parallel
	 (waiting-for-memory)
	 (assign byte-s (1- a-temp)))
	(abus-array-data
	  (assign bb-s-word (dpb memory-data byte-s byte-r bb-s-word)))
	(parallel-with-return
	 (assign bb-s-word (logxor bb-s-word bb-constant-a)))))))

;;Assumptions about setup:
;;bb-constant has:
;;  >>	for constant operations (0,-1): the constant;
;;  >>	for operations dependent only on source or destination (x, ~x, y, ~y);
;;      a 0 for x,y or -1 for ~x,~y;
;;  >> 	for operations dependent on both s and d; 0 for those using source directly,
:;      and -1 for those that want the source complemented.

(defucode bb-copy-stuff-to-b-side
  (assign bb-s-row-addr (+ bb-e-data-addr b-temp))
  (parallel-with-return
   (assign bb-d-row-addr bb-d-data-addr)))

(defmacro definst-bitblt (name source destination neither both)
  `(definst ,name no-operand
     (parallel (assign b-temp bb-s-row-offset)
	       (call bb-copy-stuff-to-b-side))
     (dispatch-after-this (parallel (ldb bb-alu-operation 4 0)
				    ;; Set up constant needed for- the most common case
				    (assign bb-constant (via-xbus (b-constant 0)))
				    (assign bb-constant-a (via-xbus (b-constant 0))))
			  (assign bb-width-b bb-width)
      ((0)			;0
       (goto ,neither))
      ((1)			;x*y
       (parallel (assign bb-identity (a-constant -1))
		 (jump ,both)))
      ((2)			;~x*y
       (assign bb-identity (a-constant -1))
       (parallel (assign bb-constant (a-constant -1)) (assign bb-constant-a (a-constant -1))
		 (jump ,both)))
      ((3) (return))		;y
      ((4)
       (parallel (assign bb-identity (a-constant -1))
		 (jump ,both)))
      ((5) (goto ,source))	;x
      ((6)			;x xor y
       (parallel (assign bb-identity (a-constant 0))
		 (jump ,both)))
      ((7)			;x+y
       (parallel (assign bb-identity (a-constant 0))
		 (jump ,both)))
      ((8.)			;~x*~y
       (assign bb-identity (a-constant -1))
       (parallel (assign bb-constant (a-constant -1)) (assign bb-constant-a (a-constant -1))
		 (jump,both)))
      ((9.)			;~x xor y
       (assign bb-identity (a-constant -1))
       (parallel (assign bb-constant (a-constant -1)) (assign bb-constant-a (a-constant -1))
		 (jump ,both)))
      ((10.)			;~x
       (parallel (assign bb-constant (a-constant -1)) (assign bb-constant-a (a-constant -1))
		 (jump ,source)))
      ((11.)			;~x+y
       (assign bb-identity (a-constant 0))
       (parallel (assign bb-constant (a-constant -1)) (assign bb-constant-a (a-constant -1))
		 (jump ,both)))
      ((12.)			;~y
       (parallel (assign bb-constant (a-constant -1)) (assign bb-constant-a (a-constant -1))
		 (jump .destination)))
      ((13.)			;x+~y actually, ~(~x*y)
       (assign bb-identity (a-constant -1))
       (parallel (assign bb-constant (a-constant -1)) (assign bb-constant-a (a-constant -1))
		 (jump ,both)))
      ((14.)			;~x+~y actually, ~(x*y)
       (parallel (assign bb-identity (a-constant -1))
		 (jump ,both)))
      ((15.)			;-1
       (parallel (assign bb-constant (a-constant -1)) (assign bb-constant-a (a-constant -1))
		 (jump ,neither))))))

(definst-bitblt %bitblt-short-row
  ubitblt-short-row-source
  ubitblt-short-row-destination
  ubitblt-short-row-neither
  ubitblt-short-row-both)

4,887,235
	511	512
(definst-bitblt %bitblt-long-row
  ubitblt-long-row-source
  ubitblt-long-row-destination
  ubitblt-long-row-neither
  ubitblt-long-row-both)

(definst-bitblt %bitblt-long-row-backwards
  ubitblt-long-row-source-backwards
  ubitblt-long-row-destination
  ubitblt-long-row-neither		 ;direction immaterial
  ubitblt-long-row-backwards)

(defucode ubitblt-short-row-source
  (read-bb-s-word)
  (assign a-temp (+ bb-width-b bb-d-bitpos))
  (parallel
   (assign byte-s (- a-temp (b-constant 32.) 1))
   (if (lesser-or-equal-fixnum-unsigned a-temp (b-constant 32.))
       ;; destination is entirely within one word
       (parallel-with-d-access bb-d-offset
	 (assign byte-s (1- bb-width))
	 (assign byte-r bb-d-bitpos)
	 (parallel-with-return
	   (store-word (dpb bb-s-word byte-s byte-r memory-data))))
     ;; destination is cplit across two words
     ;; must access-check them both before modifying either
     (sequential
      ;; compute the high byte
      (parallel-with-d-access-check-write (1+ bb-d-offset)
	(assign byte-r bb-d-bitpos)
	(assign a-temp (ldb bb-s-word byte-s byte-r memory-data)))
      ;; compute and store the low byte
      (parallel-with-d-access bb-d-offset
	(assign byte-s (31- bb-d-bitpos))
	(store-word (dpb bb-s-word byte-s byte-r memory-data) block))
      ;; now store the hign byte. This cannot fault
      (parallel-with-return
       (store-word a-temp block))))))

(defucode ubitblt-short-row-destination
  (assign a-temp (+ bb-width-b bb-d-bitpos))
  (parallel
   (assign byte-s (- a-temp (b-constant 32.) 1))
   (if (lesser-or-equal-fixnum-unsigned a-temp (b-constant 32.))
       ;; destination is entirely within one word
       (parallel-with-d-access bb-d-offset
	 (assign byte-s (1- bb-width))
	 (assign byte-r bb-d-bitpos)
	 (parallel-with-return
	  (store-word (logxor (dpb bb-constant byte-s byte-r 0) memory-data))))
     ;; destination is split across two words
     ;; must access-check them both before modifying either
     (sequential
      ;; compute the high byte
      (parallel-with-d-access-check-write (1+ bb-d-offset)
	(assign byte-r (a-constant 0))
	(assign a-temp (logxor (ldb bb-constant byte-s byte-r) memory-data)))
      ;; compute and store the low byte
      (parallel-with-d-access bb-d-offset
	(assign byte-s (31- bb-d-bitpos))
	(assign byte-r bb-d-bitpos)
	(store-word (logxor (dpb bb-constant byte-s byte-r 0) memory-data) block))
      ;; now store the high byte. This cannot fault
      (parallel-with-return
	(store-word a-temp block))))))

;; The alu operation is actually a constant
(defucode ubitblt-short-row-neither
  (assign a-temp (+ bb-width-b bb-d-bitpos))
  (if (lesser-or-equal-fixnum a-temp (b-constant 32.))
      ;; destination is entirely within one word
      (parallel-with-d-access bb-d-offset
	(assign byte-s (1- bb-width))
	(assign byte-r bb-d-bitpos)
	(parallel-with-return
	 (store-word (dpb bb-constant byte-s byte-r memory-data))))
    ;; destination is split across two words, but no pclcr problems since doing
    ;; the operation twice produces the same effect
    (sequential
     ;; store the low byte
     (parallel-with-d-access bb-d-offset
       (assign byte-s (31- bb-d-bitpos))
       (assign byte-r bb-d-bitpos)
	(store-word (dpb bb-constant byte-s byte-r memory-data)))
     ;; store the high byte
     (parallel-with-d-access (1+ bb-d-offset)
       (assign byte-s (1- a-temp))
       (assign byte-r (a-constant 0))
       (parallel-with-return
	 (store-word (dpb bb-constant byte-s byte-r memory-data)))))))

