4,887,235
	513	514
;; The alu operation depends upon both source and destination bits
(defucode ubitblt-short-row-both
  (read bb-s-word)
  (assign a-temp (+ bb-width-b bb-d-bitpos))
  (if (lesser-or-equal-fixnum a-temp (b-constant 32.))
      ;; destination is entirely within one word
      (sequential
       (assign byte-s (1- bb-width))
       (assign byte-r bb-d-bitpos)
       (parallel
	(assign-vma-offset d)
	(jump bb-byte-alu-operation-dispatch)))		;jcall
    ;; destination is split across two words
    (sequential
     ;; make sure we have write access to the high byte so no pclsr after etoring low
     (assign-vma-offset d 1)
     (start-memory read write)
     ;; store the low byte
     (assign byte-s (31- bb-d-bitpos))
     (assign byte-r bb-d-bitpos)
     (parallel
      (assign-vma-offset d)
      (call bb-byte-alu-operation-dispatch))
     ;; store the night byte
     (assign bb-s-word (rotate bb-s-word byte-r))
     (assign byte-s (1- a-temp))
     (assign byte-r (b-constant 0))
     (parallel
       (assign-vma-offset d 1)
       (jump bb-byte-alu-operation-dispatch)))))	;jcall

;(boole	fn x y ...)   if fn is "abcd" then
;          y    0	1	2	3	4	5	6	7
;       | 0  1  0	x*y	~x*y	y	x*~y	x	x#y	x+y
;   ----------
;     0 | a  c  8	9	l0	11	12	13	14	15
;   x   |	~(x+y)	~(x#y)	~x	~x+y	~y	x+~y	~x+~y	-1
;     1 | b  d

;;vma and byte regs have been set up already, for DPB.
;;trashes a-temp-2, b-temp-2. b-temp-3. but not a-temp and b-temp.
(defucode bb-byte-alu-operation-dispatch
  (dispatch-after-this (parallel (start-memory read) (ldb bb-alu-operation 4 0))
		       (parallel
			 (assign b-temp-3 (dpb bb-s-word byte-s byte-r bb-identity))
			 (waiting-for-memory))
	((1 2) ;;1 x*y logand ;;2 ~x*y	logand
	 (parallel-with-return
	  (parallel
	   (declare-memory-timing data-cycle)
	   (abus-array-data
	    (store-word (logand memory-data b-temp-3))))))
	((4 8.) ;;4 ~(~x+y) = x*~y andc2 ;;8 ~e(x+y) = ~x*~y andcb
	 (parallel
	  (declare-memory-timing data-cycle)
	  (abus-array-data
	   (assign a-temp-2 memory-data)))
	 (assign b-temp-2 (dpb (b-constant -1) byte-s byte-r 0))	;cant merge this...
	 (assign a-temp-2 (logxor a-temp-2 b-temp-2))			;...with this.
	 (parallel-with-return
	  (store-word (logand a-temp-2 b-temp-3))))
	((6 9.) ;;6 x#y logxor ;;9 ~(x~y)=.x#y logxor
	 (parallel-with-return
	  (parallel
	   (declare-memory-timing data-cycle)
	   (abus-array-data
	    (store-word (logxor b-temp-3 memory-data))))))
	((7 11.) ;;7 x+y logior	;;ll ~x+y logior
	 (parallel-with-return
	  (parallel
	   (declare-memory-timing data-cycle)
	   (abus-array-data
	    (store-word (logior b-temp-3 memory-data))))))
	((13. 14.) ;;13 x+~y = ~(~x*y) lognand	;;14 ~x+~y=~(x*y)
	 (parallel
	  (declare-memory-timing data-cycle)
	  (abus-array-data
	   (assign a-temp-2 (logand b-temp-3 memory-data))))
	 (parallel-with-return
	  (store-word (logxor (dpb (b-constant -1) byte-s byte-r 0) a-temp-2))))))

;; vma has been set up already
(defucode bb-word-alu-operation-dispatch	;commonly 3 cycles (plus 1 for the call)
(dispatch-after-this (parallel (start-memory read) (ldb bb-alu-operation 4 0))
		     (waiting-for-memory)	;---want to use this somehow...
   ((1 2)	;;1 x*y logand		;;2 ~x*y logand
    (parallel
     (declare-memory-timing data-cycle)
     (abus-array-data (store-word (logand bb-s-word memory-data)))
     (return)))
   ((4 8.)	;;4 x-y andcb		;;8 ~(x+y) ~x*~y andcb
4,887,235
	515	516
    (parallel
     (declare-memory-timing data-cycle)
     (abus-array-data (store-word (andc2 bb-s-word memory-data)))
     (return)))
   ((6 9.)	;;6 x#y logxor		;;9 ~(x#y)=x#y logxor
    (parallel
     (declare-memory-timing data-cycle)
     (abus-array-data (store-word (logxor bb-s-word memory-data)))
     (return)))
   ((7 11.)	;;7 x+y logior		;;11 ~x+y logior
    (parallel
     (declare-memory-timing data-cycle)
     (abus-array-data (store-word (logior bb-m-word memory-data)))
     (return)))
   ((13. 14.)	;;13 x+-y - .(.xsy)	;;14 ~x+~y-.(x*y)
    (parallel
     (declare-memory-timing data-cycle)
     (abus-array-data (store-word (lognand bb-s-word memory-data)))
     (return)))))

;;alu depends only on source bits
(defucode ubitblt-long-row-source
  (parallel
   (assign b-temp bb-d-bitpos)
   (if (zero-fixnum bb-d-bitpos)
       (if (zero-fixnum bb-s-bitpos)
	   (goto ubitblt-aligned-row-source)
	   ;;             SSSSSSSSSSSSSSSSSSSSSSSSSsssssss
	   ;;      dddddddddddddddddddddddddddddddd
	   (parallel-with-s-access bb-s-offset
	     (assign byte-r (32- bb-s-bitpos))
	     (parallel
	       (assign bb-s-word2 (logxor bb-constant (rotate memory-data byte-r)))
	       (lisp (trace-path #/c))
	       (jump ubitblt-d-aligned-row-source))))
     (if (equal-fixnum b-temp bb-s-bitpos)
	   ;;SSSSSSSSSSSSSSSSSSSSSSSSSsssssss
	   ;;DDDDDDDDDDDDDDDDDDDDDDDDDddddddd
	 (sequential
	  (parallel-with-s-access bb-s-offset
	    (assign b-temp (32- bb-d-bitpos))
	    (assign byte-r b-temp)
	    (assign bb-s-word (logxor bb-constant (rotate memory-data byte-r))))
	  (parallel-with-d-access bb-d-offset
	    (assign byte-r bb-d-bitpos)
	    (assign byte-s (1- b-temp))
	    (store-word (dpb bb-s-word byte-s byte-r memory-data)))
	  ;; First partial word done, we are now the aligned ease
	  (incr-wrap-s-offset)
	  (incr-d-offset)
	  (assign bb-width (- bb-width b-temp))
	  (assign bb-s-bitpos (b-constant 0))
	  (parallel
	   (assign bb-d-bitpos (b-constant 0))
	   (lisp (trace-path #/b))
	   (jump ubitblt-aligned-row-source)))
       (if (lesser-fixnum bb-5-bitpos b-temp)
	   ;;sssssssssSSSSSSSSSSSSSSSS.......
	   ;;         DDDDDDDDDDDDDODDdddddddddddddddd
	   ;;         <- 32-d.bitpos->
	   (sequential
	    (parallel-with-s-access bb-s-offset
	      (assign byte-r (32- bb-s-bitpos))
	      (assign b-temp (32- bb-d-bitpos))
	      (assign bb-s-word (logxor bb-constant (rotate memory-data byte-r))))
	    ;;.......sssssssssSSSSSSSSSSSSSSSS
	    (parallel-with-d-access bb-d-offset
	      (assign byte-r bb-d-bitpos)
	      (assign byte-s (1- b-temp))
	      (store-word (dpb bb-s-word byte-s byte-r memory-data)))
	    ;; First partial D word done, some S bits from first word remain
	    (incr-d-offset)
	    ;;rotate s-word further to right by 32-d.bitpos = left by -(32-d.bitpos)
	    ;;SSSSSSSSSSSSSSS........sssssssss
	    (assign bb-s-word2 (rotate bb-s-word byte-r))
	    (assign bb-s-bitpos (+ bb-s-bitpos b-temp))
	    (assign bb-width (- bb-width b-temp))
	    (parallel
	      (assign bb-d-bitpos (b-constant 0)))
	    (lisp (trace-path #/c))
	    (jump ubitblt-d-aligned-row-source))
	 (sequential
;;The hiah part of the first source word is not as long as the high part of the
;;first destination word. So extract the useful part of the first source word.
;;and deposit into it as much of the second source word as needed to fill out the rest
;;of the first destination word. Then position the rest of the second source word
;;appropriately for the inner loop.
		  ;;	                                   <- 32-s ->
	          ;;      ................................|SSSSSSSSSSssssssssssssssssssssss
	          ;;                      DDDDDDDDDDDDDDDD DDDDDDDDDDdddddd
	  (parllel-with-s-access bb-s-offset
4,887,235
	517	518
	    (assign byte-r (32- bb-s-bitpos))
	    (assign b-temp-2 bb-s-bitpos)
	    (assign bb-s-word (logxor bb-constant (rotate memory-data byte-r))))
	  (incr-wrap-s-offset-ahead)
	  ;;                      <----- s-d ----> <- 32-s ->   (32-d)-(32-s)=s-d
	  ;;      ssssssssssssssssSSSSSSSSSSSSSSSS|1111111111.....................
	  ;;                      DDDDDDDDDDDDDDDD DDDDDDDDDDdddddd
	  (parallel-with-s-access bb-s-offset-ahead
	    (assign byte-r (32- bb-s-bitpos))
	    (assign byte-s (- b-temp-2 bb-d-bitpos 1))
	    (assign bb-s-word2 (logxor bb-constant memory-data)))
	  (assign bb-s-word (dpb bb-s-word2 byte-o byte-r bb-s-word))
	  (parallel
	   (assign a-temp (32- bb-d-bitpos))
	   (assign b-temp obus))
	  (parallel-with-d-access bb-d-offset
	    (assign byte-r bb-d-bitpos)
	    (assign byte-s (1- a-temp))
	    (store-word (dpb bb-s-word byte-s byte-r memory-data)))
	  ;; We have now done the first partial D word. Turn into the d-aligned
	  ;; case, with the source advanced by one word from whero it started.
	  (incr-d-offset)
	  (assign bb-s-offset bb-s-offset-ahead)
	  (assign bb-s-bitpos (- b-temp-2 bb-d-bitpos))
	  (assign byte-r (32- bb-s-bitpos))
	  (assign bb-s-word2 (rotate bb-s-word2 byte-r))
	  (assign bb-width (- bb-width b-temp))
	  (parallel
	   (assign bb-d-bitpos (b-constant 0))
	   (lisp (trace-path #/e))
	   (jump ubitblt-d-aligned-row-source))))))))

(defucode ubitblt-aligned-row-source	;28 cycles per 8 words
  (if (greater-or-equal-fixnum bb-width (b-constant (* 8. 32.)))
      ;;Fetch a block of words into the buffer
      (sequential
       (assign b-temp (+ bb-s-offset (b-constant 8.)))
       (if (lesser-fixnum bb-s-row-length b-temp)
	   (goto ubitblt-aligned-row-source-slow-loop)
	 (sequential
	  (parallel
	   (assign-vma-offset s)
	   (call ubitblt-block-read-8))
	  (parallel
	   (assign-vma-offset d)
	   (call ubitblt-block-write-8))
	  (parallel
	   (assign bb-s-offset (+ bb-s-offset b-block-size))
	   (jump ubitblt-aligned-row-source)))))
    ;;Frob with whats left. Too bad dispatch blocks are expensive.
    (if (greater-or-equal-fixnum bb-width (b-constant (*4 32.)))
	(sequential
	 (assign b-temp (+ bb-s-offset (b-constant 4)))
	 (if (lesser-fixnum bb-s-row-length b-temp)
	     (goto ubitblt-aligned-row-source-slow-loop)
	   (sequential
	    (parallel
	     (assign-vma-offset s)
	     (call ubitblt-block-read-4))
	    (parallel
	     (assign-vma-offset d)
	     (call ubitblt-block-write-4))
	    (parallel
	     (assign bb-s-offset (+ bb-s-offset b-block-size))
	     (jump ubitblt-aligned-row-source-slow-loop)))))
      (goto ubitblt-aligned-row-source-slow-loop))))

(defucode ubitblt-aligned-row-source-slow-loop		;10 cycles per word
  (parallel-with-s-access bb-s-offset
    (trap-if (lesser-fixnum bb-width (b-constant 32.))
	     ubitblt-aligned-row-source-slow-loop-done)
    (waiting-for-memory)
    (assign bb-s-word (logxor bb-constant memory-data)))
  (assign-vma-offset d)					;1
  (store-word bb-s-word)				;1
  (assign bb-width (- bb-width (b-constant 32.)))	;1
  (incr-wrap-s-offset)					;2
  (parallel						;1
   (incr-d-offset)
   (lisp (trace-path #/,))
   (jump ubitblt-aligned-row-source-slow-loop)))

;Do last partial word, if any
(defucode ubitblt-aligned-row-source-slow-loop-done
  (if (plus-fixnum bb-width)
      (sequential
       (parallel-with-s-access bb-s-offset
	 (assign bb-s-word (logxor bb-constant memory-data)))
       (parallel-with-d-access bb-d-offset
	 (assign byte-r (a-constant 0))
	 (assign byte-s (1- bb-width))
	 (parallel-with-return
4,887,235
	519	520
	   (store-word (dpb bb-s-word byte-s byte-r memory-data))
	   (lisp (trace-path #/2)))))
    (parllel-with-return
     (lisp (trace-path #/1)))))

;bb-s-word2 has the pirtial previous source word whose cddress is in bb-e-offset,
;rotated into alignment with the destination
(defucode ubitblt-d-aligned-row-source
  (if (greater-or-equal-fixnum bb-width (b-constant (* 8. 32.)))
      ;;Fetch a block of words into the buffer
      (sequential
       (assign b-temp (+ bb-s-offset (b-constant 8.)))
       (if (lesser-or-equal-fixnum bb-s-row-length b-temp)
	   (goto ubitblt-d-aligned-row-source-slow-loop)
	 (sequcntial
	  (parallel
	   (assign-vma-offset s 1)
	   (call ubitblt-block-read-8))
	  (parallel
	   (assign-vma-offset d)
	   (call ubitblt-d-aligned-block-write-8))
	  (parallel
	   (assign bb-s-offset (+ bb-s-offset b-block-size))
	   (jump ubitblt-d-aligned-row-source)))))
    (if	(greater-or-equal-fixnum bb-width (b-constant (* 4. 32.)))
	(sequential
	 (assign b-temp (+ bb-s-offset (b-constant 4)))
	 (if (lesser-or-equal-fixnum bb-s-row-length b-temp)
	     (goto ubitblt-d-aligned-row-source-slow-loop)
	   (sequential
	    (parallel
	     (assign-vma-offset s 1)
	     (call ubitblt-block-read-4))
	    (parallel
	     (assign-vma-offset d)
	     (call ubitblt-d-aligned-block-write-4))
	    (parallel
	     (assign bb-s-offset (+ bb-s-offset b-block-size))
	     (jump ubitblt-d-aligned-row-source)))))
      (goto ubitblt-d-aligned-row-source-slow-loop))))

;;Each pass through this loop stores exactly one d word. Each time through,
;;bb-s-word2 will have the bits to use for the lower part of the d word (already
;;rotated into position), and another s word will be fetched into bb-s-word.
;;Then s-word will get rotated when transferred into s-word2 in preparation for
;;next loop pass.
(defucode ubitblt-d-aligned-row-source-slow-loop	;13 cycles per word
  (incr-wrap-s-offset-ahead)				;2
  (parallel-with-s-access bb-s-offset-ahead		;4
    (trap-if (lesser-fixnum bb-width (b-constant 32.))
	     ubitblt-d-aligned-row-source-done)
    (assign byte-s (1- bb-s-bitpos))
    (assign bb-s-word (logxor bb-constant memory-data)))
  (assign byte-r (- (b-constant 32.) bb-s-bitpos))	;1
  (assign-vma-offset d)					;1
  (store-word (dpb bb-s-word byte-s byte-r bb-s-word2))	;1
  (assign bb-width (- bb-width (b-constant 32.)))	;1
  (incr-d-offset) 					;1
  (assign bb-s-offset bb-s-offset-ahead)
  (parallel						;1
   (assign bb-s-word2 (rotate bb-s-word byte-r))
   (lisp (trace-path #/.))
   (jump ubitblt-d-aligned-row-source)))

(defucode ubitblt-d-aligned-row-source-done
  (if (plus-fixnum bb-width)
      (sequential
       (assign b-temp (32- bb-s-bitpos))	;how many bits are valid in bb-s-word2
       (if (lesser-or-equal-fixnum bb-width b-temp)
	;;we have enough s bits
	   (parallel-with-d-access bb-d-offset
	     (assign byte-s (1- bb-width))
	     (parallel
	      (assign byte-r (b-constant 0))
	      (assign bb-s-word bb-s-word2))
	     (parallel
	      (lisp Itrace-path #/4))
	     (parallel-with-return
	      (store-word (dpb bb-s-word byte-s byte-r memory-data)))))
       ;;need to get another source word
       (sequential
	(parallel-with-s-access bb-s-offset-ahead
	  (assign byte-r (32- bb-s-bitpos))
	  (assign byte-s (1- bb-s-bitpos))
	  (assign bb-s-word (logxor bb-constant memory-data)))
	(assign bb-s-word (dpb bb-s-word byte-s byte-r bb-s-word2))
	(lisp trace-path #/5))
       (parallel-with-d-access bb-d-offset
         (assign byte-s (1- bb-width))
	 (assign byte-r (a-constant 0))
	 (parallel-with-return
	  (store-word (dpb bb-s-word byte-s byte-r memory-data)))))))
4,887,235
	521	522
;;XXXbrad - this appears not to match previous page!
          (parallel
	   (lisp (trace-path #/3))
	   (return))))

;;alu depends only on destination bits
(defucode ubitblt-long-row-destination
  (if (plus-fixnum bb-d-bitpos)		;frob the first partial word
      (sequential
       (assign b-temp (32- bb-d-bitpos))
       (parallel-with-d-access bb-d-offset
	(assign byte-s (1- b-temp))
	(assign byte-r bb-d-bitpos)
	(store-word (logxor (dpb bb-constant byte-s byte-r 0) memory-data)))
       (incr-d-offset)
       (assign bb-width (- bb-width b-temp))
       (parallel
	(assign bb-d-bitpos (b-constant 0))
	(lisp (trace-path #/b))
	(jump ubitblt-long-row-destination-loop)))
    (machine-version-case
     ((sim) (parallel
	     (lisp (trace-path #/a))
	     (jump ubitblt-long-row-destination-loop)))
     (otherwise (goto ubitblt-long-row-destination-loop)))))

(defucode ubitblt-long-row-destination-loop	;25 cycles per 8 words
  (if (greater-or-equal-fixnum bb-width (b-constant (* 8. 32.)))
      ;;Fetch a block of words into the buffer
      (sequential
       (parallel
	(assign-vma-offset d)
	(call ubitblt-block-read-8))
       (parallel
	(assign-vma-offset d)
	(call-and-return-to ubitblt-block-write-8
			    ubitblt-long-row-destination-loop)))
    ;;Frob with whats left. Too bad dispatch blocks are expensive.
    (if (greater-or-equal-fixnum bb-width (b-constant (* 4 32.)))
	(sequential
	 (parallel
	  (assign-vma-offset d)
	  (call ubitblt-block-read-4))
	 (parallel
	  (assign-vma-offset d)
	  (call-and-return-to ubitblt-block-write-4
			      ubitblt-long-row-destination-slow-loop)))
      (goto ubitblt-long-row-destination-slow-loop))))

(defucode ubitblt-long-row-destination-slow-loop	;5 cycles per word (bus interference)
  (parallel-with-d-access-check-write bb-d-offset
    (parallel
     (assign bb-width (- bb-width (b-constant 32.)))
     (trap-if (minus-fixnum obus) ubitblt-long-row-destination-done)) ;aborts the assign
    (parallel
     (lisp (trace-path #/,))
     (waiting-for-memory)
     (incr-d-offset))
    (parallel
     (store-word (logxor bb-constant memory-data))
     (jump ubitblt-long-row-destination-slow-loop))))

(defucode ubitblt-long-row-destination-done
  (if (plus-fixnum bb-width)
      (parallel-with-d-access bb-d-offset
	(assign byte-s (1- bb-width))
	(assign byte-r (a-constant 0))
	(parallel-with-return
	 (lisp (trace-path #/2))
	 (store-word (logxor (dpb bb-constant byte-s byte-r 0) memory-data))))
    (parallel
     (lisp (trace-path #/1))
     (return))))

(defmacro def-bitblt-block-read (name n)
  `(defucode ,name
     (parallel
      (assign a-block-size (b-constant ,n))	;Used later to advance offsets
      (assign b-block-size obus)
      (start-memory block read))		;start first word
     (parallel
      (waiting-for-memory)			;waiting for first word
      (start-memory block read))		;start second word
     ,@(loop for i from (- n-bitblt-buffers n) below n-bitblt-buffers
	     collect `(abus-array-data
		       (assign (bitblt-buffer ,i)
			       (set-type (logxor bb-constant memory-data) dtp-fix))
		       ,(selectq (- n-bitblt-buffers i)
				 (1 `(return))
				 (2 nil)
				 (otherwise `(start-memory block read)))))))

4,887,235
	523	524
(def-bitblt-block-read ubitblt-block-read-8 8)	;I suppose this when interned...
(def-bitblt-block-read ubitblt-block-read-4 4)	;... will subsume this.

(defmacro def-bitblt-block-write (name n)
  `(defucode ,name
     (activate-bitblt-buffer)
     ,@(loop for i from (- n-bitblt-buffers n) below n-bitblt-buffers
	     collect `(parallel
		       (store-word (bitblt-buffer ,i) block)
		       (lisp (trace-path #/.))))
     (parallel
      (assign bb-d-offset (+ bb-d-offset b-block-size))
      (call deactivate-bitblt-buffer))
     (parallel-with-return
      (assign bb-width (- bb-width (rotate b-block-size 5))) ;2~5 = bits-per-word
      )))

(def-bitblt-block-write ubitblt-block-write-8 8)
(def-bitblt-block-write ubitblt-block-write-4 4)

(defmacro def-d-aligned-block-write (name n)
  `(defucode ,name
     (assign byte-s (- bb-s-bitpos))
     (parallel
      (assign byte-r (- (b-constant 32.) bb-s-bitpos))
      (call activate-bitblt-buffer))
     ,@(loop for i from (- n-bitblt-buffers n) below n-bitblt-buffers
	     append `((parallel
		       (store-word (dpb (bitblt-buffer ,i) byte-s byte-r bb-s-word2) block)
		       (lisp (trace-path #/.)))
		      (assign bb-s-word2 (rotate (bitblt-buffer ,i) byte-r))))
     (parallel
      (assign bb-d-offset (+ bb-d-offset b-block-size))
      (call deactivation-bitblt-buffer))
     (parallel-with-return
      (assign bb-width (- bb-width (rotate b-block-size 5))) ;2^5 = bits-per-word
      )))

(def-d-aligned-block-write ubitblt-d-aligned-block-write-8 8.)
(def-d-aligned-block-write ubitblt-d-aligned-block-write-4 4.)

;;alu depends on neither source nor destination bits
(defucode ubitblt-long-row-neither
  (if (plus-fixnum bb-d-bitpos)
      (sequential
       (assign b-temp (32- bb-d-bitpos))
       (parallel-with-d-access bb-d-offset
	 (assign byte-r bb-d-bitpos)
	 (assign byte-s (2- b-temp))
	 (store-word (dpb bb-constant byte-s byte-r memory-data)))
       (incr-d-offset)
       (assign bb-width (- bb-width b-temp))
       (parallel
	(assign bb-d-bitpos (b-constant 0))
	(lisp (trace-path #/b))
	(jump ubitblt-long-row-neither-loop)))
    (parallel
     (lisp (trace-path #/a))
     (jump ubitblt-long-row-neither-loop))))

(defucode ubitblt-long-row-neither-loop
  (if (greater-or-equal-fixnum bb-width (b-constant (* 8. 32.)))
      (sequential
       (parallel
	(assign-vma-offset d)
	(call store-block-bb-constant-8))
       (assign bb-d-offset (+ bb-d-offset (b-constant 8.)))
       (parallel
	(assign bb-width (- bb-width (b-constant (* 8. 32.))))
	(jump ubitblt-long-row-neither-loop)))
    (sequential
     (dispatch-after-next (parallel (assign b-block-size (ldb bb-width 3 5))
				    (ldb bb-width 3 5))
	((7) (parallel (assign-vma-offset d)
		       (call-and-return-to store-block-bb-constant-7
					   ubitblt-long-row-neither-finish)))
	((6) (parallel (assign-vma-offset d)
		       (call-and-return-to store-block-bb-constant-6
					   ubitblt-long-row-neither-finish)))
	((5) (parallel (assign-vma-offset d)
		       (call-and-return-to store-block-bb-constant-5
					   ubitblt-long-row-neither-finish)))
	((4) (parallel (assign-vma-offset d)
		       (call-and-return-to-store-block-bb-constant-4
			ubitblt-long-row-neither-finish)))
	((3) (parallel (assign-vma-offset d)
		       (call-and-return-to store-block-bb-constant-3
					   ubitblt-long-row-neither-finish)))
	((2) (parallel (assign-vma-offset d)
		       (call-and-return-to store-block-bb-constant-2
					   ubitblt-long-row-neither-finish)))
	((1) (assign-vma-offset d)
4,887,235
	525	526
	     (parallel
	      (lisp (trace-path #/.))
	      (store-word bb-constant)
	      (jump ubitblt-long-row-neither-finish))))
     (parallel
      (take-dispatch)
      (trap-if (zero-fixnum b-block-size) ubitblt-long-row-neither-finish)))))

(defucode ubitblt-long-row-neither-finish
  (assign bb-d-offset (+ bb-d-offset b-block-size))
  (assign bb-width (logand bb-width (b-constant #o37)))
  (if (plus-fixnum bb-width)
      (parallel-with-d-access bb-d-offset
	(assign byte-r (a-constant 0))
	(assign byte-s (1- bb-width))
	(parallel
	 (lisp (trace-path #/2))
	 (store-word (dpb bb-constant byte-s byte-r memory-data))
	 (return)))
    (parallel
     (lisp (trace-path #/1))
     (return))))

(defmacro store-block-bb-constant-routines (n)
  `(progn 'compile
	  ,@(loop with s = "STORE-BLOCK-BB-CONSTANT-~d"
		  for i from n downto 1
		  collect `(defucode ,(fintern s i)
			     (parallel
			      (store-word bb-constant block)
			      (lisp (trace-path #/,))
			      ,(if (> i 1)
				   `(jump ,(fintern s (1- i)))
				   `(return)))))))

(store-block-bb-constant-routines 8.)

;;alu depends both source and destination bits
(defucode ubitblt-long-row-both
  (parallel
   (assign b-temp bb-d-bitpos)
   (if (zero-fixnum bb-d-bitpos)
       (if (zero-fixnum bb-s-bitpos)
	   (goto ubitblt-aligned-row-both)
	   (parallel-with-s-access bb-s-offset
	     ;;    SSSSSSSSSSSSSSSSSSSSSSSSSSS.ssss
	     ;;ddddddddddddddddddddddddddddddd.
	     (assign byte-r (32- bb-s-bitpos))
	     (parallel
	      (assign bb-s-word (rotate memory-data byte-r))
	      (lisp (trace-path #/c))
	      (jump ubitblt-aligned-row-both))))
     (if (equal-fixnum bb-s-bitpos b-temp)
	 (sequential
	  (parallel-with-s-access bb-s-offset
	    ;;SSSSSSSSSSSSSSSSSSSSSSSSSS.ssssss
	    ;;dddddddddddddddddddddddddd.dddddd
	    (parallel
	     (assign byte-r (32- bb-s-bitpos))
	     (assign b-temp obus))
	    (assign byte-s (31- bb-s-titpos))
	    (assign bb-s-word (logxor bb-constant (ldb memory-data byte-s byte-r))))
	  (assign byte-r bb-s-bitpos)
	  (parallel
	   (assign-vma-offset d)
	   ;;ssssssssssssssssssssssssss.ssssss
	   ;;DDDDDDDDDDDDDDDDDDDDDDDDDD.dddddd
	   (call bb-byte-alu-operation-dispatch))
	  ;; First partial word stored, turn into aligned case
	  (incr-wrap-s-offset)
	  (incr-d-offset)
	  (assign bb-width (- bb-width b-temp))
	  (assign bb-s-bitpos (b-constant 0))
	  (parallel
	   (assign bb-d-bitpos (b-constant 0))
	   (lisp (trace-path #/b))
	   (jump ubitblt-aligned-row-both)))
       (if (lesser-fixnum bb-s-bitpos b-temp)
	   (goto ubitblt-long-row-both-s-longer)
	   (goto ubitblt-long-row-both-s-shorter))))))


(defucode ubitblt-long-row-both-s-longer
  (assign b-temp (32- bt-d-bitpos))
  (parallel-with-s-access bb-s-offset
    (assign byte-r (32- bb-s-bitpos))
    (assign byte-s (1- b-temp))
    (assign bb-s-word2 memory-data))
  ;;ssssSSSSSSSSSSSSSSSSSSSS........
  ;;    DDDDDDDDDDDDDDDDDDDDdddddddddddd
  ;;    <----- b-temp ----->
  (assign bb-s-word (logxor bb-constant (rotate bb-s-word2 byte-r)))
  ;;........ssssSSSSSSSSSSSSSSSSSSSS
  (parallel
4,887,235
	527	528
    (assign byte-r bb-d-bitpos)
    (assign b-temp-2 bb-d-bitpos))
  (parallel
   (assign-vma-offset d)
   ;;ssssssssssssssssssssssss.ssssssss
   ;;    DDDDDDDDDDDDDDDDDDDD.dddddddddddd
   (call bb-byte-alu-operation-dispatch))
  (incr-d-offset)
  ;;Remaining are (32-(s.bitpos+(32-d.bitpos))) = d.bitpos-s.bitpos
  ;;    <-- 32-d.bitpos ---> <-s.bitpos->
  ;;SSSSssssssssssssssssssss.ssssssss
  ;;    dddddddddddddddddddd.dddddddddddd
  (assign byte-r (- b-temp-2 bb-s-bitpos))
  (assign bb-s-bitpos (+ bb-s-bitpos b-temp))
  (assign bb-s-word (rotate bb-s-word2 byte-r))
  (assign bb-width (- bb-width t-temp))
  (parallel
   (assign bb-d-bitpos (b-constant 0))
   (lisp (trace-path #/d))
   (jump ubitblt-d-aligned-row-both)))

;Need two S words to do the first partial D word
(defucode ubitblt-long-row-both-s-shorter
  ;;    ssssssssssssssssssssssss.ssssssss
  ;;dddddddddddddddddddddddddddd.dddd
  (parallel-with-s-access bb-s-offset
    (assign byte-r (32- bb-s-bitpos))
    (assign byte-s (31- bb-s-bitpos))
    ;;    SSSSSSSSSSSSSSSSSSSSSSSS.ssssssss
    ;;ddddddaddddddddddddddddddddd.dddd
    (assign bb-s-word (logxor bb-constant (ldb memory-data byte-s byte-r))))
  (incr-wrap-s-offset-ahead)
  ;;   <--> s.bitpos-d.bitpos
  ;;...SSSS|sssssssssssmssssssssssss.ssssssss
  ;;   dddd dddddddddddddddddddddddd.dddd
  (parallel-with-s-access bb-s-offset-ahead
    (assign byte-s (- bb-s-bitpos b-temp 1))
    (assign byte-r (32- bb-s-bitpos))
    (assign bb-s-word2 (logxor bb-constant memory-data)))
  ;;...SSSS|SSSSSSSSSSSSSSSSSSSSSSSS.ssssssss
  ;;   dddd dddddddddddddddddddddddd.dddd
  (assign bb-s-word (dpb bb-s-word2 byte-s byte-r bb-s-word))
  (assign byte-r bb-d-bitpos)
  ;;...ssss|ssssssssssssssssssssssss.ssssssss
  ;;   DDDD DDDDDDDDDDDDDDDDDDDDDDDD.dddd
  (parallel
   (assign-vma-offset d)
   (call bb-byte-alu-operation-dispatch))
  (incr-d-offset)
  (assign bb-s-offset bb-s-offset-ahead)
  ;;...SSSssss|ssssssssssssssssssssssss.ssssssss
  ;;      dddd dddddddddddddddddddddddd.dddd
  (assign byte-r (- b-temp bb-s-bitpos))
  (assign bb-s-bitpos (- bb-s-bitpos b-temp))
  (assign b-temp (32- bb-d-bitpos))
  (assign bb-s-word (logxor (rotate bb-s-word2 byte-r) bb-constant))
  (assign bb-width (- bb-width b-temp))
  (parallel
   (assign bb-d-bitpos (b-constant 0))
   (lisp (trace-path #/e))
   (jump ubitblt-d-aligned-row-both)))

(defucode ubitblt-aligned-row-both
  (if (greater-or-equal-fixnum bb-width (b-constant (* 8. 32.)))
      ;;Fetch a block of words into the buffer
      (sequential
       (assign b-temp (+ bb-s-offset (b-constant 8.)))
       (if (lesser-fixnum bb-s-row-length b-temp)
	   (goto ubitblt-aligned-row-both-slow-loop)
	 (sequential
	  (parallel
	   (assign-vma-offset s)
	   (call ubitblt-block-read-8))
	  (parallel
	   (assign-vma-offset d)
	   (call-and-return-to ubitblt-block-alu-8 ubitblt-aligned-row-both)))))
    ;; Frob with whats left. Too bad dispatch blocks are expensive.
    (if (greater-or-equal-fixnum bb-width (b-constant (* 4 32.)))
	(sequential
	 (assign b-temp (+ bb-s-offset (b-constant 4.)))
	 (if (lesser-fixnum bb-s-row-length b-temp)
	     (goto ubitblt-aligned-row-both-slow-loop)
	   (sequential
	    (parallel
	     (assign-vma-offset c)
	     (call ubitblt-block-read-4))
	    (parallel
	     (assign-vma-offset d)
	     (call-and-return-to ubitblt-block-alu-4
				 ubitblt-aligned-row-both-slow-loop)))))
      (goto ubitblt-aligned-row-both-slow-loop))))

4,887,235
	529	530
(defucode ubitblt-aligned-row-both-slow-loop	;12 cycles per word
  (parallel-with-s-access bb-s-offset		;4 cycles
    (trap-if (lesser-fixnum bb-width (b-constant 32.))
	     ubitblt-aligned-row-both-slow-loop-done)
    (waiting-for-memory)
    (assign bb-s-word (logxor bb-constant memory-data)))
  (parallel					;1+3 cycles
   (assign-vma-offset d)
   (call bb-word-alu-operation-dispatch))
  (assign bb-width (- bb-width (b-constant 32.)))	;1 cycle
  (incr-wrap-s-offset)				;2 cycles
  (parallel
   (incr-d-offset)	;1 cycle
   (lisp (trace-path #/.))
   (jump ubitblt-aligned-row-both)))

(defucode ubitblt-aligned-row-both-slow-loop-done
  (if (plus-fixnum bb-width)
      (sequential
       (parallel-with-s-access bb-s-offset
	 (assign byte-r (b-constant 0))
	 (assign byte-s (1- bb-width))
	 (assign bb-s-word (logxor bb-constant memory-data)))
       (parallel
	(lisp (trace-path #/2))
	(assign-vma-offset d)
	(jump bb-byte-alu-operation-dispatch)))		;jcall
    (parallel-with-return
     (lisp (trace-path #/1)))))

(defucode ubitblt-block-alu-8
  (dispatch-after-this (ldb bb-alu-operation 4 0)
		       (parallel
			(assign a-block-size (a-constant 8.))
			(assign b-block-size (a-constant 0.))
			(start-memory block read))	;start first word
    ((1 2) (goto ubitblt-block-logand-8))		; x*y	~x*y
    ((4 8.) (goto ubitblt-block-andc2-8))		; x*~y  ~x*~y
    ((6 9.) (goto ubitblt-block-logxor-8))		; x xor y, ~x xor y
    ((7 11.) (goto ubitblt-block-logior-8))		; x+y, ~x+y
    ((13. 14.) (goto ubitblt-block-lognand-8))))	; ~(~x*y), ~(x*y)


(defucode ubitblt-block-alu-4
  (dispatch-after-this (ldb bb-alu-operation 4 0)
		       (parallel
			(assign a-block-size (a-constant 4.))
			(assign b-block-size (a-constant 4.))
			(start-memory block read))	;start first word
    ((1 2) (goto ubitblt-block-logand-4))		; x*y	~x*y
    ((4 8.) (goto ubitblt-block-andc2-4))		; x*~y  ~x*~y
    ((6 3.) (octo ubitblt-block-logxor-4))		; x xor y, ~x xor y
    ((7 11.) igoto ubitbit-block-logior-4))		; x+y, ~x+y
    ((13. 14.) (goto ubitblt-block-lognand-4))))	; ~(~x*y), ~(x*y)

(defmacro def-block-aluop (name n alu)
  (if (memq (get (caddr (microexpand `(,alu a-temp b-temp))) 'alu) weird-alu-functions)
      ;; Cannot simultaneously run ALU and storm into tne bitblt-buffer
      `(defucode ,name
	 (parallel
	  (waiting-for-memory)		;first word already started
	  (declare-memory-timing active-cycle))
	 ,@(loop for i from (- n-bitblt-buffers n) below n-bitblt-buffers
		 collect `(sequential
			   (abus-array-data
			    (assign b-temp (,alu (bitblt-buffer ,i) memory-data))
			    `(if (> (- n-bitblt-buffers i) 1)
				 `(start-memory block read))) ;start next word
			   (parallel
			    (assign (bitblt-buffer ,i) (set-type b-temp dtp-fix))
			    ,(if (= (- n-bitblt-buffers i) 1)
				 `(jump (fintern "UBITBLT-BLOCK-ALU-WRITE-~d" n)))))))
    ;; Normal case
    `(defucode ,name
       (parallel
	(waiting-for-memory)		;first word alread started
	(declare-memory-timing active-cycle)
	(start-memory read block))	;start second word
       ,@e(loop	for i from (- n-bitblt-buffers n) below n-bitblt-buffers
		collect `(parallel
			  (abus-array-data
			   (assign (bitbit-buffer ,1) (set-type (,alu (bitblt-buffer ,i)
								      memory-data)
								dtp-fix)))
			  ,(selectq (- n-bitblt-buffers i)
				    (1 `(jump ,(fintern "UBITBLT-BLOCK-ALU-WRITE-~d" n)))
				    (2 nil)
				    (otherwise `(start-memory block read))) ;start word after next
			  )))))

(def-block-aluop ubitblt-block-logand-8 8 logand)
(def-block-aluop ubitblt-block-logior-8 8 logior)
4,887,235
	531	532
(def-block-aluop ubitblt-block-logxor-8 8 logxor)
(def-block-aluop ubitblt-block-andc2-8 8 andc2)
(def-block-aluop ubitblt-block-lognand-8 8 lognand)

(def-block-aluop ubitblt-block-logand-4 4 logand)
(def-block-aluop ubitblt-block-loglor-4 4 logior)
(def-block-aluop ubitblt-block-logxor-4 4 logxor)
(def-block-aluop ubitblt-block-andc2-4 4 andc2)
(def-block-aluop ubitblt-blocK-lognand-4 4 lognand)

(defmacro def-block-alu-write (name n)
  `(defucode ,name
     (parallel
      (assign-vma-offset d)
      (call activate-bitblt-buffer))
     ,@(loop for i from (- n-bitblt-buffers n) below n-bitblt-buffers
	     collect `(parallel
		       (store-word (bitblt-buffer ,i) block)
		       (lisp (trace-path #/,))))
     (parallel
      (assign bb-d-offset (+ bb-d-offset b-block-size))
      (call dectivate-bitblt-buffer))
     (assign bb-width (- bb-width (rotate b-block-size 5))) ;2^5 = bits-per-word
     (parallel
      (assign bb-s-offset (+ bb-s-offset b-block-size))
      (return))))

(def-block-alu-write ubitblt-block-alu-write-8 8)
(def-block-alu-write ubitblt-block-alu-write-4 4)

;;Each time through the loop, s-word was fetched from memory like
;;          <----- s.bitpos ----->
;;ssssssssss......................
;;and then rotated so it looks like
;;......................ssssssssss
;;<----- s.bitpos ----->
;;
;:Each time, another s-word2 gets fetched and deposited into s-word like
;;          |<----- s.bitpos ----->
;;          |......................1111111111
;;2222222222 2222222222222222222222
;;
;;The rotation for the dpb equals the rotation for setup for next loop.

;;bb-s-word has the partial previous source word whose address is in bb-s-offset,
;;rotated into alignment with the destination, but not xored with bb-constant
(defucode ubitblt-d-aligned-row-both
  (if (greater-or-equal-fixnum bb-width (b-constant (* 8. 32.)))
      ;;Fetch a block of words into the buffer
      (sequential
       (assign b-temp (+ bb-s-offset (b-constant 8.)))
       (if (lesser-or-equal-fixnum bb-s-row-length b-temp)
	   (goto ubitblt-d-aligned-row-both-slow-loop)
	 (sequential
	  (parallel
	   (assign-vma-offset s 1)
	   (call ubitblt-rotated-block-read-8))
	  (parallel
	   (assign-vma-offset d)
	   (call-and-return-to ubitblt-block-alu-8 ubitblt-d-aligned-row-both)))))
    ;;Frob with whats left. Too bad dispatch blocks are expensive.
    (if (greater-or-equal-fixnum bb-width (b-constant (* 4 32.)))
	(sequential
	 (assign b-temp (+ bb-s-offset (b-constant 4.)))
	 (if (lesser-or-equal-fixnum bb-s-row-length t-temp)
	     (goto ubitblt-d-aligned-row-both-slow-loop)
	   (sequential
	    (parallel
	     (assign-vma-offset s 1)
	     (call ubitblt-rotated-block-read-4))
	    (parallel
	     (assign-vma-offset d)
	     (call-and-return-to ubitblt-block-alu-4
				 ubitblt-d-aligned-row-both-slow-loop)))))
      (goto ubitblt-d-aligned-row-both-slow-loop))))

(defucode ubitblt-d-aligned-row-both-slow-loop		;17 cycles per word
  (incr-wrap-s-offset-ahead)		;2
  (parallel-with-s-access bb-s-offset-ahead
    (trap-if (lesser-fixnum bb-width (b-constant 32.))
	     ubitblt-d-aligned-row-both-done)
    (assign byte-s (1- bb-s-bitpos))
    (assign bb-s-word2 memory-data))
  (assign byte-r (32- bb-s-bitpos))
  (assign bb-s-word (dpb bb-s-word2 byte-s byte-r bb-s-word))	;1
  (assign bb-s-word (logxor bb-constant-a bb-s-word))		;1
  (parallel						;1+3
   (assign-vma-offset d)
