4,887,235
	313	314
	(parallel-with-s-access bb-s-offset
	  (assign bb-s-word (logxor bb-constant memory-data)))
	(parallel-with-d-access bb-d-offset
	(assign byte-r (a-constant 0))
	(assign byte-s (1- bb-width))
	(parallel-with-return
	  (store-word (dpb bb-s-word byte-s byte-r memory-data))
	  (lisp (trace-path #/2)))))
    (parallel-with-return
      (lisp (trace-path #/1)))))

;bb-s-word2 has the partial previous source word whose address is in bb-s-offset.
;rotated into aliagnment with the destination
(defucode ubitblt-d-aligned-row-source
  (if (greater-or-equal-fixnum bb-width (b-constant (* 8. 32.)))
      ;;Fetch a block of words into the buffer-
      (sequential
        (assign b-temp (+ bb-s-offset (b-constant 8.)))
	(if (lesser-or-equal-fixnum bb-s-row-length b-temp)
	    (goto ubitblt-d-aligned-row-source-slow-loop)
	  (sequential
	    (parallel
	      (assign-vma-offset s 1)
	      (call ubitblt-block-read-8))
	   (parallel
	     (assign-vma-offset d)
	     (call ubitblt-d-aligned-block-write-8))
	   (parallel
	     (assign bb-s-offset (+ bb-s-offset b-block-size))
	     (jump ubitblt-d-aligned-row-source)))))
    (if (greater-or-equal-fixnum bb-width (b-constant (* 4. 32.)))
	(sequential
	  (assign b-temp (+ bb-s-offset (b-constant 4)))
	  (if (lesser-or-equal-fixnum bb-s-row-length b-temp)
	      (goto ubitblt-d-aligned-row-source-slow-loop)
	    (sequential
	      (parallel
	        (assign-vma-offset s 1)
		(call ubitblt-block-read-4))
	      (parallel
	        (assign-vma-offset d)
		(call ubitblt-d-aligned-block-write-4))
	      (parallel
	        (assign bb-s-offset (+ bb-s-offset b-block-size))
		(jump ubitblt-d-aligned-row-source)))))
      (goto ubitblt-d-aligned-row-source-slow-loop))))

;;Each pass through this loop stores exactly one d word. Each time through,
;:bb-s-word2 will have the bits to use for the lower part of the d word (already
;;rotated into position), and another s wore will be fetched into bb-s-word.
;;Then s-word will get rotated when transferred into s-word2 in preporation for
;;next loop pass.

(defucode ubitblt-d-aligned-row-source-slow-loop	;13 cycles per- word
  (incr-wrap-s-offset-ahead)				;2
  (parallel-with-s-access bb-s-offset-ahead		;4
    (trap-if (lesser-fixnum bb-width (b-constant 32.))
	     ubitblt-d-aligned-row-source-done)
    (assign byte-s (1- bb-s-bitpos))
    (assign bb-s-word (logxor bb-constant memory-data)))
  (assign byte-r (- (b-constant 32.) bb-s-bitpos))	;1
  (assign-vma-offset d)					;1
  (store-word (dpb bb-s-word byte-s byte-r bb-s-word2))	;1
  (assign bb-width (- bb-width (b-constant 32.)))	;1
  (incr-d-offset)					;1
  (assign bb-s-offset bb-s-offset-ahead)		;1
  (parallel						;1
    (assign bb-s-word2 (rotate bb-s-word byte-r))
    (lisp (trace-path #/.))
    (jump ubitblt-d-aligned-row-source)))

(defucode ubitblt-d-aligned-row-source-done
  (if (plus-fixnum bb-width)
      (sequential
       (assign b-temp (32- bb-s-bitpos))	;how many bits are valid in bb-s-word2
       (if (lesser-or-equal-fixnum bb-width b-temp)
	   ;;we have enough s bits
	   (parallel-with-d-access bb-d-offset
	     (assign byte-s (1- bb-width))
	     (parallel
	       (assign byte-r (b-constant 0))
	       (assign bb-s-word bb-s-word2))
	     (parallel
	       (lisp (trace-path #/4))
	       (parallel-with-return
		(store-word (dpb bb-s-word byte-s byte-r memory-data)))))
	 ;;need to get another source word
	 (sequential
4,887,235
	315	316
	   (parallel-with-s-access bb-s-offset-ahead
	     (assign byte-r (32- bb-s-bitpos))
	     (assign byte-s (1- bb-s-bitpos))
	     (assign bb-s-word (logxor bb-constant memory-data)))
	   (assign bb-s-word (dpb bb-s-word byte-s byte-r bb-s-word2))
	   (lisp (trace-path #/5))
	   (parallel-with-d-access bb-d-offset
	     (assign byte-s (1- bb-width))
	     (assign byte-r (a-constant 0))
	     (parallel-with-return
	       (store-word (dpb bb-s-word byte-s byte-r memory-data)))))))
    (parallel
     (lisp (trace-path #/3))
     (return))))


;alu depends only on destination bits
(defucode ubitblt-long-row-destination
  (if (plus-fixnum bb-d-bitpos)
      (sequential			;frob the first partial word
       (assign b-temp (32- bb-d-bitpos))
       (parallel-with-d-access bb-d-offset
	 (assign byte-s (1- b-temp))
	 (assign byte-r bb-d-bitpos)
	 (store-word (logxor (dpb bb-constant byte-s byte-r 0) memory-data)))
       (incr-d-offset)
       (assign bb-width (- bb-width b-temp))
       (parallel
	 (assign bb-d-bitpos (b-constant 0))
	 (lisp (trace-path #/b))
	 (jump ubitblt-long-row-destination-loop)))
    (machine-version-case
     ((sim) (parallel
	     (lisp (trace-path #/a))
	     (jump ubitblt-long-row-destination-loop)))
     (otherwise (goto ubitblt-long-row-destination-loop)))))

(defucode ubitblt-long-row-destination-loop	;25 cycles per 8 words
  (if (greater-or-equal-fixnum bb-width (b-constant (* 8. 32.)))
      ;;Fetch a block of words into the buffer
      (sequential
        (parallel
	  (assign-vma-offset d)
	  (call ubitblt-block-read-8))
	(parallel
	  (assign-vma-offset d)
	  (call-and-return-to ubitblt-block-write-8
			      ubitblt-long-row-destination-loop)))
      ;;Frob with what's left. Too bad dispatch blocks are expensive.
    (if (greater-or-equal-fixnum bb-width (b-constant (* 4 32.)))
	(sequential
	 (parallel
	  (assign-vma-offset d)
	  (call ubitblt-block-read-4))
	 (parallel
	  (assign-vma-offset d)
	  (call-and-return-to ubitblt-block-write-4
			      ubitblt-long-row-destination-slow-loop)))
      (goto ubitblt-long-row-destination-slow-loop))))

(defucode ubitblt-long-row-destination-slow-loop	;5 cycles per word (bus interference)
  (parallel-with-d-access-check-write bb-d-offset
    (parallel
      (assign bb-width (- bb-width (b-constant 32.)))
      (trap-if (minus-fixnum obus) ubitblt-long-row-destination-done)) ;aborts the assign
    (parallel
     (lisp (trace-path #/1))
     (waiting-for-memory)
     (incr-d-offset))
    (parallel
     (store-word (logxor bb-constant memory-data))
     (jump ubitblt-long-row-destination-slow-loop))))

(defucode ubitblt-long-row-destination-done
  (if (plus-fixnum bb-width)
      (parallel-with-d-access bb-d-offset
	(assign byte-s (1- bt-width))
	(assign byte-r (a-constant 0))
	(parallel-with-return
	  (lisp (trace-path #/2))
	  (store-word (logxor (dpb bb-constant byte-s byte-r 0) memory-data))))
    (parallel
     (lisp (trace-path #/1))
     (return))))

(defmacro def-bitblt-block-read (name n)
  '(defucode ,name
     (parallel
       (assign a-block-size (b-constant ,n))	;Used later to advance offsets
       (assign b-block-size obus)
       (start-memory block read))		;start first word
4,887,235
	317	318
     (parallel
      (waiting-for-memory)		;waiting for first word
      (start-memory block read))	;start second word
     ,@(loop for i from (- n-bitblt-buffers n)	below n-bitblt-buffers
	     collect '(abus-array-data
		       (assign (bitblt-buffer ,1)
			       (set-type (logxor bb-constant memory-data) dtp-fix))
		       ,(selectq (- n-bitblt-buffers i)
			  (1 '(return))
			  (2 nil)
			  (otherwise '(start-memory block read)))))))

(def-bitblt-block-read ubitblt-block-read-8 8) ;I suppose this when interned...
(def-bitblt-block-read ubitblt-block-read-4 4) ;... Will subsume this.

(defmacro def-bitblt-block-write (name n)
  '(defucode ,name
     (activate-bitblt-buffer)
     ,@(loop for i from (- n-bitblt-buffers n) below n-bitblt-buffers
	     collect '(parallel
		       (store-word (bitblt-buffer ,i) block)
		       (lisp (trace-path #/.))))
     (parallel
       (assign bb-d-offset (+ bb-d-offset b-block-size))
       (call deactivate-bitblt-buffer))
     (parallel-with-return
      (assign bb-width (- bb-width (rotate b-block-size 5))) ;2^5 = bits-per-word
      )))

(def-bitblt-block-write ubitblt-block-write-8 8)
(def-bitblt-block-write ubitblt-block-write-4 4)

(defmacro def-d-aligned-block-write (name n)
  '(defucode ,name
     (assign byte-s (1- bb-s-bitpos))
     (parallel
      (assign byte-r (- (b-constant 32.) bb-s-bitpos))
      (call activate-bitblt-buffer))
     ,@(loop for i from (- n-bitblt-buffers n) below n-bitblt-buffers
	     append '((parallel
		       (store-word (dpb (bitblt-buffer ,i) byte-s byte-r bb-s-word2) block)
		       (lisp (trace-path #/.)))
		      (assign bt-s-word2 (rotate (bitblt-buffer ,i) byte-r))))
     (parallel
       (assign bb-d-offset (+ bb-d-offset b-block-size))
       (call deactivate-bitblt-buffer))
     (parallel-with-return
       (assign bb-width (- bb-width (rotate b-block-size 5))) ;2^5 = bits-per-word
       )))

(def-d-aligned-block-write ubitblt-d-aligned-block-write-8 8.)
(def-d-aligned-block-write ubittlt-d-aligned-block-write-4 4.)

;;alu depends on neither source nor destination bits
(defucode ubitblt-long-row-neither
  (if (plus-fixnum bb-d-bitpos)
      (sequential
       (assign b-temp (32- bb-d-bitpos))
       (parallel-with-d-access bb-d-offset
	 (assign byte-r bb-d-bitpos)
	 (assign byte-S (1- b-temp))
	 (store-word (cpb bt-constant byte-s byte-r memory-data)))
       (incr-d-offset)
       (assign bb-width (- bb-width b-temp))
       (parallel
	 (assign bb-d-bitpos (b-constant 0))
	 (lisp (trace-path #/b))
	 (jump ubitblt-long-row-neither-loop)))
    (parallel
     (lisp (trace-path #/a))
     (jump ubitblt-long-row-neither-loop))))

(defucode ubitblt-long-row-neither-loop
  (if (greater-or-equal-fixnum bb-width (b-constant (* 8. 32.)))
      (sequential
        (parallel
	  (assign-vma-offset d)
	  (call store-block-bb-constant-8))
	(assign bb-d-offset (+ bb-d-offset (b-constant 8.)))
	(parallel
	  (assign bb-width (- bb-width (b-constant (* 8. 32.))))
	  (jump ubitblt-long-row-neither-loop)))
    (sequential
     (dispatch-after-next (parallel (assign b-block-size (ldb bb-width 3 5))
				    (ldb bt-width 3 5))
			  ((7) (parallel (assign-vma-offset d)
					 (call-and-return-to store-block-bb-constant-7
							     ubitblt-long-row-neither-finish)))
			  ((6) (parallel (assign-vma-offset d)
					 (call-and-return-to store-block-bb-constant-6
							     ubitblt-long-row-neither-finish)))
4,887,235
	319	320
			  ((5) (parallel (assign-vma-offset d)
					 (call-and-return-to store-block-bb-constant-5
							     ubitblt-long-row-neither-finish)))
			  ((4) (parallel (assign-vma-offset d)
					 (call-and-return-to store-block-bb-constant-4
							     ubitblt-long-row-neither-finish)))
			  ((3) (parallel (assign-vma-offset d)
					 (call-and-return-to store-block-bb-constant-3
							     ubitblt-long-row-neither-finish)))
			  ((2) (parallel (assign-vma-offset d)
					 (call-and-return-to store-block-bb-constant-2
							     ubitblt-long-row-neither-finish)))
			  ((1) (assign-vma-offset d)
			       (parallel
				 (lisp (trace-path #/.))
				 (store-word bb-constant)
				 (jump ubitblt-long-row-neither-finish))))
     (parallel
      (take-dispatch)
      (trap-if (zero-fixnum b-block-size) ubitblt-long-row-neither-finish)))))

(defucode ubitblt-long-row-neither-finish
  (assign bb-d-offset (+ bb-d-offset b-block-size))
  (assign bb-width (logand bb-width (b-constant #o37)))
  (if (plus-fixnum bb-width)
      (parallel-with-d-access bb-d-offset
	(assign byte-r (a-constant 0))
	(assign byte-s (1- to-width))
	(parallel
	  (lisp (trace-path #/2))
	  (store-word (dpb bb-constant byte-s byte-r memory-data))
	  (return)))
    (parallel
     (lisp (trace-path U/1))
     (return))))

(defmacro store-block-bb-constant-routines (n)
  `(progn 'compile
	  ,@(loop with s = "ST0RE-BLOCK-BB-CONSTANT-~d"
		  for i from n downto 1
		  collect '(defucode ,(fintern s i)
			     (parallel
			      (store-word bb-constant block)
			      (lisp (trace-path #/,))
			      ,(if (> i 1)
				   ,(jump ,(fintern s (1- i)))
				 '(return)))))))

(store-block-bb-constant-routines 8.)

;;alu depends both source and destination bits
(defucode ubitblt-long-row-both
  (parallel
    (assign b-temp bb-d-bitpos)
    (if (zero-fixnum bb-d-bitpos)
	(if (zero-fixnum bb-s-bitpos)
	    (goto ubitblt-aligned-row-both)
	  (parallel-with-s-access bb-s-offset
	    ;;    SSSSSSSSSSSSSSSSSSSSSSSSSSSSS.ssss
	    ;;ddddddddddadddddddddddddddddddddd.
	    (assign byte-r (32- bb-s-bitpos))
	    (parallel
	      (assign bb-s-word (rotate memory-data byte-r))
	      (lisp (trace-path #/c))
	      (jump ubitblt-d-aligned-row-both))))
      (if (equal-fixnum bb-s-bitpos b-temp)
	  (sequential
	   (parallel-with-s-access bb-s-offset
	     ;;SSSSSSSSSSSSSSSSSSSSSSSSSS.ssssss
	     ;;dddddddddddddddddddddddddd.dddddd
	     (parallel
	       (assign byte-r (32- bb-s-bitpos))
	       (assign b-temp obus))
	     (assign byte-s (31- bb-s-bitpos))
	     (assign bb-s-word (logxor bb-constant (ldb memory-data byte-s byte-r))))
	   (assign byte-r bb-s-bitpos)
	   (parallel
	    (assign-vma-offset d)
	    ;;ssssssssssssssssssssssssss.ssssss
	    ;;DDDDDDDDDDDDDDDDDDDDDDDDDD.dddddd
	    (call bb-byte-alu-operation-dispatch))
	   ;; First partial word stored, turn into aligned case
	   (incr-wrap-s-offset)
	   (incr-d-offset)
	   (assign bb-width (- bb-width b-temp))
	   (assign bb-s-bitpos (b-constant 0))
	   (parallel
	     (assign bb-d-bitpos (b-constant 0))
	     (lisp (trace-path #/b))
	     (jump ubitblt-aliqned-row-both)))
4,887,235
	321	322
	(if (lesser-fixnum bb-s-bitpos b-temp)
	    (goto ubitblt-long-row-both-s-longer)
	    (goto ubitblt-long-row-both-s-shorter))))))

(defucode ubitblt-long-row-both-s-longer
  (assign b-temp (32- bb-d-bitpos))
  (parallel-with-s-access bb-s-offset
    (assign type-r (32- bb-s-bitpos))
    (assign byte-s (1- b-temp))
    (assign bb-s-word2 memory-data))
  ;;ssssSSSSSSSSSSSSSSSSSSSS........
  ;;    DDDDDDDDDDDDDDDDDDDDdddddddddddd
  ;;    <----- b-temp ----->
  (assign bb-s-word (logxor bb-constant (rotate bb-s-word2 byte-r)))
  ;;........ssssSSSSSSSSSSSSSSSSSSSS
  (parallel
    (assign byte-r bb-d-bitpos)
    (assign b-temp-2 bb-d-bitpos))
  (parallel
   (assign-vma-offset d)
   ;;ssssssssssssssssssssssss.ssssssss
   ;;    DDDDDDDDDDDDDDDDDDDD.dddddddddddd
   (call bb-byte-alu-operation-dispatch))
  (incr-d-offset)
  ;;Remaining are (32-(s.bitpos+(32-d.bitpos))) = d.bitpos-s.bitpos
  ;;    <-- 32-d.bitpos ---> <-s.bitpos->
  ;;SSSSssssssssssssssssssss.ssssssss
  ;;    dddddddddddddddddddd.dddddddddddd
  (assign byte-r (- b-temp-2 bb-s-bitpos))
  (assign bb-s-bitpos (+ bb-s-bitpos b-temp))
  (assign to-s-word (rotate bb-s-word2 byte-r))
  (assign bb-width (- bb-width b-temp))
  (parallel
   (assign bt-d-bitpos (b-constant 0))
   (lisp (trace-path #/d))
   (jump ubitblt-d-aligned-row-both)))

;Need two S words to do the first partial S word
(defucode ubitblt-long-row-both-s-shorter
  ;;    ssssssssssssssssssssssss.ssssssss
  ;;dddddddddddddddddddddddddddd.dddd
  (parallel-with-s-access bb-s-offset
    (assign byte-r (32- bb-s-bitpos))
    (assign byte-s (31- bb-s-bitpos))
    ;;    SSSSSSSSSSSSSSSSSSSSSSSS.ssssssss
    ;;dddddddddddddddddddddddddddd.dddd
    (assign bb-s-word (logxor bb-constant (ldb memory-data byte-s byte-r))))
  (incr-wrap-s-offset-ahead)
  ;;   <--> s.bitpos-d.bitpos
  ;;...SSSS|ssssssssssssssssssssssss.ssssssss
  ;;dddddddddddddddddddooddddddddddd.dddd
  (parallel-with-s-access bb-s-offset-ahead
    (assign byte-s (- bb-s-bitpos b-temp 1))
    (assign byte-r (32- bb-s-bitpos))
    (assign bb-s-word2 (logxor bb-constant memory-data)))
  ;;...SSSS|SSSSSSSSSSSSSSSSSSSSSSSS.ssssssss
  ;;   dddd dddddddddddddddddddddddd.dddd
  (assign bb-s-word (dpb bb-s-word2 byte-s byte-r bb-s-word))
  (assign byte-r bb-d-bitpos)
  (assign byte-s (31- bb-d-bitpos))
  ;;...SSSS|SSSSSSSSSSSSSSSSSSSSSSSS.ssssssss
  ;;   DDDD DDDDDDDDDDDDDDDDDDDDDDDD.dddd
  (parallel
    (assign-vma-offset d)
    (call bb-type-alu-operation-dispatch))
  (incr-d-offset)
  (assign bb-s-offset bb-s-offset-ahead)
  ;;...SSSssss|sssssssssssssssssssssssss.ssssssss
  ;;      dddd ddddddddddddddddddddddddd.dddd
(assign byte-r (- b-temp bb-s-bitpos))
(assign bb-s-bitpos (- bb-s-bitpos t-temp))
(assign b-temp (32- bb-d-bitpos))
(assign bb-s-word (logxor (rotate bb-s-word2 byte-r) bb-constant))
(assign bb-width (- bb-width b-temp))
(parallel
(assign bb-d-bitpos (b-constant 0))
(lisp (trace-path #/e))
(jump ubitblt-d-aligned-row-both)))


(defucode ubitblt-aligned-row-both
  (if (greater-or-equal-fixnum bb-width (b-constant (* 8. 32.)))
      ;;Fetch a block of words into the buffer-
      (sequential
       (assign b-temp (+ bb-s-offset (b-constant 8.)))
       (if (lesser-fixnum bt-s-row-length t-temp)
	   (goto ubitblt-aligned-row-both-slow-loop)
	 (sequential
	  (parallel
4,887,235
	323	324
	(assign-vma-offset s)
	(call ubitblt-block-read-8))
	  (parallel
	   (assign-vma-offset d)
	   (call-and-return-to ubitblt-block-alu-8 ubitblt-aligned-row-both)))))
    ;;Frob with what's left. Too bad dispatch blocks are expensive.
    (if	(greater-or-equal-fixnum bb-width (b-constant (* 4 32.)))
	(sequential
	 (assign b-temp (+ bb-s-offset (b-constant 4.)))
	 (if (lesser-fixnum bb-s-row-length b-temp)
	     (goto ubitblt-aligned-row-both-slow-loop)
	   (sequential
	    (parallel
	      (assign-vma-offset s)
	      (call ubitblt-block-read-4))
	    (parallel
	      (assign-vma-offset d)
	      (call-and-return-to ubitblt-block-alu-4
				  ubitblt-aligned-row-both-slow-loop)))))
      (goto ubitblt-aligned-row-both-slow-loop))))

(defucode ubitblt-aligned-row-both-slow-loop	;12 cycles per word
  (parallel-with-s-access bb-s-offset		;4 cycles
    (trap-if (lesser-fixnum bb-width (b-constant 32.))
	     ubitblt-aligned-row-both-slow-loop-done)
    (waiting-for-memory)
    (assign bb-s-word (logxor bb-constant memory-data)))
  (parallel
    (assign-vma-offset d)			;1+3 cycles
    (call bb-word-alu-operation-dispatch))
  (assign bb-width (- bb-width (b-constant 32.)))	;1 cycle
  (incr-wrap-s-offset)				;2 cycles
  (parallel					;1 cycle
    (incr-d-offset)
    (lisp (trace-path #/.))
    (jump ubitblt-aligned-row-both)))

(defucode ubitblt-aligned-row-both-slow-loop-done
  (if (plus-fixnum bb-width)
      (sequential
        (parallel-with-s-access bb-s-offset
	  (assign byte-r (b-constant 0))
	  (assign byte-s (1- bb-width))
	  (assign bb-s-word (logxor bb-constant memory-data)))
	(parallel
	 (lisp (trace-path #/2))
	 (assign-vma-offset d)
	 (jump bb-byte-alu-operation-dispatch)))	;jcall
    (parallel-with-return
     (lisp (trace-path #/1)))))

(defucode ubitblt-block-alu-8
  (dispatch-after-this (ldb bt-alu-operation 4 0)
		       (parallel
			(assign a-block-size (a-constant 8.))
			(assign b-block-size (a-constant 8.))
			(start-memory block read))	;start first word
    ((1 2) (goto ubitblt-block-logand-8))		; x*y	~x*y
    ((4 8.) (goto ubitblt-block-andc2-8))		; x*~y	~x*~y
    ((6 9.) (goto ubitblt-block-logxor-8))		; x xor y, ~x xor y
    ((7 11.) (goto ubitblt-block-logior-7))		; x+y, ~x+y
    ((13. 14.) (goto ubitblt-block-lognand-8))))	; ~(~x*y), ~(x*y)


(defucode ubitblt-block-alu-4
  (dispatch-after-this (ldb bb-alu-operation 4 0)
		       (parallel
			(assign a-block-size (a-constant 4.))
			(assign b-block-size (a-constant 4.))
			(start-memory block read))	;start first word
	((1 2)	(goto ubitblt-block-logand-4))		; x*y ~x*y
	((4 8.)	(goto ubitblt-block-andc2-4))		; x*~y ~x*~y
	((6 9.)	(goto ubitblt-block-logxor-4))		; x xor y, ~x xor y
	((7 11.) (goto ubitblt-block-logior-4))		; x+y, ~x+y
	((13. 14.) (goto ubitblt-block-lognand-4))))	; ~(~x*y), ~(x*y)

(defmacro def-block-aluop (name n alu)
  (if (memq (get (caddr (microexpand '(,alu a-temp b-temp))) 'alu) weird-alu-functions)
      ;; Cannot simultaneously run ALU and store into the bitblt-buffer
      '(defucode ,name
	 (parallel
	   (waiting-for-memory)			;first word already started
	   (declare-memory-timing active-cycle))
	 (loop for i from (- n-bitblt-buffers n) below n-bitblt-buffers
	       collect '(sequential
			  (abus-array-data
			    (assign b-temp (,alu (bitblt-buffer ,i) memory-data))
			    ,(if (> (- n-bitblt-buffers i) 1)
				 '(start-memory block read))) ;start next word
4,887,235
	325	326
			 (parallel
			   (assign (bitblt-buffer ,i) (set-type b-temp dtp-fix))
			   ,(if (= (- n-bitblt-buffers i) 1)
				'(jump ,(fintern "UBITBLT-BLOCK-ALU-WRITE-~d" n)))))))
    ;;Normal case
    '(defucode ,name
       (parallel
	(waiting-for-memory)			;first word already started
	(declare-memory-timing active-cycle)
	(start-memory read block))	;start second word
       ,@(loop for i from (- n-bitblt-buffers n) below n-bitblt-buffers
	       collect '(parallel
			 (abus-array-data
			  (assign (bitblt-buffer ,i) (set-type (,alu (bitblt-buffer ,i)
								     memory-data)
							       dtp-fix)))			       
			 ,(selectq (- n-bitblt-buffers 1)
				   (1 '(jump ,(fintern "UBITBLT-BLOCK-ALU-WRITE-~d" n)))
				   (2 nil)
				   (otherwise (start-memory block read))) ;start word after next
			 )))))

(def-block-aluop ubitblt-block-logand-8 8 logand)
(def-block-aluop ubitblt-block-logior-8 8 logior)
(def-block-aluop ubitblt-block-logxor-8 8 logxor)
(def-block-aluop ubitblt-block-andc2-8 8 andc2)
(def-block-aluop ubitblt-block-lognand-8 8 lognand)

(def-block-aluop ubitblt-block-logand-4 4 logand)
(def-block-aluop ubitblt-block-logior-4 4 logior)
(def-block-aluop ubitblt-block-logxor-4 4 logxor)
(def-block-aluop ubitblt-block-andc2-4 4 andc2)
(def-block-aluop ubitblt-block-lognand-4 4 lognand)

(defmacro def-block-alu-write (name n)
  '(defucode ,name
     (parallel
      (assign-vma-offset d)
      (call activate-bitblt-buffer))
     ,@(loop for i from (- n-bitblt-buffers n) below n-bitblt-buffers
	     collect '(parallel
		       (store-word (bitblt-buffer ,i) block)
		       (lisp (trace-path #/.))))
     (parallel
      (assign bb-d-offset (+ bb-d-offset b-block-size))
      (call deactivate-bitblt-buffer))
     (assign bb-width (- bb-width (rotate b-block-size 5)))	;2^5 = bits-per-word
     (parallel
      (assign bb-s-offset (+ bb-s-offset b-block-size))
      (return))))


(def-block-alu-write ubitblt-block-alu-write-8 8)
(def-block-alu-write ubitblt-block-alu-write-4 4)


;;Each time through the loop, s-word was fetched from memory like
;;          <----- s.bitpos ----->
;;ssssssssss......................
;;and then rotated so it looks like
;;......................ssssssssss
;;<----- s.bitpos ----->
;;
;:Each time, another s-word2 gets fetched and deposited into s-word like
;;          |<----- s.bitpos ----->
;;          |......................1111111111
;;2222222222 2222222222222222222222
;;
;;The rotation for the dpb equals the rotation for setup for next loop.

;bb-s-word has the partial previous source word whose address is in bb-s-offset.
;rotated into alignment with the destination, but not xored with bb-constant
(defucode ubitblt-d-aligned-row-both
  (if (greater-or-equal-fixnum bb-width (b-constant (* 8. 32.)))
      ;;Fetch a block of words into tho buffer
      (sequential
        (assign b-temp (+ bb-s-offset (b-constant 8.)))
	(if (lesser-or-equal-fixnum bb-s-row-length b-temp)
	    (goto ubitblt-d-aligned-row-both-slow-loop)
	  (sequential
	    (parallel
	      (assign-vma-offset s 1)
	      (call ubitblt-rotated-block-read-8))
	    (parallel
	     (assign-vma-offset d)
	     (call-and-return-to ubitblt-block-alu-8 ubitblt-d-aligned-row-both)))))
    ;;Frot with what's left. Too bad dispatch blocks are expensive.
    (if (greater-or-equal-fixnum bb-width (b-constant (* 4 32.)))
	(sequential
	 (assign b-temp (+ bb-s-offset (b-constant 4.)))
	 (if (lesser-or-equal-fixnum bb-s-row-length t-temp)
	     (goto ubitblt-d-aligned-row-both-slow-loop)
4,887,235
	327	328
	(sequential
	 (parallel
	  (assign-vma-offset s 1)
	  (call ubitblt-rotated-block-read-4))
	 (parallel
	  (assign-vma-offset d)
	  (call-and-return-to ubitblt-block-alu-4
			      ubitblt-d-aligned-row-both-slow-loop)))))
      (goto ubitblt-d-aligned-row-both-slow-loop))))

(defucode ubitblt-d-aligned-row-both-slow-loop		;17 cycles per word
  (incr-wrap-s-offset-ahead) 				;2
  (parallel-with-s-access bb-s-offset-ahead		;4
    (trap-if (lesser-fixnum bb-width (n-constant 32.))
	     ubitblt-d-aligned-row-both-done)
    (assign byte-s (1- bb-s-bitpos))
    (assign bb-s-word2 memory-data))
  (assign byte-r (32- bb-s-bitpos))			;1
  (assign bb-s-word (dpb bb-s-word2 byte-s byte-r bt-s-word))	;1
  (assign bt-s-word (logxor bb-constant-a bb-s-word))		;1
  (parallel							;1+3
    (assign-vma-offset d)
    (call bb-word-alu-operation-dispatch))
  (assign bb-width (- bb-width (b-constant 32.)))		;1
  (incr-d-offset)						;1
  (assign bb-s-offset bb-s-offset-ahead)			;1
  (parallel
    (assign bb-s-word (rotate bb-s-word2 byte-r))
    (lisp (trace-path #/.))
    (jump ubitblt-d-aligned-row-both)))

;;At entry, we have s-word fetched from memory like
;;          <------s.bitpos------>
;;ssssssssss......................
;;but then rotated so it looks like
;;......................ssssssssss
;;<------s.bitpos------>
;;
;;This is to be combined with d-word which looks like
;;....................dddddddddddd
;;                    <---width-->
(defucode ubitblt-d-aligned-row-both-done
  (assign bb-s-word (logxor b-constant-a bb-s-word))
  (if (plus-fixnum bb-width)
      (sequential
       (assign b-temp (32- bb-s-bitpos))
       (if (lesser-or-equal-fixnum bb-width b-temp)
	   ;;we have enouqh s bits
	   ;;<----s.bitpos---><--a.temp--->
	   ;;.................sssssssssssssss
	   ;;....................dddddddddddd
	   ;;                    <---width-->
	   (sequential
	     (assign byte-r (b-constant 0))
	     (assign byte-s (1- bb-width))
	     (parallel
	       (assign-vma-offset d)
	       (lisp (trace-path #/4))
	       (jump bb-byte-alu-operation-dispatch))) ;jcall
	 ;;need to get another source word
	 ;;<----s.bitpos---><----a.temp--->
	 ;;.................sssssssssssssss
	 ;;............dddddddddddddddddddd
	 ;;            <-------width------>
	 (sequential
	  (parallel-with-s-access bb-s-offset-ahead
	    (assign byte-r b-temp)
	    (assign byte-s (1- bb-s-bitpos))
	    (assign bb-s-word2 (logxor memory-data bb-constant)))
	  (assign bb-s-word (dpb bb-s-word2 byte-s byte-r bb-s-word))
	  (assign byte-r (b-constant 0))
	  (assign byte-s (1- bb-width))
	  (parallel
	   (assign-vma-offset d)
	   (lisp (trace-path #/5))
	   (jump bb-byte-alu-operation-dispatch)))))	;jcall
    (parallel-with-return
     (lisp (trace-path #/3)))))

;;bb-s-word has the previous source word, rotated but not xored with bb-constant
;;3 cycles per word seems to be the best I can do (can't rotate while storing in bitblt-buffer)
;;If bb-s-word was xored already, it would take 4 cycles per word here
(defmacro def-bitblt-rotated-block-read (name n)
  '(defucode ,name
     (assign byte-s (1- bb-s-bitpos))
     (parallel
       (assign a-block-size (b-constant ,n))	;Used later to advance offsets
       (assign b-block-size obus)
       (start-memory block read))		;start first word
4,887,235
	329	330
     (parallel
       (waiting-for-memory)			;waiting for first word
       (assign byte-r (32- bb-s-bitpos)))
     ,@(loop for i from (- n-bitblt-buffers n) below n-bitblt-buffers
	     append '((abus-array-data
		        (assign bb-s-word2 (dpb memory-data byte-s byte-r bb-s-word)))
		      (parallel
		        (declare-memory-timing data-cycle)	;MD holds
			(assign bb-s-word (rotate memory-data byte-r))
			,(and (> (- n-bitblt-buffers i) 1)
			      '(start-memory block read)))
		      (parallel
		       (assign (bitblt-buffer ,i)
			       (set-type (logxor bb-constant bb-s-word2) dtp-fix))
		       ,(if (= (- n-bitblt-buffers i) 1)
			    '(return)))))))

(def-bitblt-rotated-block-read ubitblt-rotated-block-read-8 8)
(def-bitblt-rotated-block-read ubitblt-rotated-block-read-4 4)

(defucode ubitblt-long-row-source-backwards
  (parallel
   (assign b-temp bb-d-bitpos)
   (if (zero-fixnum bb-d-bitpos)
       (if (zero-fixnum bb-s-bitpos)
	   (parallel
	     (assign bb-s-offset (1+ bb-s-offset))  ;the loop will decr first, before pclsr
	     (lisp (trace-path #/a))
	     (jump ubitblt-aligned-row-source-backwards))
	   (sequential
	     (parallel-with-s-access bb-s-offset
	       (assign byte-r (32- bb-s-bitpos))
	       (parallel
		 (assign bb-s-word (logxor bb-constant (rotate memory-data byte-r)))
		 (lisp (trace-path #/c))
		 (jump ubitblt-d-aligned-row-source-backwards)))))
       (if (equal-fixnum b-temp bt-s-bitpos)
	   (sequential
	     (parallel-with-s-access bb-s-offset
	       (assign byte-s (1- bb-s-bitpos))
	       (assign bb-s-word (logxor memory-data bb-constant)))
	     (parallel-with-d-access-check-write bb-d-offset
	       (decr-d-offset)
	       (parallel
		 (assign byte-r (b-constant 0))
		 (assign bb-s-bitpos (b-constant 0)))
	       (store-word (dpb bb-s-word byte-s byte-r memory-data)))
	     ;; Now we can turn into the aligned case
	     (assign bb-width (- bb-width t-temp))
	     (parallel
	       (assign bb-d-bitpos (b-constant 0))
	       (lisp (trace-path #/b))
	       (jump ubitblt-aligned-row-source-backwards)))
	   (if (greater-fixnum bb-s-bitpos t-temp)	;s > d, enough in the current word
	       (sequential
		 (parallel-with-s-access bb-s-offset
		   (assign byte-s (1- bb-d-bitpos))
		   (assign byte-r (- b-temp bb-s-bitpos))
		   (assign bb-s-word (logxor bb-constant memory-data)))
		 (parallel-with-d-access-check-write bb-d-offset
		   (assign bb-s-bitpos (- bb-s-bitpos b-temp))
		   (assign bb-d-bitpos (b-constant 0))
		   (store-word (ldb bb-s-word byte-s byte-r memory-data)))
		 (assign bb-s-word (rotate bb-s-word byte-r))
		 (assign bb-width (- bb-width b-temp))
		 (parallel
		   (decr-d-offset)
		   (lisp (trace-path #/d))
		   (jump ubitblt-d-aligned-row-source-backwards)))
	       (sequential			;s < d, need to fetch another word
		 (parallel-with-s-access bb-s-offset
		   (parallel
		     (assign byte-r (- b-temp bb-s-bitpos))
		     (assign a-temp (- b-temp bb-s-bitpos)))
		   (assign byte-s (1- a-temp))
		   (assign bb-s-word (logxor bb-constant (rotate memory-data byte-r))))
		 (decr-wrap-s-offset-ahead)
		 (parallel-with-s-access bb-s-offset-ahead
		   (assign bb-s-word2 (logxor bb-constant memory-data)))
		 (assign bb-s-word (ldb bb-s-word byte-s byte-r bb-s-word))
		 (parallel-with-d-access bb-d-offset
		   (assign byte-r (b-constant 0d8))
		   (assign byte-s (1- bb-d-bitpos))
		   (store-word (ldb bb-s-word byte-s byte-r memory-data)))
		 (assign bb-s-bitpos (32- a-temp))
		 (assign byte-r a-temp)
		 (assign bb-s-word (rotate bb-s-word2 byte-r))
		 (assign bb-s-offset bb-s-offset-ahead)
		 (assign bb-width (- bb-width b-temp))
		 (assign bb-d-bitpos (b-constant 0))
		 (parallel
4,887,235
	331	332
	           (decr-d-offset)
		   (lisp (trace-path #/e))
		   (jump ubitblt-d-aligned-row-source-backwards))))))))

;bb-s-offset is 1+ the "real" value at this point
(defucode ubitblt-aligned-row-source-backwards 	;9 cycles per ward
  (decr-wrap-s-offset)				;1
  (parallel-with-s-access bb-s-offset		;4
    (trap-if (lesser-fixnum bb-width (b-constant 32.))
	     ubitblt-aligned-row-source-backwards-done)
    (waiting-for-memory)
    (assign bb-s-word (logxor bb-constant memory-data)))
  (assign-vma-offset d)				;1
  (store-word bb-s-word)			;1
  (assign bb-width (- bb-width (b-constant 32.))) ;1
  (parallel					;1
    (decr-d-offset)
    (lisp (trace-path #/,))
    (jump ubitblt-aligned-row-source-backwards)))

(defucode ubitblt-aligned-row-source-backwards-done
  (if (plus-fixnum bb-width)
      (sequential
        (parallel-with-s-accees bb-s-offset
	  (assign byte-s (1- bb-width))
	  (assign byte-r bb-width)
	  (assign bb-s-word (logxor bb-constant (ldb memory-data byte-s byte-r))))
	(parallel-with-d-access bb-d-offset
	  (assign byte-r (32- bb-width))
	  (parallel-with-return
	    (store-word (dpb bb-s-word byte-s byte-r memory-data))
	    (lisp (trace-path #/2)))))
      (parallel-with-return
        (lisp (trace-path #/1)))))

;;each time through the loop, bb-s-word has the low part of the previous word
;;rotated to be at the high end of the word. We use it as background to LDB the
;;high part of the next word into it.

;bb-s-offset is 1+ the "real" value at this point
;could bum one cycle by moving assignment to byte-s out of loop,
;but this should use block mode anyway
(defucode ubitblt-d-aligned-row-source-backwards	;11 cycles per word
  (decr-wrap-s-offset)				;1
  (parallel-with-s-access bb-s-offset		;4
    (trap-if (lesser-fixnum bb-width (b-constant 32.))
	     ubitblt-d-aligned-row-source-backwards-done)
    (assign byte-r (32- bb-s-bitpos))
    (assign bb-s-word2 (logxor bb-constant memory-data)))
  (assign byte-s (31- bb-s-bitpos))		;1
  (assign-vma-offset d)				;1
  (store-word (ldb bb-s-word2 byte-s byte-r bb-s-word)) ;1
  (assign bb-width (- bb-width (b-constant 32.))) ;1
  (decr-d-offset)				;1
  (parallel					;1
    (assign bb-s-word (rotate bb-s-word2 byte-r))
    (lisp (trace-path #/.))
    (jump ubitblt-d-aligned-row-source-backwards)))

(defucode ubitblt-d-aligned-row-source-backwards-done
  (parallel
    (assign bb-width-b bb-width)
    (if (plus-fixnum bb-width)
	(if (greater-or-equal-fixnum bb-s-bitpos bb-width-b)
	    (parallel-with-d-access bb-d-offset
	      (assign byte-r (b-constant 0))
	      (assign byte-s (31- bb-width))
	      (parallel-with-return
	        (store-word (ldb memory-data byte-s byte-r bb-s-word))
		(lisp (trace-path #/4))))
	  (sequential
	    (parallel-with-s-access bb-s-offset
	      (assign byte-r bb-width)
	      (assign bb-s-word (rotate bb-s-word byte-r))
	      (assign bb-s-word2 (logxor bb-constant memory-data)))
	    (parallel
	      (assign byte-r (- bb-width-b bb-s-bitpos))
	      (assign a-temp obus))
	    (assign byte-s (1- a-temp))
	    (assign bb-s-word (ldb bb-s-word2 byte-s byte-r- bb-s-word))
	    (parallel-with-d-access bb-d-offset
	      (assign byte-s (1- bb-width))
	      (assign byte-r (32- bb-width))
	      (parallel-with-return
	        (store-word (dpb bb-s-word byte-s byte-r memory-data))
		(lisp (trace-path #/5))))))
      (parallel-with-return
        (lisp (trace-path #/3))))))
