:TITLE[BitBlt];

%Edit by Ed Fiala 21 April 1982: fix 64k boundary problems in initialization;
  bum 4 cycles in initialization.
Edit by Ed Fiala 11 March 1982: replace some Ats with DispTables; add
  refill at 377b; reformat; bum 3 mi, 4 cycles in initialization.
Edit by Ed Fiala 5 November 1981: Change interrupt exits.
Edit by Fiala 28 August 1981: Exit to P4Tail rather than MesaBBret;
  MIPend change.
Edit by Fiala 28 April 1981: Check IntPending instead of NWW.
Edit by Neely March 25, 1981  6:04 PM Fix another bug that causes BitBlt to
  touch one page past the source and cause an address fault.
Edit by Jim Frandeen March 11, 1981  10:23 AM Fix bug that causes BitBlt to
  touch one page past the source and cause an address fault.
Edit by Neely February 25, 1981  5:07 PM Make GrayBrick addressing consistant
  with PricOps and Dandelion.  
Edit by Neely February 10, 1981  7:51 AM Fix Bug @ bbFirstGrayWord
Edit by Johnsson February 9, 1981  7:02 PM  New MemStat for BitBlt
Edit by Jim Frandeen September 30, 1980  1:19 PM Remove use of bit 0 of NWW.
  Fix AR 5866: BitBlt goes too far if dstFunc=xor.
Edit by Jim Frandeen July 24, 1980  8:00 AM Fix right to left bug
Edit by Jim Frandeen July 21, 1980  12:59 PM Change use of AC2,AC3 to
  bbArgLo,bbArgHi to free these registers. Change to allow DestBit and
  SourceBit to be greater than 16D.
Edit by Jim Frandeen May 15, 1980  3:05 PM
%

*BitBltArg format. This is aligned on a sixteen word boundary in the MDS. Each line of bits is called an item.

Set[dstWordLo,0];	*Long pointer to destination item.
Set[dstWordHi,1];	*If direction is backward, this points to the last
			*item to be processed.
Set[dstBit,2];		*Bit within word of destination
Set[dstBpl,3];		*Destination bits per line: after each item, we add
			*this to the dest address to get the address of the
			*next item. If direction is backward, this is neg.
Set[srcWordLo,4];	*Long pointer to source item.
Set[srcWordHi,5];	*If direction is backward, this points to the last
			*item to be processed.
Set[srcBit,6];		*Bit within word of source
Set[srcBpl,7];		*Source bits per line: after each item, we add this
			*to the source address to get the address of the next
			*item. If direction is backward, this is neg.
Set[width,10];		*Width in bits of rectangle to be operated on.
			*Restricted to a max of 32,767.
Set[height,11];		*Height in bits of rectangle to move or number of
			*items to process. Restricted to a max of 32,767.
Set[flags,12];		*Flags are defined as follows:

MC[direction,100000];	*Bit 0: 0 = forward (left to right by increasing
			*memory addresses), 1 = backward (right to left by
			*decreasing memory addresses).
MC[disjoint,40000];	*Bit 1: 1 = source and destination are disjoint.
MC[disjointItems,20000];	*Bit 2: 1 = source and destination overlap,
			*but individual lines are disjoint.
MC[graySource,10000];	*Bit 3: 1 = source is Gray block.
MC[srcComplement,4000];	*Bit 4: 1 = source is complemented.
MC[dstFunc,3000];	*Bits 5-6: dest function: null, AND, OR, XOR

*The remaining bits are zero in BitBltFlags. We use these bits as follows:
MC[InnerLoopType,360];	*Bits 10-13: type of inner loop.
	MC[Type0,0];	*Inner Loop type 0 (justified value 0).
	MC[Type1,40];	*Inner Loop type 1 (justified value 2).
	MC[Type4,200];	*Inner Loop type 4 (justified value 10).
	MC[Type5,240];	*Inner Loop type 5 (justified value 12).
MC[RightToLeft,1];	*Bit 17: 1 = Right To Left, 0 = Left To Right.
			*If Flags is odd, direction is right to Left.

%REGISTER USAGE:

bbArgLo,bbArgHi	This base register pair points to the BitBltArg table.

BitBltArgPtr	Points to the (hex aligned) BitBltArg table.

Dest		A quadword buffer that contains the next four words of
		destination data.

DestBpl		Contains destination bits per line. Initialized from DstBpl
		in the BitBltArg table. After each item, we add DestBpl to
		DestQLo and Hi and DestBit to get address of the next item.

DestQLo,Hi	Base register pair points to the next destination quadword.
		Initialized from dst BitAddress in BitBltArg table.

DestBitOffset	Offset in bits from DestQLo and Hi. Initialized from dst
		BitAddress in the BitBltArg table. Low order 6 bits are used
		to load DB to index the first bit in the first quadword. At
		the end of an item, we add DestBpl to the low order 6 bits
		to get the bit offset to the first quadword of the next item.

DestWordOffset	Offset in words from DestQLo and Hi to the current quadword
		of the item. Initialized to zero at the start of each item.
		Incremented by 4 each time another Dest quadword is fetched.

Flags		Contains Flags from BitBltArg table as well as other flags
		described above.

GrayWordIndex	Index into the next word in the gray block if the source is
		gray. We use the four low order bits of BitBltArgPtr. Since
		the BitBltArg table is hex aligned, these bits are zero the
		first time we are called.

GrayWord	Contains next word of the gray block if the source is gray.

ItemWidth	Contains width in bits of the item to be operated on.
		Initialized to width from the BitBltArg table.

ItemsRemaining	Number of items remaining to be processed. Initialized to
		height from the BitBltArg table. Decremented by one each time
		through ItemRefill. When zero, we are done.

NegItemWidth	Contains negative width in bits of the item to be operated on.
		Initialized to width from the BitBltArg table and made negative.

NegSDNonOverlap	Negative of the number of non overlap bits between the source
		and destination. Used only for right to left transfers.

SDNonOverlap	Number of non overlap bits between source and destination. A
		temporary used only to compute the transfer direction.

Source		A quadword buffer containing the next four source words.

SourceBitOffset	The bit offset from SourceQLo/Hi. Initialized from src
		BitAddress in the BitBltArg table. Low order 6 bits are used
		to load SB to index the first bit in the first quadword. At
		the end of an item, we add SourceBpl to the low order 6 bits
		to get the bit offset to the first quadword of the next item.

SourceBpl	Source bits/line. Initialized from SrcBpl in the BitBltArg
		table. After each item, we add SourceBpl to SourceQLo/Hi and
		SourceBit to get the address of the next item.

SourceQLo,Hi	Base register pair points to the next source quadword.
		Initialized from src BitAddress in the BitBltArg table.

SourceSubitemBitOffset	Offset in bits from SourceQLo/Hi. Used to address the
		next subitem when moving Right to Left.

SourceWordOffset	Offset in words from SourceQLo/Hi to the current
		quadword of the item. Initialized to zero at the start of each
		item. Incremented by 4 each time another Source quadword is fetched.

SubitemDestBitOffset	Offset in bits from DestQLo/Hi. Used to address the
		next subitem when moving Right to Left.

SubitemSourceBitOffset	Offset in bits from SourceQLo/Hi. Used to address the
		next subitem when moving Right to Left.

TouchPages	Set to zero if it is not necessary to touch pages before each
		item; otherwise it is set to negative ItemWidth.
%
*DISPATCH TABLES:
Loca[InnerLoopDisp,bbP1,20];
Loca[bbT4,bbP1,40];
Loca[bbT5,bbP1,60];

*DISPATCH VALUES FOR BBFA:
Set[ItemRefill,3];
Set[SourceDestRefill,4];
Set[SourceRefill,5];
Set[DestRefill,6];
Set[NoRefill,7];

*DISPATCH VALUES FOR INNER LOOP TYPE:
Set[InnerLoopType0,0];
Set[InnerLoopType1,2];
Set[InnerLoopType4,10];
Set[InnerLoopType5,12];

	PFetch4[PCB,IBuf,4], GoToP[MesaRefill], At[LShift[bbP1,10],377];

OnPage[bbP1];

*Start here from MesaESC, or restart here after an interrupt or a page fault.
*bbArgHi contains MDShi. The low order four bits of BitBltArgPtr, used for
*the GrayWordIndex, must be zeroed.
*Timing to here: 14.5+4 = 18.5 cycles
MesaBitBlt:
	T ← (Stack0) and not (17C);
	bbArgLo ← T;	*Long pointer to BitBLT table in bbArgLo,bbArgHi
	bbTouchPages ← 0C;	*Assume we don't need to touch pages.
	PFetch1[bbArgLo,bbFlags,flags], Task;	*Fetch flags.
	MemStat ← BitBltFixup;
	PFetch1[bbArgLo,bbItemWidth,width];

%Set inner loop type in the Flags word and load T with the value of SALUFOP:
	T[10] = MA' (the NOT of the value of MA in the table below)
	T[11] = MB. The MB branch condition is used to indicate "no source".
	T[12:17] = ALU Operation
Bit Blt functions from bits 3-6 of flags:

C = 0 => the destination bits are to be cleared before the operation.
MA = 1 => bit positions in the input word not covered by the source mask are
filled with ones.
T is the type of inner loop:
	0: functions 1 - 3 and 5 - 7
	1: functions 0 and 4 
	4: functions 10 and 14
	5: functions 11 - 13 and 15 - 17

CODE	Gray	Source	Dest	C  MA T		SALUFOP	  Dest ←
		Compl	Func

 0	0	0	0 0	0  0  1		R OR T	  Source
 1	0	0	0 1	1  1  0		R AND T	  Source AND Dest
 2	0	0	1 0	1  0  0		R OR T	  Source OR Dest
 3	0	0	1 1	1  0  0		R XOR T	  Source XOR Dest
 4	0	1	0 0	0  1  1		R OR T'	  Source'
 5	0	1	0 1	1  0  0		R AND T'  Source' AND Dest
 6	0	1	1 0	1  1  0		R OR T'	  Source' OR Dest
 7	0	1	1 1	1  1  0		R XOR T'  Source' XOR Dest
10	1	0	0 0	0  0  4		R OR T	  Gry
11	1	0	0 1	1  1  5		R AND T	  Gry AND Dest
12	1	0	1 0	1  0  5		R OR T	  Gry OR Dest
13	1	0	1 1	1  0  5		R XOR T	  Gry XOR Dest
14	1	1	0 0	0  1  4		R OR T'	  Gry'
15	1	1	0 1	1  0  5		R AND T'  Gry' AND Dest
16	1	1	1 0	1  1  5		R OR T'	  Gry' OR Dest
17	1	1	1 1	1  1  5		R XOR T'  Gry' XOR Dest
%

*Timing to here 18.5+20 = 38.5 cycles
bbSetupFunction:
	Dispatch[bbFlags,3,4];
	PFetch1[bbArgLo,bbSourceBpl,srcBpl], Disp[.+1];
	bbFlags ← (bbFlags) or (Type1), GoTo[bbOR], DispTable[20];	*Source Type 1
	T ← 056C, GoTo[bbSetSALUFOP];		*Source AND Dest Type 0
bbOR:	T ← 204C, GoTo[bbSetSALUFOP];		*Source OR Dest Type 0
	T ← 263C, GoTo[bbMustTouchPages];	*Source XOR Dest Type 0
	bbFlags ← (bbFlags) or (Type1), GoTo[bbORNot];	*Source' Type 1
	T ← 227C, GoTo[bbSetSALUFOP];		*Source' AND Dest Type 0
bbORNot:
	T ← 074C, GoTo[bbSetSALUFOP];		*Source' OR Dest Type 0
	T ← 054C, GoTo[bbMustTouchPages];	*Source' XOR Dest Type 0
	T ← 304C, GoTo[bbType4];		*Gry Type 4
	T ← 156C, GoTo[bbType5];		*Gry AND Dest Type 5
	T ← 304C, GoTo[bbType5];		*Gry OR Dest Type 5
	T ← 363C, GoTo[bbType5MustTouchPages];	*Gry XOR Dest Type 5
	T ← 174C, GoTo[bbType4];		*Gry' Type 4
	T ← 327C, GoTo[bbType5];		*Gry' AND Dest Type 5
	T ← 174C, GoTo[bbType5];		*Gry' OR Dest Type 5
	T ← 154C, GoTo[bbType5MustTouchPages];	*Gry' XOR Dest Type 5

bbType4:
	bbFlags ← (bbFlags) or (Type4), GoTo[bbSetSALUFOP];	*Type 4: no source

bbType5:
	bbFlags ← (bbFlags) or (Type5), GoTo[bbSetSALUFOP];	*Type 5: no source
  
bbType5MustTouchPages:
	bbFlags ← (bbFlags) or (Type5);	*Type 5: no source
bbMustTouchPages:
	SALUF ← T, T ← bbItemWidth;
	bbTouchPages ← (Zero) - T, GoTo[bbInitializeRegs];

bbSetSALUFOp:
	SALUF ← T;

*If this is the first time called, StkP is 1.  TOS points to the BitBltArg
*table. StkP is read complemented: 376 => StkP = 1, 367 => StkP = 10.
*Timing: 38.5 + 9 + (2 if not type 0) + (2 if must touch pages) cycles
*  = 47.5 to 51.5 cycles
bbInitializeRegs:
	T ← (SStkP&NStkP) or not (377C);
	RTemp ← 10C;
	LU ← (RTemp) xnor T;	*Test for Stack pointer = 10
	StkP ← RTemp, GoTo[bbContinueAfterInterrupt,ALU=0];
*Continue if this is the first call to BitBlt. Initialize registers in the
*stack.  Set StkP to 10 because we will use all of the Stack to save state.
*Fetch long pointer to destination into DestQLo and Hi.
	PFetch2[bbArgLo,bbDestQLo,dstWordLo];
*Init GrayWordIndex to the initial Y offset specified for the gray block.
	T ← LdF[bbSourceBpl,4,4], Task;
	bbGrayWordIndex ← (bbGrayWordIndex) or T;
*Fetch long pointer to source into SourceQLo and Hi. Fetch SourceBitOffset
*and fetch SourceBpl into ItemsRemaining.
	PFetch4[bbArgLo,bbSourceQLo,srcWordLo];
	T ← RHMask[bbDestQHi];	*Convert DestQHi to base register format.
	bbDestQHi ← (LSh[bbDestQHi,10]) + T + 1;
*Fetch DestBitOffset and DestBpl; jump if source is gray.
	PFetch2[bbArgLo,bbDestBitOffset,dstBit], GoTo[bbInit2,MB'];
*Point SourceQ at start of gray brick by subtracting initial yOffset from it.
	  T ← (bbGrayWordIndex) and (17C);
	  bbSourceQLo ← (bbSourceQLo) - T;
	  T ← RHMask[bbSourceQHi], GoTo[bbInit3,Carry];	*Needed @ bbInit3.
	  bbSourceQHi ← (bbSourceQHi) - 1;
*Convert SourceQHi to base register format.
bbInit2:
	T ← RHMask[bbSourceQHi];
bbInit3:
	bbSourceQHi ← (LSh[bbSourceQHi,10]) + T + 1;
*Fetch height into ItemsRemaining.
	PFetch1[bbArgLo,bbItemsRemaining,height], Task;
*Prepare DestBitOffset for the first time through the loop.  DestBitOffset
*must point to the starting bit within the quadword.  DestBit can be more
*than 16D to start with, so we add any word portion to DestQLo.
	T ← RSh[bbDestBitOffset,4];
	bbDestQLo ← (bbDestQLo) + T;
	T ← 60C, Skip[Carry'];
	  bbDestQHi ← (bbDestQHi) + (400C) + 1;
	T ← (LSh[bbDestQLo,4]) and T;
	bbDestBitOffset ← (LdF[bbDestBitOffset,14,4]) + T;
	LU ← bbItemsRemaining;	*Test for zero height.
*Prepare SourceBitOffset for the first time through the loop.  SourceBitOffset
*must point to the starting bit within the quadword.  SourceBit can be more
*than 16D to start with, so we add any word portion to SourceQLo.
	T ← RSh[bbSourceBitOffset,4], Skip[ALU#0];
	  GoTo[bbExit];		*Completion return if height is zero.
*If Source is gray
	bbSourceQLo ← (bbSourceQLo) + T, GoTo[bbCommonInit,MB];
	T ← 60C, Skip[Carry'];
	  bbSourceQHi ← (bbSourceQHi) + (400C) + 1;
	T ← (LSh[bbSourceQLo,4]) and T;
	bbSourceBitOffset ← (LdF[bbSourceBitOffset,14,4]) + T, GoTo[bbCommonInit];

*Come here if the opcode has been executed again after an interrupt.  Fetch
*data from BitBltArg table that was not saved in the stack.
bbContinueAfterInterrupt:
	PFetch1[bbArgLo,bbDestBpl,dstBpl];
bbCommonInit:
	SB ← bbSourceBitOffset;
*Initialize the negative of the ItemWidth for setting MNBR.
	T ← bbItemWidth;
	bbNegItemWidth ← (Zero) - T, Skip[ALU#0];
	  GoTo[bbExit];	*Completion return - item width is zero
	MNBR ← bbNegItemWidth, Call[bbTask];

*Determine direction of operation. If direction is backward, items are not
*disjoint, and rectangles are not disjoint, we must move right to left.
*When direction is backward, source and destination addresses point to the
*beginning of the last item to be transferred, and the source and destination
*bits per line are negative.
bbDetermineDirection:
	T ← 4C;
	LU ← (LdF[bbFlags,0,3]) xor T;
	T ← bbItemWidth, GoTo[bbBTRL,ALU=0];
*Continue if Left to Right.
	bbDestWordOffset ← 0C, GoTo[bbPrepareInnerLoop2];

*For Bottom to Top, Right to Left initialization.  If non-overlap .ls. four
*words (64 bits), we can still move left to right. This works because we pick
*up the next source quadword before we store the dest quadword.
bbBTRL:	bbTouchPages ← (Zero) - T;	*We must touch pages
	T ← LdF[bbSourceQLo,0,16];
*T = (dest words - source words)/4
	T ← (LdF[bbDestQLo,0,16]) - T, Task;
*NegSDNonOverlap = (dest words - source words)/4
	bbSDNonOverlap ← T;
	T ← LdF[bbSourceBitOffset,12,6];
*T = dest bit offset - source bit offset
	T ← (LdF[bbDestBitOffset,12,6]) - T, Task;
	T ← bbSDNonOverlap ← (LSh[bbSDNonOverlap,6]) + T;
	LU ← LdF[bbSDNonOverlap,0,12];	*Test non-overlap < 100
	bbNegSDNonOverlap ← (Zero) - T, Skip[ALU#0];	*If non-overlap >= 100
*Left to Right if non-overlap < 100.
	  bbDestWordOffset ← 0C, GoTo[bbPrepareInnerLoop2];
	LU ← LdF[bbItemWidth,0,12];	*Test item width < 100
*Test ItemWidth - SDNonOverlap
	LU ← (bbItemWidth) - T, Skip[ALU#0];	*If ItemWidth >= 100
*Left to Right if ItemWidth < 100.
	  bbDestWordOffset ← 0C, GoTo[bbPrepareInnerLoop2];
	bbDestWordOffset ← 0C, Skip[ALU<0];
*Right to Left if ItemWidth >= SDNonOverlap
	  bbFlags ← (bbFlags) or (RightToLeft);
*Left to Right if ItemWidth < SDNonOverlap
	GoTo[bbPrepareInnerLoop2];

*Avoid touching pages except when necessary.  If we get a page fault, the
*opcode will be restarted.  We cannot be restarted if the function is XOR or
*XNOR or if we are moving right to left, in which case there is a Source-Dest
*overlap.
*If not necessary to touch pages, TouchPages = 0.
bbPrepareInnerLoop2:
	T ← (bbTouchPages) + 1, GoTo[bbFetchFirstDest,R>=0];
*Continue if it is necessary to touch pages.  T = -ItemWidth + 1.  Start with
*the last page of the scan line and finish with a PFetch4 of the first
*quadword. The initial displacement is
*[(ItemWidth - 1 + DestBitOffset) RSh 4] + non-page bits of the first word.
*The choice of touching or not touching is indicated in TouchPages which
*contains 0 when not touching or negative ItemWidth when touching.
*T ← DestBitOffset + ItemWidth-1.
	T ← (LdF[bbDestBitOffset,12,6]) - T, Call[bbShiftTRight4];
*Since DestBitOffset contains the bit offset within the quadword, we must
*mask off the low two bits of DestQLo as well as the page bits.  Then add in
*[(ItemWidth - 1 + DestBitOffset) RSH 4] which is saved in RTemp.  Any
*overflow into the high 8 bits are the page bits that we must touch.
	T ← (bbDestQLo) and (374C);
	T ← (RTemp) + T;
	T ← (LSh[AllOnes,10]) and T, Call[bbTouchDest];
*T will be zero if the item does not cross a page boundary.
*Continue here if the item crosses a page boundary. Decrement T by one page.
	T ← (LSh[AllOnes,10]) + T;
bbTouchDest:
	PFetch4[bbDestQLo,bbDest], GoTo[bbPrepareSource,ALU=0];	*If last fetch
	Return;		*Fetch another quadword

bbShiftTRight4:
	RTemp ← T;
	T ← RTemp ← RSh[RTemp,4], Return;

bbFetchFirstDest:
	PFetch4[bbDestQLo,bbDest,0];
bbPrepareSource:
	bbSourceWordOffset ← 0C, GoTo[bbSourceIsGray,MB];	*If no source
*If not necessary to touch pages, TouchPages = 0.
	T ← (bbTouchPages) + 1, GoTo[bbFetchFirstSource,R>=0];
*Continue if it is necessary to touch pages.  T = negative ItemWidth + 1.
*Start with the last page of the scan line and finish with a PFetch4 of the
*first quadword.
*T ← SourceBitOffset + ItemWidth-1.
	T ← (LdF[bbSourceBitOffset,12,6]) - T, Call[bbShiftTRight4];
	T ← (bbSourceQLo) and (374C);
	T ← (RTemp) + T;
	T ← (LSh[AllOnes,10]) and T, Call[bbTouchSource];
*Continue here if the item crosses a page boundary.  Decrement T by one page.
	T ← (LSh[AllOnes,10]) + T;
*If last fetch
bbTouchSource:
	PFetch4[bbSourceQLo,bbSource], GoTo[bbPrepareDispToInnerLoop,ALU=0];
	Return;	*Fetch another quadword

bbSourceIsGray:
	Dispatch[bbFlags,10,4], GoTo[bbDispToInnerLoop];

bbFetchFirstSource:
	PFetch4[bbSourceQLo,bbSource,0];
bbPrepareDispToInnerLoop:
	Dispatch[bbFlags,10,4], GoTo[bbNewRightToLeftItem,R Odd];
bbDispToInnerLoop:
	DB ← bbDestBitOffset, Disp[bbInnerLoops];

*Start the inner loop for items that have a gray Source. Fetch the next word of the gray block.
bbStartGrayInnerLoop:
	PFetch1[bbSourceQLo,bbGrayWord];
bbStartInnerLoop:
	bbDestQLo ← (bbDestQLo) and not (3C);
	BBFBX, Return;

*Initialize to move right to left.  We must move right to left when the Dest
*is on the same line as the Source, the Dest is to the right of the Source,
*and the amount of non overlap (SDNonOverlap) is more than 100B bits.  The
*hardware is not designed to do this, so it is not very effecient.  On the
*1st iteration, we add SDNonOverlap bits to SourceBitOffset and DestBitOffset
*to get the starting bit address.  On each iteration except for the last, we
*move SDNonOverlap bits.
*Set SubitemSourceBitOffset and SubitemDestBitOffset to point to the end of
*the item.
bbNewRightToLeftItem:
	T ← bbItemWidth, Task;
	T ← (LdF[bbSourceBitOffset,12,6]) + T;
	bbSubitemSourceBitOffset ← T;
	T ← bbItemWidth, Task;
	T ← (LdF[bbDestBitOffset,12,6]) + T;
	bbSubitemDestBitOffset ← T, Skip;
*Come here on ItemRefill when we were moving an item right to left.
*Calculate the number of bits left to transfer.  If number of bits remaining
*> SDNonOverlap, transfer SDNonOverlap bits; otherwise transfer number of
*bits remaining. If zero, we are finished with this item.
bbContinueRightToLeftItem:
	T ← bbSubitemDestBitOffset;
	T ← (LdF[bbDestBitOffset,12,6]) - T;
	RTemp ← T;	*T = RTemp = negative number of bits remaining.
	LU ← (bbNegSDNonOverlap) - T, Skip[ALU#0];	*If bits remaining # 0
*Last subitem.
	  bbItemsRemaining ← (bbItemsRemaining) - 1, GoTo[bbCommonItemRefill];
*Assume number of bits remaining to be moved; skip if SDNonOverlap > no. bits
*remaining.
	MNBR ← RTemp, Skip[ALU<0];
*Continue if no. bits remaining > than SDNonOverlap.  Move SDNonOverlap bits.
	  T ← MNBR ← bbNegSDNonOverlap;
	bbSourceQLo ← (bbSourceQLo) and not (3C);
	bbDestQLo ← (bbDestQLo) and not (3C);
*Decrement SubitemSourceBitOffset and SubitemDestBitOffset to point to the
*next Right to Left bits to transfer.  T contains the negative number to add
*to the offsets.
	bbSubitemDestBitOffset ← (bbSubitemDestBitOffset) + T;
	T ← bbSubitemSourceBitOffset ← (bbSubitemSourceBitOffset) + T, Call[bbShiftTRight4];
	bbSourceWordOffset ← T;
*Set SourceWordOffset and DestWordOffset to point to the correct starting
*quadword so that we can use the normal inner loops to fetch the next Source
*and Dest quadwords.
	PFetch4[bbSourceQLo,bbSource];
	SB ← bbSubitemSourceBitOffset;
	DB ← bbSubitemDestBitOffset;
	T ← RSh[bbSubitemDestBitOffset,4];
	PFetch4[bbDestQLo,bbDest];
	Dispatch[bbFlags,10,4];
	bbDestWordOffset ← T, Disp[bbInnerLoops];

%The mi with BBFA uses the source mask to mask the source word after it has
been aligned with the destination.  This mi does four operations:

(1) The T input on the right side is disabled by special logic. Instead, H2
input is loaded in bit positions not covered by the Source Mask. If MA=1,
these bits are filled with ones; if MA=0, these bit positions are filled with
zeroes.

(2) The word placed in T will contain the source bits correctly aligned with
the destination, filled with ones if MA=1, or with zeroes if MA=0.

(3) APC is set with a dispatch value for loop control.

(4) SB, DB, and MNBR are updated by the number of bits to be transferred by
this iteration.

The 2nd mi of the inner loop (containing BBFB or BBFBX) does three operations:

(1) It combines the destination bits with the source bits in T depending on
the operation previously loaded into SALUFOp.

(2) It updates SBX from SB, DBX from DB, and it calculates MWX for the next
iteration.

(3) It dispatches to the next instruction depending on the value loaded into
APC by the previous BBFA instruction:
	ItemRefill if MNBR is about to become zero.
	SourcDestRefill if SB and DB are exhausted
	SourceRefill if SB is exhausted
	DestRefill if DB is exhausted
	Continue if another loop can be performed
%

bbStoreDest:
	PStore4[bbDestQLo,bbDest], GoTo[bbFetchDest];

*Return to the first instruction (BBFA) of the inner loop.
bbFetchSource:
	PFetch4[bbSourceQLo,bbSource], Return;

*Note: Doing PStore4 first allows both PFetch4s to be launched before
*transport for either occurs. If the PFetch4 for the source were done first,
*the PFetch4 for the dest could not be launched before transport for both
*preceding references had finished.
bbStoreDestFetchBoth:
	PStore4[bbDestQLo,bbDest];
	T ← bbSourceWordOffset ← (bbSourceWordOffset) + (4C);
	PFetch4[bbSourceQLo,bbSource];
bbFetchDest:
	T ← bbDestWordOffset ← (bbDestWordOffset) + (4C);
*Return to the first instruction (BBFA) of the inner loop.
	PFetch4[bbDestQLo,bbDest], Return;

*This is the inner loop for functions 1 - 3 and 5 - 7: Source AND Dest,
*Source OR Dest, Source XOR Dest, NotSource AND Dest, NotSource OR Dest,
*NotSource.
*Set DestBit and DB to point to the starting bit in the dest quadword.
*We add T (the bit starting word in the quadword * 16) to DestBit. Set
*SourceBit and SB to point to the starting bit in the source quadword.
bbInnerLoops:
	bbSourceQLo ← Form-4[bbSourceQLo], Call[bbStartInnerLoop], At[InnerLoopDisp,InnerLoopType0];
	T ← BBFA[SB[bbSource]] OR T;
bbInnerLoopType0B:
	DB[bbDest] ← BBFBX[DB[bbDest]] SALUFOp T, Disp[.+1];
	T ← bbDestWordOffset, GoTo[bbItemRefill], DispTable[5,17,3];
	T ← bbDestWordOffset, GoTo[bbStoreDestFetchBoth];
	T ← bbSourceWordOffset ← (bbSourceWordOffset) + (4C), GoTo[bbFetchSource];
	T ← bbDestWordOffset, GoTo[bbStoreDest];
	T ← BBFA[SB[bbSource]] OR T, GoTo[bbInnerLoopType0B];


*Inner loop for functions 0 (Dest ← Source) and 4 (Dest ← notSource).
	bbSourceQLo ← Form-4[bbSourceQLo], Call[bbStartInnerLoop], At[InnerLoopDisp,InnerLoopType1];
	T ← BBFA[SB[bbSource]] OR T;
bbInnerLoopType1B:
	DB[bbDest] ← BBFB[DB[bbDest]] SALUFOp T, Disp[.+1];
	T ← bbDestWordOffset, GoTo[bbItemRefill], DispTable[5,17,3];
	T ← bbDestWordOffset, GoTo[bbStoreDestFetchBoth];
	T ← bbSourceWordOffset ← (bbSourceWordOffset) + (4C), GoTo[bbFetchSource];
	T ← bbDestWordOffset, GoTo[bbStoreDest];
	T ← BBFA[SB[bbSource]] OR T, GoTo[bbInnerLoopType1B];


*This is the inner loop for functions 10 (Gray) and 14 (notGray).
*Set DestBit and DB to point to the starting bit in the dest quadword.
*We add T (the bit starting word in the quadword * 16) to DestBit.
*Set SB equal DB so that source and dest will be exhausted at the same time.
	T ← (bbGrayWordIndex) and (17C), Call[bbStartGrayInnerLoop], At[InnerLoopDisp,InnerLoopType4];
	T ← BBFA[bbGrayWord];
bbInnerLoopType4B:
	DB[bbDest] ← BBFB[DB[bbDest]] SALUFOp T, Disp[.+1];
	T ← bbDestWordOffset, GoTo[bbItemRefill], At[bbT4,ItemRefill];
	T ← bbDestWordOffset, GoTo[bbStoreDest], At[bbT4,SourceDestRefill];
	T ← BBFA[bbGrayWord], GoTo[bbInnerLoopType4B], At[bbT4,SourceRefill];
	T ← bbDestWordOffset, GoTo[bbStoreDest], At[bbT4,DestRefill];
	T ← BBFA[bbGrayWord], GoTo[bbInnerLoopType4B], At[bbT4,NoRefill];


*This is the inner loop for functions 11 - 13 (Gry AND Dest, Gry OR Dest,
*Gry XOR Dest) and 15 - 17 (notGry AND Dest, notGry OR Dest, notGry XOR Dest).
	T ← (bbGrayWordIndex) and (17C), Call[bbStartGrayInnerLoop], At[InnerLoopDisp,InnerLoopType5];
	T ← BBFA[bbGrayWord];
bbInnerLoopType5B:
	DB[bbDest] ← BBFBX[DB[bbDest]] SALUFOp T, Disp[.+1];
	T ← bbDestWordOffset, GoTo[bbItemRefill], At[bbT5,ItemRefill];
	T ← bbDestWordOffset, GoTo[bbStoreDest], At[bbT5,SourceDestRefill];
	T ← BBFA[bbGrayWord], GoTo[bbInnerLoopType5B], At[bbT5,SourceRefill];
	T ← bbDestWordOffset, GoTo[bbStoreDest], At[bbT5,DestRefill];
	T ← BBFA[bbGrayWord], GoTo[bbInnerLoopType5B], At[bbT5,NoRefill];

bbTestForRightToLeft:
	bbFlags, GoTo[bbContinueRightToLeftItem,R Odd];	*If Right to Left item.
bbTask:	Return;

*Come here from all inner loops when the current item is exhausted. Store the
*last dest quadword. If this is the last item, exit.
bbItemRefill:
	PStore4[bbDestQLo,bbDest], Call[bbTestForRightToLeft];
	bbItemsRemaining ← (bbItemsRemaining) - 1;
*Come here when the last Right to Left subitem has been processed. Subtract
*one from the number of items left to process.
bbCommonItemRefill:
	MNBR ← bbNegItemWidth, GoTo[bbExit,ALU=0];
	T ← bbDestBpl, GoTo[bbAdvancePositive,R>=0];
*Advance Dest and Source in the negative direction. Add Dest Bits Per Line to
*DestBit to get the number of bits to the next item. Then add the number of
*words to the next item to DestQLo and Hi.
	bbDestBitOffset ← (LdF[bbDestBitOffset,12,6]) + T;
	T ← RSh[bbDestBitOffset,4], GoTo[bbAdvanceDestNegative,ALU<0];
	bbDestQLo ← (bbDestQLo) + T;
	T ← bbSourceBpl, Skip[Carry'];
	  bbDestQHi ← (bbDestQHi) + (400C) + 1;
	bbSourceBitOffset ← (LdF[bbSourceBitOffset,12,6]) + T, GoTo[bbAdvanceSource2];

bbAdvanceDestNegative:
	T ← (LSh[AllOnes,14]) or T;
	bbDestQLo ← (bbDestQLo) + T;
	T ← bbSourceBpl, Skip[Carry];
	  bbDestQHi ← (bbDestQHi) - (400C) - 1;
	bbSourceBitOffset ← (LdF[bbSourceBitOffset,12,6]) + T;

bbAdvanceSource2:
	T ← RSh[bbSourceBitOffset,4], GoTo[bbAdvanceSourcePositive,ALU>=0];
	T ← (LSh[AllOnes,14]) or T;
	bbSourceQLo ← (bbSourceQLo) + T;
	SB ← bbSourceBitOffset, Skip[Carry];
	  bbSourceQHi ← (bbSourceQHi) - (400C) - 1;
bbTestWakeUpDisable:
	LU ← xfWDC, DblGoTo[bbWakeups,bbNoWakeups,IntPending];

bbAdvancePositive:
	bbDestBitOffset ← (LdF[bbDestBitOffset,12,6]) + T;
	T ← RSh[bbDestBitOffset,4], Call[bbTask];
	bbDestQLo ← (bbDestQLo) + T, GoTo[bbIncrementGrayIndex,MB];
	T ← bbSourceBpl, Skip[Carry'];
	  bbDestQHi ← (bbDestQHi) + (400C) + 1;
	bbSourceBitOffset ← (LdF[bbSourceBitOffset,12,6]) + T;
	T ← RSh[bbSourceBitOffset,4];
bbAdvanceSourcePositive:
	bbSourceQLo ← (bbSourceQLo) + T;
	SB ← bbSourceBitOffset, Skip[Carry'];
	  bbSourceQHi ← (bbSourceQHi) + (400C) + 1;
	LU ← xfWDC, Skip[IntPending];
bbNoWakeups:	*No interrupt requests
	  bbDestWordOffset ← 0C, GoTo[bbPrepareInnerLoop2];
bbWakeups:	*One or more int. requests
	bbDestWordOffset ← 0C, GoTo[bbPrepareInnerLoop2,ALU#0];
*IntPending true implies NWW .ne. 0 and xfWDC .eq. 0 that interrupt will
*take; control will return by restarting the opcode.
	LoadPage[opPage0];
	MemStat ← Normal, GoToP[NopInt];

*If no source, increment GrayWordIndex to point to the next gray word.
bbIncrementGrayIndex:
	T ← LdF[bbSourceBpl,14,4], Skip[Carry'];	*Gray height minus one
	  bbDestQHi ← (bbDestQHi) + (400C) + 1;
	SB ← bbSourceBitOffset;
	LU ← (LdF[bbGrayWordIndex,14,4]) - T;
	T ← bbDestBpl, Skip[ALU#0];
*First gray word
	  bbGrayWordIndex ← (bbGrayWordIndex) and not (17C), GoTo[bbTestWakeUpDisable];
*Next gray word
	bbGrayWordIndex ← (bbGrayWordIndex) + 1, GoTo[bbTestWakeUpDisable];

bbExit:
	MemStat ← Normal;
	LU ← NextInst[IBuf];
	StkP ← RZero, NIRet;

:END[BitBlt];