/*
 *			D I F F
 */

/*)BUILD	$(TKBOPTIONS) = {
			TASK	= ...DIF
		}
*/

#ifdef	DOCUMENTATION
title	diff	Differential File Comparison
index		Differential File Comparison

synopsis

	diff [option] file1 file2

description

	Diff compares two files, showing what must be changed to make
	them identical. Either file1 or file2 (but not both) may refer
	to directories.  If that is the case, a file in the directory
	whose name is the same as the other file argument will be used.
	The standard input may be used for one of the files by replacing
	the argument by "-".  Except for the standard input, both files
	must be on disk devices.
	.s
	Options:
	.lm +8
	.s.i -8;-b	Remove trailing whitespace (blanks and tabs)
	and compress all other strings of whitespace to a single blank.
	.s.i -8;-c	Print some context -- matching lines before
	and after the non-match section.  Mark non-matched sections
	with "|".
	.s.i -8;-i	Ignore lower/upper case distinctions.
	.s.i -8;-e	Output is in an "editor script" format which
	is compatible with the Unix 'ed' editor.
	.s.lm -8
	All information needed to compare the files is maintained in main
	memory. This means that very large files (or fairly large files with
	many differences) will cause the program to abort with an "out
	of space" message.  Main memory requirements (in words) are
	approximately:
	.s
		2 * (length of file1 + length of file2)
	.br
		+ 3 * (number of changes)
	.s
	(Where "length" is the number of lines of data in each file.)
	.s
	The algorithm reads each file twice, once to build hash tables and once
	to check for fortuitous matches (two lines that are in fact different,
	but which have the same hash value).  CPU time requirements include
	sorting the hash tables and randomly searching memory tables for
	equivalence classes. For example, on a time-shared VAX-11/780,
	comparing two 1000 line files required about 30 seconds (elapsed
	clock time) and about 10,000 bytes of working storage.  About 90
	per-cent of the time was taken up by file I/O.

diagnostics

	.lm +8
	.s.i -8;Warning, bad option 'x'
	.s
	The option is ignored.
	.s.i -8;Usage ...
	.s
	Two input files were not specified.
	.s.i -8;Can't open input file "filename".  Can't continue.
	.s.i -8;Out of space
	.s
	The program ran out of memory while comparing the two files.
	.s.i -8;Can't read line nnn at xxx in file[A/B]
	.s
	This indicates an I/O error when seeking to the specific line.
	It should not happen.
	.s.i -8;Spurious match, output is not optimal.
	.s
	Two lines that were different yielded the same hash value.  This is
	harmless except that the difference output is not the minimum set of
	differences between the two files.  For example, instead of the output:
	.s
		lines 1 to 5 were changed to ...
	.s
	the program will print
	.s
		lines 1 to 3 were changed to ...
	.br
		lines 4 to 5 were changed to ...
	.s
	The program uses a CRC16 hash code.  The likelihood of this error is
	quite small.
	.lm -8

author

	The diff algorithm was developed by J. W. Hunt and M. D. McIlroy,
	using a central algorithm defined by H. S. Stone.
	It was published in:
	.s.lm +4.nf
	Hunt, J. W., and McIlroy, M. D.,
	An Algorithm for Differential File Comparison,
	Computing Science Technical Report #41,
	Bell Laboratories, Murray Hill, NJ  07974
	.s.lm -4.f

bugs

	On RSX and DECUS C on VMS systems, diff may fail if the both
	files are not "variable-length, implied carriage control"
	format.  The scopy program can be used to convert files
	to this format if problems arise.

	When compiled under VAX C, diff handles STREAM_LF files
	properly (in addition to the canonical variable-length implied
	carriage control files).  Other variations should work, but
	have not been tested.

	When compiled under VAX C, diff is quite slow for unknown reasons
	which ought to be investigated.  On the other hand, it has access to
	effectively unlimited memory.

	Output in a form suitable for ed - the -e option - seems rather
	pointless; the analogue on DEC systems is SLP (SUMSLP on VMS).
	It would be simple to provide SLP-compatible output.  The question
	is, why bother - since the various DEC file comparison utilities
	already produce it.

#endif

/*
 * Diff maintains all information needed to compare the two files in main
 * memory.  This means that very large files (or fairly large files with
 * many differences) will cause the program to abort with an "out of space"
 * error.  Main memory requirements (in words) are approximately:
 *
 *	2 * (length of file1 + length of file2) + (3 * number of changes)
 *
 * The diff algorithm reads each file twice (once to build hash tables and
 * a second time to check for fortuitous matches), then reads the differences
 * by seeking randomly within the files.  CPU time requirements include
 * sorting the two hash vectors and randomly searching memory tables for
 * equivalence classes.  For example, running in Vax compatibility
 * mode, two 1000 line files with a fair number of differences took
 * about 25 seconds (elapsed wall clock time) for processing.  Most of this
 * time was spent in the file read routines.  This test required slightly
 * more than 6000 words of memory for internal tables.
 *
 * The diff algorithm was developed by J. W. Hunt and M. D. McIlroy,
 * using a central algorithm defined by H. S. Stone.  The algorithm
 * was described in:
 *
 *	Hunt, J. W., and McIlroy, M. D.,
 *	An Algorithm for Differential File Comparison,
 *	Computing Science Technical Report #41,
 *	Bell Laboratories, Murray Hill, NJ  07974
 *	
 * The following description is summarized from that document.  While
 * it has been slightly modified to correspond to the program source, the
 * algorithm is essentially identical.
 *
 * 1.	Read the input files, building two vectors containing the
 *	line number (serial) and hash value (hash) of each line.
 *	Data for fileA will be in a vector pointed to by fileA[],
 *	while data for fileB will be pointed to by fileB[]. The
 *	lengths (number of lines) of the files will be represented
 *	by lenA and lenB respectively.  [This is slightly different
 *	from the published algorithm.]
 *
 * 2.	Note initial and final sequences that have identical
 *	hash values to shorten subsequent processing.  Note that
 *	the "jackpot" phase (step 9.) will examine all lines in
 *	the file.  Next, sort each file using hash as the primary
 *	key and serial as the secondary key.
 *
 * 3.	Construct an array of equivalence classes (member[1..lenB])
 *	where each element contains the line number in fileB and a
 *	flag which is True if the indicated line is the first member
 *	of an equivalence class.  (An equivalence class is a set of
 *	lines which all hash to the same value.  The lines themselves
 *	are not necessarily identical.)
 *
 * 4.	Construct an array, class[1..lenA], where each element, I, is set to
 *	the index of a line, J, in fileB if member[J] is the first
 * 	element in an equivalence class and the hash code of line[I] in
 *	fileA is the same as the hash code of line[J] in fileB.  Class[I]
 *	is set to zero if no such line exists.
 *
 *	If non-zero, class[I] now points in member[] to the beginning of
 *	the class of lines in fileB equivalent to line[I] in fileA.
 *
 * The next two steps implement the longest common subsequence algorithm.
 *
 * 5.	Structure CANDIDATE { a, b, previous }, where a and b are line
 * 	numbers and previous a reference to a candidate, will store
 *	candidate lists as they are constructed.
 *
 *	Vector clist[] stores references to candidates.  It is dimensioned
 *	(0..min(lenA, lenB) + 1)
 *
 *	Initialize
 *		clist[0] = CANDIDATE {   0,   0, -1 };
 *		clist[1] = CANDIDATE { A+1, B+1, -1 };
 *		ktop = 1;
 *
 *	clist[1] is a fence beyond the last usefully filled element
 *	and -1 is an out-of-range clist index. Ktop is the index of the
 *	fence.  Note, because of the memory allocation used, clist[]
 *	is actually composed of two vectors, clist[] containing the
 *	candidate reference, and klist[] containing pointers to clist.
 *
 * 6.	For (A = 1 to lenA) {
 *		I = class[A];	-- the index in member[]:  beginning of
 *				-- the class of lines in fileB equivalent
 *				-- to this line in fileA.
 *		if (I is non-zero) {
 *			Merge each member into the candidate list
 *			as discussed below.
 *		}
 *
 * Unravel the chain of candidates, getting a vector of common subsequences:
 *
 * 7.	Set all elements of match[0..lenA] to zero.
 *
 * 8.	clist[ktop-1] points to the subsequence chain head.  For each element
 *	in the chain, let A and B be the line number entries.  Then, set
 *
 *		match[A] = B;
 *
 *	The non-zero elements of match[]  now pick out a longest common
 *	subsequence chain, possibly including spurious matches due to
 *	hash coincidences.  The pairings between the two files are:
 *
 *	if (match[A] is non-zero) {
 *		line A in fileA matches line match[A] in fileB;
 *	}
 *
 * Now, read each line of fileA and fileB to check for jackpots:
 *
 * 9.	for (A = 1 to lenA) {
 *		if (match[A] is nonzero) {
 *			if (fileA[A] is not identical to fileB[match[A]])
 *				match[A] = 0;	-- Hash congruence
 *		}
 *	}
 *
 * Ignoring "squish" complications, the merge step may be defined as follows:
 *
 *	Entry:
 *		clist[]		Candidate pointer array
 *		ktop		Fence beyond last filled index
 *		A		Current index in fileA
 *		member[]	Equivalence vector
 *		I		The index in member[] of the first element
 *				  of the class of lines in fileB that are
 *				  equivalent to line[A] in fileA.
 *
 * 1.	Let clist[R] be "an r-candidate" and C a reference to
 *	the last candidate found, which will always be an r-candidate.
 *	clist[R] will be updated with this reference once the previous
 *	value of clist[R] is no longer needed.  Initialize:
 *
 *		R = 0;
 *		C = clist[0];
 *
 * 2.	Do steps 3 through 6 repeatedly:
 *
 *   3.	set B to the line number in member[I];
 *	search clist[R..ktop] for an element, clist[S], such that
 *
 *		clist[S-1].b < B and clist[S].b >= B
 *
 *	Note that clist[] is ordered on clist[].b so that binary
 *	search will work.  The search algorithm used requires the
 *	two "fence" entries described above.
 *
 *	If such an element is found, perform steps 4. and 5.
 *
 *	4. Extend the longest common subsequence chain:
 *
 *		If (clist[S].b > B) {
 *			clist[R] = C;
 *			R = S;
 *			C = candidate(A, B, clist[S - 1]);
 *		}
 *
 *	5. Extend the number of subsequences, moving the fence:
 *
 *		If (S == ktop) {
 *			clist[ktop + 1] = clist[ktop]
 *			ktop = ktop + 1;
 *			break out of step 2's loop;
 *		}
 *
 *   6.	I = I + 1;
 *	if (member[I] is the first element in another class) {
 *		break out of step 2's loop;
 *	}
 *	else {
 *		continue at step 2.
 *	}
 *
 * 7.	clist[R] = C;
 *	exit merge subroutine.
 *
 * To illustrate vector contents, here is a sample run:
 *
 * File A:
 *	line 1
 *	line 2
 *	line 3
 *	line 4
 *	line 5 gets deleted
 *	line 6
 *
 * File B:
 *	line 1
 *	line 1.5 inserted
 *	line 2
 *	line 3 changed
 *	line 4
 *	line 6
 *
 * (For clarity, the "squish" step is omitted from the following)
 *
 * On entry to equiv() (after readin and sorting), the file[] vector is
 * as follows (the first entry in each pair is the line number, the
 * second is the hash value.  Entries are sorted on hash value):
 *
 * FileA[] (1..lines in fileA):
 *   line   hash
 *	3 042400  6 043300  5 050026  1 102201  2 102701  4 103501
 * FileB[] (1..lines in fileB):
 *	6 043300  2 045600  1 102201  3 102701  5 103501  4 147166
 *
 *
 * After Equiv has processed file[]:
 *
 * FileA[] (1..lines in fileA):
 *   line value
 *	3 0	6 1	5 0	1 3	2 4	4 5
 * Member[] (0..lines in fileB)
 *	0	-6	-2	-1	-3	-5	-4
 *
 *
 * After unsort() has unwound fileB:
 *
 * Class[] (1 .. lines in fileA):
 *	3	 4	0	5	0	1
 *
 * Within unravel(), match is built in the following order:
 *
 *	match[6] := 6
 *	match[4] := 5
 *	match[2] := 3
 *	match[1] := 1
 *
 * Match[] (0 .. lines in fileA):
 *
 *	 0	1	3	0	5	0	6
 *
 * Output is as follows:
 *
 *	1a2
 *	> line 1.5 inserted
 *	3c4
 *	< line 3
 *	---
 *	> line 3 changed
 *	5d5
 *	< line 5 gets deleted
 *
 *
 */

#include <stdio.h>
#include <ctype.h>
#ifdef vms
#include		<ssdef.h>
#include		<stsdef.h>
#define	IO_SUCCESS	(SS$_NORMAL | STS$M_INHIB_MSG)
#define	IO_ERROR	SS$_ABORT
#endif
/*
 * Note: IO_SUCCESS and IO_ERROR are defined in the Decus C stdio.h file
 */
#ifndef	IO_SUCCESS
#define	IO_SUCCESS	0
#endif
#ifndef	IO_ERROR
#define	IO_ERROR	1
#endif
#define	EOS		0
#define	TEMPFILE	"diff.tmp"
#define	TRUE		1
#define	FALSE		0

#ifdef	pdp11
#define	short	int
#endif

extern long	ftell();

typedef struct candidate {
	int	b;			/* Line in fileB		*/
	int	a;			/* Line in fileA		*/
	int	link;			/* Previous candidate		*/
} CANDIDATE;

typedef struct line {
	unsigned short	hash;		/* Hash value etc.		*/
	short		serial;		/* Line number			*/
} LINE;

LINE	*file[2];			/* Hash/line for total file	*/
#define	fileA	file[0]
#define	fileB	file[1]

LINE	*sfile[2];			/* Hash/line after prefix	*/
#define	sfileA	sfile[0]
#define	sfileB	sfile[1]

int	len[2];				/* Actual lines in each file	*/
#define	lenA	len[0]
#define	lenB	len[1]

int	slen[2];			/* Squished lengths		*/
#define	slenA	slen[0]
#define	slenB	slen[1]

int	prefix;				/* Identical lines at start	*/
int	suffix;				/* Identical lenes at end	*/

FILE	*infd[2] = { NULL, NULL };	/* Input file identifiers	*/
FILE	*tempfd;			/* Temp for input redirection	*/

/*
 * The following vectors overlay the area defined by fileA
 */

int		*class;			/* Unsorted line numbers	*/
int		*klist;			/* Index of element in clist	*/
CANDIDATE	*clist;			/* Storage pool for candidates	*/
int		clength	= 0;		/* Number of active candidates	*/

int		*match;			/* Longest subsequence		*/
long		*oldseek;		/* Seek position in file A	*/

/*
 * The following vectors overlay the area defined by fileB
 */

int		*member;		/* Concatenated equiv. classes	*/
long		*newseek;		/* Seek position in file B	*/
char		*textb;			/* Input from file2 for check	*/

/*
 * Global variables
 */

int		eflag	= FALSE;	/* Edit script requested	*/
int		bflag	= FALSE;	/* Blank supress requested	*/
int		cflag	= FALSE;	/* Context printout		*/
int		iflag	= FALSE;	/* Ignore case requested	*/
char		text[257];		/* Input line from file1	*/
#ifndef	vms
char		*free_space;		/* For storage allocation	*/
#endif
extern char	*myalloc();		/* Storage allocator		*/
extern char	*compact();		/* Storage compactor		*/

#ifdef	DEBUG
#define	TIMING
#endif
#ifdef	TIMING
extern long	time();
extern char	*$$mend;
long		totaltime;
long		sectiontime;
char		*mstart;
#endif

main(argc, argv)
int		argc;
char		**argv;
/*
 * Diff main program
 */
{
	register int	i;
	register char	*ap;

#ifdef	TIMING
	sectiontime = time(&totaltime);
#endif
#ifdef vms
	argc = getredirection(argc,argv);
#endif
	while (argc > 1 && *(ap = argv[1]) == '-' && *++ap != EOS) {
		while (*ap != EOS) {
			switch (tolower(*ap++)) {
			case 'b':
					bflag++;
					break;

			case 'c':
					cflag++;
					break;

			case 'e':
					eflag++;
					break;

			case 'i':
					iflag++;
					break;

			default:
					fprintf(stderr,
						"Warning, bad option '%c'\n",
						ap[-1]);
					break;
			}
		}
		argc--;
		argv++;
	}

	if (argc != 3)
		error("Usage: diff [-options] file1 file2");
	if (cflag && eflag) {
		fprintf(stderr,
			"Warning, -c and -e are incompatible, -c supressed.\n");
		cflag = FALSE;
	}
	argv++;
	for (i = 0; i <= 1; i++) {
		if (argv[i][0] == '-' && argv[i][1] == EOS) {
			infd[i] = stdin;
			if ((tempfd = fopen(TEMPFILE, "w")) == NULL)
				cant(TEMPFILE, "work", 1);
		}
		else {
			infd[i] = fopen(argv[i], "r");
		}
	}
	if (infd[0] == NULL && infd[1] == NULL) {
		cant(argv[0], "input", 0);
		cant(argv[1], "input", 1);
	}
	else if (infd[1] == NULL)
		opendir(1, &argv[1], infd[0]);
	else if (infd[0] == NULL)
		opendir(0, &argv[0], infd[1]);
#ifndef	vms
	free_space = malloc(1);
#endif
	/*
	 * Read input, building hash tables.
	 */
	input(0);
	input(1);
	squish();
#ifdef	DEBUG
	printf("before sort\n");
	for (i = 1; i <= slenA; i++)
		printf("sfileA[%d] = %6d %06o\n",
			i, sfileA[i].serial, sfileA[i].hash);
	for (i = 1; i <= slenB; i++)
		printf("sfileB[%d] = %6d %06o\n",
			i, sfileB[i].serial, sfileB[i].hash);
#endif
	sort(sfileA, slenA);
	sort(sfileB, slenB);
#ifdef	TIMING
	ptime("input");
#endif
#ifdef	DEBUG
	printf("after sort\n");
	for (i = 1; i <= slenA; i++)
		printf("sfileA[%d] = %6d %06o\n",
			i, sfileA[i].serial, sfileB[i].hash);
	for (i = 1; i <= slenB; i++)
		printf("sfileB[%d] = %6d %06o\n",
			i, sfileB[i].serial, sfileB[i].hash);
#endif
	/*
	 * Build equivalence classes.
	 */
	member = (int *)fileB;
	equiv();
	member = (int *)compact((char *)member, (slenB + 2) * sizeof (int),
			"squeezing member vector");
	/*
	 * Reorder equivalence classes into array class[]
	 */
	class = (int *)fileA;
	unsort();
	class = (int *)compact((char *)class, (slenA + 2) * sizeof (int),
			"compacting class vector");
#ifdef	TIMING
	ptime("equiv/unsort");
#endif
	/*
 	 * Find longest subsequences
	 */
	klist = (int *)myalloc((slenA + 2) * sizeof (int), "klist");
	clist = (CANDIDATE *)myalloc(sizeof (CANDIDATE), "clist");
	i = subseq();
	free((char *)member);
	free((char *)class);
	match = (int *)myalloc((lenA + 2) * sizeof (int), "match");
	unravel(klist[i]);
	free(clist);
	free(klist);
#ifdef	TIMING
	ptime("subsequence/unravel");
#endif
	/*
	 * Check for fortuitous matches and output differences
	 */
	oldseek = myalloc((lenA + 2) * sizeof (* oldseek), "oldseek");
	newseek = myalloc((lenB + 2) * sizeof (* newseek), "newseek");
	textb = myalloc(sizeof text, "textbuffer");
	if (check(argv[0], argv[1]))
		fprintf(stderr, "Spurious match, output is not optimal\n");
#ifdef	TIMING
	ptime("check");
#endif
	output();
#ifdef	TIMING
	ptime("output");
	printf("%ld seconds required\n", sectiontime - totaltime);
#endif
	if (tempfd != NULL) {
		fclose(tempfd);
		delete(TEMPFILE);
	}
}
input(which)
int		which;			/* 0 or 1 to redefine infd[]	*/
/*
 * Read the file, building hash table
 */
{
	register LINE		*lentry;
	register int		linect;
	register short		hashval;
	FILE			*fd;
	unsigned short		hash();

	linect = 0;
	lentry = (LINE *)myalloc(sizeof(LINE) * 3, "line");
	fd = infd[which];
	while (!getline(fd, text)) {
		lentry = (LINE *)compact((char *)lentry,
			(++linect + 3) * sizeof(LINE),
			"extending line vector");
		lentry[linect].hash = hash(text);
	}
	/*
	 * If input was from stdin ("-" command), finish off the temp file.
	 */
	if (fd == stdin) {
		fclose(tempfd);
		tempfd = infd[which] = fopen(TEMPFILE, "r");
	}
	len[which] = linect;
	file[which] = lentry;
}
squish()
/*
 * Look for initial and trailing sequences that have identical hash values.
 * Don't bother building them into the candidate vector.
 */
{
	register int	i;
	register LINE	*ap;
	register LINE	*bp;
	int		j;
	int		k;

	/*
	 * prefix -> first line (from start) that doesn't hash identically
	 */
	i = 0; ap = &fileA[1]; bp = &fileB[1];
	while (i < lenA && i < lenB && ap->hash == bp->hash) {
		i++; ap++; bp++;
	}
	prefix = i;
	/*
 	 * suffix -> first line (from end) that doesn't hash identically
	 */
	j = lenA - i;
	k = lenB - i;
	ap = &fileA[lenA];
	bp = &fileB[lenB];
	i = 0;
	while (i < j && i < k && ap->hash == bp->hash) {
		i++; ap--; bp--;
	}
	suffix = i;
	/*
	 * Tuck the counts away
	 */
	for (k = 0; k <= 1; k++) {
		sfile[k] = file[k] + prefix;
		j = slen[k] = len[k] - prefix - suffix;

		for (i = 0, ap = sfile[k]; i <= slen[k]; i++, ap++) {
			ap->serial = i;
		}
	}
}
sort(vector, vecsize)
LINE		*vector;		/* What to sort			*/
int		vecsize;		/* How many to sort		*/
/*
 * Sort hash entries
 */
{
	register int	j;
	register LINE	*aim;
	register LINE	*ai;
	int		mid;	
	int		k;
	LINE		work;

	for (j = 1; j <= vecsize; j *= 2);
	mid = (j - 1);
	while ((mid /= 2) != 0) {
		k = vecsize - mid;
		for (j = 1; j <= k; j++) {
			for (ai = &vector[j]; ai > vector; ai -= mid) {
				aim = &ai[mid];
				if (aim < ai)
					break;	/* ?? Why ??		*/
				if (aim->hash > ai->hash ||
						aim->hash == ai->hash &&
						aim->serial > ai->serial)
					break;
				work.hash = ai->hash;
				ai->hash = aim->hash;
				aim->hash = work.hash;
				work.serial = ai->serial;
				ai->serial = aim->serial;
				aim->serial = work.serial;
			}
		}
	}
}
equiv()
/*
 * Build equivalence class vector
 */
{
	register LINE	*ap;
	register union {
		LINE	*bp;
		int	*mp;
	} r;
	register int	j;
	LINE		*atop;

#ifdef	DEBUG
	printf("equiv entry\n");
	for (j = 1; j <= slenA; j++)
		printf("sfileA[%d] = %6d %06o\n",
				j, sfileA[j].serial, sfileA[j].hash);
	for (j = 1; j <= slenB; j++)
		printf("sfileB[%d] = %6d %06o\n",
				j, sfileB[j].serial, sfileB[j].hash);
#endif
	j = 1;
	ap = &sfileA[1];
	r.bp = &sfileB[1];
	atop = &sfileA[slenA];
	while (ap <= atop && j <= slenB) {
		if (ap->hash < r.bp->hash) {
			ap->hash = 0;
			ap++;
		}
		else if (ap->hash == r.bp->hash) {
			ap->hash = j;
			ap++;
		}
		else {
			r.bp++;
			j++;
		}
	}
	while (ap <= atop) {
		ap->hash = 0;
		ap++;
	}
	sfileB[slenB + 1].hash = 0;
#ifdef	DEBUG
	printf("equiv exit\n");
	for (j = 1; j <= slenA; j++)
		printf("sfileA[%d] = %6d %06o\n",
				j, sfileA[j].serial, sfileA[j].hash);
	for (j = 1; j <= slenB; j++)
		printf("sfileB[%d] = %6d %06o\n",
				j, sfileB[j].serial, sfileB[j].hash);
#endif
	ap = &sfileB[0];
	atop = &sfileB[slenB];
	r.mp = &member[0];
	while (++ap <= atop) {
		r.mp++;
		*r.mp = -(ap->serial);
		while (ap[1].hash == ap->hash) {
			ap++;
			r.mp++;
			*r.mp = ap->serial;
		}
	}
	r.mp[1] = -1;
#ifdef	DEBUG
	for (j = 0; j <= slenB; j++)
		printf("member[%d] = %d\n", j, member[j]);
#endif
}
unsort()
/*
 * Build class vector
 */
{
	register int	*temp;
	register int	*tp;
	register union {
		LINE	*ap;
		int	*cp;
	} u;
	LINE		*evec;
	int		*eclass;
#ifdef	DEBUG
	int		i;
#endif

	temp = (int *)myalloc((slenA + 1) * sizeof(int), "unsort scratch");
	u.ap = &sfileA[1];
	evec = &sfileA[slenA];
	while (u.ap <= evec) {
#ifdef	DEBUG
		printf("temp[%2d] := %06o\n", u.ap->serial, u.ap->hash);
#endif
		temp[u.ap->serial] = u.ap->hash;
		u.ap++;
	}
	/*
	 * Copy into class vector and free work space
	 */
	u.cp = &class[1];
	eclass = &class[slenA];
	tp = &temp[1];
	while (u.cp <= eclass)
		*u.cp++ = *tp++;
	free((char *) temp);
#ifdef	DEBUG
	printf("unsort exit\n");
	for (i = 1; i <= slenA; i++)
		printf("class[%d] = %d %06o\n", i, class[i], class[i]);
#endif
}
subseq()
/*
 * Generate maximum common subsequence chain in clist[]
 */
{
	int			a;
	register int		ktop;
	register int		b;
	register int		s;
	int			r;
	int			i;
	int			cand;

	klist[0] = newcand(0, 0, -1);
	klist[1] = newcand(slenA + 1, slenB + 1, -1);
	ktop = 1;				/* -> guard entry	*/
	for (a = 1; a <= slenA; a++) {
		/*
		 * For each non-zero element in fileA ...
		 */
		if ((i = class[a]) == 0)
			continue;
		cand = klist[0];		/* No candidate now	*/
		r = 0;				/* Current r-candidate	*/
		do {
#ifdef	DEBUG
			printf("a = %d, i = %d, b = %d\n", a, i, member[i]);
#endif
			/*
			 * Perform the merge algorithm
			 */
			if ((b = member[i]) < 0)
				b = -b;
#ifdef	DEBUG
			printf("search(%d, %d, %d) -> %d\n",
					r, ktop, b, search(r, ktop, b));
#endif
			if ((s = search(r, ktop, b)) != 0) {
				if (clist[klist[s]].b > b) {
					klist[r] = cand;
					r = s;
					cand = newcand(a, b, klist[s-1]);
#ifdef	DEBUG
					dumpklist(ktop, "klist[s-1]->b > b");
#endif
				}
				if (s >= ktop) {
					klist[ktop + 1] = klist[ktop];
					ktop++;
#ifdef	DEBUG
					klist[r] = cand;
					dumpklist(ktop, "extend");
#endif
					break;
				}
			}
		} while (member[++i] > 0);
		klist[r] = cand;
	}
#ifdef	DEBUG
	printf("last entry = %d\n", ktop - 1);
#endif
	return(ktop - 1);			/* Last entry found	*/
}
int
newcand(a, b, pred)
int		a;		/* Line in fileA			*/
int		b;		/* Line in fileB			*/
int		pred;		/* Link to predecessor, index in cand[]	*/
{
	register CANDIDATE	*new;

	clength++;
	clist = (CANDIDATE *)compact((char *)clist,
			clength * sizeof (CANDIDATE),
			"extending clist");
	new = &clist[clength - 1];
	new->a = a;
	new->b = b;
	new->link = pred;
	return(clength - 1);
}
search(low, high, b)
register int	low;
int		high;
int		b;
/*
 * Search klist[low..top] (inclusive) for b.  If klist[low]->b >= b,
 * return zero.  Else return s such that klist[s-1]->b < b and
 * klist[s]->b >= b.  Note that the algorithm presupposes the two
 * preset "fence" elements, (0, 0) and (slenA, slenB).
 */
{
	register int		temp;
	register int		mid;

	if (clist[klist[low]].b >= b)
		return(0);
	while ((mid = (low + high) / 2) > low) {
		if ((temp = clist[klist[mid]].b) > b)
			high = mid;
		else if (temp < b)
			low = mid;
		else {
			return(mid);
		}
	}
	return(mid + 1);
}
unravel(k)
register int	k;
{
	register int		i;
	register CANDIDATE	*cp;
	int			first_trailer;
	int			difference;

	first_trailer = lenA - suffix;
	difference = lenB - lenA;
#ifdef	DEBUG
	printf("first trailer = %d, difference = %d\n",
		first_trailer, difference);
#endif
	for (i = 0; i <= lenA; i++) {
		match[i] = (i <= prefix) ? i
			: (i > first_trailer) ? i + difference
			: 0;
	}
#ifdef	DEBUG
	printf("unravel\n");
#endif
	while (k != -1) {
		cp = &clist[k];
#ifdef	DEBUG
		if (k < 0 || k >= clength)
			error("Illegal link -> %d", k);
		printf("match[%d] := %d\n", cp->a + prefix, cp->b + prefix);
#endif
		match[cp->a + prefix] = cp->b + prefix;
		k = cp->link;
	}
}
check(fileAname, fileBname)
char		*fileAname;
char		*fileBname;
/*
 * Check for hash matches (jackpots) and collect random access indices to
 * the two files.
 */
{
	register int	a;		/* Current line in file A	*/
	register int	b;		/* Current line in file B	*/
	int		jackpot;

/*
 * The VAX C ftell() returns the address of the CURRENT record, not the
 * next one (as in DECUS C or, effectively, other C's).  Hence, the values
 * are "off by one" in the array.  OFFSET compensates for this.
 */
#ifdef vms
#define OFFSET (-1)
#else
#define OFFSET 0
#endif

	b = 1;
	rewind(infd[0]);
	rewind(infd[1]);
/*
 * See above; these would be over-written on VMS anyway.
 */
#ifndef vms
	oldseek[0] = ftell(infd[0]);
	newseek[0] = ftell(infd[1]);
#endif

	jackpot = 0;
#ifdef	DEBUG
	printf("match vector\n");
	for (a = 0; a <= lenA; a++)
		printf("match[%d] = %d\n", a, match[a]);
#endif
	for (a = 1; a <= lenA; a++) {
		if (match[a] == 0) {
			getline(infd[0], text);
			oldseek[a+OFFSET] = ftell(infd[0]);
			continue;	
		}
		while (b < match[a]) {
			getline(infd[1], textb);
			newseek[b+OFFSET] = ftell(infd[1]);
			b++;
		}
		getline(infd[0], text);
		getline(infd[1], textb);
		if (!streq(text, textb)) {
			fprintf(stderr,	"Spurious match:\n");
			fprintf(stderr, "line %d in %s, \"%s\"\n",
				a, fileAname, text);
			fprintf(stderr, "line %d in %s, \"%s\"\n",
				b, fileBname, textb);
			match[a] = 0;
			jackpot++;
		}
		oldseek[a+OFFSET] = ftell(infd[0]);
		newseek[b+OFFSET] = ftell(infd[1]);
		b++;
	}
	for (; b <= lenB; b++) {
		getline(infd[1], textb);
		newseek[b+OFFSET] = ftell(infd[1]);
	}
/*
 * The logical converse to the code up above, for NON-VMS systems, to
 * store away an fseek() pointer at the beginning of the file.  For VMS,
 * we need one at EOF...
 */
#ifdef vms
	getline(infd[0],text);			/* Will hit EOF...	*/
	oldseek[lenA] = ftell(infd[0]);
	getline(infd[1],textb);			/* Will hit EOF...	*/
	newseek[lenB] = ftell(infd[1]);
#endif

	return(jackpot);
}
output()
{
	register int	astart;
	register int	aend;
	int		bstart;
	register int	bend;

	rewind(infd[0]);
	rewind(infd[1]);
	match[0] = 0;
	match[lenA+1] = lenB + 1;
	if (!eflag) {
		/*
		 * Normal printout
		 */
		for (astart = 1; astart <= lenA; astart = aend + 1) {
			/*
			 * New subsequence, skip over matching stuff
			 */
			while (astart <= lenA
				  && match[astart] == (match[astart - 1] + 1))
				astart++;
			/*
			 * Found a difference, setup range and print it
			 */
			bstart = match[astart - 1] + 1;
			aend = astart - 1;
			while (aend < lenA && match[aend + 1] == 0)
				aend++;
			bend = match[aend + 1] - 1;
			match[aend] = bend;
			change(astart, aend, bstart, bend);
		}
	}
	else {
		/*
		 * Edit script output -- differences are output "backwards"
		 * for the benefit of a line-oriented editor.
		 */
		for (aend = lenA; aend >= 1; aend = astart - 1) {
			while (aend >= 1
				  && match[aend] == (match[aend + 1] - 1)
				  && match[aend] != 0)
				aend--;
			bend = match[aend + 1] - 1;
			astart = aend + 1;
			while (astart > 1 && match[astart - 1] == 0)
				astart--;
			bstart = match[astart - 1] + 1;
			match[astart] = bstart;
			change(astart, aend, bstart, bend);
		}
	}
	if (lenA == 0)
		change(1, 0, 1, lenB);
}
change(astart, aend, bstart, bend)
int		astart;
int		aend;
int		bstart;
int		bend;
/*
 * Output a change entry: fileA[astart..aend] changed to fileB[bstart..bend]
 */
{
	/*
	 * This catches a "dummy" last entry
	 */
	if (astart > aend && bstart > bend)
		return;
	range(astart, aend);
	putchar((astart > aend) ? 'a'
			: (bstart > bend) ? 'd' : 'c');
	if (!eflag)
		range(bstart, bend);
	putchar('\n');
	if (!eflag) {
		fetch(oldseek, astart, aend, lenA, infd[0], "< ");
		if (astart <= aend && bstart <= bend)
			printf("---\n");
	}
	fetch(newseek, bstart, bend, lenB, infd[1], (!eflag) ? "> " : "");
	if (eflag && bstart <= bend)
		printf(".\n");
}
range(from, to)
int		from;
int		to;
/*
 * Print a range
 */
{
	printf("%d", (to > from) ? from : to);
	if (to >= from)
		printf(",%d", to);
}
fetch(seekvec, start, end, trueend, fd, pfx)
long		*seekvec;
register int	start;
register int	end;
int		trueend;
FILE		*fd;
char		*pfx;
/*
 * Print the appropriate text
 */
{
	register int	i;
	int		first;
	int		last;

	first = last = FALSE;
	if (cflag) {
		if (start > end)
			return;
		if (start > 1) {
			start--;
			first++;
		}
		if (end < trueend-1) {
			end++;
			last++;
		}
	}
	if (fseek(fd, seekvec[start - 1], 0) != 0) {
		printf("?Can't read line %d at %08lx (hex) in file%c\n",
			start, seekvec[start - 1],
			(fd == infd[0]) ? 'A' : 'B');
	}
	else {
		for (i = start; i <= end; i++) {
			if (fgetss(text, sizeof text, fd) == NULL) {
				printf("** Unexpected end of file\n");
				break;
			}
#ifdef DEBUG
			printf("%5d: %s%s\n", i, pfx, text);
#else
			if (cflag) {
				if (first || (last && i == end)) {
					putchar(' ');
					first = FALSE;
				}
				else	putchar('|');
			}
			else	printf("%s", pfx);
			printf("%s\n", text);
#endif
		}
	}	
}
getline(fd, buffer)
FILE		*fd;
char		*buffer;
/*
 * Input routine, read one line to buffer[], return TRUE on eof, else FALSE.
 * The terminating newline is always removed.  If "-b" was given, trailing
 * whitespace (blanks and tabs) are removed and strings of blanks and
 * tabs are replaced by a single blank.  Getline() does all hacking for
 * redirected input files.
 */
{
	register char	*top;
	register char	*fromp;
	register char	c;

	if (fgetss(buffer, sizeof text, fd) == NULL) {
		*buffer = EOS;
		return(TRUE);
	}
	if (fd == stdin)
		fputss(buffer, tempfd);
	if (bflag || iflag) {
		top = buffer;
		fromp = buffer;
		while ((c = *fromp++) != EOS) {
			if (bflag && (c == ' ' || c == '\t')) {
				c = ' ';
				while (*fromp == ' ' || *fromp == '\t')
					fromp++;
			}
			if (iflag)
				c = tolower(c);
			*top++ = c;
		}
		if (bflag && top[-1] == ' ')
			top--;
		*top = EOS;
	}
	return(FALSE);
}
static unsigned short crc16a[] = {
	0000000,	0140301,	0140601,	0000500,
	0141401,	0001700,	0001200,	0141101,
	0143001,	0003300,	0003600,	0143501,
	0002400,	0142701,	0142201,	0002100,	
};
static unsigned short crc16b[] = {
	0000000,	0146001,	0154001,	0012000,
	0170001,	0036000,	0024000,	0162001,
	0120001,	0066000,	0074000,	0132001,
	0050000,	0116001,	0104001,	0043000,
};

unsigned short
hash(buffer)
char		*buffer;
/*
 * Return the CRC16 hash code for the buffer
 * Algorithm from Stu Wecker (Digital memo 130-959-002-00).
 */
{
	register unsigned short	crc;
	register char		*tp;
	register short 		temp;

	crc = 0;
	for (tp = buffer; *tp != EOS;) {
		temp = *tp++ ^ crc;	/* XOR crc with new char	*/
		crc = (crc >> 8)
			^ crc16a[(temp & 0017)]
			^ crc16b[(temp & 0360) >> 4];
	}
#ifdef	DEBUG_ALL
	printf("%06o: %s\n", (crc == 0) ? 1 : crc, buffer);
#endif
	return((crc == 0) ? 1 : crc);
}		
opendir(which, arg, okfd)
int		which;		/* Which file to open (0 or 1)		*/
char		**arg;		/* File name argument, &argv[which]	*/
FILE		*okfd;		/* File name (already open)		*/
{
	register char		*tp;
	register int		c;
	register char		*newname;

	fgetname(okfd, text);
	/*
	 * Skip over device name
	 */
	for (tp = text; (c = *tp) && c != ':'; tp++);
	if (c)	tp++;
	else	tp = text;
	/*
	 * Skip over [UIC] or [PPN] if present
	 */
	if (*tp == '[' || *tp == '(') {
		while ((c = *tp++) && c != ']' && c != ')');
		if (c == 0) {
			fprintf(stderr, "?Bug: bad file name \"%s\"\n",
					text);
			tp--;
		}
	}
	strcpy(text, tp);
	/*
	 * Don't include version
	 */
	for (tp = text; (c = *tp) && c != ';'; tp++);
	*tp = 0;
	/*
	 * Now, text has the file name, tp - text, its length,
	 * and *arg the (possible) directory name.  Create a new
	 * file name for opening.
	 */
	if ((newname = malloc(tp - text + strlen(*arg) + 1)) == NULL)
		error("Out of space at start");
	concat(newname, *arg, text, NULL);
	if ((infd[which] = fopen(newname, "r")) == NULL)
		cant(*arg, "constructed input", 1);
	else
		*arg = newname;
}
char *
myalloc(amount, why)
int		amount;
char		*why;
/*
 * Allocate or crash.
 */
{
	register char	*pointer;

	if ((pointer = malloc((unsigned) amount)) == NULL)
		noroom(why);
	return (pointer);
}

char *
compact(pointer, new_amount, why)
char		*pointer;
int		new_amount;
char		*why;
/*
 * Reallocate pointer, compacting storage
 */
{
	register	*new_pointer;
	extern char	*realloc();

	/*
	 * This routine is heavily dependent on C storage allocator hacks
	 */
#ifndef	vms
	free(pointer);			/* Do not change		*/
	free(free_space);		/* The order of this code.	*/
	free_space = malloc(1);		/* This code doesn't work on	*/
#endif
	if ((new_pointer =		/* Vax-11 C			*/
			realloc(pointer, (unsigned) new_amount)) == NULL)
		noroom(why);
#ifdef	DEBUG
	if (new_pointer != pointer) {
		fprintf(stderr, "moved from %06o to %06o\n",
			pointer, new_pointer);
	}
/*	rdump(new_pointer, why);
*/
#endif
	return (new_pointer);
}

noroom(why)
char		*why;
{
	fprintf(stderr, "?DIFF-F-out of room when %s\n", why);
	exit(IO_ERROR);
}

#ifdef	DEBUG
rdump(pointer, why)
int		*pointer;
char		*why;
/*
 * Dump memory block
 */
{
	int	*last;
	int	count;

	last = (int **)pointer[-1];
	fprintf(stderr, "dump %s of %06o -> %06o, %d words",
			why, pointer, last, last - pointer);
	last = (int *)(((int) last) & ~1);
	for (count = 0; pointer < last; ++count) {
		if ((count & 07) == 0) {
			fprintf(stderr, "\n%06o", pointer);
		}
		fprintf(stderr, "\t%06o", *pointer);
		pointer++;
	}
	fprintf(stderr, "\n");
}
#endif
cant(filename, what, fatalflag)
char		*filename;
char		*what;
int		fatalflag;
/*
 * Can't open file message
 */
{
	fprintf(stderr, "Can't open %s file \"%s\"\n", what, filename);
	if (fatalflag)
		error("Can't continue");
}
#ifdef	DEBUG
dump(d_linep, d_len, d_which)
LINE	*d_linep;
{
	register int i;
	
	printf("Dump of file%c, %d elements\n", "AB"[d_which], d_len);
	printf("linep @ %06o\n", d_linep);
	for (i = 0; i <= d_len; i++) {
		printf("%3d  %6d  %06o\n", i,
				d_linep[i].serial, d_linep[i].hash);
	}
}

dumpklist(kmax, why)
int	kmax;
char	*why;
/*
 * Dump klist
 */
{
	register int		i;
	register CANDIDATE	*cp;
	register int		count;

	printf("\nklist[0..%d] %s, clength = %d\n", kmax, why, clength);
	for (i = 0; i <= kmax; i++) {
		cp = &clist[klist[i]];
		printf("%2d %2d", i, klist[i]);
		if (cp >= &clist[0] && cp < &clist[clength])
			printf(" (%2d %2d -> %2d)\n", cp->a, cp->b, cp->link);
		else if (klist[i] == -1)
			printf(" End of chain\n");
		else	printf(" illegal klist element\n");
	}
	for (i = 0; i <= kmax; i++) {
		count = -1;
		for (cp = klist[i]; cp > &clist[0]; cp = &cp->link) {
			if (++count >= 6) {
				printf("\n    ");
				count = 0;
			}
			printf(" (%2d: %2d,%2d -> %d)",
				cp-clist, cp->a, cp->b, cp->link);
		}
		printf("\n");
	}
	printf("*\n");
}
#endif
#ifdef	TIMING
ptime(why)
char		*why;
/*
 * Dump time buffer
 */
{
	long	ttemp;

	ttemp = time(NULL);
	printf("%ld seconds for %s\n",
		ttemp - sectiontime, why);
	sectiontime = ttemp;
}
#endif
                                                                                                                                                                                                                         