/*
 * Kwik index generator.
 */

/*)BUILD	$(FILES)	= { kwik sorts }
		$(STACK)	= 4000
		$(TKBOPTIONS)	= {
			TASK = ...KWI
		}
*/

#ifdef	DOCUMENTATION

title	kwik	Keyword in Context Index
index		Keyword in Context Index

synopsis

	kwik [options] [file ...]

description

	Kwik constructs a keyword in context (kwik) index using the data in
	the named files, writing the resulting index to the standard output.
	The standard input is read if no files are specified;
	kwik may be used as
	a filter.  The following options are defined:
	.lm +16
	.s.i -16;-s		The kwik index normally excludes common
	(stop) words. Specifing the '-s' option empties the stop list,
	thus including the following words:
	.s.nf
		a	by	in	to
		an	for	of	the
		and	from	on	with
	.s.f
	.s.i -16;-r		Make the index in reverse alphabetic order.
	.s.i -16;-t offset	This is used to build index tables.  The input
	is entered in the following format:
	.s
		name<TAB>index text
	.s
	The kwik index will be output with the name in the
	left hand column.  The kwik'ed
	text then follows.  The '-t' option takes a mandatory argument:
	the column at which the first byte of the kwik'ed text
	should be placed.  For example, the index for the Decus library
	documentation was produced by the following command:
	.s.nf
	    kwik -t 16 -w 64 <infile >outfile
	.f
	.s.i -16;-x file__name	The named file contains a user-specified
	exclusion (stopword) list.  The '-x' option may be repeated if
	multiple exclusion lists are needed. Note that the order of the
	'-s' and '-x' options is important:
	.lm +4
	.s.i-4;kwik -x file
	.s
	The file contains an exclusion list, one word per line.  Append
	the contents of the file to the default list.
	.s.i -4;kwik -s -x file
	.s
	Replace the default stoplist by the contents of the named file.
	.s.i -4;kwik -x file -s
	.s
	After reading the exclusion file, the entire stop list
	is erased.  (This is not a useful procedure.)
	.lm -4
	.s.i -16;-w width	The output line width is normally 80
	characters. The '-w' option changes it to a user-specified value.
	.lm -16

diagnostics

	.lm +8
	.s.i -8;Usage ...
	.s
	Illegal options cause an extensive "help" message to be printed.
	.s.i -8;Bad (width | offset) ...
	.s
	An illegal or out-of-range number was given as a parameter to a '-w'
	or '-t' option.
	.s.i -8;Can't open exclude file
	.s.i -8;Illegal exclusion
	.s
	An exclude file text line began with a non-printing character.
	The line is ignored.
	.s.i -8;Out of space in saveexclude
	.s
	The program ran out of main memory when building the stop list.
	.s.i -8;No index for ...
	.s
	Kwik was invoked in '-t' mode.  Unfortunately, the indicated line
	was not in the format "name#<TAB>#text".  The line is ignored.
	.s.i -8;Cannot create temp file
	.s
	The sorts subroutine could not create its file.
	.s.i -8;Out of space
	.s
	The sorts subroutine filled the available disk space.
	.lm -8

author

	David Conroy, Martin Minow

implementation

	Kwik is linked together with sorts.obj.
	.s
	The program must be linked with a large enough
	stack space so that the sorts routine can operate correctly.  The
	following seems to work correctly:
	.s.nf
	    tkb
	    kwik=kwik,sorts,c:c/lb
	    /
	    stack = 2048
	    //
	.f

bugs
	If an index line is too long to fit (in the kwik'ed part),
	the program overlays text on the line, logging the break showing
	where the beginning of the line was.

#endif

#include <stdio.h>

#define	FOLD	'\t'
#define	NBUF	128
#define	FALSE	0
#define	TRUE	1
#define	EOS	0
extern	int	sort_r;			/* Reverse kwik			*/

static int	width	= 80;
static int	sflag	= 1;		/* Stop list (on)		*/
static char	*stopfile = "";		/* User's stop list		*/
static int	tflag	= 0;		/* Table format flag		*/
static int	offset	= 0;		/* Table format offset		*/
static char	inbuf[NBUF];
static char	outbuf[NBUF];
static char	tbuf[NBUF];

/*
 * This table contains '1' for bytes that may begin indexed words.
 * This may be defined as the regular expression "[0-9A-Za-z]"
 */
static char	ok[] = {
	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,	/*   0.. 15	*/
	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,	/*  16.. 31	*/
	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,	/*  32.. 47	*/
	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0,	/*  48.. 63 0-9	*/
	0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,	/*  64.. 79 A-O	*/
	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0,	/*  80.. 95 P-Z	*/
	0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,	/*  96..111 a-o	*/
	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0	/* 112..127 p-z	*/
};

/*
 * Stop words are stored in a sorted bucket table.
 */

struct stoplist {
	struct stoplist	*next;
	char		*word;
};

/*
 * Hack for the Decus compiler
 */

static struct stoplist st_and	= { NULL,	"and" };
static struct stoplist st_an	= { &st_and,	"an" };
static struct stoplist st_a	= { &st_an,	"a" };
#define astop	st_a

static struct stoplist st_by	= { NULL,	"by" };
#define bstop	st_by

static struct stoplist st_from	= { NULL,	"from" };
static struct stoplist st_for	= { &st_from,	"for" };
#define fstop	st_for

static struct stoplist st_in	= { NULL,	"in" };
#define istop	st_in

static struct stoplist st_on	= { NULL,	"on" };
static struct stoplist st_of	= { &st_on,	"of" };
#define	ostop	st_of

static struct stoplist st_to	= { NULL,	"to" };
static struct stoplist st_the	= { &st_to,	"the" };
#define tstop	st_the

static struct stoplist st_with	= { NULL,	"with" };
#define wstop	st_with

#define NSTOP	128-' '

static struct stoplist *stoplist[NSTOP] = {
/*  32.. 39 */	NULL,	NULL,	NULL,	NULL,	NULL,	NULL,	NULL,	NULL,
/*  40.. 47 */	NULL,	NULL,	NULL,	NULL,	NULL,	NULL,	NULL,	NULL,
/*  48.. 55 */	NULL,	NULL,	NULL,	NULL,	NULL,	NULL,	NULL,	NULL,
/*  56.. 63 */	NULL,	NULL,	NULL,	NULL,	NULL,	NULL,	NULL,	NULL,
/*  64.. 71 */	NULL,	NULL,	NULL,	NULL,	NULL,	NULL,	NULL,	NULL,
/*  72.. 79 */	NULL,	NULL,	NULL,	NULL,	NULL,	NULL,	NULL,	NULL,
/*  80.. 87 */	NULL,	NULL,	NULL,	NULL,	NULL,	NULL,	NULL,	NULL,
/*  88.. 95 */	NULL,	NULL,	NULL,	NULL,	NULL,	NULL,	NULL,	NULL,
/*  96..103 */	NULL,	&astop,	&bstop,	NULL,	NULL,	NULL,	&fstop,	NULL,
/* 104..111 */	NULL,	&istop,	NULL,	NULL,	NULL,	NULL,	NULL,	&ostop,
/* 112..119 */	NULL,	NULL,	NULL,	NULL,	&tstop,	NULL,	NULL,	&wstop,
/* 120..127 */	NULL,	NULL,	NULL,	NULL,	NULL,	NULL,	NULL,	NULL,
};

main(argc, argv)
char *argv[];
{
	register char *cp;
	register c, i;
	FILE *fp;
	int nf;

	nf = argc-1;
	for (i=1; i<argc; ++i) {
		cp = argv[i];
		if (*cp == '-') {
			--nf;
			argv[i] = NULL;
			++cp;
			while ((c = *cp++) != EOS) {
				switch (tolower(c)) {

				case 'r':
					++sort_r;
					break;

				case 'w':
					if (++i >= argc)
						usage();
					width = getv(argv[i],2,NBUF,"width");
					--nf;
					argv[i] = NULL;
					break;

				case 's':
					sflag = 0;
					break;

				case 't':
					if (++i >= argc)
						usage();
					tflag++;
					offset = getv(argv[i],1,NBUF,"offset");
					--nf;
					argv[i] = NULL;
					break;

				case 'x':
					if (++i >= argc)
						usage();
					stopfile = argv[i];
					--nf;
					argv[i] = NULL;
					break;

				default:
					usage();
				}
			}
		}
	}
	getexclude();			/* Initialize stoplist		*/
	width -= offset;		/* Offset is for index		*/
	offset -= 1;			/* One byte follows the index	*/
	if (nf <= 0)
		rotate(stdin);
	else {
		for (i=1; i < argc; ++i) {
			if ((cp = argv[i]) == NULL)
				continue;
			if ((fp = freopen(cp, "r", stdin)) == NULL) {
				fprintf(stderr, 
					"kwik: %s: cannot open. Continuing\n",
					cp);
				continue;
			}
			rotate(fp);
		}
	}
	sorta(NULL);	
	unrotate();
}

usage()
{
	fprintf(stderr, "-KWIK-parameter error, usage is\n");
	fprintf(stderr, "\tkwik\t[-r]\t\tReverse kwik\n");
	fprintf(stderr, "\t\t[-s]\t\tUse standard stoplist\n");
	fprintf(stderr, "\t\t[-w width]\tOutput line width\n");
	fprintf(stderr, "\t\t[-t offset]\tInput in index format\n");
	fprintf(stderr, "\t\t[-x exlude]\tExclude these words from index\n");
	fprintf(stderr, "\t\t[file ...]\tFiles to be processed\n");
	error("?KWIK-E-Cannot proceed");
}

getv(arg, min, max, who)
char	*arg;		/* What to convert				*/
int	min;		/* Minimun acceptable value			*/
int	max;		/* Maximum acceptable value			*/
char	*who;		/* For error printout				*/
{
	register int	result;

	result = atoi(arg);
	if (result < min || result > max) {
		error("?KWIK-E-bad %s %d, minimum = %d, maximum = %d\n",
			who, result, min, max);
	}
	return(result);
}

getexclude()
/*
 * Store words to exclude
 */
{
	FILE	*fp;
	register struct stoplist **stopp;

	/*
	 * If -s wasn't given, erase current stoplist
	 */
	if (!sflag) {
		for (stopp = stoplist; stopp < &stoplist[NSTOP];)
			*stopp++ = NULL;
	}
	if (*stopfile == EOS)
		return;
	if ((fp = fopen(stopfile, "r")) == NULL)
		error("?KWIK-E-can't open exclude file \"%s\"\n", stopfile);
	while (fgetss(inbuf, sizeof inbuf, fp) != NULL) {
		saveexclude(inbuf);
	}
	fclose(fp);
}

saveexclude(what)
char		*what;		/* What to save				*/
{
	register char			*p;
	register struct stoplist	*stp;
	struct stoplist			**stopp;
	register int			c;
	struct stoplist			*newstop;

	/*
	 * Force line to lowercase
	 */
	for (p = what; (c = *p) != EOS;)
		*p++ = tolower(c);
	stopp = &stoplist[*what - ' '];
	if (stopp < &stoplist || stopp >= &stoplist[NSTOP])
		error("Illegal exclusion \"%s\"\n", what);
	if ((newstop = malloc(sizeof (struct stoplist))) == NULL
			|| (p = malloc(strlen(what) + 1)) == NULL)
		error("Out of space in saveexclude");
	strcpy((newstop->word = p), what);
	if ((stp = *stopp) == NULL) {
		*stopp = newstop;
		newstop->next = NULL;
	}
	else {
		while(strcmp(stp->word, what) <= 0 && stp->next != NULL)
			stp = stp->next;
		newstop->next = stp->next;
		stp->next = newstop;
	}
	
}

testexclude(what)
register char	*what;		/* Is it excluded?			*/
/*
 * Return true if <what> is in the exclude table.  Note: <what> is
 * guaranteed to be in lowercase and what[0] is guaranteed to be
 * a reasonable printing character.
 */
{
	register struct stoplist	*stp;
	register int			test;
	
	test = *what - ' ';
	if (test < 0 || test >= NSTOP)
		error("Bug: illegal testexclude \"%s\"\n", what);
	stp = stoplist[test];
	while (stp != NULL) {
		if ((test = strcmp(stp->word, what)) > 0)
			return(FALSE);
		else if (test == 0)
			return(TRUE);
		else stp = stp->next;
	}
	return(FALSE);
}

rotate(fp)
FILE *fp;
{
	register char	*p;
	register char	*tp;
	register char	*inp;

	while (fgetss(inbuf, sizeof inbuf, fp) != NULL) {
		inp = inbuf;
		/*
		 * If index mode, get the index entry
		 */
		if (tflag) {
			for (tp = tbuf; *inp != EOS && *inp != '\t';)
				*tp++ = *inp++;
			*tp = EOS;
			if (*inp == EOS)
				fprintf(stderr,
					"%KWIK-W-no index for \"%s\"\n",
					inbuf);
			else	inp++;			
		}
		/*
		 * Erase junk from the rest of the line
		 */
		for (p = inp; *p != EOS; p++) {
			if (*p < ' ')
				*p = ' ';
		}
		/*
		 * Skip to a word, output it, skip to the end of the word
		 */
		for (p = inp;;) {
			while (!ok[*p] && *p != EOS)
				p++;
			if (*p == EOS)
				break;
			stuff(p, inp);
			while (*p > ' ')
				p++;
		}
	}
}

stuff(fold_point, start)
char		*fold_point;	/* Where to rotate from			*/
char		*start;		/* Start of the text (if indexing)	*/
/*
 * Stuff this entry (assuming it isn't excluded)
 */
{
	register char	*p;
	register char	*bp;
	register c;

	p = fold_point;
	bp = outbuf;
	/*
	 * Get the sort argument, test against the exclusion buffer
	 */
	while ((c = *p++) > ' ')
		*bp++ = tolower(c);
	*bp = EOS;
	if (testexclude(outbuf))
		return;
	*bp++ = FOLD;
	/*
	 * Copy the input from the rotate point to the end of the line
	 */
	bp = cpystr(bp, fold_point);
	*bp++ = FOLD;
	/*
	 * Copy the rest of the input (from the start to the rotate point)
	 */
	p = start;
	while (p < fold_point)
		*bp++ = *p++;
	/*
	 * If indexing, append the index entry
	 */
	if (tflag) {
		*bp++ = FOLD;
		bp = cpystr(bp, tbuf);
	}
	*bp = EOS;
	sorta(outbuf);
}

unrotate()
{
	register char	*in;
	register char	*out;
	register	c;
	char		*start;
	char		*rest;
	char		*bufend;
	char		*middle;
	char		*next();

	bufend = &outbuf[width];
	middle = &outbuf[(width - 1) / 2];	/* Fold here		*/
	while (sorto(inbuf) != NULL) {
		for (in = inbuf; (c = *in++) != EOS && c != FOLD;);
		if (c == EOS)
			error("Bug: no first fold");
		/*
		 * Partition the text line
		 */
		start = in;			/* start -> after fold	*/
		while ((c = *in) != EOS && c != FOLD)
			in++;
		if (c == EOS)
			error("Bug: missing second fold");
		*in++ = EOS;			/* Terminate right side	*/
		rest = in;
		while ((c = *in) != EOS && c != FOLD)
			in++;
		/*
		 * Output the index
		 */
		if (tflag) {
			if (c == EOS)
				error("Bug: missing third fold");
			else {
				*in++ = EOS;	/* Terminate left side	*/
				printf("%-?s ", offset, in);	
			}
		}
		else {
			if (c != EOS)
				error("Bug: extra fold");
		}
		/*
		 * Partition the line.  At this point:
		 *	start ->	line after the fold
		 *	rest ->		line from start to fold
		 * Clear the line and stuff the text into it
		 */
		out = &outbuf;
		while (out < bufend)
			*out++ = ' ';
		/*
		 * Copy from "start" to the right half of the output buffer
		 * This algorithm was taken from the Lawerence Livermore
		 * tool kit.
		 */
		out = middle;
		for (in = start; (c = *in) != EOS; in++) {
			if (in > start && in[-1] == ' ') {
				if (next(1, start, in, out) >= bufend)
					out = outbuf;

			}
			if (out >= bufend)
				out = outbuf;
			*out++ = c;
		}
		/*
		 * Copy from the end of the text to the middle (backwards)
		 */
		out = middle;
		for (in = rest; *in != EOS; in++);
		while (--in >= rest) {
			out--;
			if (in[1] == ' ') {
				if (next(-1, rest, in, out) < outbuf)
					out = bufend - 1;
			}
			if (out < outbuf)
				out = bufend - 1;
			*out = *in;
		}
		/*
		 * Delete trailing blanks
		 */
		for (out = bufend; *--out == ' ' && *out >= outbuf;);
		out[1] = EOS;
		printf("%s\n", outbuf);
	}
}

char *
next(increment, edge, in, out)
int		increment;	/* Which direction (+1 | -1)		*/
char		*edge;		/* Lower limit for in			*/
register char	*in;		/* From pointer				*/
register char	*out;		/* Output pointer			*/
{
	register int	c;

	for (; in >= edge; in += increment) {
		if (*in == ' ' || *in == EOS)
			break;
		out += increment;
	}
	return(out);
}
