/*
 *		l z c m p 2 . c
 *
 * Actually do compression.
 */

#include	"lz.h"

static int	offset;
long int	in_count = 1;		/* length of input		*/
long int	bytes_out;		/* length of compressed output	*/
long int	out_count = 0;		/* # of codes output (debug)	*/
flag		clear_flg = FALSE;
/*
 * These global parameters are written to the compressed file.
 * The decompressor needs them.
 */
extern short	maxbits;		/* settable max # bits/code	*/
extern short	block_compress;
extern code_int	maxmaxcode;

/*
 * Set by the mainline code and/or static.
 */

/*
 * Local variables to the compressor.
 */
code_int	maxcode;
int		n_bits;
double		ratio;
count_int	checkpoint = CHECK_GAP;

#ifdef decus
/*
 * dsect is a Decus C specific command that tells the linker/task-builder
 * to put the following data in a specific program section.  It is
 * not necessary, but lets me see how much space various HSIZE etc.
 * options require.
 */
dsect "xxhtab";
#endif
count_int	htab[HSIZE];
U_short		codetab[HSIZE];
extern code_int	hsize;
code_int	free_ent = 0;
flag		using_fast;		/* Not using hashing		*/
#ifdef DEBUG
code_int	largecode = 0;
int		largebits = 0;
int		clearcount = 0;
#endif

#if IS_CACHE
/*
 * Define parameters for the "character hog" check.  This optimizes
 * hash table lookup for the "most popular" character.  This improves
 * performance for raster images at a modest increase in memory.
 * (which real small machines can't afford).
 */
#ifdef decus
dsect "xxcach";
#endif

#define HOG_CHECK ((count_int) 2000)	/* Number to read before check	*/
#define MAX_CACHE ((count_int) 1<<BITS) /* Next line is this value too	*/
U_short		hashcache[1<<BITS];	/* common hash short circuit cache */
count_int	cfreq[256];		/* character counts		*/

#ifndef vax
char		chog;			/* Most common char from input	*/
#define CHOG	' '			/* Assume space is commonest	*/
#else 
int		chog;			/* char arith is slow on VAX	*/
#define CHOG	(int) ' '		/* Assume space is commonest	*/
#endif
flag		cstat_flg = FALSE;	/* on after determining char hog */

#endif

#if USERMEM > 0
/*
 * Machines with LOTS of memory don't bother to hash the codes.
 */
short		ftable[ (1 << FBITS) * 256];
count_int	fcodemem[1 << FBITS];
extern long	fsize;			/* File size			*/
#endif

/*
 * compress stdin to stdout
 *
 * Algorithm:  on large machines, for maxbits <= FBITS, use fast direct table
 * lookup on the prefix code / next character combination.  For smaller code
 * size, use open addressing modular division double hashing (no chaining), ala
 * Knuth vol. 3, sec. 6.4 Algorithm D, along with G. Knott's relatively-prime
 * secondary probe.  Do block compression with an adaptive reset, whereby the
 * code table is cleared when the compression ratio decreases, but after the
 * table fills.  The variable-length output codes are re-sized at this point,
 * and a special CLEAR code is generated for the decompressor.  For the
 * megamemory version, the sparse array is cleared indirectly through a
 * "shadow" output code history.  Late additions: for the hashing code,
 * construct the table according to file size for noticeable speed improvement
 * on small files.  Also detect and cache codes associated with the most
 * common character to bypass hash calculation on these codes (a characteristic
 * of highly-compressable raster images).  Please direct questions about this
 * implementation to ames!jaw.
 */

int
compress()
/*
 * Compression mainline.  TRUE if successful.
 */
{
	register long		fcode;
	register code_int	i;
	register int		c;		/* Current input char	*/
	register code_int	ent;
	register int		disp;
	register code_int	hsize_reg;
	code_int		initial_probe;	/* Hash full tester	*/

	i = 0;
	offset = 0;
	bytes_out = 0;
	out_count = 0;
	clear_flg = FALSE;
	using_fast = FALSE;	/* Not using direct addressing		*/
	ratio = 0.0;
	in_count = 1;
	checkpoint = CHECK_GAP;
	n_bits = INIT_BITS;
	maxcode = MAXCODE(n_bits);
	free_ent = ((block_compress) ? FIRST : 256);
#ifdef DEBUG
	largecode = free_ent;
	largebits = n_bits;
	clearcount = 0;
#endif
	ent = GET();

#if USERMEM > 0
	/*
	 * Maybe use direct addressing for big files and huge systems.
	 */
	if (maxbits <= FBITS && (fsize >= 30000)) {
	    using_fast = TRUE;
	    while ((c = GET()) != EOF) {
		in_count++;
		fcode = (long) (((long) c << maxbits) + ent);
		/*
		 * test for code in "string" table
		 */
		if (ftable[fcode] != 0)
		    ent = ftable[fcode];
		else {
		    output ((code_int) ent);
		    out_count++;
		    ent = c;
		    if (free_ent >= maxmaxcode) {	
			if ((count_int) in_count < checkpoint
			 || block_compress == 0) 
			    continue;
			else {
			    clear();
			    i = 0;
			}
		    }
		    else {
			/*
			 * put code in table and
			 * memorize for block compression.
			 */
			ftable[fcode] = (short) free_ent++;
			fcodemem[i++] = fcode;
#ifdef DEBUG
			if (free_ent > largecode)
			    largecode = free_ent;
#endif
		    }
		}
	    }
	    goto finish;
	}
	/*
	 * Strictly speaking, the following is an "else" clause.
	 */
#endif
#if IS_CACHE
	chog = CHOG;		/* assumed character for the hog	*/
	cstat_flg = FALSE;
#endif
	hsize_reg = hsize;
	cl_hash(hsize_reg);		/* clear hash tables		*/
	while ((c = GET()) != (unsigned) EOF) {
	    in_count++;
#if IS_CACHE
	    if (!cstat_flg) {
		cfreq[c]++; 	/* gather frequencies at start of input	*/
		if ((count_int) in_count > HOG_CHECK) {
		    cstat_flg = TRUE;
		    chog = hogtally();	/* compute char hog		*/
		    if (chog != CHOG) 	/* fixup for wrong assumption	*/
			creset((count_int) free_ent);
		}
	    }
	    if (c == chog) {
		if ((i = hashcache[ent])) {	/* cache -> code	*/
		    ent = i;
		    continue;
		}
	    }
#endif
	    fcode = (long) (((long) c << maxbits) + ent);
#ifdef SHORT_INT
	    /*
	     * Avoid long modulus
	     */
	    i = (((c + 12347) * ent) & 077777) % HSIZE;
#else
	    i = fcode % hsize_reg;	/* division hashing		*/
#endif
	    if (htab[i] == fcode) {
		ent = codetab[i];
		continue;
	    }
	    else if ((long) htab[i] < 0) /* empty slot			*/
		goto nomatch;
	    disp = hsize_reg - i;	/* secondary hash (G. Knott)	*/
	    if (i == 0)
		disp = 1;
	    initial_probe = i;		/* Remember for full test	*/
probe:
	    if ((i -= disp) < 0)
		i += hsize_reg;
	    if (htab[i] == fcode) {
		ent = codetab[i];
		continue;
	    }
	    if ((long) htab[i] > 0)  {
		if (i == initial_probe) {
		    fprintf(stderr, "Hash table full, get help\n");
		    abort();
		}
		goto probe;
	    }
nomatch:
	    output((code_int) ent);
	    out_count++;
#ifdef interdata
	    if ((unsigned) free_ent < (unsigned) maxmaxcode) {
#else
	    if (free_ent < maxmaxcode) {
#endif
#if IS_CACHE
		if (c == chog)			/* code -> cache	*/
		    hashcache[ent] = free_ent;
#endif
		codetab[i] = free_ent++;	/* code -> hashtable	*/
#ifdef DEBUG
		if (free_ent > largecode)
		    largecode = free_ent;
#endif
		htab[i] = fcode;
	    }
	    else if ((count_int) in_count >= checkpoint
		  && block_compress != 0) {
		clear();
	    }
	    ent = c;
	}
finish:
	/*
	 * Put out the final code.
	 */
	output((code_int) ent);
	out_count++;
	if (block_compress != 0) {
	    /*
	     * On RT11 and similar operating systems, the last
	     * physical data block is null-filled to some defined
	     * size.  Code in lzdcm2.c looks for two CLEAR's in
	     * a row and exits if seen.  The -1 following may also
	     * be an exit code.
	     */
	    sendclear();
	    sendclear();
	}
	output((code_int) -1);			/* EOF signal		*/
	/*
	 * Print out stats on stderr
	 */
	if (!quiet) {
#ifdef DEBUG
	    fprintf(stderr, "%ld chars in, %ld codes (%ld bytes) out, ",
		in_count, out_count, bytes_out);
	    divout("compression ratio", in_count, bytes_out, "\n");
	    divout(" Compression as in compact",
		(in_count - bytes_out) * 100, in_count, "%\n");
	    fprintf(stderr, " Largest code was %d (%d bits),",
		largecode - 1, largebits);
	    fprintf(stderr, " %d clear code%s sent.\n",
		clearcount, (clearcount == 1) ? "" : "s");
#else
	    divout("Compression",
		(in_count - bytes_out) * 100, in_count, "%\n");
#endif
	}
	if (bytes_out > in_count) {
	    fprintf(stderr,
		"Note: output file (%ld bytes) > input (%ld bytes)\n",
		bytes_out, in_count);
	    return (FALSE);		/* Tell the caller		*/
	}
	return (TRUE);
}

static
divout(leader, numer, denom, trailer)
char		*leader;
long		numer;
long		denom;
char		*trailer;
/*
 * Print numer / denom without loading floating printout routines.
 */
{
	numer = (numer * 100) / denom;
	fprintf(stderr, "%s: %ld.%02ld%s",
	    leader, numer / 100, numer % 100, trailer);
}

/*
 * Output a given code.
 */
static char_type buf[BITS];

#if !vax_asm
char_type lmask[9] = {0xFF, 0xFE, 0xFC, 0xF8, 0xF0, 0xE0, 0xC0, 0x80, 0x00};
char_type rmask[9] = {0x00, 0x01, 0x03, 0x07, 0x0F, 0x1F, 0x3F, 0x7F, 0xFF};
#endif

#ifdef DEBUG
static int	col = 0;
#endif

output(code)
code_int  code;
/*
 * Output the given code.
 * Inputs:
 * 	code:	A n_bits-bit integer.  If == -1, then EOF.  This assumes
 *		that n_bits =< (long)wordsize - 1.
 * Outputs:
 * 	Outputs code to the file.
 * Assumptions:
 *	Chars are 8 bits long.
 * Algorithm:
 * 	Maintain a BITS character long buffer (so that 8 codes will
 * fit in it exactly).  Use the VAX insv instruction to insert each
 * code in turn.  When the buffer fills up empty it and start over.
 */
{
    /*
     * On the VAX (Unix), it is important to have the register declarations
     * in exactly the order given, or the asm will break.
     */
    register int	r_off, bits;
    register char_type	*bp;

	r_off = offset;
	bits = n_bits;
	bp = buf;

	if (code >= 0) {
	    /*
	     * Not at EOF, add the code
	     */
#ifdef DEBUG
	    if (debug > 1) {
		fprintf(stderr, "%5d%c", code,
		    (col += 6) >= 74 ? (col = 0, '\n') : ' ');
	    }
#endif
#if vax_asm
	    /*
	     * VAX DEPENDENT!! Implementation on other machines may be
	     * difficult.
	     *
	     * Translation: Insert BITS bits from the argument starting at
	     * offset bits from the beginning of buf.
	     */
	    0;					/* C compiler bug ??	*/
	    asm("insv	4(ap),r11,r10,(r9)");
#else
	    /*
	     * WARNING: byte/bit numbering on the vax is simulated
	     * by the following code
	     *
	     * Get to the first byte.
	     */
	    bp += (r_off >> 3);
	    r_off &= 7;
	    /*
	     * Since code is always >= 8 bits, only need to mask the first
	     * hunk on the left.
	     */
	    *bp = (*bp & rmask[r_off]) | (code << r_off) & lmask[r_off];
	    bp++;
	    bits -= (8 - r_off);
	    code >>= 8 - r_off;
	    /*
	     * Get any 8 bit parts in the middle ( <= 1 for up to 16 bits).
	     */
	    if (bits >= 8) {
		*bp++ = code;
		code >>= 8;
		bits -= 8;
	    }
	    if (bits != 0)				/* Last bits. */
		*bp = code;
#endif
	    offset += n_bits;
	    if (offset == (n_bits << 3)) {
		bytes_out += n_bits;
		PUTBUF(buf, n_bits);
		offset = 0;
	    }
	    /*
	     * If the next entry is going to be too big for the code size,
	     * then increase it, if possible.
	     */
	    if (free_ent > maxcode || clear_flg) {
		/*
		 * Write the whole buffer, because the input side won't
		 * discover the size increase until after it has read it.
		 */
		if (offset > 0) {
		    PUTBUF(buf, n_bits);
		    bytes_out += n_bits;
		}
		offset = 0;
		if (clear_flg) {
		    n_bits = INIT_BITS;
		    maxcode = MAXCODE(n_bits);
		    clear_flg = FALSE;
		}
		else {
		    n_bits++;
		    if (n_bits == maxbits)
			maxcode = maxmaxcode;
		    else
			maxcode = MAXCODE(n_bits);
#ifdef DEBUG
		    if (n_bits > largebits)
			largebits = n_bits;
#endif
		}
#ifdef DEBUG
		if (debug != 0) {
		    fprintf(stderr, "\nChange to %d bits\n", n_bits);
		    col = 0;
		}
#endif
	    }
	}
	else {
	    /*
	     * At EOF, write the rest of the buffer.
	     */
	    if (offset > 0) {
		PUTBUF(buf, ((offset + 7) / 8));
		bytes_out += (offset + 7) / 8;
	    }
	    offset = 0;
	    LZ_FLUSH();			/* Flush output buffer		*/
#ifdef DEBUG
	    if (debug > 1)
		fprintf(stderr, "\n");
#endif
	}
}

clear()
/*
 * Check the compression ration to see whether it is going up
 * or staying the same.  If it is going down, the internal
 * statistics of the file have changed, so clear out our
 * tables and start over.  Inform the decompressor of the
 * change by sending out a CLEAR code.
 */
{
	register code_int	i;
	register count_int	*p, *endp;
	register U_short	*q;

#ifdef DEBUG
	if (debug != 0) {
    	    fprintf (stderr, "clear: count: %ld", in_count);
	    divout(" ratio", in_count, bytes_out, "\n");
	}
#endif
	checkpoint = in_count + CHECK_GAP;
	if ((double) in_count / (double) bytes_out > ratio)
	    ratio = (double) in_count / (double) bytes_out;
	else {
	    /*
	     * Compression ratio has gone down, start again.
	     * The most likely reason for this is that the
	     * characteristics of the file have changed.
	     */
	    ratio = 0.0;
#if USERMEM > 0
	    if (using_fast) { 			/* sparse array clear	*/
		for (i = (1 << maxbits) - 1; i >= 0; i--)
		    ftable[fcodemem[i]] = 0;	/* indirect thru "shadow" */
	    }
	    else 
#endif
	    {
		/*
		 * Clear hash table
		 */
		endp = &htab[hsize];
		for (p = &htab[0], q = &codetab[0]; p < endp;) {
		    *p++ = -1;
		    *q++ = 0;
		}
#if IS_CACHE
		creset(MAX_CACHE);
#endif
	    }
	    sendclear();
#ifdef DEBUG
	    clearcount++;
#endif
	}
}

sendclear()
/*
 * Put a clear code into the output stream.
 */
{
	free_ent = FIRST;
	clear_flg = TRUE;
	output((code_int) CLEAR);
	out_count++;
}

#if IS_CACHE
static flag nfiles = FALSE;

creset(n)
register count_int n;		/* clear at least this many entries	*/
/*
 * clear hash cache
 */
{
	register count_int i;
	register U_short *hash_p;
	register U_short zero;

	if (!nfiles) {		/* No clear needed if first time	*/
	    nfiles = TRUE;
	    return;
	}
	zero = 0;
	n = (n + 15) & (~15);		/* Was & (-16)			*/
	hash_p = hashcache + n;
	for (i = n; i > 0; i -=16) {
	    *(hash_p-16) = zero;
	    *(hash_p-15) = zero;
	    *(hash_p-14) = zero;
	    *(hash_p-13) = zero;
	    *(hash_p-12) = zero;
	    *(hash_p-11) = zero;
	    *(hash_p-10) = zero;
	    *(hash_p-9) = zero;
	    *(hash_p-8) = zero;
	    *(hash_p-7) = zero;
	    *(hash_p-6) = zero;
	    *(hash_p-5) = zero;
	    *(hash_p-4) = zero;
	    *(hash_p-3) = zero;
	    *(hash_p-2) = zero;
	    *(hash_p-1) = zero;
	    hash_p -= 16;
	}
}

hogtally()	/* compute character code hog */
{
	register int i, most;

	for (i = most = 0; i < 256; i++) {
	    if (cfreq[i] >= cfreq[most])
		most = i;
	}
	return (most);
}
#endif

cl_hash(h_size)
register code_int h_size;
{
	register count_int *htab_p;
	register int i;
	register long m1;

	htab_p = htab + h_size;
	m1 = -1;
#if IS_CACHE
	/* clear hashcache */
#define	min(a,b)	((a > b) ? b : a)
	creset(min((count_int) h_size, MAX_CACHE));
#endif
	i = h_size - 16;
	do {
	    *(htab_p-16) = m1;
	    *(htab_p-15) = m1;
	    *(htab_p-14) = m1;
	    *(htab_p-13) = m1;
	    *(htab_p-12) = m1;
	    *(htab_p-11) = m1;
	    *(htab_p-10) = m1;
	    *(htab_p-9) = m1;
	    *(htab_p-8) = m1;
	    *(htab_p-7) = m1;
	    *(htab_p-6) = m1;
	    *(htab_p-5) = m1;
	    *(htab_p-4) = m1;
	    *(htab_p-3) = m1;
	    *(htab_p-2) = m1;
	    *(htab_p-1) = m1;
	    htab_p -= 16;
	} while ((i -= 16) >= 0);
    	for (i += 16; i > 0; i--)
	    *--htab_p = m1;
}
