{$I+,Y-}
program trigram;

{
  Computer Detection of Typographical Errors
		Trigram Statistics

  		Author : C. E. Chew
  		Date   : April 1985
}



const
  wordsize = 20;			{maximum size of words}
  wordsperline = 3;

  trigra = "TRIGRA";
  version = 'TRIGRA  2.0  April 1985';

  debug = false;

  nul = chr(0);
  hyphen = '-';
  apostrophe = '''';
  space = ' ';

  disize = 541;
  trisize = 2741;

  radix = 28;				{character compression radix}
  radixc0 = '`';			{dummy character}

  tfudge = 20257;
  dfudge = 20259;

  inpx = 2;
  outx = 3;



type
  relation = (lt, le, eq, ge, gt, ne);
  word = array [1..wordsize+1] of char;	{allow room for delimiters and nul}
  string = word;

  xgram = integer;

  gramrec = record
              gram : xgram;
              occu : integer
            end;

  freqptr = @freqrec;
  freqrec = record
              gram : array [0..maxint] of gramrec
            end;



var
  digram, trigram : freqptr;
  dsize, tsize : integer;
  dthreshold, tthreshold :integer;
  dlongest, dnumber, dtotal, tlongest, tnumber, ttotal : integer;



procedure rerun; external;

procedure fatal(s,m:string); external;



function initialise:boolean;

var
  i : integer;
  ch : char;

begin
  dlongest := 0;
  dnumber := 0;
  dtotal := 0;
  tlongest := 0;
  tnumber := 0;
  ttotal := 0;
  initialise := false;
  new(digram, disize);
  new(trigram, trisize);
  with digram@ do
    for i := 0 to disize-1 do
      gram[i].occu := 0;
  with trigram@ do
    for i := 0 to trisize do
      gram[i].occu := 0;

  if argc = 4 then
    initialise := true
end;



procedure insertgram(g : xgram; var t : freqptr; s, fudge : integer;
                     var longest, number, total : integer);
var
  h, q, r, k : integer;

begin
  h := g * fudge;
  if h < 0 then
    h := h - 32768;
  q := h div s;
  if q = 0 then
    q := 1;
  r := h mod s;
  k := 0;
  repeat
    k := k + 1;
    with t@.gram[r] do begin
      if occu = 0 then begin
	gram := g;
	occu := 1;
	longest := max(longest, k);
	number := number + 1;
	total := total + k;
	k := 0
      end
      else if gram = g then begin
	occu := occu + 1;
	k := 0
      end
      else begin
	r := r - q;
	if r < 0 then
	  r := r + s
      end
    end
  until (k = 0) or (k > s);
  if k <> 0 then
    fatal(trigra, "Hash table overflow")
end;



procedure scaninput;

var
  inp : text;
  di, tri, i, c, cl : integer;
  p : freqptr;
  w : word;


procedure getword(var w : word);

var
  ch : char;
  i : integer;


procedure getchar(var ch : char);

begin
  if eof(inp) then
    ch := nul
  else begin
    read(inp, ch);
    if (ch >= 'A') and (ch <= 'Z') then
      ch := chr(ord(ch) + (ord('a')-ord('A')))
    else if ch < space then
      ch := space
  end
end;


begin
  repeat
    getchar(ch)
  until ((ch >= 'a') and (ch <= 'z')) or (ch = nul);

  i := 1;
  while (ch >= 'a') and (ch <= '{') do begin
    w[i] := ch;
    i := i + 1;
    if i > wordsize then
      ch := nul
    else begin
      getchar(ch);
      if ch = apostrophe then
	ch := '{'
      else if ch = hyphen then
	repeat
	  getchar(ch)
	until ch <> space
    end
  end;
  w[i] := nul
end;


begin
  reset(inp, argv[inpx]@, "RNO");

  getword(w);
  while w[1] <> nul do begin
    c := ord(w[1]) - ord(radixc0);
    cl := 0;
    i := 2;
    repeat
      di := cl * radix + c;
      cl := c;
      c := max(ord(w[i]) - ord(radixc0), 0);
      tri := di * radix + c;
      insertgram(di, digram, disize, dfudge, dlongest, dnumber, dtotal);
      insertgram(tri, trigram, trisize, tfudge, tlongest, tnumber, ttotal);
      i := i + 1
    until c = 0;
    insertgram(cl * radix, digram, disize, dfudge, dlongest, dnumber, dtotal);
    getword(w)
  end
end;



procedure sorttables;

procedure compress(t : freqptr; m, l : integer; var s : integer);

var
  i : integer;

begin
  s := -1;
  for i := 0 to m do begin
    if t@.gram[i].occu > l then begin
      s := s + 1;
      if s <> i then
	t@.gram[s] := t@.gram[i]
    end
  end
end;

procedure sort(t : freqptr; s : integer);

procedure sorting(l, r : integer);

var
  xo : integer;
  tmp : gramrec;
  i, j : integer;

begin
  i := l;
  j := r;
  xo := t@.gram[(i+j) div 2].occu;
  repeat
    while t@.gram[i].occu > xo do
      i := i + 1;
    while t@.gram[j].occu < xo do
      j := j - 1;
    if i < j then begin
      tmp := t@.gram[j];
      t@.gram[j] := t@.gram[i];
      t@.gram[i] := tmp
    end;
    if i <= j then begin
      i := i + 1;
      j := j - 1
    end
  until i > j;
  if l < j then
    sorting(l, j);
  if i < r then
    sorting(i, r)
end;

begin {sort}
  sorting(0, s)
end;

begin {sorttables}
  compress(digram, disize-1, dthreshold, dsize);
  compress(trigram, trisize-1, tthreshold, tsize);
  sort(digram, dsize);
  sort(trigram, tsize)
end;



procedure dumptables;

var
  grm : file of gramrec;

procedure dumping(t : freqptr; s : integer);

var
  i : integer;
  a, b, c : integer;

begin
  for i := 0 to s do begin
    grm@ := t@.gram[i];
    put(grm);
    if debug then begin
      a := t@.gram[i].gram;
      c := a mod radix;
      a := a div radix;
      b := a mod radix;
      a := a div radix;
      writeln(chr(a+ord(radixc0)),
              chr(b+ord(radixc0)),
              chr(c+ord(radixc0)),t@.gram[i].occu:8)
    end
  end
end;

begin
  rewrite(grm, argv[outx]@, "GRM");
  if debug then
    writeln('Trigrams');
  dumping(trigram, tsize);
  grm@.gram := 0;
  grm@.occu := 0;
  put(grm);
  if debug then
    writeln('Digrams');
  dumping(digram, dsize);
  grm@.gram := 0;
  grm@.occu := 0;
  put(grm)
end;


    
begin
  if not initialise then begin
    writeln(version);
    writeln;
    writeln('$-option input output');
    writeln;
    rerun
  end
  else begin
    write('Digram Threshold : ');
    readln(dthreshold);
    write('Trigram Threshold : ');
    readln(tthreshold);
    writeln('Scanning Document');
    scaninput;
    writeln('Sorting Tables');
    sorttables;
    writeln('Dumping Tables');
    dumptables;
    writeln('Statistics');
    writeln(' Digram Longest Branch : ', dlongest);
    writeln(' Digram Number of Keys : ', dnumber);
    writeln(' Digram Average Length : ', float(dtotal)/float(dnumber):8:2);
    writeln(' Trigram Longest Branch : ', tlongest);
    writeln(' Trigram Number of Keys : ', tnumber);
    writeln(' Trigram Average Length : ', float(ttotal)/float(tnumber):8:2);
    writeln;
    writeln(' Digram Entries     : ', dsize+1);
    writeln(' Trigram Entries    : ', tsize+1)
  end
end.
                                                                                                                                                                                                                                                                                                                                                       