/*--- EDIT # 0496 5 Aug 1982 16:19:10 DB1:[21,5]SORT.C;20 */ /*--- PREVIOUS EDIT 17 Jun 1982 11:28:44 DB0:[310,61]SORT.C;4 */ # /* * * * The information in this document is subject to change * without notice and should not be construed as a commitment * by Digital Equipment Corporation or by DECUS. * * Neither Digital Equipment Corporation, DECUS, nor the authors * assume any responsibility for the use or reliability of this * document or the described software. * * Copyright (C) 1980, DECUS * * * General permission to copy or modify, but not for profit, is * hereby granted, provided that the above copyright notice is * included and reference made to the fact that reproduction * privileges were granted by DECUS. * */ /* * Sort utility. * * Updated 13-Mar-80 for the new RSX library. * Updated Jan 1982 by Ray Van Tassle. * internals heavily modified, and many new switches added. * The quicksort & merge vastly improved. * * sort [-bdfinrt?uv] [-[bdfinrt?]km1.n1,m2.n2] [-oOUTFILE] [file ...] * * Sort sorts all of the named files together and writes the result * to the standard output. The standard input is sorted if no file * names are supplied; sort may be used as a filter. * * Default sort is ascending in ASCII sequence using the entire line, * with upper and lower case considered different. * Up to 10 key fields may be specified. * Lines with equal keys are further ordered using the entire * line as an ASCII key. * * The '-o' option causes the sorted output to be written to the * named output file instead of to the standard output. The output * file may be the same as one of the input files. * * The '-b' (blank) option causes leading whitespace in the key field * to be ignored. * * The '-d' (dictionary) option sorts in dictionary * order; only letters, digits, and blanks are considered in the * compare, all else is ignored. * * The '-f' (fold) option folds all letters * into lower-case for comparisons. * * The '-i' (ignore) option causes non-printing (whitespace) * characters to be ignored. * * The '-k' option selects a field to be used as a sort key. * Up to 10 key fields can be specified. The keys are in order of * decreasing significance. Key fields are compared, beginning * with the most significant one, until a not-equal key is found. * * The format of this is: '-km1.n1,m2.n2'. * Any of 'bdfinrt?' may also be specified by a preceeding * switch field. * * m1.n1 = start-of-key position (1st char of the key). * m2.n2 = end-of-key position (the char after the key). * m? = # of fields to skip, from start of line. * ( 0 = start of line ) * In the absence of a "-t" flag, a field is defined as: * optional whitespace, followed by one or * more non-whitespace characters, up to but not * including the next whitespace character. * n? = # of chars to skip, from start-of-field. * m1.n1 if omitted means skip 0 fields & 0 chars from * start of line. * m2.n2 if omitted means end-of line. * Other than m2.n2 being omitted, any of the 4 numbers * omitted means 0 for that number. * Neither of these specifiers will go past the end-of-line. * It is possible for a line to have a null key if * the start-of-key is on or after the end-of-key, * or at end-of-line. * A null key is lower than any non-null key. * * If '-b' was specified, leading whitespace is skipped * after advancing to the m1/m2 field, but before * advancing by n1/n2 characters. * * The '-n' (numeric) option changes the ordering to ascending * arithmetic on a * leading signed integer numeric string. * * The '-r' (reverse) option reverses the sense of the comparison * from ascending to descending. * * The '-t?' (terminator char) the definition of a "field" (see '-k') * is changed. The new definition is: * string of zero or more characters ending with the * "?" character. * * The '-u' (unique) option causes only the 1st line of a set of * lines having equal keys to be output. Only the specified keys are * considered in the definition of 'unique'. * * The '-v' (verbose) gives elapsed time, etc. * * Examples of key field selection: * If a line is: "ABC DEF GHI" * flag key field * -k1 " DEF GHI" * -bk1 "DEF GHI" * -bk1.1 "EF GHI" * -bk1,2 "DEF " * -k1,1 "" (null field) * -k1.0,2.1 " DEF " * -bk1.0,2.1 "DEF G" * -k0.1,1.3 "BC DE" * -k1.0,2.0 " DEF" * -k0.5,1.0 "" (null field) * * If a line is: "ABC,DEFGHIJ,KLM" * flag key field * -t,k1 "DEFGHIJ,KLM" * -t,k1,2 "DEFGHIJ," * -t,k0.1,0.6 "BC,DE" * -t,k1.0,0.10 "DEFGHI" * * Error messages: * * "Cannot create temp. file" if the required temporary file cannot * be created in the current directory. * "Cannot open" if a file cannot be accessed for reading. * "Cannot create" if a file cannot be created for writing. * "Out of space" if the sort runs out of memory. * * The following messages occur on a fatal error. Get help. * "Panic: Unexpected end of file" * "Panic: empty run" * "Panic: temp. file" * * Author: * David Conroy * Very slightly modified by Martin Minow * The sort & work file handling extensivly modified * by Ray Van Tassle * Ray Van Tassle * Motorola * 1301 E. Algonquin Rd. * Room 4135 * Schaumburg, Ill. * (312)-576-6017 * * * * The sort algorithm is NOT recursive. * The following seems to work: * tkb * sort=sort,qksort,c:c/lb * / * LIBR=FCSRES:RO * // * * The "native" C library currently lacks the ftell() function, * Thus, the routine may be used only with the Decus (PDP-11 * compatibility mode) library. * * * Work-file(s) * This program uses a Quicksort for the distribution phase (creating * runs of sorted lines) and puts each run into a work file. In essence, * the merge is one n-way merge onto the output file. ("n" is the number * of runs created in the 1st pass, as the file position is saved for each * run). * * During the distribution (input) phase, the sorted array (line) * is used as a heap while putting the run out to the work file. * We replace the lines with input lines, to attempt to increase the size * of the run. It is muchly and widely claimed that this doubles the * average run size. * My experiments bear this out. This should make the merge phase * go faster (less runs). * Oh hell. It turns out that doing so greatly increases the time * of the distribution phase (50% to 100%) but decreases the time of the * merge phase only a little bit. Perhaps it would help * in a very specialized environment, where you could make several * merge work-files, all on different devices, and have them contiguous, * so that you wouldn't have the overhead of arm seeks. * The absolute best you could ever hope to do would require 2 passes * over the data: one for the distribution phase, and one for * the merge phase. According to one of my books, about the best * thing to use with several work files is a "three file polyphase merge", * with a Fibonacci series for the number of runs on each file. * The quoted figures are 2.7-4.6 equivalent passes over the entire data * (polyphase is rough to figure exactly, because it does not pass the * entire file on each pass). * My figures on several varied files are the entire sort taking anywhere * from 3 to 5 times as long as just passing the file thru a pgm which * converts it to lower case. This seems quite good, as the time spent * in doing comparisons is hefty. Maybe changine the way we do the * merge could improve things a lot, but I don't think so. Anyway, this * is a pretty neat trick, cramming all the runs into one file. * * * I also pre-read several lines from each run whenever I do a seek * to that run (this is H_P_READ). This improves the time of the merge * phase by making it as much as ten (!!!) times faster. Evidently, * the seek time is the dominating factor of this phase, and * reading in several lines at each seek buys us a whole lot. Note that * each actual read operation gets 512 bytes, which is usually several * lines, so we have done all the work to get many lines, so * we might as well take advantage of that fact and save some of them. * It turns out that there is very little to be gained by increasing the * size of the pre-read from 5 to 10 lines. I suspect that most * of the improvement is in going from none to 1, and each incremental line * that is added gains less than the last. * * Doing all these things, and increasing the size of the in-core sort * area (MX_RLINE) will make this a pretty fast sort program. * * Some timings: * which distr merge runs (distr,merge times in seconds 11/70) * Orig 327.2 146.1 214 Joe Sventeks DICT 45000 words * This 171 177 75 same (The file is already sorted) * pass 117 convert it to lower case (pass it) * * orig 96.4 2038 161 32000 random numbers (ascii sort) * this 174 271 54 same * pass 111 * * orig 17.3 100.7 49 words.doc (9628 words) * this 40 61 17 * pass 23 * */ #include #define MX_RLINE 600 /* number of lines in a run */ #define TEMP "sort.tmp" #define MAX_WK_FILES 1 /* # of different work-files to use */ #define MAX_KEYS 11 /* Max number of key fields that can be specified */ #define H_P_READ 8 /* # of lines a heapelt can hold */ typedef char BYTE; /*********** */ /* These change the way internal routines perform */ #define TREEVER /* verify the tree after doing the reheap */ #undef TREEVER #define REPLPUT /* read replacement lines in putline */ #undef REPLPUT struct work_file { char w_filename[35]; FILE *w_fp; }; struct run { struct run *r_rp; long r_seek; int r_size; }; struct heap_s { char *h_lp_a[H_P_READ]; struct run *h_rp; }; struct run *crp = NULL; /* current run ptr */ struct run *frp = NULL; /* 1st run ptr */ struct run *lrp = NULL; /* last run ptr */ struct work_file wkf[MAX_WK_FILES]; char **line = NULL; int first_out = TRUE; /* This is the 1st output line */ FILE *ofp = NULL; FILE *tfp = NULL; FILE *ifp = NULL; char *ofn = NULL; struct heap_s *heap; int nline = 0; int nruns = 0; int max_mline = 0; /* max # of lines we can hold in memory during merge*/ int pre_lines = 0; /* # of lines actually pre-read into memory */ long int in_records = 0; /* # of records read from the input files */ long int in_bytes = 0; /* # of bytes in them */ /*** This doubles as the input buffer and the detailed usage help * info. It had better be at least 200 bytes long. */ char lbuf[] = "-b ignore leading blanks\n-d dictionary order\n\ -f fold letters to lowercase for compare\n\ -i ignore non-printing chars\n\ -km1.n1,m2.n2 select key field (max 9 keys)\n-n numeric comparison\n\ -oOUT specify output file (otherwise it is stdout)\n\ -r reverse order to descending\n\ -t? specify field terminator char\n\ -u output only 1st line with equal keys\n\ -v verbose\n"; int nlbuf = -1; /* length of string in lbuf (<0 means none) */ char **llout = NULL; /* last line put out (for -u) */ int dflag[MAX_KEYS] = 0; /* dictionary order */ int iflag[MAX_KEYS] = 0; /* ignore all whitespace */ int fflag[MAX_KEYS] = 0; /* fold upper-case to lower-case */ int vflag = 0; /* give elapsed times, etc */ int hard_way[MAX_KEYS] = 0; /* if any of the above are set, this * comparison must be done the hard way.*/ int uflag = 0; /* only output 1st of set of lines with * equal keys */ int nflag[MAX_KEYS] = 0; /* numeric comparison (else alphabetic) */ int rflag[MAX_KEYS] = 0; /* reverse sort sense * (descending instead of ascending) */ int bflag[MAX_KEYS] = 0; /* ignore leading whitespace */ char term_char[MAX_KEYS] = 0; /* field terminator character */ int kflag = 0; /* number of field specifiers (0 = none) */ /***** m1/n1 specify the 1st character of the key */ int m1[MAX_KEYS] = 0; /* fields to skip to start-of-key */ int n1[MAX_KEYS] = 0; /* chars to skip from start-of-field */ /***** m2/n2 specify the 1st character after the key. This char isn't * included in the key. */ int m2[MAX_KEYS] = -1; /* Number of fields to skip to end-of-key. * -1 means 'end of line' */ int n2[MAX_KEYS] = 0; /* chars to skip from end-of-key field */ extern char *getline(); extern char *xalloc(); extern int compare(); /* compare routine */ extern qksort(); /* the sorting routine */ struct heap_s heap_tmp; /* temp area for the reheap */ extern long int systime(); /* time in msec since midnight */ long int start_time; main(argc, argv) char *argv[]; { register struct run *rp; register char *cp; char *cptmp; struct heap_s *hp; int c, i, j, nf; start_time = systime(); /* init for elapsed time */ nf = argc - 1; for (i=1; i= argc) usage(c); /* no next arg, error! */ if (*(ofn = argv[i]) == '-') usage(c); /* next arg is itself an arg!! */ argv[i] = NULL; /* eliminate the arg */ } else { /* value is in this arg */ ofn = cp; } cp = argv[i]; /* re-examine current arg (to skip it) */ --nf; break; case 'r': ++rflag[kflag]; break; case 'k': defin_key(cp); /* printf("m1,n1, m2,n2: %d,%d, %d,%d\n", m1[kflag], n1[kflag], m2[kflag], n2[kflag]); printf("%d %d %d %d %d %d %d %d\n", bflag[kflag], rflag[kflag], nflag[kflag], iflag[kflag], fflag[kflag], dflag[kflag], hard_way[kflag], term_char[kflag]); */ kflag++; m1[kflag] = n1[kflag] = n2[kflag] = 0; m2[kflag] = -1; bflag[kflag] = rflag[kflag] = nflag[kflag] = 0; iflag[kflag] = fflag[kflag] = dflag[kflag] = 0; hard_way[kflag] = term_char[kflag] = 0; cp = &argv[i]; break; case 't': if ((c = *cp++) == NULL) usage(c); if (c == '\\') { /* escaped char */ if (*cp == NULL) usage(c); cptmp = cp; c = esc_char(&cptmp); cp = cptmp; } term_char[kflag] = c; break; default: usage(c); } } } } if (nf == 0) ifp = stdin; if ((tfp = fopen(TEMP, "w")) == NULL) { fprintf(stderr, "Cannot create temp file.\n"); exit(1); } iovtoa (tfp, wkf[0].w_filename); line = xalloc(MX_RLINE * sizeof(char *)); crp = xalloc(sizeof(struct run)); crp->r_size = 0; /* There is always an implied last key of: ascii, ascending order, * using the entire line. */ if (kflag == 0) { kflag++; } m1[kflag] = n1[kflag] = n2[kflag] = 0; m2[kflag] = -1; bflag[kflag] = rflag[kflag] = nflag[kflag] = 0; iflag[kflag] = fflag[kflag] = dflag[kflag] = 0; hard_way[kflag] = term_char[kflag] = 0; if (m1[kflag] == m1[kflag-1] && n1[kflag] == n1[kflag-1] && m2[kflag] == m2[kflag-1] && n2[kflag] == n2[kflag-1]) if (bflag[kflag] == bflag[kflag-1] && rflag[kflag] == rflag[kflag-1] && nflag[kflag] == nflag[kflag-1] && iflag[kflag] == iflag[kflag-1]) if (fflag[kflag] == fflag[kflag-1] && dflag[kflag] == dflag[kflag-1] && term_char[kflag] == term_char[kflag-1]) kflag--; /*** Distribution phase. Create sorted runs from the input file(s) * to the temp file(s). */ for (;;) { if (ifp == NULL) { for (i=1; i= argc) break; argv[i] = NULL; if ((ifp = fopen(cp, "r")) == NULL) { fprintf(stderr, "%s: cannot open.\n", cp); quit(); } } get_in_line(); /* get line from input */ if (nlbuf >= 0) { if (nline >= MX_RLINE || (cp = malloc(nlbuf)) == NULL) { do { qksort(line, nline, sizeof(line[0]), &compare); saverun(); putline(tfp); crp = xalloc(sizeof(struct run)); crp->r_size = 0; } while (nline == MX_RLINE); if (nlbuf >= 0) cp = xalloc(nlbuf); } if (nlbuf >= 0) { strcpy(cp, lbuf); nlbuf = -1; line[nline++] = cp; } } } qksort(line, nline, sizeof (line[0]), &compare); if (frp == NULL) { /* We have only 1 run, so put it right out. */ openoutput(); if (uflag) llout = xalloc(sizeof(lbuf)); putline(ofp); pr_eltim(); quit(); } /*** Merge phase. We are all done with the input files. */ if (nline > 0) { saverun(); putline(tfp); } fclose(tfp); pr_eltim(); start_time = systime(); free(line); if (uflag) llout = xalloc(sizeof(lbuf)); openoutput(); if ((tfp = fopen(wkf[0].w_filename, "r")) == NULL) panic("Cannot reopen temp file.\n"); heap = xalloc(nruns * sizeof(struct heap_s)); /* See how many lines can be pre-read form the various runs. All this is * in an attempt to read several lines from a run into memory each time * we read, because the seek time is most of the time expense of the * merge phase. * We are limited by: 1) the size of the h_lps array in a heap element, * and 2) the amount of memory we have available to put these lines into. * To arrive at the memory we have/need, allocate as much as we need. * This will most likely fail (no way is a very large file going to * fit in memory). This failure point defines the max # of lines we will * be able to hold. There is a safety margin here, but bad luck could * cause the alloc to fail during the merge. If this happens, bump up * the safety margin. With any luck, using the average record size will * be good enough. */ i = nruns * H_P_READ; if (i > in_records) i = in_records; c = in_bytes / in_records; if (c < (sizeof(int))) c = sizeof(int); lrp = xalloc(25 * c); /* alloc a safety margin */ lrp->r_rp = NULL; while (i-- && (crp = malloc(c)) != NULL) { crp->r_rp = lrp; lrp = crp; max_mline++; } /* now free the space all up */ while (lrp != NULL) { crp = lrp; lrp = lrp->r_rp; free(crp); } /* Read the same number of lines initially for all runs. */ i = max_mlines / nruns; if (i <= 0) i = 1; if (i > H_P_READ) i = H_P_READ; rp = frp; hp = &heap[0]; while (rp != NULL) { /* init for the merge */ hp->h_rp = rp; run_read(hp, i); if (hp->h_lp_a[0] == NULL) panic("Empty run.\n"); rp = rp->r_rp; hp++; } /* Sort it, to get the heap initially set up. */ qksort (heap, nruns, sizeof (struct heap_s), &compare); while (nruns) { cp = heap[0].h_lp_a[0]; if (llout) sp_fputs(cp, ofp); else fputss(cp, ofp); free(cp); copy(&heap_tmp, &heap[0], sizeof (struct heap_s)); pre_lines--; /* shift up the other pre-read lines */ copy(&heap_tmp.h_lp_a[0], &heap_tmp.h_lp_a[1], sizeof(heap.h_lp_a[0]) * (H_P_READ-1)); heap_tmp.h_lp_a[H_P_READ - 1] = NULL; if (heap_tmp.h_lp_a[0] == NULL) /* we used them all up */ run_read(&heap_tmp, H_P_READ); if (heap_tmp.h_lp_a[0] == NULL) { /* Done with this run. */ --nruns; reheap (heap, nruns, sizeof (struct heap_s), &heap[nruns]); } else reheap (heap, nruns, sizeof (struct heap_s), &heap_tmp); } if (vflag) { i = (systime() - start_time) /100; /* get&print elapsed time*/ fprintf (stderr,"Merge Elapsed: %d.%01d sec\n", i/10, i%10); } quit(); } /*********************************************/ pr_eltim() { int i; if (!vflag) return; i = (systime() - start_time) /100; /* get&print elapsed time*/ fprintf(stderr, "# records: %ld\t#bytes: %ld\n", in_records, in_bytes); fprintf (stderr,"%d runs\n", nruns); fprintf (stderr,"Distr Elapsed: %d.%01d sec\n", i/10, i%10); } /*********************************************/ /* Get the next line from the input file to "lbuf". * ifp == NULL means input file is closed. * nlbuf == length of the line in lbuf (<0 means nothing in lbuf). */ get_in_line() { if (ifp == NULL) return; if (fgetss(lbuf, sizeof lbuf, ifp) == NULL) { if (ifp != stdin) fclose(ifp); ifp = NULL; nlbuf = -1; } else { nlbuf = strlen(lbuf); in_records++; in_bytes += nlbuf; } } /* * Open the output file and stash its file * pointer in 'ofp'. If no output file is * given 'ofp' is a dup. of 'stdout'. */ openoutput() { if (ofn == NULL) ofp = stdout; else if ((ofp = fopen(ofn, "w")) == NULL) { fprintf(stderr, "%s: cannot create.\n", ofn); quit(); } } /******************************************/ /* Special fputs, used for '-u' flag, to put out only * unique lines. * The very first line is deemed 'unique', and is always put out. */ sp_fputs(buf, usr_file) char *buf; FILE *usr_file; { if (com_par(&llout, &buf, kflag? kflag-1 : kflag) || first_out) { fputss(buf, usr_file); /* this line different from last */ first_out = FALSE; strcpy (llout, buf); } } /**************************************/ /* read some lines from a run into memory */ run_read(helt, m_lines) struct heap_s *helt; /* the heap element for this run */ int m_lines; /* max # of lines to read */ { int i; for (i = 0; i < m_lines && pre_lines <= max_mline; i++) { if ((helt->h_lp_a[i] = getline(helt->h_rp)) == NULL) break; pre_lines++; } if (i < H_P_READ) helt->h_lp_a[i] = NULL; } /************************************************/ /* convert escaped char to a char value. Update cp. */ esc_char (cpp) char **cpp; { register char c; /* the converted character */ register char c1; register char *p; /* buffer ptr */ p = *cpp; /* point to the string */ c = tolower(*p++); if (c == 't') c = '\t'; /* \t is tab */ else if (c == 's') c = ' '; /* \s is space */ else if (c >= '0' && c <= '7') { /* \digits is the obvious */ c -= '0'; while ((c1 = *p++) >= '0' && c1 <= '7') c = (c<<3) + (c1 - '0'); p--; /* back up, we went too far */ } else /* anything else is itself */ c = *(p-1); *cpp = p; /* update the buffer pointer */ return (c); } /******************** routines passed into quicksort *******/ /* * Compare routine. */ static int compare(sa, sb) char *sa[], *sb[]; { return (com_par(sa, sb, kflag)); } static int com_par(sa, sb, num_keys) char *sa[], *sb[]; int num_keys; /* max number of keys to examine */ { extern char *get_to_key(); register char *a, *b; register c; char ch; long d, atol(); int field_num; /* field number loop counter */ char *aeok_ptr, *beok_ptr; /* for saving the char after the key */ char aeok_char, beok_char; c = 0; for (field_num = 0; !c && (field_num <= num_keys); field_num++) { /* find the key-field addresses */ aeok_ptr = beok_ptr = &ch; /* in case EOK = EOL */ a = get_to_key(m1[field_num], n1[field_num], m2[field_num], n2[field_num], bflag[field_num], *sa, &aeok_ptr, term_char[field_num]); b = get_to_key(m1[field_num], n1[field_num], m2[field_num], n2[field_num], bflag[field_num], *sb, &beok_ptr, term_char[field_num]); aeok_char = *aeok_ptr; *aeok_ptr = NULL; /* terminate keys */ beok_char = *beok_ptr; *beok_ptr = NULL; /* terminate keys */ /*fprintf(stderr,"a key:'%s'\n", a);*/ if (nflag[field_num]) { if ((d = atol(a)-atol(b)) < 0) --c; else if (d > 0) ++c; } else if (hard_way[field_num]) c = hard_cmp(a, b, iflag[field_num], dflag[field_num], fflag[field_num]); else c = strcmp(a, b); /* restore the char after the keys */ *aeok_ptr = aeok_char; *beok_ptr = beok_char; if (rflag[field_num]) c = -c; } return (c); } /***************/ /* return pointer to the next field of the string. * A field is defined as: optional whitespace followed by * non-whitespace; the next field is the next * whitespace or NULL. * * The returned result is a pointer to this 1st char of the * next field. * It will never advance past the trailing NULL. */ char *nxt_fld(s, tch) register char *s; register char tch; /* field terminator char or NULL */ { register char c; if (tch != NULL) { while (*s != NULL && *s++ != tch) ; } else { while (((c=*s) != NULL) && iswhite(c)) s++; while (((c=*s) != NULL) && !iswhite(c)) s++; } return (s); } /*****************************************/ /* whitespace is anything not between space & 0177 */ iswhite(c) register char c; { /* make it an unsigned byte */ c &= 0377; return ( (c <= ' ') || (0177 <= c)); } /*****************************************/ /********* get the definition for a key field */ defin_key(cp) char *cp; { if (kflag >= MAX_KEYS - 1) panic ("Too many keys specified\n"); n1[kflag] = n2[kflag] = 0; m2[kflag] = -1; m1[kflag] = atodl(&cp); if (*cp == '.') { cp++; /* char adv from start of field */ n1[kflag] = atodl(&cp); } if (!*cp++) return; /* no end-of-key */ m2[kflag] = atodl(&cp); if (!*cp) return; if (*cp++ != '.') panic ("Invalid key field format\n"); n2[kflag] = atodl(&cp); return; } /*****************/ /* unsigned ascii to decimal conversion. * stops on a non-digit. Update the buffer pointer to * point to this terminating non-digit. */ int atodl(cp) char **cp; { register int n; register char *p; register char ch; n = 0; p = *cp; /* point to the string */ while (isdigit((ch = *p++))) n = 10*n + (ch - '0'); *cp = p-1; /* update the buffer pointer */ return (n); } /************************************************/ /* get pointers to the start-of-key, and end-of-key. * returns ptr to 1st char of the key. * "eok_ptr" points to next char after the key, unless eok = "end-of-line". * The pointers will never go past the NULL. * eok-ptr will never be before sok-ptr. */ char *get_to_key (sok_fld, sok_adv, eok_fld, eok_adv, skip_ws, str, eok_ptr, term_ch) int sok_fld, sok_adv; /* start-of-key fields & chars to skip*/ int eok_fld, eok_adv; /* end-of-key fields & chars to skip*/ int skip_ws; /* skip leading whitespace before advancing by chars*/ char *str; /* the string */ char **eok_ptr; /* set to point to the 1st char after the key, * if any eok was specified. * otherwise not modified. */ char term_ch; /* field terminator or NULL */ { register char *kptr; /* start-of-key pointer */ register char *eptr; /* end-of-key pointer */ register int i; char ch; /* skip over fields to get to start-of-key */ kptr = str; /* init to start of line */ i = sok_fld; while (i--) kptr = nxt_fld(kptr, term_ch); /* probably we can keep going forward from the SOK to EOK field */ i = eok_fld - sok_fld; eptr = kptr; /* advance SOK ptr by chars (never past end) */ if (skip_ws) while ((ch=*kptr) && iswhite(ch)) kptr++; while (sok_adv-- && *kptr++); if (eok_fld >= 0) { /* position to eok field*/ if (i < 0) { i = eok_fld; /* eok search from start of line */ eptr = str; } while (i--) eptr = nxt_fld(eptr, term_ch); /* advance eok ptr by chars (never past end) */ if (skip_ws) while ((ch=*eptr) && iswhite(ch)) eptr++; while (eok_adv-- && *eptr++); *eok_ptr = eptr; /* return ptr to char after key */ if (eptr < kptr) *eok_ptr = kptr; /* force null key */ } return (kptr); } /*************************************************************/ /* string comparisons the hard-way, one character at a time, * because one of the special flags was set. */ hard_cmp(a, b, ifl, dfl, ffl) register char *a, *b; /* the two strings to compare */ int ifl; /* ignore all whitespace */ int dfl; /* dictionary order, ignore all but letters, digits, * and blanks */ int ffl; /* fold upper-case to lower case before compare */ { register char achar, bchar; int c; /* result */ for (c = 0;;) { if (dfl) { /* dictionary-style */ while ((achar = *a) && !(isdigit(achar) || isalpha(achar) || (achar == ' '))) a++; while ((bchar = *b) && !(isdigit(bchar) || isalpha(bchar) || (bchar == ' '))) b++; } if (ifl) { /* ignore all whitespace */ while ((achar = *a) && iswhite(achar)) a++; while ((bchar = *b) && iswhite(bchar)) b++; } if (ffl) { /* fold to lower-case */ achar = tolower(*a); bchar = tolower(*b); } c = achar - bchar; if (c) break; if (achar == NULL) break; a++; b++; /* incr string ptrs (we know neither is at EOS) */ } return (c); } /* * Rebuild a heap. * This is essentially TREESORT. * Used to reorder the heap when a new item * is read into it. * The 'heap' is a binary tree, such that for each node 'i', the key at * 'i' is the lowest of it and it's two sons, (2*i + 1) and (2*i + 2). * The new element is to be inserted where it should go. * We can move things up in the heap, as array element [0] is empty. * At each step, h[i] is empty. */ reheap(h, n, elt_size, new_elt) BYTE h[]; /* The array which is the heap */ int n; /* number of elements in array h*/ int elt_size; /* size of array element in bytes */ struct heap_s *new_elt; /* new element to be added */ { register int i, j; BYTE *ip, *jp, *jp1; /* ptrs for i, j, j+1 */ for (i = 0, ip = h; (j = 2*i+1) < n; i = j, ip = jp) { jp = h+(j*elt_size); jp1 = jp + elt_size; if ((j+1 < n) && (compare(jp1, jp) < 0)) { jp = jp1; ++j; } /* j now points to the smaller child of i */ if (compare(new_elt, jp) <= 0) break; copy(ip, jp, elt_size); } copy(ip, new_elt, elt_size); #ifdef TREEVER /**** verify that the tree is ok */ for (i = 0; (j = 2*i+1) < nruns; i++) { if (h != heap) break; if ((j < nruns && compare(&heap[i].h_lp, &heap[j].h_lp) > 0) || (j+1 < nruns && compare(&heap[i].h_lp,&heap[j+1].h_lp) > 0)) { fprintf (stderr, "Tree out of order. n = %d. Index = %d.\n", nruns, i); fprintf (stderr,"addresses: %o %o %o\n", &heap[i], &heap[j], &heap[j+1]); fprintf (stderr,"nruns: %d\n%d->%s%d->%s%d->%s", nruns, i,heap[i].h_lp,j,heap[j].h_lp,j+1, j+1%s", new_elt->h_lp); fprintf (stderr,"At %o\n", new_elt); fprintf (stderr,"%d->%s", 0, heap[0].h_lp); for (i = 0; i %s", i, heap[i].h_lp); } abort(); } } for (i = 0; (j = 2*i+1) < nline; i++) { if (h != line) break; if ((j < nline && compare(&line[i], &line[j]) > 0) || (j+1 < nline && compare(&line[i],&line[j+1]) > 0)) { fprintf (stderr, "Tree out of order. n = %d. Index = %d.\n", nline, i); fprintf (stderr,"addresses: %o %o %o\n", &line[i], &line[j], &line[j+1]); fprintf (stderr,"nline: %d\n%d->%s%d->%s%d->%s", nline, i,line[i],j,line[j],j+1, j+1%s", i, line[i]); } abort(); } } #endif } /* * Save a run. * The run block has been preallocated * because there may not be enough space * to allocate it now. */ saverun() { long ftell(); crp->r_rp = NULL; crp->r_seek = ftell(tfp); if (frp == NULL) frp = crp; else lrp->r_rp = crp; lrp = crp; ++nruns; } /* * Get a line from the specified run * on the temp. file. * Pack the line into allocated storage * and return a pointer to it. * Return NULL if there are no lines left * in the run; real end of file is an * internal botch. */ char *getline(rp) register struct run *rp; { register char *cp; long ftell(); if (rp->r_size == 0) return (NULL); flseek(tfp, rp->r_seek, 0); if (fgetss(lbuf, sizeof lbuf, tfp) == NULL) panic("Unexpected end of file\n"); rp->r_seek = ftell(tfp); --rp->r_size; cp = xalloc(strlen(lbuf) + sizeof(char)); strcpy(cp, lbuf); return (cp); } /*************************/ /* Dump the lines in the array to the file. */ putline(fp) FILE *fp; /* the output fp */ { register int ilater; /* ptr for lines that can't go in this run */ char *ncp, *cp; int i; crp->r_size = nline; #ifndef REPLPUT for (i=0; i 0) { cp = line[0]; if (llout) sp_fputs(cp, fp); else fputss(cp, fp); if (nlbuf >= 0 && crp->r_size < 30000 && (ncp = malloc(nlbuf)) != NULL) { strcpy(ncp, lbuf); get_in_line(); if (compare(&ncp, &cp) >= 0) { /* the line can be in this run */ crp->r_size++; reheap(line, nline, sizeof(line[0]), &ncp); } else { /* it will be part of the next run */ nline--; reheap (line, nline, sizeof(line[0]), &line[nline]); line[--ilater] = ncp; /* tuck it away */ } } else { /* nothing to replace with */ nline--; reheap(line, nline, sizeof(line[0]), &line[nline]); } free(cp); } /* The new replacement lines that couldn't become part of * this run are at the end of the array. Move them up and adjust the * index. */ nline = MX_RLINE - ilater; if (nline != 0 && ilater != 0) copy(&line[0], &line[ilater], nline * sizeof(line[0])); #endif } /* * Allocate space. * If no space, abort with a nasty * little message. */ char *xalloc(n) { register char *p; if ((p = malloc(n)) == NULL) { fprintf(stderr, "Out of space.\n"); exit(1); } return (p); } /* * Quit. * Get rid of the temp. file. * Exit. */ quit() { if (tfp != NULL) fmkdl(tfp); exit(0); } /* * Tell the user just what is expected * of him. */ usage(c) char c; { fprintf(stderr, "Usage: sort [-bdfinrt?uv] [-[bdfinrt?]km1.n1,m2.n2]"); fprintf (stderr, " [-k...]\n\t [-oOUTFILE] [file ...]\n"); if (c == '?') { printf(lbuf); } exit(1); } /* * Errors. * Print a message and die with "error" status on RT/RSX, * "error" on UNIX (I think). */ errxit(a) { fprintf(stderr,"?SORT-F-%r", &a); #ifdef decus /* Exit status per rt/rsx/unix */ #ifdef rt11 exit(4); #endif #ifdef rsx exit(2); #endif #else exit(0); #endif } /* * Fatal errors. * Print a message and die. */ panic(a) { fprintf(stderr, "Panic: %r", &a); #ifdef decus /* Exit status per rt/rsx/unix */ #ifdef rt11 exit(8); #endif #ifdef rsx exit(4); #endif #else exit(0); #endif }