C PROGRAMMING COLUMN by Al Stevens [LISTING ONE] /* ---------------------- csort.h --------------------------- */ #define NOFLDS 5 /* maximum number of fields to sort */ #define MOSTMEM 50000U /* most memory for sort buffer */ #define LEASTMEM 10240 /* least memory for sort buffer */ struct s_prm { /* sort parameters */ int rc_len; /* record length */ struct { int f_pos; /* 1st position of field (rel 1) */ int f_len; /* length of field */ char ad; /* a = ascending; d = descending */ } s_fld [NOFLDS]; /* one per field */ }; struct bp { /* one for each sequence in merge buffer */ char *rc; /* -> record in merge buffer */ int rbuf; /* records left in buffer this sequence */ int rdsk; /* records left on disk this sequence */ }; int init_sort(struct s_prm *prms); /* Initialize the sort */ void sort(char *rcd); /* Pass records to Sort */ char *sort_op(void); /* Retrieve sorted records*/ void sort_stats(void); /* Display sort statistics*/ [LISTING TWO] /* --------------------- filesort.c ------------------------ */ #include #include #include #include "csort.h" /* sort a file: * command line: filesort filename length {f1, l1, az1, ...} * filename is the name of the input file * length is the length of a record * for each field: * f1 is the field position relative to 1 * l1 is the field lengths * az1 = A = ascending, D = descending */ static struct s_prm sp; static void usage(void); void main(int argc, char *argv[]) { int i, fct = 0; FILE *fpin, *fpout; char *buf; char filename[64]; /* ------- get the file name from the command line ------ */ if (argc-- > 1) strcpy(filename, argv++[1]); else usage(); /* ----- get the record length from the command line ---- */ if (argc-- > 1) sp.rc_len = atoi(argv++[1]); else usage(); /* ----- get field definitions from the command line ---- */ do { if (argc < 4) usage(); sp.s_fld[fct].f_pos = atoi(argv++[1]); sp.s_fld[fct].f_len = atoi(argv++[1]); sp.s_fld[fct].ad = *argv++[1]; argc -= 3; fct++; } while (argc > 1); printf("\nFile: %s, length", filename, sp.rc_len); for (i = 0; i < fct; i++) printf("\nField %d: position %d, length %d, %s", i+1, sp.s_fld[i].f_pos, sp.s_fld[i].f_len, sp.s_fld[i].ad == 'd' ? "descending" : "ascending"); if ((fpin = fopen(filename, "rb")) == NULL) { printf("\nInput file not found"); exit(1); } if ((buf = malloc(sp.rc_len)) == NULL || init_sort(&sp) == -1) { printf("\nInsufficient memory to sort"); exit(1); } /* ------ sort the input records ------- */ while (fread(buf, sp.rc_len, 1, fpin) == 1) sort(buf); sort(NULL); fclose(fpin); /* ----- retrieve the sorted output records ------ */ fpout = fopen("SORTED.DAT", "wb"); while ((buf = sort_op()) != NULL) fwrite(buf, sp.rc_len, 1, fpout); fclose(fpout); sort_stats(); free(buf); } static void usage(void) { printf("\nusage: filesort fname len {pos length ad...}"); exit(1); } [LISTING THREE] /* ----------------------- csort.c ------------------------- */ #include #include #include #include "csort.h" static struct s_prm *sp; /* structure of sort parameters */ static unsigned totrcd; /* total records sorted */ static int no_seq; /* counts sequences */ static int no_seq1; static unsigned bspace; /* available buffer space */ static int nrcds; /* # of records in sort buffer */ static int nrcds1; static char *bf, *bf1; /* points to sort buffer */ static int inbf; /* variable records in sort buffer */ static char **sptr; /* -> array of buffer pointers */ static char *init_sptr; /* pointer to appropriated buffer */ static int rcds_seq; /* rcds / sequence in merge buffer */ static FILE *fp1, *fp2; /* sort work file fds */ static char fdname[15]; /* sort work name */ static char f2name[15]; /* sort work name */ static int comp(char **a, char **b); static char *appr_mem(unsigned *h); static FILE *wopen(char *name, int n); static void dumpbuff(void); static void merge(void); static void prep_merge(void); /* -------- initialize sort global variables---------- */ int init_sort(struct s_prm *prms) { sp = prms; if ((bf = appr_mem(&bspace)) != NULL) { nrcds1 = nrcds = bspace / (sp->rc_len + sizeof(char *)); init_sptr = bf; sptr = (char **) bf; bf += nrcds * sizeof(char *); fp1 = fp2 = NULL; totrcd = no_seq = inbf = 0; return 0; } else return -1; } /* --------- Function to accept records to sort ------------ */ void sort(char *s_rcd) { if (inbf == nrcds) { /* if the sort buffer is full */ qsort(init_sptr, inbf, sizeof (char *), comp); if (s_rcd) { /* if there are more records to sort */ dumpbuff(); /* dump the buffer to a sort work file*/ no_seq++; /* count the sorted sequences */ } } if (s_rcd != NULL) { /* --- this is a record to sort --- */ totrcd++; /* --- put the rcd addr in the pointer array --- */ *sptr = bf + inbf * sp->rc_len; inbf++; /* --- move the rcd to the buffer --- */ memmove(*sptr, s_rcd, sp->rc_len); sptr++; /* point to next array entry */ } else { /* null pointer means no more rcds */ if (inbf) { /* any records in the buffer? */ qsort(init_sptr, inbf, sizeof (char *), comp); if (no_seq) /* if this isn't the only sequence*/ dumpbuff(); /* dump the buffer to a work file */ no_seq++; /* count the sequence */ } no_seq1 = no_seq; if (no_seq > 1) /* if there is more than 1 sequence */ prep_merge(); /* prepare for the merge */ } } /* -------------- Prepare for the merge ----------------- */ static void prep_merge() { int i; struct bp *rr; unsigned n_bfsz; memset(init_sptr, '\0', bspace); /* -------- merge buffer size ------ */ n_bfsz = bspace - no_seq * sizeof(struct bp); /* ------ # rcds/seq in merge buffer ------- */ rcds_seq = n_bfsz / no_seq / sp->rc_len; if (rcds_seq < 2) { /* ---- more sequence blocks than will fit in buffer, merge down ---- */ fp2 = wopen(f2name, 2); /* open a sort work file */ while (rcds_seq < 2) { FILE *hd; merge(); /* binary merge */ hd = fp1; /* swap fds */ fp1 = fp2; fp2 = hd; nrcds *= 2; /* ------ adjust number of sequences ------ */ no_seq = (no_seq + 1) / 2; n_bfsz = bspace - no_seq * sizeof(struct bp); rcds_seq = n_bfsz / no_seq / sp->rc_len; } } bf1 = init_sptr; rr = (struct bp *) init_sptr; bf1 += no_seq * sizeof(struct bp); bf = bf1; /* fill the merge buffer with records from all sequences */ for (i = 0; i < no_seq; i++) { fseek(fp1, (long) i * ((long) nrcds * sp->rc_len), SEEK_SET); /* ------ read them all at once ------ */ fread(bf1, rcds_seq * sp->rc_len, 1, fp1); rr->rc = bf1; /* --- the last seq has fewer rcds than the rest --- */ if (i == no_seq-1) { if (totrcd % nrcds > rcds_seq) { rr->rbuf = rcds_seq; rr->rdsk = (totrcd % nrcds) - rcds_seq; } else { rr->rbuf = totrcd % nrcds; rr->rdsk = 0; } } else { rr->rbuf = rcds_seq; rr->rdsk = nrcds - rcds_seq; } rr++; bf1 += rcds_seq * sp->rc_len; } } /* ------- Merge the work file down This is a binary merge of records from sequences in fp1 into fp2. ------ */ static void merge() { int i; int needy, needx; /* true = need a rcd from (x/y) */ int xcnt, ycnt; /* # rcds left each sequence */ int x, y; /* sequence counters */ long adx, ady; /* sequence record disk addresses */ /* --- the two sets of sequences are x and y ----- */ fseek(fp2, 0L, SEEK_SET); for (i = 0; i < no_seq; i += 2) { x = y = i; y++; ycnt = y == no_seq ? 0 : y == no_seq - 1 ? totrcd % nrcds : nrcds; xcnt = y == no_seq ? totrcd % nrcds : nrcds; adx = (long) x * (long) nrcds * sp->rc_len; ady = adx + (long) nrcds * sp ->rc_len; needy = needx = 1; while (xcnt || ycnt) { if (needx && xcnt) { /* need a rcd from x? */ fseek(fp1, adx, SEEK_SET); adx += (long) sp->rc_len; fread(init_sptr, sp->rc_len, 1, fp1); needx = 0; } if (needy && ycnt) { /* need a rcd from y? */ fseek(fp1, ady, SEEK_SET); ady += sp->rc_len; fread(init_sptr+sp->rc_len, sp->rc_len, 1, fp1); needy = 0; } if (xcnt || ycnt) { /* if anything is left */ /* ---- compare the two sequences --- */ if (!ycnt || (xcnt && (comp(&init_sptr, &init_sptr + sp->rc_len)) < 0)) { /* ----- record from x is lower ---- */ fwrite(init_sptr, sp->rc_len, 1, fp2); --xcnt; needx = 1; } else if (ycnt) { /* record from y is lower */ fwrite(init_sptr+sp->rc_len, sp->rc_len, 1, fp2); --ycnt; needy = 1; } } } } } /* -------- Dump the sort buffer to the work file ---------- */ static void dumpbuff() { int i; if (fp1 == NULL) fp1 = wopen(fdname, 1); sptr = (char **) init_sptr; for (i = 0; i < inbf; i++) { fwrite(*(sptr + i), sp->rc_len, 1, fp1); *(sptr + i) = 0; } inbf = 0; } /* --------------- Open a sort work file ------------------- */ static FILE *wopen(char *name, int n) { FILE *fp; strcpy(name, "sortwork.000"); name[strlen(name) - 1] += n; if ((fp = fopen(name, "wb+")) == NULL) { printf("\nFile error"); exit(1); } return fp; } /* --------- Function to get sorted records ---------------- This is called to get sorted records after the sort is done. It returns pointers to each sorted record. Each call to it returns one record. When there are no more records, it returns NULL. ------ */ char *sort_op() { int j = 0; int nrd, i, k, l; struct bp *rr; static int r1 = 0; char *rtn; long ad, tr; sptr = (char **) init_sptr; if (no_seq < 2) { /* -- with only 1 sequence, no merge has been done -- */ if (r1 == totrcd) { free(init_sptr); fp1 = fp2 = NULL; r1 = 0; return NULL; } return *(sptr + r1++); } rr = (struct bp *) init_sptr; for (i = 0; i < no_seq; i++) j |= (rr + i)->rbuf | (rr + i)->rdsk; /* -- j will be true if any sequence still has records - */ if (!j) { fclose(fp1); /* none left */ remove(fdname); if (fp2) { fclose(fp2); remove(f2name); } free(init_sptr); fp1 = fp2 = NULL; r1 = 0; return NULL; } k = 0; /* --- find the sequence in the merge buffer with the lowest record --- */ for (i = 0; i < no_seq; i++) k = ((comp( &(rr + k)->rc, &(rr + i)->rc) < 0) ? k : i); /* --- k is an integer sequence number that offsets to the sequence with the lowest record ---- */ (rr + k)->rbuf--; /* decrement the rcd counter */ rtn = (rr + k)->rc; /* set the return pointer */ (rr + k)->rc += sp->rc_len; if ((rr + k)->rbuf == 0) { /* ---- the sequence got empty ---- */ /* --- so get some more if there are any --- */ rtn = bf + k * rcds_seq * sp->rc_len; memmove(rtn, (rr + k)->rc - sp->rc_len, sp->rc_len); (rr + k)->rc = rtn + sp->rc_len; if ((rr + k)->rdsk != 0) { l = ((rcds_seq-1) < (rr+k)->rdsk) ? rcds_seq-1 : (rr+k)->rdsk; nrd = k == no_seq - 1 ? totrcd % nrcds : nrcds; tr = (long) ((k * nrcds + (nrd - (rr + k)->rdsk))); ad = tr * sp->rc_len; fseek(fp1, ad, SEEK_SET); fread(rtn + sp->rc_len, l * sp->rc_len, 1, fp1); (rr + k)->rbuf = l; (rr + k)->rdsk -= l; } else memset((rr + k)->rc, 127, sp->rc_len); } return rtn; } /* ------- Function to display sort stats -------- */ void sort_stats() { printf("\n\n\nRecord Length = %d",sp->rc_len); printf("\n%d records sorted",totrcd); printf("\n%d sequence",no_seq1); if (no_seq1 != 1) putchar('s'); printf("\n%u characters of sort buffer", bspace); printf("\n%d records per buffer\n\n",nrcds1); } /* ----- appropriate available memory ----- */ static char *appr_mem(unsigned *h) { char *buff = NULL; *h = (unsigned) MOSTMEM + 1024; while (buff == NULL && *h > LEASTMEM) { *h -= 1024; buff = malloc(*h); } return buff; } /* ------- compare function for sorting, merging -------- */ static int comp(char **a, char **b) { int i, k; if (**a == 127 || **b == 127) return (int) **a - (int) **b; for (i = 0; i < NOFLDS; i++) { if (sp->s_fld[i].f_pos == 0) break; if ((k = strnicmp((*a)+sp->s_fld[i].f_pos - 1, (*b)+sp->s_fld[i].f_pos - 1, sp->rc_len)) != 0) return (sp->s_fld[i].ad == 'd')?-k:k; } return 0; }