Spelling utilities in C (Part 2 of 3)

Graham Toal gtoal at tharr.UUCP
Thu Jun 28 17:07:54 AEST 1990


#!/bin/sh-----cut here-----cut here-----cut here-----cut here-----
# shar:	Shell Archiver
#	Run the following text with /bin/sh to create:
#	check.c #	correct.c #	dyntrie.c #	init.c #	locate.c #	print.c #	similcmp.c #	soundex.c #	utils.c 
cat - << \SHAR_EOF > check.c
/*

    File:    check.c
    Author:  Graham Toal
    Purpose: Check a word using DAWG or TRIE.
    Functions exported:  dawg_check, pack_check
    Internal function:   dawg_ck, pack_ck

Description:

    call as:     dawg_check(dawg, "word")
                 pack_check(trie, "word")

   Simple spelling-check. Unfortunately the two interfaces for
   DAWGs and TRIEs are different, and even DAWG stuff differs from
   program to program.  The problem is primarily in how to spot
   the end of a search; at the moment it is done when a particular
   pointer points to the root. Unfortunately we enter the data
   structure either at root+1 or at some arbitrary point, so this
   scheme means passing in two pointers.  The next idea is to check
   that the *data* pointed to by the terminating pointer is 0; this
   would be OK but I was hoping to leave the initial word in the
   dawg free for expansion... (dawg contains [0, Node1..NodeN])

   *best* idea would be to terminate on *INDEX* being 0.  This is
   probably also simplest & I'll code it up properly when I'm awake...

*/

static int
#ifdef PROTOTYPES
dawg_ck(NODE PCCRAP *dict, NODE PCCRAP *edge, char *word)
#else
dawg_ck(dict, edge, word)
NODE PCCRAP *dict;
NODE PCCRAP *edge;
char *word;
#endif
{
  if (edge == dict) return(0!=0);
  for (;;) {
    if (*word == (((*edge >> V_LETTER) & M_LETTER))) {
      if (*++word == '\0') {
        return((*edge & M_END_OF_WORD) != 0);
      } else {
        if ((edge = dict+(*edge & M_NODE_POINTER)) == dict) break;
        continue;
      }
    }
    if (((*edge++) & M_END_OF_NODE) != 0) break;
  }
  return(0!=0);
}

int
#ifdef PROTOTYPES
dawg_check(NODE PCCRAP *dict, char *word)
#else
dawg_check(dict, word)
NODE PCCRAP *dict;
char *word;
#endif
{
  return(dawg_ck(dict, dict+ROOT_NODE, word));
}

static int
#ifdef PROTOTYPES
pack_ck(NODE PCCRAP *ptrie, INDEX p, char *s)
#else
pack_ck(ptrie, p, s)
NODE PCCRAP *ptrie;
INDEX p;
char *s;
#endif
{
/* Note: node is passed in as a parameter so that it is in a register -
   otherwise it would take an extra instruction every time it was accessed.
   We trust the compiler to allocate register variables most efficiently --
   when people tinker, it might improve one machine but ruin another...
 */

#define next_letter(s) \
  p = ptrie[p + *s]; \
  if (((p >> V_LETTER) & M_LETTER) != *s++) return(0!=0); \
  if (*s == '\0') return((p >> V_END_OF_NODE) != 0); \
  if ((p &= M_NODE_POINTER) == 0) return(0!=0)

  /* Depending on whether your machine has a cache or not, and whether
     it optimises pipeline fetches, you should decrease or increase the
     number of macro instantiations in the loop below appropriately */

  for (;;) {
    next_letter(s);    next_letter(s);    next_letter(s);    next_letter(s);
  }
}

int
#ifdef PROTOTYPES
pack_check(NODE PCCRAP *ptrie, char *s)
#else
pack_check(ptrie, s)
NODE PCCRAP *ptrie;
char *s;
#endif
{
  return(pack_ck(ptrie, 1L, s));
}
SHAR_EOF
cat - << \SHAR_EOF > correct.c
/*

    File:    correct.c
    Author:  Graham Toal
    Purpose: Correct a word using Soudex and similcmp on DAWG.
    Functions exported:  dawg_correct
    Internal functions:  reccmp, walk_rest_dawg, walk_dawg

Description:
   Reduce word to rough phonetic representation.
   Reduce all words in dawg to same form.  Compare.  This
   results in a large list of plausible (HA!) candidates.

   Then apply 'similcmp' which returns a distance metric representing
   closeness of two words.  Sort by this metric, and return N best
   matches.  If one stands out above others, return it alone.

   I'm working on a *much better* algorithm, but this will do
   for prototyping the interface.

   Final version of this will have callbacks to handle corrected
   words as they are produced.

*/

#include "soundex.c"
#include "similcmp.c"

typedef struct {
  char *possible;
  int score;
} rec;


/* Numbers of parameters exploded during process of removing global state :( */

static void
#ifdef PROTOTYPES
walk_rest_dawg(NODE PCCRAP *dawg, INDEX node, int len,
  char *word, char *original, char *target,
  rec *ans, int *nextfreep)
#else
walk_rest_dawg(dawg, node, len, word, original, target, ans, nextfreep)
  NODE PCCRAP *dawg;
  INDEX node;
  int len;
  char *word;
  char *original;
  char *target;
  rec *ans;
  int *nextfreep;
#endif
#define nextfree (*nextfreep)
{
  NODE PCCRAP *edge;
  long c;

    for (edge = (NODE PCCRAP *)&dawg[node]; ;edge++) {
    c = *edge;            /* Avoid MS C compiler bug :-( */
    c = c >> V_LETTER;
    c = c & M_LETTER;
    word[len] = (char)c;
    if ((*edge & M_END_OF_WORD) != 0) {
      word[len+1] = 0;
      if (strcmp(soundex(word), target) == 0) {
        ans[nextfree].possible = (char *)Malloc(len+1, 1);
        if (ans[nextfree].possible == NULL) {
          fprintf(stderr,"I\'ve no idea - it could be anything...\n");
          exit(0);
        }
        strcpy(ans[nextfree].possible, word);
        ans[nextfree].score = simil(word, original);
        if (nextfree < 511) nextfree++;
      }
    }
    c = *edge & M_NODE_POINTER;
    if (c != 0) {
      walk_rest_dawg(dawg, c, len+1, word, original, target, ans, nextfreep);
    }
    if ((*edge & M_END_OF_NODE) != 0) break; /* End of node */
  }
#undef nextfree
}


static void
#ifdef PROTOTYPES
walk_dawg(
  NODE PCCRAP *dawg, INDEX node, int len,
  char *original, char *target,
  char targets, char *word,
  rec *ans, int *nextfreep)
#else
walk_dawg(dawg, node, len, original, target, targets, word, ans, nextfreep)
  NODE PCCRAP *dawg;
  INDEX node;
  int len;
  char *original;
  char *target;
  char targets;
  char *word;
  rec *ans;
  int *nextfreep;
#endif
#define nextfree (*nextfreep)
{
  NODE PCCRAP *edge;
  long c;

  edge = (NODE PCCRAP *)&dawg[node];
  /* Only search trie starting with initial letter */
  for (;;) {
    c = *edge;            /* Avoid MS C compiler bug :-( */
    c = c >> V_LETTER;
    c = c & M_LETTER;
    word[len] = (char)c;
    if (c == targets) {
      c = *edge & M_NODE_POINTER;
      walk_rest_dawg(dawg, c, len+1, word, original, target, ans, nextfreep);
      return;
    }
    edge++;
  }
#undef nextfree
}

static int
#ifdef PROTOTYPES
reccmp(const void *a, const void *b)
#else
reccmp(a, b)
  void *a;
  void *b;
#endif
{
  return(((rec *)b)->score - ((rec *)a)->score);
}


int
#ifdef PROTOYPES
dawg_correct(NODE PCCRAP *dawg, char *original)
#else
dawg_correct(dawg, original)
NODE PCCRAP *dawg;
char *original;
#endif
{
char word[81];
char target[256];
static rec ans[256];
  /* static because brain-dead MSC can't hack a big stack :-( */
int targets;
int nextfree = 0;
int i, prev=0;

  targets = original[0];
  strcpy(target, soundex(original));
  walk_dawg(
    dawg, (INDEX)ROOT_NODE, 0, original, target, targets, word, ans, &nextfree);
  if (nextfree == 0) return(FALSE);
  qsort(ans, (size_t)nextfree, sizeof(rec), /*&*/reccmp);
  for (i = 0; i < nextfree; i++) {
    if (((prev*100) / (ans[i].score)) > 120) break;
    fprintf(stdout, "%s (%d)\n", ans[i].possible, ans[i].score);
    if (ans[i].score >= 100) break;
    if (i == 5) break;
    prev = ans[i].score;
  }
  return(TRUE);
}
SHAR_EOF
cat - << \SHAR_EOF > dyntrie.c
/* The #define is to repair mangling by BITNET mailers */
#define NOT ~
              /* This should be Tilde (twiddle) -- C unary Not   */
/*

    File:    dyntrie.c
    Author:  Graham Toal
    Purpose: Check a word using DAWG or TRIE.
    Functions exported:  trie_new, trie_add, trie_dump
    Internal functions:  insert_simple recurse_add

Description:

   Builds DAWG-compatible trie in store, incrementally; words do not
   have to be presented in order.  (Note it is NOT a 'packed-trie';
   in our scheme it is closer to a dawg - but without the tail-compression
   precomputed dawgs have).

   Any tries built using this should be used by check_dawg, print_dawg etc.

   Has rather nice property of only claiming enough memory for the
   job; dynamically moves the data structure when it fills!

*/

  /* To avoid global state, I'm putting these useful bits of info
     in the two words before the dawg itself; I *HOPE* that all systems
     allow negative indexing off arrays; I'm not at all sure they do
     though.  However since I moved the base of the dict up by adding
     2 to it, I *should* be able to get the old address by by subtracting
     2 from it, so I'm using the first pair of macros below, not the
     more intuitive second pair.
       I should really do this for ordinary dawgs & tries too.  Indeed
     I should *really* implement the dictlib.h interface, but thats a
     month or two off.  Also I'ld like some feedback first.
   */

#define EXTRAS 2
#define MAX_ENTRIES(dawg) (*(dawg-2))          /* Safe way of aliasing */
#define NEXT_FREE(dawg) (*(dawg-1))


  /* #define MAX_ENTRIES(dawg) dawg[-2] */     /* 'Obvious' alias      */
  /* #define NEXT_FREE(dawg) dawg[-1] */       /* may be buggy         */

#define EMPTY 0
#define INIT_MAX_ENTRIES 512

/* Slop is so that we don't need to be continually checking nextfree
   as being close to max_entries */
#define SLOP 12

/* A debugging macro which rangechecks arrays -- happens when
   utils.c is included with RCHECK defined. Otherwise no overhead. */
#define DICT(idx) dict[RANGECHECK(idx, MAX_ENTRIES(dict))]


  /*
      This quick hack job has special-case code in each function
      for the root node-set.  Its rather tacky; I could clean it
      up and make the root node like all other nodes, but why
      bother; this is only test code anyway...
   */



static INDEX
#ifdef PROTOTYPES
insert_simple(NODE PCCRAP **dictp, char *word)
#else
insert_simple(dictp, word)
NODE PCCRAP **dictp;
char *word;
#endif
#define dict (*dictp)
{
NODE bugfix;
INDEX i; NODE n; int c;

  if (NEXT_FREE(dict) >= MAX_ENTRIES(dict)-SLOP) {
    NODE PCCRAP *moved;
    moved = MALLOC((SIZE_T)MAX_ENTRIES(dict)*2+EXTRAS, sizeof(NODE));
    if (moved != NULL) {
      moved += EXTRAS;
      MAX_ENTRIES(moved) = MAX_ENTRIES(dict);
      NEXT_FREE(moved) = NEXT_FREE(dict);
      /* Should use realloc but appears to be buggy :-( */
      for (i = MAX_ENTRIES(dict); i < MAX_ENTRIES(dict)*2; i++) {
        moved[i] = EMPTY;
      }
      for (i = 0; i < MAX_ENTRIES(dict); i++) moved[i] = DICT(i);
      dict -= EXTRAS;
      FREE(dict);
      dict = moved; MAX_ENTRIES(dict) *= 2;
    } else {
      fprintf(stderr, "Can\'t add any more words again\n");
    }
  }

  c = (*word++)&255;
  if (c == '\0') return(TERMINAL_NODE);
  i = NEXT_FREE(dict)++;
  bugfix = insert_simple(&dict, word);
  n = ((NODE)c << (NODE)V_LETTER) | M_END_OF_NODE | bugfix;
  DICT(i) = n; if (*word == '\0') DICT(i) |= M_END_OF_WORD;
  return(i);
#undef dict
}

static void
#ifdef PROTOTYPES
recurse_add(NODE PCCRAP **dictp, INDEX base, char *word)
#else
recurse_add(dictp, base, word)
NODE PCCRAP **dictp;
INDEX base;
char *word;
#endif
#define dict (*dictp)
{
NODE bugfix;
INDEX i = base, newbase;
NODE unpicked[MAX_CHARS], n;
int c, ch;
int gap, nextslot = 0, j;
  /* First see if first char is already in trie */
  ch = (*word++)&255;
  for (;;) {
    c = (int)(((dict[i] >> V_LETTER) & M_LETTER)&255);
    if (ch == c) {
      newbase = dict[i]&M_NODE_POINTER;
      if (newbase == 0) {
        /* have abc (this is c), adding (abc)defg */
        dict[i] &= (NOT M_NODE_POINTER);
        bugfix = insert_simple(&dict, word);
        dict[i] |= bugfix;
      } else {
        if (*word == '\0') {
          dict[i] |= M_END_OF_WORD;
        } else {
          recurse_add(dictp, newbase, word);
        }
      }
      return;
    }
    if ((dict[i]&M_END_OF_NODE) != 0) break;
    i++;
  }

  /* unpick base */
  for (nextslot = 0; nextslot < MAX_CHARS; nextslot++) {
    unpicked[nextslot] = EMPTY;
  }
  nextslot = 0;

  i = base; j = 0;
  for (;;) {
    if ((j == 0) && (((dict[i] >> V_LETTER) & M_LETTER) > ch)) {
      j = 1; newbase = nextslot++;
    }
    n = dict[i]; dict[i] = EMPTY; i++;
    unpicked[nextslot++] = n & (NOT M_END_OF_NODE);
    if ((n & M_END_OF_NODE) != 0) break;
  }
  if (j == 0) newbase = nextslot++; /* Was it last alphabetically? */
  /* add this char to the node */
  if (*word == '\0') {
    unpicked[newbase] =
      ((NODE)ch << (NODE)V_LETTER) | TERMINAL_NODE | M_END_OF_WORD;
  } else {
    /* and insert the rest of the
       string with insert_simple */
    bugfix = insert_simple(&dict, word);
    unpicked[newbase] =
      ((NODE)ch << (NODE)V_LETTER) | bugfix;
      /* The insert_simple call has to come after above loop, because
         of freeing slots... */
  }
  unpicked[nextslot-1] |= M_END_OF_NODE;

  /* scan for suitably-sized gap */
  /* MAX_CHARS+1 should be first 'real' node base (0, 1..MAX_CHARS reserved) */

  /*
     The particular application this is wanted for doesn't have a large
     number of words to be added dynamically, so the BFI code below which
     finds free holes in the trie space works fine.  However if some bright
     spark cares to keep a freelist *in the holes* then this program
     effectively implements a linear-time sorting algorithm.

     (I know it's not *really*, but think about it before jumping
     down my throat; the N^2 case is very statistically unlikely)
   */

  for (i = (INDEX)MAX_CHARS+1; i < MAX_ENTRIES(dict)-nextslot-SLOP; i++) {
    /*
        Even without the freelist, the sneaky hack in pack.c for
        noting earliest bunch of free holes might well make a
        significant difference... (It was a lowest_search_start
        variable and used a bit of hysteresis)
     */
    gap = TRUE;
    for (j = 0; j < nextslot; j++) {
      if (dict[i+j] != EMPTY) {
        gap = FALSE;
        break;
      }
    }
    if (gap) break;
  }
  if (!gap) {
    NODE PCCRAP *moved;
    moved = MALLOC((SIZE_T)MAX_ENTRIES(dict)*2+EXTRAS, sizeof(NODE));
    if (moved != NULL) {
      moved += EXTRAS;
      MAX_ENTRIES(moved) = MAX_ENTRIES(dict);
      NEXT_FREE(moved) = NEXT_FREE(dict);
      /* Should use realloc but appears to be buggy :-( */
      for (i = MAX_ENTRIES(dict);
           i < MAX_ENTRIES(dict)*2; i++) moved[i] = EMPTY;
      for (i = 0; i < MAX_ENTRIES(dict); i++) moved[i] = DICT(i);
      dict -= EXTRAS;
      FREE(dict);
      dict = moved; /* If your compiler has aliasing bugs they may hit here. */
      MAX_ENTRIES(dict) *= 2;
      /* scan for suitably-sized gap */
      for (i = ((MAX_ENTRIES(dict)/2)-(MAX_CHARS+1));
           i < MAX_ENTRIES(dict)-nextslot; i++) {
        gap = TRUE;
        for (j = 0; j < nextslot; j++) {
          if (dict[i+j] != EMPTY) {
            gap = FALSE;
            break;
          }
        }
        if (gap) break;
      }
    }
    if (!gap) {
      fprintf(stderr, "Can\'t add any more words\n");
      return;
    }
  }
  newbase = i;
  /* reinsert */
  for (j = 0; j < nextslot; j++) {
    dict[newbase+j] = unpicked[j];
  }
  if (newbase+nextslot-1 >= NEXT_FREE(dict)) NEXT_FREE(dict) = newbase+nextslot;
  /* update pointer to the base of this node */
  for (i = 0; i < MAX_ENTRIES(dict); i++) {
    if ((dict[i] & M_NODE_POINTER) == base) {
      dict[i] &= (NOT M_NODE_POINTER);
      dict[i] |= newbase;
      break;            /* (should only be one since trie, not dawg) */
    }
  }
#undef dict
}

void
#ifdef PROTOTYPES
trie_add(NODE PCCRAP **dictp, char *word)
#else
trie_add(dictp, word)
NODE PCCRAP **dictp;
char *word;
#endif
#define dict (*dictp)
{
INDEX next;
INDEX bugfix;
int c;
  /* Root node is pre-reserved MAX_CHARS entries */
  c = (*word++)&255; /* bugfix - remember chars can be signed :-( */
  if (DICT((INDEX)ROOT_NODE+c) == EMPTY) {
    /* I'm allowing the root node to be sparse for speed; it *should*
       be dense like any other node.  May change this later */
    if (*word == '\0') {
      DICT((INDEX)ROOT_NODE+c) =
        ((NODE)c << (NODE)V_LETTER) | M_END_OF_WORD;
    } else {
      bugfix = insert_simple(&dict, word);
      DICT((INDEX)ROOT_NODE+c) =
        ((NODE)c << (NODE)V_LETTER) | bugfix;
    }
  } else {
    if (*word == '\0') {
      DICT((INDEX)ROOT_NODE+c) |= M_END_OF_WORD;
    } else {
      next = DICT((INDEX)ROOT_NODE+c) & M_NODE_POINTER;
      if (next == 0) {
        /* have 'a', adding 'abcd' */
        /* (Should really check the letter for safety...) */
        bugfix = insert_simple(&dict, word);
        DICT((INDEX)ROOT_NODE+c) = ((NODE)c << (NODE)V_LETTER)
          | bugfix | M_END_OF_WORD;
      } else {
        recurse_add(dictp, next, word);
      }
    }
  }
#undef dict
}

int
#ifdef PROTOTYPES
trie_new(NODE PCCRAP **dawgp)
#else
trie_new(dawgp)
NODE PCCRAP **dawgp;
#endif
#define dawg (*dawgp)
{
INDEX i;

  dawg = MALLOC(INIT_MAX_ENTRIES+EXTRAS, sizeof(NODE));
  dawg += EXTRAS;

  MAX_ENTRIES(dawg) = INIT_MAX_ENTRIES; NEXT_FREE(dawg) = MAX_CHARS+1;

  dawg[0] = 0;
  for (i = 1; i <= MAX_CHARS; i++) dawg[i] = 0;
  dawg[MAX_CHARS] |= M_END_OF_NODE;
  /* 0 base, MAX_CHARS chars */

  return(TRUE);
#undef dawg
}

int
#ifdef PROTOTYPES
trie_dump(NODE PCCRAP *dawg, char *filename)
#else
trie_dump(dawg, filename)
NODE PCCRAP *dawg;
char *filename;
#endif
{
INDEX cnt, max;
FILE *dumper;
  max = NEXT_FREE(dawg)*sizeof(NODE);
    /* I *knew* the next_free variable would come in handy :-) */
  dumper = fopen(filename, WRITE_BIN);
  if (dumper == NULL) {
    fprintf(stderr, "Failed to dump trie on file \'%s\'\n", filename);
    return(FALSE);
  }
  putword(max, dumper);
  if ((cnt = putwords(dawg, max, dumper)) != max) {
    fprintf(stderr, "Failed to dump: %ld bytes written instead of %ld\n",
      cnt, max);
    return(FALSE);
  }
  fclose(dumper);
  return(TRUE);
}

#undef MAX_ENTRIES
#undef NEXT_FREE
#undef DICT
#undef SLOP
SHAR_EOF
cat - << \SHAR_EOF > init.c
/*

    File:    init.c
    Authors: Richard Hooker, Graham Toal
    Purpose: Loading of dictionary for spelling checker.
    Functions exported:  dawg_init, pack_init
    Internal functions:  spell_init

Description:

The address of a dictionary (either a PACKed dict or a DAWG) is set up by
this procedure.  This gives us the option of connecting the dict in read-only
(or copy-on-write) virtual memory.  On non-VM machines, we simply allocate a
large buffer into which the relevant part of the dictionary is loaded.

The magic number is used to check the dict type, *and* the machine byte-sex.
If the sex is reversed, even a VM system has to load the data into writable
memory (so that it can reverse it).

*/

/*######################### INTERNAL FUNCTIONS #########################*/


static int
#ifdef PROTOTYPES
spell_init(char *filename, NODE PCCRAP **dictp,
  char *DEFAULT_DICT, long magic_number, INDEX *nedges)
#else
spell_init(filename, dictp, DEFAULT_DICT, magic_number, nedges)
char *filename;
NODE PCCRAP **dictp;
char *DEFAULT_DICT;
long magic_number;
INDEX *nedges;
#endif
#define dict (*dictp)
{
FILE *fp; INDEX count;

  /*  init_dict("") gets default */
  if (*filename == '\0') filename = DEFAULT_DICT;

  /* Open the file and find out the size of the dict -- which
     is stored in the first word.  Later I'll change the dict format
     to have a header, and the header will have to be skipped by
     this module. */

  if ((fp = fopen(filename, "rb")) == NULL) {
    fprintf (stderr, "Can\'t open file \"%s\"\n", filename);
    return(FALSE);
  }
  *nedges = getword(fp);
#ifdef DEBUG
fprintf(stderr, "dawg contains %8lx edges\n", *nedges);
#endif
  /* Allocate precisely enough memory for all edges + 0 at root node. */
  if ((dict = MALLOC((SIZE_T)((*nedges)+1), sizeof(NODE PCCRAP *))) == 0) {
    fprintf (stderr, "Can\'t allocate space for dictionary\n");
    return(FALSE);
  }

  dict[0] = 0; /* Root node set to 0; terminal nodes point to 0. */

  /* Load in the dictionary.  Routine 'getwords' should be efficient */
  count = getwords(&dict[1], (long)(4*(*nedges)), fp);
  if (count != 4*(*nedges)) {
    fprintf(stderr,
      "Failed to read dictionary %s - wanted %ld bytes, got %ld\n",
      filename, 4*(*nedges), count);
    return(FALSE);
  }
  fclose(fp);

  return(TRUE);
#undef dict
}

/*####################### EXPORTED FUNCTIONS #########################*/

int
#ifdef PROTOTYPES
dawg_init(char *filename, NODE PCCRAP **dawgp, INDEX *nedges)
#else
dawg_init(filename, dawgp, nedges)
char *filename;
NODE PCCRAP **dawgp;
INDEX *nedges;
#endif
{
  return(spell_init(filename, dawgp, DEFAULT_DAWG, DAWG_MAGIC, nedges));
}

int
#ifdef PROTOTYPES
pack_init(char *filename, NODE PCCRAP **packp, INDEX *nedges)
#else
pack_init(filename, packp, nedges)
char *filename;
NODE PCCRAP **packp;
INDEX *nedges;
#endif
{
  return(spell_init(filename, packp, DEFAULT_PACK, PACK_MAGIC, nedges));
}

SHAR_EOF
cat - << \SHAR_EOF > locate.c
/*

    File:    locate.c
    Author:  Graham Toal
    Purpose: Find all words beginning with particular prefix.
    Functions exported:  locate_prefix

Description:
   Searches as in spell-check for prefix in dict, but doesn't
   fail if word doesn't terminate at that point.  It returns
   an index into the dict which can be used with print_dawg_prefix
   to display all the words found.  However it is more useful
   than that; text-analysis programs can check that a word matches
   "root*", for instance, when doing stylistic analysis etc.

*/

INDEX
#ifdef PROTOTYPES
dawg_locate_prefix(NODE PCCRAP *dawg, char *word, INDEX edge)
#else
dawg_locate_prefix(dawg, word, edge)
NODE PCCRAP *dawg;
char *word;
INDEX edge;
#endif
{
  for (;;) {
    if (*word == (((dawg[edge] >> V_LETTER) & M_LETTER))) {
      if (*++word == '\0') {
        return(dawg[edge]&M_NODE_POINTER);
      } else {
        if ((edge = (dawg[edge] & M_NODE_POINTER)) == ROOT_NODE) break;
        continue;
      }
    }
    if (((dawg[edge++]) & M_END_OF_NODE) != 0) break;
  }
  /* What to do when none found? -- fail-safe, or some error code...? */
  return(ROOT_NODE);
}
SHAR_EOF
cat - << \SHAR_EOF > print.c
/*

    File:    print.c
    Author:  Graham Toal
    Purpose: Print a packed trie to stderr.
    Functions exported:  dawg_print, pack_print, dawg_print_prefix
    Internal functions:  pack_pr dawg_pr dawg_pr_prefix

Description:
  Pre-order traverse of packed TRIE or DAWG.  Will be modified
  soon to take output file as parameter.  Then sometime after that
  to callback with each string at it is generated.  Meanwhile,
  people requiring custom versions of dawg-walking stuff might
  as well just copy this code and edit it to do what they want.

  The special version print_dawg_prefix takes a NODE from somewhere
  in a dict as a parameter, and prints out only those words which
  contain that node.  You have to locate the node seperately with
  'locate_prefix', and pass the prefix string in so it can be printed.

*/

static void
#ifdef PROTOTYPES
dawg_pr(NODE PCCRAP *dawg, INDEX node, int len)
#else
dawg_pr(dawg, node, len)
NODE PCCRAP *dawg;
INDEX node;
int len;
#endif
{
  static char word[MAX_WORD_LEN];
  NODE PCCRAP *edge;

  for (edge = (NODE PCCRAP *)&dawg[node]; ; edge++) {
  long c;
    c = *edge;           /* Don't rewrite this - its avoiding a MSC bug */
    c = c >> V_LETTER;
    c = c & M_LETTER;
    word[len] = (char)c;
    if ((*edge & M_END_OF_WORD) != 0) {
      word[len+1] = '\0';
      fprintf(stdout, "%s\n", word);
    }
    c = *edge & M_NODE_POINTER;
    if ((*edge & M_NODE_POINTER) != 0)
      dawg_pr (dawg, c, len + 1);
    if ((*edge & M_END_OF_NODE) != 0) break; /* End of node */
  }
}

void
#ifdef PROTOTYPES
dawg_print(NODE PCCRAP *dawg, INDEX node)
#else
dawg_print(dawg, node)
NODE PCCRAP *dawg;
INDEX node;
#endif
{
  dawg_pr(dawg, node, 0);
}

static void
#ifdef PROTOTYPES
pack_pr(NODE PCCRAP *ptrie, INDEX i, int pos)
#else
pack_pr(ptrie, i, pos)
NODE PCCRAP *ptrie;
INDEX i;
int pos;
#endif
{
static char s[MAX_WORD_LEN+1];
int b;
INDEX p;
  for (b = BASE_CHAR; b < BASE_CHAR+MAX_CHARS; b++) {
    if (b != 0) {
      p = ptrie[i+b-BASE_CHAR];
      if (((p>>V_LETTER)&M_LETTER) == b) {
      	s[pos] = b; s[pos+1] = '\0';
        if ((p & M_END_OF_WORD) != 0) fprintf(stderr, "%s\n", s);
        if ((p &= M_NODE_POINTER) != 0) {
          pack_pr(ptrie, p, pos+1);
        }
      }
    }
  }
}


void
#ifdef PROTOTYPES
pack_print(NODE PCCRAP *ptrie, INDEX node)
#else
pack_print(ptrie, node)
NODE PCCRAP *ptrie;
INDEX node;
#endif
{
  pack_pr(ptrie, node, 0);
}


static void
#ifdef PROTOTYPES
dawg_pr_prefix(NODE PCCRAP *dawg, char *prefix, INDEX node, int len)
#else
dawg_pr_prefix(dawg, prefix, node, len)
NODE PCCRAP *dawg;
char *prefix;
INDEX node;
int len;
#endif
{
  NODE PCCRAP *edge;
  static char word[MAX_WORD_LEN];
  long c;

  for (edge = (NODE PCCRAP *)&dawg[node]; ; edge++) {
    /* Don't 'fix' - compiler bugaround for microsoft :-( */
    c = *edge; c = c >> V_LETTER; c = c & M_LETTER;
    word[len] = (char)c;
    if ((*edge & M_END_OF_WORD) != 0) {
      word[len+1] = 0;
      fprintf(stdout, "%s%s\n", prefix, word);
    }
    c = *edge & M_NODE_POINTER;
    if (c != 0) dawg_pr_prefix(dawg, prefix, c, len + 1);
    /* End of node - check repair later - I accidentally nobbled it */
    if ((*edge & M_END_OF_NODE) != 0) break;
  }
}

void
#ifdef PROTOTYPES
dawg_print_prefix(NODE PCCRAP *dawg, char *prefix, INDEX node)
#else
dawg_print_prefix(dawg, prefix, node)
NODE PCCRAP *dawg;
char *prefix;
INDEX node;
#endif
{
  dawg_pr_prefix(dawg, prefix, node, 0);
}
SHAR_EOF
cat - << \SHAR_EOF > similcmp.c
#ifndef similcmp_c
#define similcmp_c 1

#ifdef LIB_STRINGS
#include <strings.h>
#else
#include <string.h>
#endif

/*

    File:    similcmp.c
    Authors: John Ratcliffe, and an anonymous coder.
    Purpose: Better approximate string equality test.
    Functions exported:  simil
    Internal functions:  GCsubstr

Description:
  See below.  I'll replace this one eventually with my own
  algorithm which does sophisticated compound-grapheme analysis
  and returns a degree of phonetic similarity and a probability
  that the transformations made were valid.


  'ghoti==fish' (Match = 90%, Prob = 1%) ;-)
  lauGH = f       match 100% prob 30%
  wOmen = i       match  90% prob 10%
  staTIon = sh    match 100% prob 5%

*/

/* Ratcliff/Obershelp pattern matching
 *
 *      Approximate word matching: takes two words and returns a
 *      similarity score based on co-occurrence of subpatterns.
 *
 *      This code appeared in a letter to the editor in DDJ, 11/88.
 *      The original article on the pattern matching, "Pattern Matching
 *      by Gestalt" by John Ratcliff, appeared in the July 1988
 *      issue (#181) but the algorithm was presented in assembly.  
 *
 *      The 11/88 issue also contained another verison in C which was
 *      a more faithful translation of the original; it has the
 *      advantage of not being recursive.
 *
 *      The algorithm seems to work nicely for a variety of test cases.
 *      The main drawback of the algorithm is the cost of the pairwise
 *      comparisons.  It is significantly more expensive than stemming,
 *      soundex, and the like.  Might consider using this as a second
 *      phase...
 */

int
#ifdef PROTOTYPES
GCsubstr(char *st1, char *end1, char *st2, char *end2)
#else
GCsubstr(st1, end1, st2, end2)
  char *st1;
  char *end1;
  char *st2;
  char *end2;
#endif
{
register char *a1, *a2;
char *b1, *s1, *b2, *s2;
short max, i;

  if (end1 <= st1 || end2 <= st2) return(0);
  if (end1 == st1 + 1 && end2 == st2 + 1) return(0);
                
  max = 0; b1 = end1; b2 = end2;
        
  for (a1 = st1; a1 < b1; a1++) {
    for (a2 = st2; a2 < b2; a2++) {
      if (*a1 == *a2) {
        /* determine length of common substring */
        for (i = 1; a1[i] && (a1[i] == a2[i]); i++) /* do nothing */;
        if (i > max) {
          max = i; s1 = a1; s2 = a2;
          b1 = end1 - max; b2 = end2 - max;
        }
      }
    }
  }
  if (!max) return(0);
  max += GCsubstr(s1 + max, end1, s2 + max, end2);        /* rhs */
  max += GCsubstr(st1, s1, st2, s2);                      /* lhs */
  return(max);
}

int
#ifdef PROTOTYPES
simil(char *s1, char *s2)
#else
simil(s1, s2)
 char *s1;
 char *s2;
#endif
{
int l1 = strlen(s1), l2 = strlen(s2);
 if (strcmp(s1, s2) == 0) return(100); /* exact match end-case */
 return(200 * GCsubstr(s1, s1 + l1, s2, s2 + l2) / (l1 + l2));
}
#endif /* similcmp_c */
SHAR_EOF
cat - << \SHAR_EOF > soundex.c
#ifndef soundex_c
#define soundex_c 1

#ifdef LIB_STRINGS
#include <strings.h>
#else
#include <string.h>
#endif


/* isalpha & toupper; define your own if you don't have ctype.h */
#include <ctype.h>

/*

    File:    soundex.c
    Authors: Jonathan Leffler
    Purpose: Approximate string equality test.
    Functions exported:  soundex
    Internal functions:  nsoundex

Description:

 There are umpteen soundexes around. At least this one is commented...
 (Actually I'd prefer one which understood ph/f and Mac/Mc as special
  cases; short of a proper phonetic alg such as I'm currently developing)
 See below for description:

*/

/*
**      SOUNDEX CODING
**
**      Rules:
**      1.      Retain the first letter; ignore non-alphabetic characters.
**      2.      Replace second and subsequent characters by a group code.
**              Group   Letters
**              1               BFPV
**              2               CGJKSXZ
**              3               DT
**              4               L
**              5               MN
**              6               R
**      3.      Do not repeat digits
**      4.      Truncate or ser-pad to 4-character result.
**
**      Originally formatted with tabstops set at 4 spaces -- you were warned!
**
**      Code by: Jonathan Leffler (john at sphinx.co.uk)
**      This code is shareware -- I wrote it; you can have it for free
**      if you supply it to anyone else who wants it for free.
**
**      BUGS: Assumes 7-ASCII; fails on ISO Latin1
*/

static char lookup[] = {
        '0',    /* A */        '1',    /* B */        '2',    /* C */
        '3',    /* D */        '0',    /* E */        '1',    /* F */
        '2',    /* G */        '0',    /* H */        '0',    /* I */
        '2',    /* J */        '2',    /* K */        '4',    /* L */
        '5',    /* M */        '5',    /* N */        '0',    /* O */
        '1',    /* P */        '0',    /* Q */        '6',    /* R */
        '2',    /* S */        '3',    /* T */        '0',    /* U */
        '1',    /* V */        '0',    /* W */        '2',    /* X */
        '0',    /* Y */        '2'     /* Z */
};

/* Soundex for arbitrary number of characters of information */
#define MAX_SOUNDEX_LEN 10

static char *
#ifdef PROTOTYPES
nsoundex(char *str, int n)
#else
nsoundex(str, n)
char *str;                   /* In: String to be converted */
int n;                       /* In: Number of characters in result string */
#endif
{
  static char buff[MAX_SOUNDEX_LEN];
  register char *s;
  register char *t;
  char c;
  char l;

  if (n <= 0) n = 4;  /* Default */
  if (n > sizeof(buff) - 1)  n = sizeof(buff) - 1;
  t = &buff[0];

  for (s = str; ((c = *s) != '\0') && t < &buff[n]; s++) {
    if (!isalpha(c)) continue;
    c = toupper(c);
    if (t == &buff[0]) {
      l = *t++ = c;
      continue;
    }
    c = lookup[c-'A'];  /* Fails on any alpha not in 'a'..'z' */
    if (c != '0' && c != l) l = *t++ = c;
  }
  while (t < &buff[n]) *t++ = '0'; *t = '\0';
  return(&buff[0]);
}

/* Normal external interface */
char *
#ifdef PROTOTYPES
soundex(char *str)
#else
soundex(str)
char *str;
#endif
{
  return(nsoundex(str, 4));  /* vary this? - try 5 or 6? */
}
#endif /* soundex_c */

SHAR_EOF
cat - << \SHAR_EOF > utils.c
/*

    File:    utils.c
    Author:  Graham Toal
    Purpose: Portability library

Description:

   Most of what differs between operating systems is handled here.
   This module is *incredibly* hacky -- I've just been interested
   in finding the problems so far, not in making the solutions neat.

   The final release of this suite will concentrate on cleaning up
   this file!
*/



/* PENDING: Generic load dict; meanwhile should also put efficient
   msdos file loading into getwords().  See spelldawg for best coding. */

#define SIZE_T size_t
/* MSDos redefines this for huge allocation */

#ifdef SYS_RISCOS
#define DEFAULT_DAWG "run:dict-dwg"
#define DEFAULT_PACK "run:dict-pck"
#else
#ifdef SYS_EMAS
#define DEFAULT_DAWG "dict#dwg"
#define DEFAULT_PACK "dict#pck"
#else
/* Unix, MessDross, etc... */
#define DEFAULT_DAWG "dict.dwg"
#define DEFAULT_PACK "dict.pck"
#endif
#endif


/* Undo any #define's here if they are inappropriate for some system */

#define EXT_CHAR '.'

#define READ_TEXT "r"
#define WRITE_BIN "wb"
#define READ_BIN "rb"


/* system configuration */

#ifdef KNOWS_VOID
#define VOID void
#else
/* As in... void fred(VOID) */
#define void int
#define VOID
#endif

#ifdef SYS_MSDOS
#ifdef COMPILER_ZORTECH
int _stack = 0x3000;
#define PCCRAP far
#else
#ifdef COMPILER_TURBOC
#define PCCRAP far
#else
#define PCCRAP huge
#endif
#endif
#else
#define PCCRAP
#endif

/* Hmph... I don't really want to do realloc too.  Just as well that
   one implmentation is buggy; saves me having to work out what to do :-) */


#ifdef SYS_MSDOS
/* Normally int... */
#undef SIZE_T
#define SIZE_T long
/* (Still to test AZTEC & LATTICE) */
/* Mallocs of > 64K (maybe even > 32K) have to come off the far/huge heap */
#ifdef COMPILER_ZORTECH
#include <dos.h>
#define MALLOC(x,s) (NODE PCCRAP *)farmalloc((x)*(s)) /* Zortech */
#define Malloc(x,s) malloc((x)*(s))
#define FREE(x) farfree(x)
#define Free(x) free(x)
#else /* else not microsoft */
#ifdef COMPILER_TURBOC
#include <alloc.h>
#define MALLOC(x,s) (NODE PCCRAP *)farmalloc((x)*(s))  /* Turbo */
#define Malloc(x,s) malloc((x)*(s))
#define FREE(x) farfree(x)
#define Free(x) free(x)
#else /* else not turbo */
#include <malloc.h>
#ifdef NEVER
#define MALLOC(x,s) (NODE PCCRAP *)my_halloc((long)(x),(size_t)(s))
 /* Microsoft, Aztec */
#define Malloc(x,s) my_malloc((x)*(s))
#define FREE(x) my_hfree((void huge *)(x))  /* For some reason MICROSOFT    */
#define Free(x) my_free((void *)x)          /* complains of a type mismatch */
                                         /* without these casts */
#endif
#define MALLOC(x,s) (NODE PCCRAP *)halloc((long)(x),(size_t)(s))
 /* Microsoft, Aztec */
#define Malloc(x,s) malloc((x)*(s))
#define FREE(x) hfree((void huge *)(x))  /* For some reason MICROSOFT    */
#define Free(x) free((void *)x)          /* complains of a type mismatch */
                                         /* without these casts */
#endif
#endif
#else /* else not SYS_MSDOS */
#ifdef STDLIBS
#include <stdlib.h>  /* for malloc, free & exit */
#define MALLOC(x,s) (NODE PCCRAP *)malloc(((x)*(s)))
#define Malloc(x,s) malloc((x)*(s))
#define FREE(x) free(x)
#define Free(x) free(x)
#else
#define MALLOC(x,s) (NODE PCCRAP *)malloc(((x)*(s)))
#define Malloc(x,s) malloc((x)*(s))
#define FREE(x) free(x)
#define Free(x) free(x)
#ifndef size_t       /* Doesn't work if size_t was typedef'd */
#define size_t int   /* And if an int isn't big enough we're in trouble! */
#endif
#ifdef PROTOTYPES
extern void *malloc(size_t bytes);
extern void exit(int rc);
#else
extern void *malloc();
extern void exit();
#endif
#endif /* not stdlibs */
#endif /* Not msdos */

#ifdef SYS_RISCOS
#undef EXT_CHAR
#define EXT_CHAR '-'
#endif

#ifdef SYS_EMAS
#undef EXT_CHAR
#define EXT_CHAR '#'
#endif

#ifdef SYS_MAC
#ifdef COMPILER_THINK
#undef WRITE_BIN
#undef READ_BIN
#define WRITE_BIN "w"
#define READ_BIN "r"
#endif
#endif

#ifdef MEMMODELS
#define SMALL_MEMORY 1
#endif

/*
                     Error handling

     At the moment we'll just go for OK / NOT OK.  Later we should
   fix all exit's to use a specific code symbol (eg EXIT_MALLOCFAIL)
   but this has to be done system by system.  Whenever a new one is
   added, a default should be set up (as 0?)
 */

/*#include <errno.h>*/        /* Only later when we want to be more precise! */
#define EXIT_OK       (0)     /* Generic success              */
#define EXIT_ERROR    (1)     /* Generaic failure             */

/* For each system, replace generic errors with system-dependant ones. */
#ifdef vms
/*
 * VMS-specific error status codes.  Approximate Ultrix equivalents.
 */
#include <ssdef.h>
#include <stsdef.h>
#undef  EXIT_OK
#define EXIT_OK     SS$_NORMAL     /* Generic success                  */
#undef  EXIT_ERROR
#define EXIT_ERROR  SS$_NORMAL     /* Don't want random error message! */
#endif

#define DELETE(filename)

#ifdef __STDC__
#undef DELETE
#define DELETE(filename) remove(filename)
#else
#ifdef unix
#undef DELETE
#define DELETE(filename) unlink(filename)
#endif
#endif

#ifdef NEVER

/* these macros caught the fact that on MICROSOFT, the parameters
   being passed in were ints -- and I hadn't been given a warning
   because I had explicitly cast the to size-t.  Hence why I've
   declared SIZE_T as long.  This is all a bit messy. Curse you IBM PCs
 */

void PCCRAP *my_halloc(long n,size_t s) {
char PCCRAP *p;
  p = halloc(n, s);
  fprintf(stderr, "halloc(%8lx*%d) -> %8lx\n", n, s, (long)p);
  return(p);
}

void *my_malloc(size_t s) {
char *p;
  p = malloc(s);
  fprintf(stderr, "malloc(%4x) -> %4x\n", s, (int)p);
  return(p);
}

void my_hfree(void PCCRAP *p) {
  fprintf(stderr, "hfree(%8lx)\n", (long)p);
  hfree(p);
}

void my_free(void *p) {
  fprintf(stderr, "free(%4x)\n", (int)p);
  free(p);
}
#endif


#ifdef RCHECK
#ifndef PROTOTYPES
long toosmall(idx, max, line)
long idx;
long max;
int line;
#else
long toosmall(long idx, long max, int line)
#endif
{
  if (line == 0) {
    fprintf(stderr,
      "RANGE CHECK: %ld should not be less than 0 (max %ld)\n", idx, max);
  } else {
    fprintf(stderr,
      "RANGE CHECK AT LINE %d: %ld should not be less than 0 (max %ld)\n",
      line, idx, max);
  }
  return(0L);
}
#ifndef PROTOTYPES
long toobig(idx, max, line)
long idx;
long max;
int line;
#else
long toobig(long idx, long max, int line)
#endif
{
  if (line == 0) {
    fprintf(stderr,
      "RANGE CHECK: %ld should not be larger than %ld\n", idx, max);
  } else {
    fprintf(stderr,
      "RANGE CHECK AT LINE %d: %ld should not be larger than %ld\n",
      line, idx, max);
  }
  return(max);
}

#ifdef KNOWS_LINE
#define TOOSMALL(idx, max) toosmall((long)idx, (long)max, __LINE__)
#define TOOBIG(idx, max) toobig((long)idx, (long)max, __LINE__)
#else
#define TOOSMALL(idx, max) toosmall((long)idx, (long)max, 0)
#define TOOBIG(idx, max) toobig((long)idx, (long)max, 0)
#endif

#define RANGECHECK(idx,max) \
  (idx < 0 ? (TOOSMALL(idx,max)) : (idx >= max ? (TOOBIG(idx,max)) : idx))
#else
#define RANGECHECK(idx,max) (idx)
#endif

#ifdef PROTOTYPES
long getwords(long PCCRAP *data, long count, FILE *fp)
#else
long getwords(data, count, fp)
long PCCRAP *data;
long count;
FILE *fp;
#endif
{
#ifdef SYS_MSDOS
char PCCRAP *p; int c; long byte_count;
  p = (char PCCRAP *)(&data[0]);
  /* Somewhere I have a version which fread()s into a 16K near
     buffer, then copies it out; use that technique here - *MUCH*
     faster */
  for (byte_count = 0; byte_count < count; byte_count++) {
    c = fgetc(fp);
    if (c == -1 || ferror(fp)) {
      printf ("Dictionary read error - %ld wanted - %ld got\n",
        count, byte_count)/*, exit(0)*/;
      break;
    }
    *p++ = (c & 255);
  }
  return(count);
#else
  return((long)fread(&data[0], (size_t)1, (size_t)count, fp));
#endif
}

#ifdef PROTOTYPES
long putwords(long PCCRAP *data, long count, FILE *fp)
#else
long putwords(data, count, fp)
long PCCRAP *data;
long count;
FILE *fp;
#endif
{
#ifdef SYS_MSDOS
extern int _NEAR _CDECL errno;
long i; char PCCRAP *p;
  p = (char PCCRAP *)&data[0];
  if (fp == NULL) {
    fprintf(stderr, "putwords: no file?\n");
    exit(0);
  }
  for (i = 0L; i < count; i++) {
  /* Somewhere I have a version which copies to a 16K near
     buffer, then frwrite()s it out; use that technique here - *MUCH*
     faster */
    int rc = fputc(*p++, fp);
    if (ferror(fp)) {
      fprintf(stderr, "rc = %d\n", rc);
      perror("dawg");
      fprintf (stderr, "Error writing to output file\n");
      exit(0);
    }
  }
  return(count);
#else
  return(fwrite(&data[0], (size_t)1, (size_t)count, fp));
#endif
}

static long PCCRAP *WTMP = NULL;
/* A bit hacky having this single 4 bytes in heap space, but it makes
   things a little more consistent.  it'll all go away eventually
   anyway... */

#ifdef PROTOTYPES
void putword(long w, FILE *fp)
#else
void putword(w, fp)
long w;
FILE *fp;
#endif
{
  if (WTMP == NULL) {
    WTMP = (long PCCRAP *)MALLOC(1,sizeof(long));
  }
  *WTMP = w;
  if (putwords(WTMP, (long)sizeof(long), fp) != sizeof(long)) {
    /* (using putwords avoids confusion over bytesex) */
    fprintf(stderr, "Data write error - putword\n");
  }
}

#ifdef PROTYPES
long getword(FILE *fp)
#else
long getword(fp)
FILE *fp;
#endif
{
  if (WTMP == NULL) {
    WTMP = (long PCCRAP *)MALLOC(1,sizeof(long));
  }
  if (getwords(WTMP, (long)sizeof(long), fp) != sizeof(long)) {
    fprintf(stderr, "Data read error - getword\n");
  }
  return(*WTMP);
}
SHAR_EOF



More information about the Alt.sources mailing list