Sets in C (like Pascal) **source included**

Graham Toal gtoal at tharr.UUCP
Fri Jun 29 15:55:16 AEST 1990


Archive-name: setdemo.c
/*
    File:    setdemo.c
    Author:  Graham Toal <gtoal%uk.ac.ed at nsfnet-relay.ac.uk>
    Created: 27/06/90

    Bryon Lape <wozniak at utkux1.utk.edu> recently asked for C code to implement
    pascal-like sets.  By coincidence I had written this just a few days
    earlier, as a proof-of-concept test program.  Slight lack of comments
    but should be fathomable.
    
    The code in fact implements something stronger than pascal sets; it allows
    arbitrarily sized sets, and it lets you construct expressions out of
    those sets *without* using any scratch workspace to evaluate intermediates.

    (The real use of the code is in free text indexing, where the sets are
     lists of addresses of words in a large file, and we are performing
     queries on the file)
     
    (c) Graham Toal 1990.
    I retain copyright on this only to stop anyone else copyrighting it and
    restricting my use of my own code! Otherwise it is entirely in the
    public domain.  Use it as you wish, commercially or otherwise.


    Note: the sets are stored as lists of elements; if the input data
    is always unique and sorted, the output data will be too; a simple
    proof by induction can show this.  So when constructing sets as
    input to this code, remember to validate them.
    
    For example, list1 = [ 1 2 38 3456 100006 ] is ok, but
                 list2 = [ 1 38 2 3456 100006 ] is bad (order), and
                 list3 = [ 1 1 1 2 38 3456 100006 ] is also bad (not unique).

 */

/*#define DEBUG*/
#include <stdio.h>

#define TRUE (0==0)
#define FALSE (0!=0)

#define DATA int

typedef struct PIPEstruct PIPE;

typedef int (*UNARYFN)(PIPE *, DATA *);
typedef int (*BINFN)(PIPE *, PIPE *, DATA *);
/* easily extended to ternary functions for things such as range-checking */

#define MONO 1
#define BINARY 2

#define PENDING (EOF+1)
#define UNDEF (PENDING+1)

struct PIPEstruct {
  int fntype;
  UNARYFN monofun;
  BINFN   bifun;
  struct PIPEstruct *op1, *op2;
  int state; /* EOF, PENDING, UNDEF */
  DATA lookahead;
  int *array; /* These two represent data stream for now */
  int index;
  /* DATA offset1, offset2; */ /* These two would be used in real life
                                  to point into a file of word indexes */
};

void unread(PIPE *p, DATA d) {
  p->state = PENDING; p->lookahead = d;
}

int and(PIPE *left, PIPE *right, DATA *value) {
DATA d1, d2;
  /* Perform an AND operation on two streams */
  for (;;) {
    if (!eval(left, &d1) || !eval(right, &d2)) return(FALSE);
    while ((d1 < d2) && eval(left, &d1));
    while ((d2 < d1) && eval(right, &d2));
    if (d1 == d2) {*value = d1; return(TRUE);}
  }
}

int or(PIPE *left, PIPE *right, DATA *value) {
DATA d1, d2;
  /* Perform an OR operation on two streams */
  if (!eval(left, &d1)) return(eval(right, value)); *value = d1;
  if (eval(right, &d2)) {
    if (d1 < d2) unread(right, d2);
    else if (d2 < d1) {unread(left, d1); *value = d2;}
  }
  return(TRUE);
}

int eval(PIPE *expr, DATA *value) {
  if (expr->state == PENDING) {
    expr->state = UNDEF; *value = expr->lookahead;
    return(TRUE);
  }
  switch (expr->fntype) {
  case MONO:   return(expr->monofun(expr->op1, value));
  case BINARY: return(expr->bifun(expr->op1, expr->op2, value));
  }
}

int data(PIPE *d, DATA *value) {
  if (d->state == PENDING) {
    d->state = UNDEF; *value = d->lookahead;
    return(TRUE);
  }
  if (d->array[d->index] == -1) d->state = EOF; /* Hack for demo --
                               real code would compare offset1 to offset2 */
  if (d->state == EOF) return(FALSE);
  *value = d->array[d->index++]; d->state = UNDEF;
  return(TRUE);
}

PIPE *make_data_stream(int O_List[]) {
  PIPE *tmp;
  tmp = (PIPE *)malloc(sizeof(PIPE));
  tmp->fntype = MONO;
  tmp->monofun = data;
  tmp->array = O_List;
  tmp->index = 0;
  tmp->op1 = tmp; /* Actually op1 should be a structure containing
                     array and index from above; this is just laziness */
  tmp->state = UNDEF;
  return(tmp);
}

PIPE *make_pipe_binop(int (*fn)(PIPE *left, PIPE *right, DATA *value),
                      PIPE *arg1, PIPE *arg2) {
  PIPE *tmp;
  tmp = (PIPE *)malloc(sizeof(PIPE));
  tmp->bifun = fn;
  tmp->fntype = BINARY;
  tmp->op1 = arg1;
  tmp->op2 = arg2;
  tmp->state = UNDEF;
  return(tmp);
}

int main(int argc, char **argv) {
  /* Simulated Search Engine -- effectively performing set operations
     on arbitrarily large sets. */
  static int O_List1[16] = {
    1, 3, 5, 6,
    7, 8, 10, 12,
    14, 16, 17, 18,
    20, 21, 22, -1};
  static int O_List2[7] = {1, 2, 4, 5, 6, 7, -1};
  static int O_List3[9] = {14, 15, 16, 17, 18, 19, 20, 21, -1};
  static int O_List4[4] = {12, 13, 14, -1};

  /*

    The problem is a free-text database lookup.  We have an index
    which when presented with a keyword, returns a list of identifiers
    of records which contain that keyword.  We wish to use and and or
    operations on several keywords, eg 'Give me all the records which
    contain "programmer" and "poor", or "manager" and "rich"'

    Here is a worked example of the problem.   There is a very
    easy solution possible *iff* all the sets are kept in memory;
    however they can grow to be excessively large, so the chosen
    algorithm has to work on small sections of these sets at any
    one time.
       Conceptually the simplest way to do this is to define a mechanism
    where the operator is akin to a unix filter or process, receiving its
    input data from a stream, and writing its results to a stream.  Then
    the expression evaluation becomes a dataflow problem of connecting these
    streams together, and reading the results coming out of the topmost filter.
      One minor implementation note: accessing the elements of each of
    these lists off CD Rom turn-about would cause lots of disk-head movement;
    we can optimise this to any degree necessary by having the bottom-level
    functions buffer their data in blocks in Ram.

                          List1 <- 1 3 5 6 7 8 10 12 14 16 17 18 20 21 22
                         /
                       OR
                      /  \
                     /    List2 <- 1 2 4 5 6 7
        Result <- AND
                     \    List3 <- 12 13 14 15 16 17 18 19
                      \  /
                       OR
                         \
                          List4 <- 12 13 14

                       <- 1 3 4 5 6 7 8 10 12 14 16 17 18 20 21 22
                      /
                     /
        Result <- AND
                     \
                      \
                       <- 12 13 14 15 16 17 18 19 20 21

        Result <- 12 14 16 17 18 20 21

   */

  PIPE *data1, *data2, *data3, *data4, *or1, *or2, *and1;
  DATA value;

  /* The Object-lists are simulated by a C array; in real life they
     would be found in a large index file. (They are actually found
     by following a 'trie' from a master index) */
     
  data1 = make_data_stream(O_List1);
  data2 = make_data_stream(O_List2);
  data3 = make_data_stream(O_List3);
  data4 = make_data_stream(O_List4);

  /* Compile our expression */
  or1 = make_pipe_binop(or, data1, data2);
  or2 = make_pipe_binop(or, data3, data4);
  and1 = make_pipe_binop(and, or1, or2);

  /* Perform the evaluation of the expression; if you want, you can
     store the results in a new list; often it is easier to do things
     with them one at a time as the members of the list are presented */

  fprintf(stderr, "The result of list1|list2 & list3|list4 is:\n");
  while (eval(and1, &value)) {
    fprintf(stderr, " %d", value);
  }
  fprintf(stderr, "\n");
}



More information about the Comp.lang.c mailing list