cmpall -- find identical (duplicated) files

Liam R. E. Quin lee at sqarc.sq.com
Fri Aug 3 07:24:11 AEST 1990


I wrote cmpall some time ago, when I found that I had lots of copies and
duplicated directory hierarchies.

You say (for example)
	find $HOME | cmpall
and it says
	same /home/lee/src/lqtext/doc/README /home/lee/doc/README
and so forth.

This version only compares files that have the same name and size.
You could change this to compare files that have the same size if you
wanted -- I wasn't too bothered by that.  It won't (of course) detect
duplicates if one is compressed.

It works unchanged on SysV and SunOS, should be OK on other Unix systems.

No manpage.  Just do
	make cmpall
(you don't even need a makefile).

Let me know if it helps, or if you change it...

Lee

: To unbundle, sh this file
echo x - cmpall.c 1>&2
sed 's/^X//' >cmpall.c <<'@@@End of cmpall.c'
X/* $Header: /home/lee/src/cmpall/cmpall.c,v 1.2 90/08/02 17:22:12 lee Exp Locker: lee $
X *
X */
X
X#include <stdio.h>
X#include <malloc.h>
X#include <errno.h>
X#include <fcntl.h>
X#include <sys/types.h>
X#include <sys/stat.h>
X
Xextern int errno;
X
Xtypedef struct s_listel {
X    char *Name;
X    struct stat *statbuf;
X    int n_seen;
X    struct s_listel *Next;
X} t_listel;
X
X#define STREQ(henry,utzoo) ((*(henry)== *(utzoo))&&!strcmp(henry,utzoo))
X#define new(type) ((type *) malloc(sizeof(type)))
X
Xint UseBytes = 0;
Xchar *progname;
X
X/* read filenames a line at a time.
X * whenever we get a file we've seen before, compare it
X * with the other of the sames name.
X * If they are the same, print "choose -s first second"
X * otherwise, print "choose -d first second"
X */
Xmain(argc, argv)
X    int argc;
X    char *argv[];
X{
X    t_listel *NewEntry();
X    void AddEntry();
X    char *GetLine();
X    char *Path;
X
X    progname = argv[0];
X
X    if (argc != 1) {
X	if (argc == 2 && STREQ(argv[1], "-b")) {
X	    UseBytes = 1;
X	} else {
X	    (void) fprintf(stderr, "usage: find <dir> ... -print | %s [-b]\n",
X								progname);
X	    exit(1);
X	}
X    }
X
X    while ((Path = GetLine(stdin, "/dev/stdin")) != (char *) 0) {
X	AddEntry(stdin, "/dev/stdin", Path);
X    }
X
X    exit(0);
X}
X
Xt_listel *SeenSoFar = (t_listel *) 0;
X
Xvoid
XAddEntry(fd, Name, Path)
X    FILE *fd;
X    char*Name;
X    char *Path;
X{
X    extern char *strrchr();
X
X    t_listel *e;
X    t_listel **lpp;
X
X    if ((e = new(t_listel)) == (t_listel *) 0) {
X	(void) fprintf(stderr, "no RAM reading \"%s\" ", Name);
X	/* possibly we dumped core just then... */
X	(void) fprintf(stderr, "at \"%s\"\n", Path);
X	exit(1);
X    }
X
X    e->Name = Path;
X    e->n_seen = 1;
X    e->statbuf = (struct stat *) 0;
X
X    for (lpp = &SeenSoFar; *lpp; lpp = &(*lpp)->Next) {
X	register char *Old = (*lpp)->Name;
X	int i;
X	char *PathLastSlash;
X	char *OldLastSlash;
X
X	if ((PathLastSlash = strrchr(Path, '/')) == (char *) 0) {
X	    PathLastSlash = Path;
X	} else {
X	    ++PathLastSlash;
X	}
X
X	if ((OldLastSlash = strrchr(Old, '/')) == (char *) 0) {
X	    OldLastSlash = Old;
X	} else {
X	    ++OldLastSlash;
X	}
X
X	if (*PathLastSlash == *OldLastSlash) {
X	    i = strcmp(PathLastSlash, OldLastSlash);
X	} else {
X	    i = *PathLastSlash - *OldLastSlash;
X	}
X
X	if (i < 0) continue; /* not there yet */
X	if (i == 0) {
X	    /* found it... */
X
X	    /* If the files are the same, we should add them to a single
X	     * linked list and traverse it at the end, I suppose!
X	     */
X	    if (Compare(fd, Name, *lpp, e) == 0) {
X		/* They were the same, so no point storing this one... */
X		(void) printf("same '%s' '%s'\n", (*lpp)->Name, e->Name);
X		if (e->statbuf) {
X		    (void) free((char *) e->statbuf);
X		}
X		(void) free((char *) e);
X		(void) free(Path);
X		return;
X	    }
X	    /* the files were different, so add the new one to the list */
X	    /* FALLTHROUGH */
X	}
X	break;
X    }
X
X    /* lpp points either at the last "Next", or at the Next
X     * of the element before which we must insert a new entry...
X     */
X
X    e->Next = (*lpp);
X    *lpp = e;
X    return;
X}
X
X/*ARGSUSED*/
Xint
XCompare(fd, Name, Old, New)
X    FILE *fd;
X    char *Name;
X    t_listel *Old;
X    t_listel *New;
X{
X    /* We return
X     * 0 if the files are identical
X     * -1 if we can't stat Path
X     * 1 if the files are (or might be) different
X     * 2 if they are definitely different
X     *
X     * If we can stat Path but not e, we return 3, and delete e
X     * from the list.
X     */
X    if (Old->statbuf == (struct stat *) 0) {
X	if ((Old->statbuf = new(struct stat)) == (struct stat *) 0) {
X	    (void) fprintf(stderr, "reading \"%s\", no RAM to stat \"%s\"\n",
X							Name, Old->Name);
X	    exit(1);
X	}
X	if (stat(Old->Name, Old->statbuf) < 0) {
X	    Delete(Old);
X	    return 3;
X	}
X    }
X
X    if (New->statbuf == (struct stat *) 0) {
X	if ((New->statbuf = new(struct stat)) == (struct stat *) 0) {
X	    (void) fprintf(stderr, "reading \"%s\", no RAM to stat \"%s\"\n",
X							Name, New->Name);
X	    exit(1);
X	}
X	if (stat(New->Name, New->statbuf) < 0) {
X	    return -1;
X	}
X    }
X
X    /* if they are not normal files, they are the same if the major/minor
X     * numbers are the same:
X     */
X    /* NOTDONE */
X    
X    /** Now compare the files: */
X
X    /*  first, are they the same size? */
X    if (Old->statbuf->st_size != New->statbuf->st_size) {
X	return 2; /* definitely different */
X    }
X
X    if (Old->statbuf->st_ino == New->statbuf->st_ino &&
X	Old->statbuf->st_dev == New->statbuf->st_dev &&
X	Old->statbuf->st_nlink == New->statbuf->st_nlink) {
X
X	/* they are linked to each other, so they are the same! */
X	return 0; /* this is only really a heuristic */
X    }
X
X    switch (CmpFile(Old->Name, New->Name)) {
X    case 0: return 0;
X    case 1: return 2; /* different! */
X    case -1: /* no a */
X    case -2: /* no b */
X	return -1;
X    default:
X	return -1;
X    }
X}
X
Xint
XCmpFile(a, b)
X    char *a, *b;
X{
X    /* return 0 if same, -ve on error, 1 otherwise */
X    int afd = open(a, O_RDONLY, 0);
X    int bfd = open(b, O_RDONLY, 0);
X    char ablk[1024], bblk[1024];
X    int ar, br;
X
X    if (afd < 0) return -1;
X    if (bfd < 0) return -2;
X
X    while ((ar = read(afd, ablk, 1024)) == (br = read(bfd, bblk, 1024))) {
X	if ((ar == 0) || memcmp(ablk, bblk, ar) != 0) {
X	    (void) close(afd);
X	    (void) close(bfd);
X	    /* ar == br == 0: got to EOF, they are the same */
X	    return (ar == 0) ? 0 : 1;
X	} else if (ar < 0) {
X	    return -3; /* read error! */
X	}
X    }
X    /* Since we use stat to check the size before calling this function,
X     * this should never happen... except in the case of a read error.
X     */
X    return 1; /* different lengths */
X}
X
Xchar *
XGetLine(fd, Name)
X    FILE *fd;
X    char *Name;
X{
X    char *Result;
X    register char *p;
X    unsigned int len = 20;
X    int ch;
X
X    if ((Result = malloc(len)) == (char *) 0)  {
X	(void) fprintf(stderr, "out of mem reading \"%s\"\n", Name);
X	exit(1);
X    }
X
X    p = Result;
X
X    while ((ch = getc(fd)) != EOF) {
X	if (ch == '\n') break;
X	if (p - Result >= len - 1) {
X	    int WhereWeWere = p - Result;
X
X	    if ((Result = realloc(Result, len += 20)) == (char *) 0) {
X		(void) fprintf(stderr, "no more mem in \"%s\"\n", Name);
X		exit(1);
X	    }
X
X	    /* Realloc() might have returned a different address,
X	     * so we must ensure that p points to the new array...
X	     */
X	    p = &Result[WhereWeWere];
X	}
X	*p++ = ch;
X    }
X    *p = '\0';
X
X    if (ch == EOF) {
X	if (Result[0] != '\0') {
X	    return Result;
X	}
X	return (char *) 0;
X    }
X    if (p - Result > 1) {
X	Result = realloc(Result, (unsigned) (p - Result + 1));
X	if (Result == (char *) 0) {
X	    (void) fprintf(stderr, "can't save line in \"%s\"\n", Name);
X	    exit(1);
X	}
X    }
X    return Result;
X}
X
XDelete(e)
X    t_listel *e;
X{
X    register t_listel **lpp;
X
X    for (lpp = &SeenSoFar; *lpp; lpp = &(*lpp)->Next) {
X	if ((*lpp)->Name == e->Name) { /* same address, NOT strcmp! */
X	    t_listel *next = e->Next;
X
X	    (void) free(e->Name);
X	    if (e->statbuf) {
X		(void) free((char *) e->statbuf);
X	    }
X	    (void) free((char *) e);
X	    *lpp = next;
X
X	    break;
X	}
X    }
X}
X
X/*
X * $Log:	cmpall.c,v $
X * Revision 1.2  90/08/02  17:22:12  lee
X * delinted it a little...
X * 
X * Revision 1.1  90/08/02  17:04:26  lee
X * Initial revision
X * 
X *
X */
@@@End of cmpall.c
ls -l cmpall.c
exit 0

Lee
-- 
Liam R. E. Quin,  lee at sq.com, {utai,utzoo}!sq!lee,  SoftQuad Inc., Toronto
``He left her a copy of his calculations [...]  Since she was a cystologist,
  she might have analysed the equations, but at the moment she was occupied
  with knitting a bootee.''  [John Boyd, Pollinators of Eden, 217]



More information about the Alt.sources mailing list