Lex Description

Henry Spencer henry at utzoo.uucp
Fri Nov 18 06:52:24 AEST 1988


In article <1988Nov11.201700.25003 at utzoo.uucp> I wrote:
>... I have a lex program which tokenizes
>C, with a couple of minor reservations.  If enough people squeak, and if
>nobody comes up with a better one, I'll post it.

Well, I haven't seen any others, and I have heard a certain amount of
squeaking, so here it is.  It's a bit crude in spots, and error recovery
is minimal -- I built it partly as an exercise, and partly for some
statistics gathering on existing legal programs, so it accepts *exactly*
legal draft-ANSI C and nothing else.  Although this was based on a slightly
old draft, I think the lexical structure of the language is stable enough
that this is still current, with one exception:  hexadecimal string escapes
are no longer limited to three digits.

(If you want to make it more robust, the first thing you're going to have
to do is make the string part less fussy.)

(Oh yes, it uses a couple of local functions you may not have:  error()
prints a message and exits, efopen() does fopen() and calls error() if
it failed.)

-----------------
%{
/*
 * ctokens - print tokens of a C or C++ program
 *
 * Full ANSI C (draft of 1 Oct 1986) except:  no trigraphs; copes with
 * backslash-newline stripping only inside strings; imperfect understanding
 * of the context-dependent rule that makes <bletch.h> a single token
 * inside a #include.
 *
 * Except for newlines, any white-space character is printed as "\t".
 * It would be more sensible to make the white-space expression [ \t\v\f]+
 * instead of just [ \t\v\f], but our old lex has problems with that.
 *
 * Note that this program uses one (sigh) undocumented feature of Unix lex:
 * the ability to override the choice of input stream by assigning to yyin.
 * Avoiding this requires reimplementing lex's input functions, which is a
 * pain because getc/ungetc isn't good enough.
 *
 * $Log$
 */

#include <stdio.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <string.h>

#define	STREQ(a, b)	(*(a) == *(b) && strcmp((a), (b)) == 0)

#ifndef lint
static char RCSid[] = "$Header$";
#endif

int debug = 0;
char *progname;

extern void error(), exit();
#ifdef UTZOOERR
extern char *mkprogname();
#else
#define	mkprogname(a)	(a)
#endif

#define	PRINTIT	printf("%s\n", yytext)

int cflag = 0;			/* C only. */

/* stuff for stupid context-dependent #include <name> */
#define	SAWNL	0
#define	SAWNUM	1
#define	SAWINC	2
#define	OTHER	3
int state = SAWNL;
/* #define	PS	printf("state %d\n", state) */
#define	PS	/* */
%}

EXP	([eE][+-]?[0-9]+)
FS	[flFL]
IS	([uU][lL]?|[lL][uU]?)

%%

[_a-zA-Z][_a-zA-Z0-9]*		{		/* identifier */
					PRINTIT;
					if (strcmp(yytext, "include") == 0 &&
							state == SAWNUM)
						state = SAWINC;
					else
						state = OTHER;
					PS;
				}

[0-9]+"."[0-9]*{EXP}?{FS}?	|
"."[0-9]+{EXP}?{FS}?		|
[0-9]+{EXP}{FS}?		|
[1-9][0-9]*{IS}?		|
0[0-7]*{IS}?			|
0[xX][0-9a-fA-F]+{IS}?		{ PRINTIT;	/* number */ }

\'([^'\\\n]|\\(['"?\\abfnrtv]|[0-7]{1,3}|[xX][0-9a-fA-F]{1,3}))+\'	{
		PRINTIT;	/* character constant */
	}

\"([^"\\\n]|\\(['"?\\abfnrtv\n]|[0-7]{1,3}|[xX][0-9a-fA-F]{1,3}))*\"	{
		/* string -- remove backslashed newlines */
		register char *p;

		for (p = yytext; *p != '\0'; p++)
			if (*p == '\\' && *(p+1) == '\n')
				p++;
			else
				putchar(*p);
		putchar('\n');
	}

"#"	{
		if (state == SAWNL)
			state = SAWNUM;
		PRINTIT;
		PS;
	}
"<"[^>\n]*">"	{
		PS;
		if (state != SAWINC) {
			REJECT;
		} else
			PRINTIT;
		state = OTHER;
	}
[-()&*+~!/%<>^|,.=;:{}?]	|
"["				|
"]"				|
"->"				|
"++"				|
"--"				|
"<<"				|
">>"				|
"<="				|
">="				|
"=="				|
"!="				|
"&&"				|
"||"				|
"##"				|
"..."				|
[-*/%+&^|]"="			|
"<<="				|
">>="				{ PRINTIT;	/* misc. tokens */ }
"::"				{
					if (cflag) {
						REJECT;
					} else
						PRINTIT;
				}

\n				{ state = SAWNL; PS;  printf("\\n\n"); }
[ \t\v\f]			printf("\\t\n");

"/*"	{
		register int ch;
		register int nnl = 0;

		printf("/* ");
		for (;;) {
			ch = input();
			if (ch == '*') {
				ch = input();
				if (ch == '/')
					break;
				else
					unput(ch);
			} else if (ch == '\n') {
				nnl++;
				if (nnl <= 10)
					printf("\\n");
				if (nnl == 10)
					printf("...");
			} else if (ch == '\0') {
				fprintf(stderr, "unterminated comment!\n");
				exit(0);
			}
		}
		printf(" */\n");
	}

"//"	{
		register int ch;

		if (cflag) {
			REJECT;
		} else {
			printf("//\n");
			while ((ch = input()) != '\n')
				if (ch == '\0') {
					fprintf(stderr, "unterminated comment!\n");
					exit(0);
				}
			unput(ch);
		}
	}

.				printf("%c ???\n", yytext[0]);

%%

/*
 - main - parse arguments and handle options
 */
main(argc, argv)
int argc;
char *argv[];
{
	int c;
	int errflg = 0;
	FILE *in;
	struct stat statbuf;
	extern int optind;
	extern char *optarg;
	extern FILE *efopen();
	void process();

	progname = mkprogname(argv[0]);

	while ((c = getopt(argc, argv, "dC")) != EOF)
		switch (c) {
		case 'C':	/* C only, no C++. */
			cflag = 1;
			break;
		case 'd':	/* Debugging. */
			debug++;
			break;
		case '?':
		default:
			errflg++;
			break;
		}
	if (errflg) {
		fprintf(stderr, "usage: %s [-C] [file] ...\n", progname);
		exit(2);
	}

	if (optind >= argc)
		process(stdin, "stdin");
	else
		for (; optind < argc; optind++)
			if (STREQ(argv[optind], "-"))
				process(stdin, "-");
			else {
				in = efopen(argv[optind], "r");
				if (fstat(fileno(in), &statbuf) < 0)
					error("can't fstat `%s'", argv[optind]);
				if ((statbuf.st_mode & S_IFMT) == S_IFDIR)
					error("`%s' is directory!", argv[optind]);
				process(in, argv[optind]);
				(void) fclose(in);
			}
	exit(0);
}

/*
 * process - process input file
 */
void
process(in, inname)
FILE *in;
char *inname;
{
	yyin = in;
	(void) yylex();
}
-----------------
-- 
Sendmail is a bug,             |     Henry Spencer at U of Toronto Zoology
not a feature.                 | uunet!attcan!utzoo!henry henry at zoo.toronto.edu



More information about the Comp.lang.c mailing list