Expressions will consist of the four basic operations and single-letter variables. As a result, I will return a parse tree. Anyway, let's start with the basics and see how it goes.
#include <stdio.h> #include <stdlib.h> #include <string.h> #include <stdarg.h> typedef struct node { int type; char *data; struct node *left, *right; } node_t; enum { NODE_EOF, NODE_PLUS, NODE_MINUS, NODE_MUL, NODE_DIV, NODE_VALUE, }; node_t *node_new(int type, char *data, int len) { node_t *N; char *str; N= calloc(1,sizeof(*N)); N->type= type; str= malloc(len+1); memcpy(str, data, len); str[len]= 0; N->data= str; return N; } void node_print(node_t *N) { static int indent; int i; for(i=0;i<indent;i++) printf(" "); if (!N) { printf("(null)\n"); return ; } switch(N->type) { case NODE_PLUS: printf("+\n"); goto binary; case NODE_MINUS: printf("-\n"); goto binary; case NODE_DIV: printf("/\n"); goto binary; case NODE_MUL: printf("*\n"); goto binary; case NODE_VALUE: printf("%s\n", N->data); return ; default: printf("(unknown-%d)\n", N->type); return ; } binary: indent+= 3; node_print(N->left); node_print(N->right); indent-= 3; }We will store these nodes temporarily in stacks. Here are the definitions for that.
typedef struct { int tos, size; node_t **data; } node_stack_t; node_stack_t *stack_new(int size) { node_stack_t *stack; stack= malloc(sizeof(*stack)); stack->size= size; stack->tos= -1; stack->data= malloc(sizeof(stack->data[0])*size); return stack; } void stack_push(node_stack_t *stack, node_t *N) { if (stack->tos==stack->size-1) { stack->size += stack->size / 2; stack->data= realloc(stack->data, sizeof(stack->data[0])*stack->size); } stack->data[++stack->tos]= N; } node_t* stack_pop(node_stack_t *stack) { if (stack->tos==-1) { fprintf(stderr,"stack underflow\n"); exit(1); } return stack->data[stack->tos--]; } node_t *stack_top(node_stack_t *stack) { if (stack->tos==-1) { fprintf(stderr,"stack underflow\n"); exit(1); } return stack->data[stack->tos]; } void stack_print(node_stack_t *stack, char *name) { int i; printf("=== STACK : %s ===\n", name); for(i=stack->tos;i>-1;i--) { node_print(stack->data[i]); if (i!=0) printf("---\n"); } printf("=== END STACK : %s ===\n", name); }As is customary, tos holds the last valid position in the stack. Therefore, it's initialized to -1.
We also need a way to signal errors. Since this is a simple program, exit()ing shouldn't be too much of a trouble.
void syntax_error(char *str, char *p, char *fmt, ...) { va_list ap; int i; fprintf(stderr,"%s\n",str); for(i=0;i<p-str;i++) fprintf(stderr," "); fprintf(stderr,"^\n"); va_start(ap,fmt); vfprintf(stderr,fmt,ap); va_end(ap); exit(1); }Lexical analysis is typically much more complicated then what we have here.
node_t *lex(char *str,char **P) { if ((**P>='a' && **P<='z') || (**P>='A' && **P<='Z')) return node_new(NODE_VALUE, (*P)++, 1); switch(**P) { case '+': return node_new(NODE_PLUS, (*P)++, 1); case '-': return node_new(NODE_MINUS, (*P)++, 1); case '*': return node_new(NODE_MUL, (*P)++, 1); case '/': return node_new(NODE_DIV, (*P)++, 1); case 0: return node_new(NODE_EOF, *P, 0); } syntax_error(str, *P, "unrecognized character.\n"); return NULL; // never reached }
typedef struct { int is_op, // is this an operator? prec, // the precedence for the operator is_bin, // set to 1 if it's a binary operator r_assoc; // set to 1 if it's right-associative } tokdef_t; tokdef_t toktab[]= { [NODE_VALUE] { 0, 0, 0, 0 }, [NODE_PLUS] { 1, 100, 1, 0 }, [NODE_MINUS] { 1, 100, 1, 0 }, [NODE_MUL] { 1, 120, 1, 0 }, [NODE_DIV] { 1, 120, 1, 0 }, };There are two stacks: one for arguments, one for operators. Let's assume we only have binary operators for now. The algorithm proceeds from left to right, processing each token only once. In a string composed of only arguments and binary operators, the algorithm should encounter an initial term, and then a series of operator-term pairs like this:
// term op term op .... op termWhen an operator is encountered, the algorithm looks at the operator at the top of the operator-stack. If that operator has higher precedence than the incoming operator, then we need to execute the one on the stack first. Otherwise, we don't touch the stack. After this, the new operator gets pushed on the stack. That is, we delay the execution of an operator on the operator stack until we encounter an operator that has lower precedence.
Let's do a manual example. Let's have the string 'a+b*c+d'.
token | action | ostack | astack |
a | push to astack | [] | [a] |
+ | push to ostack | [+] | [a] |
b | push to astack | [+] | [a b] |
* | TOS(+) doesn't have greater precedence, push to ostack | [+ *] | [a b] |
c | push to astack | [+ *] | [a b c] |
+ | TOS(*) has greater precedence reduce K=b*c. | [+] | [a K] |
TOS(+) has same precedence, but left association. reduce L=a+K. | [] | [L] | |
push to ostack | [+] | [L] | |
d | push to astack | [+] | [L d] |
EOF | reduce everything. M=L+d | [] | [M] |
void reduce(node_stack_t *argstack, node_stack_t* opstack) { node_t *op; op= stack_pop(opstack); if (toktab[op->type].is_bin) op->right= stack_pop(argstack); op->left= stack_pop(argstack); stack_push(argstack, op); } node_t *parse(char *str) { char *p; node_t *n, *op; node_stack_t *argstack, *opstack; p= str; argstack= stack_new(1024); opstack= stack_new(1024); term: n= lex(str, &p); if (n->type==NODE_EOF || toktab[n->type].is_op) syntax_error(str,p, "term expected.\n"); stack_push(argstack, n); op: n= lex(str, &p); if (n->type==NODE_EOF) goto done; if (!toktab[n->type].is_op) syntax_error(str,p, "operator expected.\n"); while(opstack->tos!=-1) { op= stack_top(opstack); if (toktab[op->type].prec < toktab[n->type].prec) break; if (op->type == n->type && toktab[op->type].r_assoc) break; reduce(argstack, opstack); } stack_push(opstack, n); goto term; done: while(opstack->tos!=-1) reduce(argstack, opstack); return stack_top(argstack); } int main(int argc,char **argv) { node_t *n; n= parse(argv[1]); node_print(n); return 0; }When we run the above with the expression 'a+b*c-d*e', we get:
- + a * b c * d eHere is the complete program, expr1.
node_t *parse(char *str) { char *p; node_t *n, *op; node_stack_t *argstack, *opstack; p= str; argstack= stack_new(1024); opstack= stack_new(1024); term: n= lex(str, &p); if (n->type==NODE_EOF || n->type==NODE_RPAREN || toktab[n->type].is_op) syntax_error(str,p, "term expected.\n"); if (n->type==NODE_LPAREN) { stack_push(opstack, n); goto term; } stack_push(argstack, n); op: n= lex(str, &p); if (n->type==NODE_EOF) goto done; if (n->type==NODE_RPAREN) { while(opstack->tos!=-1 && stack_top(opstack)->type!=NODE_LPAREN) reduce(argstack, opstack); if (opstack->tos==-1) syntax_error(str, p-1, "unmatched ).\n"); stack_pop(opstack); goto op; } if (!toktab[n->type].is_op) syntax_error(str,p, "operator expected.\n"); while(opstack->tos!=-1) { op= stack_top(opstack); if (toktab[op->type].prec < toktab[n->type].prec) break; if (op->type == n->type && toktab[op->type].r_assoc) break; reduce(argstack, opstack); } stack_push(opstack, n); goto term; done: while(opstack->tos!=-1 && stack_top(opstack)->type!=NODE_LPAREN) reduce(argstack, opstack); if (opstack->tos!=-1) syntax_error(str, p, "unmatched (\n"); return stack_top(argstack); }Here is the complete program, expr2.
Let's make a negation operator. We will reuse the minus sign. First of all, we need a new field for the toktab, which tells us whether a token has a prefix operator meaning in addition to its normal meaning.
typedef struct { int is_op, prec, is_bin, r_assoc, prefix_meaning; } tokdef_t; tokdef_t toktab[]= { [NODE_VALUE] { 0, 0, 0, 0 , -1 ,}, [NODE_PLUS] { 1, 100, 1, 0 , -1 }, [NODE_MINUS] { 1, 100, 1, 0 , NODE_NEG }, [NODE_MUL] { 1, 120, 1, 0 , -1 }, [NODE_DIV] { 1, 120, 1, 0 , -1 }, [NODE_LPAREN] { 0, 0, 0, 0 , -1 }, [NODE_RPAREN] { 0, 0, 0, 0 , -1 }, [NODE_NEG] { 1, 500, 0, 0, -1 }, };Parsing is quite easy to do, with this table entry.
node_t *parse(char *str)
{
char *p;
node_t *n, *op;
node_stack_t *argstack, *opstack;
p= str;
argstack= stack_new(1024);
opstack= stack_new(1024);
term:
n= lex(str, &p);
if (n->type==NODE_EOF || n->type==NODE_RPAREN)
syntax_error(str,p, "term expected.\n");
if (toktab[n->type].is_op)
{
if (toktab[n->type].prefix_meaning==-1)
syntax_error(str,p, "term expected.\n");
n->type= toktab[n->type].prefix_meaning;
stack_push(opstack, n);
goto term;
}
if (n->type==NODE_LPAREN)
{
stack_push(opstack, n);
goto term;
}
stack_push(argstack, n);
op:
n= lex(str, &p);
if (n->type==NODE_EOF) goto done;
if (n->type==NODE_RPAREN)
{
while(opstack->tos!=-1 &&
stack_top(opstack)->type!=NODE_LPAREN)
reduce(argstack, opstack);
if (opstack->tos==-1)
syntax_error(str, p-1, "unmatched ).\n");
stack_pop(opstack);
goto op;
}
if (!toktab[n->type].is_op)
syntax_error(str,p, "operator expected.\n");
while(opstack->tos!=-1)
{
op= stack_top(opstack);
if (toktab[op->type].prec < toktab[n->type].prec) break;
if (op->type == n->type &&
toktab[op->type].r_assoc) break;
reduce(argstack, opstack);
}
stack_push(opstack, n);
goto term;
done:
while(opstack->tos!=-1 &&
stack_top(opstack)->type!=NODE_LPAREN)
reduce(argstack, opstack);
if (opstack->tos!=-1)
syntax_error(str, p, "unmatched (\n");
return stack_top(argstack);
}
Here is the complete program, expr3.
op: ... if (toktab[n->type].is_postfix) { stack_push(opstack, n); reduce(argstack, opstack); goto op; } ...We do the reduction immediately because in general, postfix operators have the highest precedence. Therefore, there is no need to buffer them in the stack.
It's also possible to have postfix operators with precedence lower than that of say, prefix operators. If that was the case, we would reduce the stack conditionally before pushing the new operator, effectively doing the same thing as a binary operator, but with one argument only (and thus the "goto op").
Anyway, it would be confusing to have varying degrees of precedence for each postfix/prefix operator. So, it's best to have the same precedence for all postfix operators which is different than the precedence for all prefix operators.
The above code can also handle right associative operators. However, there aren't many of those in common use. The only one I can think of is the exponentiation operator.
The algorithm can be used to parse even more complicated syntaxes. For instance, it would be possible to parse C style function calls by adding a postfix meaning to the left parenthesis as a binary operator. The comma operator would then be defined as a right-associative low precedence operator. The function call operator would then combine the function expression with the argument list expression when the matching right parenthesis is encountered.