/* IParser -- An interpretting parser Copyright (C) 2001 Frans Faase This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. GNU General Public License: http://home.planet.nl/~faase009/GNU.txt Description: This program implements a parser with the following unique features: - It is an interpretting parser, meaning that on the input it expects at least two arguments, namely: - a filename of a file with a grammer, and - a filename of a file to be parsed according to the grammar. - Implements a "greedy" back-tracking parser, thus supporting a wide range of grammers. Can deal with many ambigious grammers. - Uses a smart caching algorithm for high performance. - Grammar supports keywords like OPT, LIST, SEQ and CHAIN. - Grammer supports common C-like non-terminals. - Scanner is controlled by parser, instead of being implemented as a first pass. This allows context-sensitve scanning. - Has output option to implement a parser with "hard-coded" grammar. So, you don't need to supply the first argument anymore. - Builds a abstact syntax tree from the parsed input based on a simple name giving strategy. - extremely small code base. (This file contains it all.) Please read the comments at the end of this file for instructions on how to adapt IParse for your own grammar. This parser was developed as part of the Meta-Meta environment. See http://home.planet.nl/~faase009/MM.html */ #define VERSION "1.2.1 of November 7, 2002." /* First some standard definitions. */ #include #include #include #include #include #ifndef NULL #define NULL 0 #endif typedef int bool; #define TRUE 1 #define FALSE 0 typedef signed long longint; typedef unsigned long longword; typedef unsigned short word; typedef unsigned char byte; #define MALLOC(t) (t*)malloc(sizeof(t)) #define MALLOC_N(N,T) (T *)malloc((N)*sizeof(T)) #define STR_MALLOC(N) (char *)malloc(N+1) #define REALLOC_N(A,N,T) (T *)realloc(A, (N)*sizeof(T)) #define STRCPY(D,S) D = (char *)malloc(strlen(S)+1); strcpy(D,S) #define _DEBUG(S) #define _DEBUG_P1(S,A) #define _DEBUG_P2(S,A,B) #define _DEBUG_P3(S,A,B,C) #define _DEBUG_P4(S,A,B,C,D) #define DEBUG(S) if (debug_scan) printf(S) /*#define DEBUG_P1(S,A) if (debug_scan) printf(S,A)*/ #define DEBUG_P2(S,A,B) if (debug_scan) printf(S,A,B) #define DEBUG_P3(S,A,B,C) if (debug_scan) printf(S,A,B,C) #define DEBUG_P4(S,A,B,C,D) if (debug_scan) printf(S,A,B,C,D) #define DEBUG_P6(S,A,B,C,D,E,F) if (debug_scan) printf(S,A,B,C,D,E,F) #define DEBUG_F(S) if (debug_scan > 1) printf(S) #define DEBUG_F_P1(S,A) if (debug_scan > 1) printf(S,A) #define DEBUG_F_P2(S,A,B) if (debug_scan > 1) printf(S,A,B) #define DEBUG_F_P3(S,A,B,C) if (debug_scan > 1) printf(S,A,B,C) #define DEBUG_F_P4(S,A,B,C,D) if (debug_scan > 1) printf(S,A,B,C,D) #define DEBUG_F_P6(S,A,B,C,D,E,F) if (debug_scan > 1) printf(S,A,B,C,D,E,F) /* A hexadecimal hash tree ~~~~~~~~~~~~~~~~~~~~~~~ The following structure implements a mapping of strings to an integer value in the range [0..254]. It is a tree of hashs in combination with a very fast incremental hash function. In this way, it tries to combine the benefits of trees and hashs. The incremental hash function will first return the lower 4 bits of the characters in the string, and following this the higher 4 bits of the characters. */ typedef struct hexa_hash_tree_t hexa_hash_tree_t, *hexa_hash_tree_p; struct hexa_hash_tree_t { byte state; union { char *string; hexa_hash_tree_p *children; } data; }; byte *keyword_state = NULL; char *string(char *s) /* Returns a unique address representing the string. the global keyword_state will point to the integer value in the range [0..254]. If the string does not occure in the store, it is added and the state is initialized with 0. */ { static hexa_hash_tree_p hash_tree = NULL; hexa_hash_tree_p *r_node = &hash_tree; char *vs = s; int depth; int mode = 0; for (depth = 0; ; depth++) { hexa_hash_tree_p node = *r_node; if (node == NULL) { node = MALLOC(hexa_hash_tree_t); node->state = 0; STRCPY(node->data.string, s); *r_node = node; keyword_state = &node->state; return node->data.string; } if (node->state != 255) { char *cs = node->data.string; hexa_hash_tree_p *children; word i, v = 0; if (*cs == *s && strcmp(cs+1, s+1) == 0) { keyword_state = &node->state; return node->data.string; } children = MALLOC_N(16, hexa_hash_tree_t*); for (i = 0; i < 16; i++) children[i] = NULL; i = strlen(cs); if (depth <= i) v = ((byte)cs[depth]) & 15; else if (depth <= i*2) v = ((byte)cs[depth-i-1]) >> 4; children[v] = node; node = MALLOC(hexa_hash_tree_t); node->state = 255; node->data.children = children; *r_node = node; } { word v; if (*vs == '\0') { v = 0; if (mode == 0) { mode = 1; vs = s; } } else if (mode == 0) v = ((word)*vs++) & 15; else v = ((word)*vs++) >> 4; r_node = &node->data.children[v]; } } } /* Abstract Parse Trees ~~~~~~~~~~~~~~~~~~~~ The following section of the code implements a representation for Abstract Parse Trees. */ typedef struct context_t context_t, *context_p; typedef struct tree_t tree_t, *tree_p; typedef struct list_t list_t, *list_p; struct tree_t { char *type; union { list_p parts; char *str_value; int int_value; double double_value; char char_value; context_p context; } c; word line, column; longword refcount; }; struct list_t { list_p next; tree_p first; }; char *tt_str_value, *tt_int_value, *tt_list; /* Special strings, one for each terminal */ char *tt_ident = "identifier"; char *tt_str_value = "string value"; char *tt_int_value = "integer value"; char *tt_double_value = "double value"; char *tt_char_value = "char value"; char *tt_list = "list"; /* Special strings for identifiers and context */ char *tt_identdef = ""; char *tt_identdefadd = ""; char *tt_identuse = ""; char *tt_identfield = ""; char *tt_opencontext = ""; char *tt_closecontext = ""; tree_p old_trees = NULL; list_p old_lists = NULL; long alloced_trees = 0; tree_p malloc_tree( void ) { tree_p new; if (old_trees) { new = old_trees; old_trees = (tree_p)old_trees->type; } else new = MALLOC(tree_t); new->line = 0; new->column = 0; new->refcount = 1; alloced_trees++; return new; } void free_list( list_p list ); bool type_can_have_parts( char *type ) { return type != tt_ident && type != tt_str_value && type != tt_int_value && type != tt_double_value && type != tt_char_value && type != tt_opencontext && type != tt_closecontext; } void tree_release( tree_p *r_t ) { tree_p t = *r_t; if (t == NULL) return; alloced_trees--; t->refcount--; if (t->refcount == 0) { if (type_can_have_parts(t->type)) { list_p list = t->c.parts; while (list != NULL) { list_p next = list->next; free_list(list); list = next; } } t->type = (char*)old_trees; old_trees = t; } *r_t = NULL; } void free_list( list_p list ) { tree_release(&list->first); list->next = old_lists; old_lists = list; } void tree_assign(tree_p *d, tree_p s) { tree_p old_d = *d; *d = s; if (s != NULL) { s->refcount++; alloced_trees++; } if (old_d != NULL) tree_release(&old_d); } void tree_move(tree_p *d, tree_p *s) { tree_p old_d = *d; *d = *s; *s = NULL; if (old_d != NULL) tree_release(&old_d); } static list_p malloc_list( void ) { list_p new; if (old_lists) { new = old_lists; old_lists = old_lists->next; } else new = MALLOC(list_t); new->next = NULL; new->first = NULL; return new; } tree_p make_ident( char *str ) { tree_p tree = malloc_tree(); tree->type = tt_ident; tree->c.str_value = str; return tree; } tree_p make_str_atom( char *str ) { tree_p tree = malloc_tree(); tree->type = tt_str_value; tree->c.str_value = str; return tree; } tree_p make_double_atom( double value ) { tree_p tree = malloc_tree(); tree->type = tt_double_value; tree->c.double_value = value; return tree; } tree_p make_char_atom( char value ) { tree_p tree = malloc_tree(); tree->type = tt_char_value; tree->c.char_value = value; return tree; } tree_p make_int_atom( int value ) { tree_p tree = malloc_tree(); tree->type = tt_int_value; tree->c.int_value = value; return tree; } tree_p make_list( void ) { tree_p tree = malloc_tree(); tree->type = tt_list; tree->c.parts = NULL; return tree; } tree_p make_tree( char *name ) { tree_p tree = malloc_tree(); tree->type = name; tree->c.parts = NULL; return tree; } void insert_element( tree_p tree, tree_p element ) { list_p r_list; r_list = malloc_list(); tree_assign(&r_list->first, element); r_list->next = tree->c.parts; tree->c.parts = r_list; } void add_element( tree_p tree, tree_p element ) { list_p *r_list = &tree->c.parts; while (*r_list != NULL) r_list = &(*r_list)->next; *r_list = malloc_list(); tree_assign(&(*r_list)->first, element); } void drop_last_element( tree_p tree ) { list_p *r_list = &tree->c.parts; if (*r_list == NULL) return; while ((*r_list)->next != NULL) r_list = &(*r_list)->next; free_list(*r_list); *r_list = NULL; } inline list_p elements( tree_p tree ) { return tree->c.parts; } inline bool is_a_ident( tree_p tree ) { return tree && tree->type == tt_ident; } inline bool is_ident( tree_p tree, char *ident ) { return tree && tree->type == tt_ident && tree->c.str_value == ident; } inline bool equal_ident( tree_p tree, char *ident ) { return tree->c.str_value == ident; } inline bool is_a_str( tree_p tree ) { return tree && tree->type == tt_str_value; } inline bool is_str( tree_p tree, char *str ) { return tree && tree->type == tt_str_value && tree->c.str_value == str; } inline bool is_a_int( tree_p tree ) { return tree && tree->type == tt_int_value; } inline bool is_a_double( tree_p tree ) { return tree && tree->type == tt_double_value; } inline bool is_a_char( tree_p tree ) { return tree && tree->type == tt_char_value; } inline bool is_a_list( tree_p tree ) { return tree && tree->type == tt_list; } inline bool is_tree( tree_p tree, char *name ) { return tree && tree->type == name; } inline bool equal_tree( tree_p tree, char *name ) { return tree->type == name; } int nr_parts( tree_p tree ) { list_p parts = tree != NULL ? tree->c.parts : NULL; int nr = 0; for (; parts; parts = parts->next) nr++; return nr; } tree_p part( tree_p tree, int i ) { list_p parts = tree->c.parts; for (i--; parts && i > 0; parts = parts->next, i--); return parts ? parts->first : NULL; } /* Functions for printing the parse tree in a human readable form with smart indentation. */ int print_tree_depth = 0; void print_list(FILE *f, list_p list, bool compact); void print_context(FILE *f, context_p context, bool compact); void print_tree( FILE *f, tree_p tree, bool compact ) { if (tree == NULL) fprintf(f, "[NULL]"); else { if (tree->line != 0) fprintf(f, "#%d:%d#", tree->line, tree->column); if (is_a_ident(tree)) fprintf(f, "%s", tree->c.str_value); else if (is_a_str(tree)) fprintf(f, "\"%s\"", tree->c.str_value); else if (is_a_int(tree)) fprintf(f, "%d", tree->c.int_value); else if (is_a_double(tree)) fprintf(f, "%f", tree->c.double_value); else if (is_a_char(tree)) fprintf(f, "'%c'", tree->c.char_value); else if (tree->type == tt_opencontext) { fprintf(f, "{"); print_context(f, tree->c.context, compact); } else if (tree->type == tt_closecontext) fprintf(f, "}"); else { fprintf(f, "%s(", tree->type); print_tree_depth += strlen(tree->type) + 1; print_list(f, tree->c.parts, compact); print_tree_depth -= strlen(tree->type) + 1; fprintf(f, ")"); } } if (print_tree_depth == 0 && !compact) fprintf(f, "\n"); }; void print_list(FILE *f, list_p list, bool compact) { list_p l; bool first = TRUE; for (l = list; l; l = l->next) { if (!first) { if (compact) fprintf(f, ","); else fprintf(f, ",\n%*s", print_tree_depth, ""); } first = FALSE; /* fprintf(f, "[%lx]", (longword)l); */ print_tree(f, l->first, compact); } } /* Function for printing the Abstract Program Tree in the form of a function, which after being called, returns the reconstruction of the tree. This function can be used to hard-code a parser for a given grammer. This function was used to produce the function init_IParse_grammar below, which creates the Abstract Program Tree representing the input grammar of IParse. */ static void print_tree_rec( FILE *f, tree_p tree ); void print_tree_to_c( FILE *f, tree_p tree, char *name ) { fprintf(f, "void init_%s( tree_p *root )\n", name); fprintf(f, "{ /* Generated by IParse version %s */\n", VERSION); fprintf(f, " tree_p tt[100];\n"); fprintf(f, " int v = 0;\n"); fprintf(f, "\n"); fprintf(f, "#define ID(s) tt[v]=make_ident(string(s));\n"); fprintf(f, "#define LITERAL(s) tt[v]=make_str_atom(string(s));" " *keyword_state |= 1;\n"); fprintf(f, "#define INT(i) tt[v]=make_int_atom(i);\n"); fprintf(f, "#define DOUBLE(i) tt[v]=make_double_atom(i);\n"); fprintf(f, "#define CHAR(i) tt[v]=make_char_atom(i);\n"); fprintf(f, "#define TREE(n) tt[v]=make_tree(string(n)); v++;\n"); fprintf(f, "#define LIST tt[v]=make_list(); v++;\n"); fprintf(f, "#define NONE tt[v]=NULL;\n"); fprintf(f, "#define SEP add_element(tt[v-1],tt[v]); tree_release(&tt[v]);\n"); fprintf(f, "#define CLOSE add_element(tt[v-1],tt[v]); tree_release(&tt[v]); v--;\n"); fprintf(f, "#define EMPTY_CLOSE v--;\n"); fprintf(f, "\n "); print_tree_rec( f, tree ); fprintf(f, "\n *root = tt[0];\n}\n\n"); } static void print_tree_rec( FILE *f, tree_p tree ) { if (tree == NULL) fprintf(f, "NONE "); else if (is_a_ident(tree)) { fprintf(f, "ID(\"%s\") ", tree->c.str_value); } else if (is_a_str(tree)) { fprintf(f, "LITERAL(\"%s\") ", tree->c.str_value); } else if (is_a_int(tree)) { fprintf(f, "INT(%d) ", tree->c.int_value); } else if (is_a_double(tree)) { fprintf(f, "DOUBLE(%f) ", tree->c.double_value); } else if (is_a_char(tree)) { fprintf(f, "CHAR('%c') ", tree->c.char_value); } else { list_p l; if (is_a_list(tree)) fprintf(f, "LIST "); else fprintf(f, "TREE(\"%s\") ", tree->type); if (tree->c.parts) { print_tree_depth++; for (l = tree->c.parts; l; l = l->next) { print_tree_rec(f, l->first); if (l->next) fprintf(f, "SEP\n%*.*s ", print_tree_depth, print_tree_depth, ""); } print_tree_depth--; fprintf(f, "CLOSE "); } else fprintf(f, "EMPTY_CLOSE "); } }; /* Storing the input file ~~~~~~~~~~~~~~~~~~~~~~ The input file is read (completely) into a buffer. (Who cares about memory usage nowadays.) Functionality for determining the line and column is included in the reading functions. */ #define TAB_POS 4 char *info = NULL; longword file_pos; longword file_len; #define _eof (file_pos >= file_len) word cur_line = 1; word cur_column = 1; char *buffer = NULL; void _assign(FILE *the_file) { int fh = fileno(the_file); file_len = lseek(fh, 0L, SEEK_END); lseek(fh, 0L, SEEK_SET); buffer = (char*)malloc(file_len+2); file_len = read(fh, buffer, file_len); buffer[file_len] = '\0'; file_pos = 0; cur_line = 1; cur_column = 1; info = buffer; } void _unassign() { if (buffer) free(buffer); buffer = NULL; } inline void _set_file_pos(longword new_file_pos) { file_pos = new_file_pos; info = buffer + file_pos; } inline void _next() { if (file_pos < file_len) { switch(*info) { case '\t': cur_column = ((cur_column + TAB_POS) % TAB_POS) * TAB_POS; break; case '\n': cur_line++; cur_column = 1; break; default: cur_column++; break; } file_pos++; info++; } } inline void _advance(word steps) { file_pos += steps; if (file_pos > file_len) file_pos = file_len; info = buffer + file_pos; cur_column += steps; } void _print_state() { printf("[0,%ld,%ld] = `%c'\n", file_pos, file_len, *info); } /* Scanner ~~~~~~~ For reasons of speed the scanning procedures are hard-coded. The rationale behind this is that it is often really simple to write procedures for scanning symbols from a buffer. This section starts with a generic part followed by specific scanning procedures one for each terminal symbol of the language. The specific scanning procedure scan_space is used for scanning white space between the terminals. For a language which does not have white space, this procedure should simply return. */ int debug_scan = 0; char *current_nt; typedef struct { longword file_pos; word cur_line; word cur_column; } scan_pos_t; void skip_space(void); static void expected_string(char *s, char *is_keyword); static char *start_info() { static char buff[13]; int i; char *s; for (i = 0, s = info; i < 10 && *s; s++) if (*s == '\n') { buff[i++] = '\\'; buff[i++] = 'n'; } else if (*s == '\t') { buff[i++] = '\\'; buff[i++] = 't'; } else buff[i++] = *s; buff[i] = '\0'; return buff; } longword f_file_pos = 0; word f_line = 1; word f_column = 1; int nr_exp_syms = 0; longword last_space_pos = (longword)-1; scan_pos_t last_space_end_pos; longword last_string_pos = (longword)-1; char *last_string; scan_pos_t last_string_end_pos; longword last_ident_pos = (longword)-1; char *last_ident; bool last_ident_is_keyword; scan_pos_t last_ident_end_pos; void init_scan(FILE *the_file) { _assign(the_file); f_file_pos = 0; f_line = 1; f_column = 1; nr_exp_syms = 0; last_space_pos = (longword)-1; last_string_pos = (longword)-1; last_ident_pos = (longword)-1; skip_space(); } void save_pos(scan_pos_t *sp) { sp->file_pos = file_pos; sp->cur_line = cur_line; sp->cur_column = cur_column; } void restore_pos(scan_pos_t *sp) { if (sp->file_pos == file_pos) return; _set_file_pos(sp->file_pos); cur_line = sp->cur_line; cur_column = sp->cur_column; /* _print_state(); */ } bool accept_eof() { if (!_eof) { DEBUG_F_P3("%d.%d: accept_eof() failed for: `%s'\n", cur_line, cur_column, start_info()); expected_string("", NULL); return FALSE; } DEBUG_P3("%d.%d: accept_eof() for: `%s'\n", cur_line, cur_column, start_info()); return TRUE; } /* Terminal scanning procedures. Below the specific scanning procedures for all the terminal symbols follow. For each type of terminal a procedure starting with accept_ needs to be implemented. These procedures will be called from the parser only when there is a grammer rule requesting for the terminal symbol. Linking terminal symbols to scanning procedures is done by means of registration functions which are defined below. A scanning procedure should return true when the particular symbol was scanned, and false otherwise. In case false is returned, the function expected_string should be called to inform the parser about the name of the terminal that could not be parsed. The scanning procedures read their data from global variable 'info', which is a pointer to the string representing the remaining part of the file. When 'info' is at the end of the buffer _eof becomes true. (The buffer is terminated with '\0'.) The procedures _next and _advance should be called to increment this pointer. (The pointer does not need to be restored to its initial position after scanning.) To return to a previous location, a combination of calls to save_pos and restore_pos should be used (these also remember line and column positions). Each scanning procedure should call skip_space in case the terminal may be followed by white space. Some of the scanning procedures (such as accept_string) use memorization to remember the last scanned string. Memorization can be used for complex terminal symbols, but it is not required. */ bool cpp_comments = TRUE; bool nested_comments = TRUE; void skip_space() { /* Fixed part. Do not modify!! */ if (file_pos == last_space_pos) { restore_pos(&last_space_end_pos); return; } last_space_pos = file_pos; /* Flexible part. The code below increments scans the white space characters. */ for(;;) { _DEBUG_P1("now: `%c'\n", *info); if (*info == ' ' || *info == '\t' || info[0] == '\n') { _next(); } else if (cpp_comments && info[0] == '/' && info[1] == '/') { while (!_eof && info[0] != '\n') _next(); } else if (info[0] == '/' && info[1] == '*') { scan_pos_t sp; int nesting_depth = 1; save_pos(&sp); _next(); _next(); while (!_eof && !(info[0] == '*' && info[1] == '/') && nesting_depth > 0) { if (nested_comments) { if (info[0] == '/' && info[1] == '*') { nesting_depth++; _next(); } else if (info[0] == '*' && info[1] == '/') { nesting_depth--; _next(); } } _next(); } if (_eof) { printf("Warning: %d.%d comment not terminated\n", sp.cur_line, sp.cur_column); } _next(); _next(); } else break; } /* Fixed part. Do not modify!! */ save_pos(&last_space_end_pos); /* _print_state(); */ } bool accept_i_string(char **string, bool c_string) /* Procedure for scanning a C-like string. If a string was scanned it is returned in 'string'. If 'c_string' is true, multiple strings only separated by white space will be returned as a single string as in C. This procedure is an auxilary procedure called from accept_string and accept_cstring. */ { scan_pos_t sp; int i; if (_eof || info[0] != '"') { expected_string("", NULL); DEBUG_F_P3("%d.%d: accept_string() failed for: `%s'\n", cur_line, cur_column, start_info()); return FALSE; } /* First determine the amouth of memory that needs to be allocated for the string. */ save_pos(&sp); i = 0; _next(); for (;;) { if (_eof || *info == '\0' || *info == '\n') { printf("Error: %d.%d incorrectly terminated string\n", sp.cur_line, sp.cur_column); break; } else if (*info == '\\') { _next(); if (0 <= *info && *info <= 7) { char d1 = *info; _next(); if (0 <= *info && *info <= 7) { _next(); if ((d1 == '0' || d1 == '1') && 0 <= *info && *info <= 7) _next(); } } else _next(); i++; } else if (*info == '"') { if (c_string) { _next(); skip_space(); if (_eof || *info != '"') break; _next(); } else break; } else { i++; _next(); } } /* Now allocate memory for the strings */ *string = STR_MALLOC(i); /* Fill the string after having restored 'info' to the starting position */ restore_pos(&sp); i = 0; _next(); for (;;) { if (_eof || *info == '\0' || *info == '\n') { break; } else if (*info == '\\') { _next(); if (!_eof && 0 <= *info && *info <= 7) { char d1 = *info, v = *info - '0'; _next(); if (!_eof && 0 <= *info && *info <= 7) { v = v*8 + *info - '0'; _next(); if ( !_eof && (d1 == '0' || d1 == '1') && 0 <= *info && *info <= 7) { v = v*8 + *info - '0'; _next(); } } (*string)[i++] = v; } else if (!_eof) { if (*info == 'n') (*string)[i++] = '\n'; else if (*info == 't') (*string)[i++] = '\t'; else (*string)[i++] = *info; _next(); } } else if (*info == '"') { _next(); if (c_string) { skip_space(); if (_eof || *info != '"') break; _next(); } else break; } else { (*string)[i++] = *info; _next(); } } (*string)[i] = '\0'; skip_space(); DEBUG_P4("%d.%d: accept_string(\"%s\") %d\n", sp.cur_line, sp.cur_column, *string, cur_column); return TRUE; } bool accept_string(char **string) { bool result; longword start_pos; /* Memorization intro */ if (file_pos == last_string_pos) { *string = last_string; restore_pos(&last_string_end_pos); return TRUE; } start_pos = file_pos; /* Start of actual scanning */ result = accept_i_string(string, FALSE); /* Memorization prologue */ if (result) { last_string_pos = start_pos; last_string = *string; save_pos(&last_string_end_pos); return TRUE; } return FALSE; } bool accept_cstring(char **string) { return accept_i_string(string, TRUE); } #define IDENT_START_CHAR(X) (isalpha(X)||X=='_') #define IDENT_FOLLOW_CHAR(X) (isalpha(X)||isdigit(X)||X=='_') bool accept_ident(char **ident, char keyword_flag) { longword start_pos, start_column; int i; /* Memorization intro */ if (file_pos == last_ident_pos) { if (last_ident_is_keyword) { expected_string("", NULL); DEBUG_F_P3("%d.%d: accept_ident() failed for `%s'\n", cur_line, cur_column, start_info()); return FALSE; } start_column = cur_column; *ident = last_ident; restore_pos(&last_ident_end_pos); DEBUG_P3("%d.%d: accept_ident(%s)\n", cur_line, cur_column, *ident); return TRUE; } start_pos = file_pos; /* Does it look an identifier? */ if (_eof || !IDENT_START_CHAR(*info)) { expected_string("", NULL); DEBUG_F_P3("%d.%d: accept_ident() failed for `%s'\n", cur_line, cur_column, start_info()); return FALSE; } start_column = cur_column; /* Determine the lengte of the identifier */ i = 1; while (IDENT_FOLLOW_CHAR(info[i])) i++; /* Make a unique string out of it. (Little hack for not having to allocate memory.) */ { char s = info[i]; info[i] = '\0'; *ident = string(info); info[i] = s; } /* Reject if it is a keyword. */ if (*keyword_state & keyword_flag) { expected_string("", *ident); DEBUG_F_P3("%d.%d: accept_ident() failed, because `%s' is a keyword\n", cur_line, start_column, *ident); return FALSE; } DEBUG_P3("%d.%d: accept_ident(%s)\n", cur_line, start_column, *ident); _advance(i); skip_space(); /* Memorization prologue */ last_ident_pos = start_pos; last_ident = *ident; last_ident_is_keyword = FALSE; save_pos(&last_ident_end_pos); return TRUE; } bool accept_char(char *v) { long start_column; if (_eof || *info != '\'') { expected_string("", NULL); DEBUG_F_P3("%d.%d: accept_char() failed for `%s'\n", cur_line, cur_column, start_info()); return FALSE; } start_column = cur_column; _next(); if (*info == '\\') { _next(); if (0 <= *info && *info <= 7) { char d1 = *info; *v = *info - '0'; _next(); if (0 <= *info && *info <= 7) { *v = (*v)*8 + *info - '0'; _next(); if ((d1 == '0' || d1 == '1') && 0 <= *info && *info <= 7) { *v = (*v)*8 + *info - '0'; _next(); } } } else if (*info == 'n') *v = '\n'; else if (*info == 't') *v = '\t'; else *v = *info; } else *v = *info; _next(); if (*info != '\'') { printf("Error: %d.%d ill terminated character\n", cur_line, start_column); } else { _next(); } skip_space(); DEBUG_P3("%d.%d: accept_char(%c)\n", cur_line, start_column, *v); return TRUE; } bool accept_int(longint *v) { long start_column; int i = 0; if (_eof || ( !isdigit(*info) && ( (*info != '+' && *info != '-') || !isdigit(info[1])))) { expected_string("", NULL); DEBUG_F_P3("%d.%d: accept_int() failed for `%s'\n", cur_line, cur_column, start_info()); return FALSE; } start_column = cur_column; if (info[i] == '-' || info[i] == '+') i++; if (info[i] == '0' && info[i+1] == 'x') { i += 2; while ( isdigit(info[i]) || ('a' <= info[i] && info[i] <= 'f') || ('A' <= info[i] && info[i] <= 'F')) i++; } else { while (isdigit(info[i])) i++; if (info[i] == '.') { expected_string("", NULL); DEBUG_F_P3("%d.%d: accept_int() found double `%s'\n", cur_line, start_column, start_info()); return FALSE; } if (info[i] == 'L') i++; } sscanf(info, "%ld", v); _advance(i); skip_space(); DEBUG_P3("%d.%d: accept_int(%ld)\n", cur_line, start_column, *v); return TRUE; } bool accept_double(double *v) { long start_column; int i = 0; if (_eof || ( !isdigit(*info) && *info != '.' && ( (*info != '+' && *info != '-') || (!isdigit(info[1]) && *info != '.')))) { expected_string("", NULL); DEBUG_F_P3("%d.%d: accept_double() failed for `%s'\n", cur_line, cur_column, start_info()); return FALSE; } start_column = cur_column; if (*info == '-' || *info == '+') i++; while (isdigit(*info)) i++; if (*info != '.') { expected_string("", NULL); DEBUG_P3("%d.%d: accept_double() found int `%s'\n", cur_line, start_column, start_info()); return FALSE; } i++; while (isdigit(*info)) i++; sscanf(info, "%lf", v); _advance(i); skip_space(); DEBUG_P3("%d.%d: accept_double(%f)\n", cur_line, start_column, *v); return TRUE; } bool accept_literal(char *sym) { long start_pos; char *v, *s; bool keyword = IDENT_START_CHAR(*sym); /* Memorization intro */ if (keyword) { if (file_pos == last_ident_pos) { if (sym == last_ident) { DEBUG_F_P3("%d.%d: accept_literal(%s)", cur_line, cur_column, sym); restore_pos(&last_ident_end_pos); return TRUE; } else if (!last_ident_is_keyword) { expected_string(sym, NULL); DEBUG_F_P4("%d.%d: accept_literal(%s) failed for `%s'\n", cur_line, cur_column, sym, start_info()); return FALSE; } } start_pos = file_pos; } for (v = info, s = sym; *v == *s; v++, s++); if (*s != '\0' || (keyword && IDENT_START_CHAR(*v))) { expected_string(sym, NULL); DEBUG_F_P4("%d.%d: accept_literal(%s) failed `%s'\n", cur_line, cur_column, sym, start_info()); return FALSE; } DEBUG_P3("%d.%d: accept_literal(%s)\n", cur_line, cur_column, sym); _advance(v - info); skip_space(); /* Memorization prologue */ if (keyword) { last_ident_pos = start_pos; last_ident = sym; last_ident_is_keyword = TRUE; save_pos(&last_ident_end_pos); } return TRUE; } /*** Transact-SQL scanning procedures ***/ #define ADD_CHAR(C) { if (string) string[i] = C; i++; } bool try_accept_sql_string(char *string, int *r_i) { int i; char quote = info[0]; scan_pos_t sp; save_pos(&sp); i = 0; _next(); for (;;) { if (_eof || *info == '\0' || *info == '\n') { printf("Error: %d.%d incorrectly terminated string\n", sp.cur_line, sp.cur_column); return FALSE; } else if (*info == quote) { _next(); if (!_eof && *info == quote) { ADD_CHAR(quote) _next(); } else break; } else { ADD_CHAR(*info) _next(); } } ADD_CHAR('\0') *r_i = i; return TRUE; } bool accept_both_sql_string(char **string, char quote) { scan_pos_t sp; int i; char *dummy = NULL; if (_eof || (info[0] != quote)) { expected_string("", NULL); DEBUG_F_P3("%d.%d: accept_sql_string() failed for: `%s'\n", cur_line, cur_column, start_info()); return FALSE; } save_pos(&sp); if (!try_accept_sql_string(dummy, &i)) return FALSE; *string = STR_MALLOC(i); restore_pos(&sp); try_accept_sql_string(*string, &i); skip_space(); DEBUG_P4("%d.%d: accept_sql_string(\"%s\") %d\n", sp.cur_line, sp.cur_column, *string, cur_column); return TRUE; } bool accept_single_sql_string(char **string) { return accept_both_sql_string(string, '\''); } bool accept_double_sql_string(char **string) { return accept_both_sql_string(string, '"'); } int last_ident_ats; int last_ident_is_label; bool try_accept_sql_ident(char *what, int nr_ats, bool is_label, char **ident, char keyword_flag) { int i, j; int cnt_ats; long start_line; long start_column; if (file_pos == last_ident_pos) { if (last_ident_is_keyword || nr_ats != last_ident_ats || is_label != last_ident_is_label) { expected_string(what, NULL); DEBUG_F_P3("%d.%d: accept_sql_ident() failed for `%s'\n", cur_line, cur_column, start_info()); return FALSE; } start_column = cur_column; *ident = last_ident; restore_pos(&last_ident_end_pos); } else { longword start_pos = file_pos; char buffer[1000]; start_line = cur_line; start_column = cur_column; if (_eof || (!isalpha(*info) && *info != '_' && *info != '#' && *info != '$' && *info != '@')) { expected_string(what, NULL); DEBUG_F_P3("%d.%d: accept_sql_ident() failed for `%s'\n", cur_line, start_column, start_info()); return FALSE; } i = 0; cnt_ats = 0; while (info[i] == '@') { i++; cnt_ats++; } if ((!isalpha(info[i]) && !isdigit(info[i]) && info[i] != '_' && *info != '#' && *info != '$' && *info != '.') || cnt_ats != nr_ats) { expected_string(what, NULL); DEBUG_F_P3("%d.%d: accept_sql_ident() failed for `%s'\n", cur_line, start_column, start_info()); return FALSE; } j = 0; while (isalpha(info[i]) || isdigit(info[i]) || info[i] == '_') { if (j < 999) buffer[j++] = tolower(info[i]); i++; } buffer[j] = '\0'; *ident = string(buffer); if (*keyword_state & keyword_flag) { expected_string(what, *ident); DEBUG_F_P3("%d.%d: accept_sql_ident() failed, because `%s' is a keyword\n", cur_line, start_column, *ident); return FALSE; } if (info[i] == ':' && !is_label) { expected_string(what, *ident); DEBUG_F_P3("%d.%d: accept_sql_ident() failed, because `%s' is not a label\n", cur_line, start_column, *ident); return FALSE; } if (info[i] != ':' && is_label) { expected_string(what, *ident); DEBUG_F_P3("%d.%d: accept_sql_ident() failed, because `%s' is a label\n", cur_line, start_column, *ident); return FALSE; } if (info[i] == ':') { i++; } _advance(i); last_ident_pos = start_pos; last_ident = *ident; last_ident_ats = nr_ats; last_ident_is_label = is_label; last_ident_is_keyword = FALSE; skip_space(); save_pos(&last_ident_end_pos); } DEBUG_P3("%d.%d: accept_sql_ident(%s)\n", start_line, start_column, *ident); return TRUE; } bool accept_sql_ident(char **ident, char keyword_flag) { return try_accept_sql_ident("", 0, FALSE, ident, keyword_flag); } bool accept_sql_var_ident(char **ident, char keyword_flag) { return try_accept_sql_ident("", 1, FALSE, ident, keyword_flag); } bool accept_sql_sysvar_ident(char **ident, char keyword_flag) { return try_accept_sql_ident("", 2, FALSE, ident, keyword_flag); } bool accept_sql_label_ident(char **ident, char keyword_flag) { return try_accept_sql_ident("", 0, TRUE, ident, keyword_flag); } /* Registration of terminal scanners ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ The following section of code deals with the registration of scanner procedures with the terminal symbols used in the grammar. The actual registration is done by calling the add__literal procedures in the init procedure. */ #define TT_LITERAL 1 #define TT_STRING 2 #define TT_INT 3 #define TT_DOUBLE 4 #define TT_CHAR 5 #define TT_IDENT 6 typedef struct terminal_t terminal_t, *terminal_p; struct terminal_t { terminal_p next; char *name; int type; union { bool (*accept_string)(char **v); bool (*accept_int)(longint *v); bool (*accept_double)(double *v); bool (*accept_char)(char *v); bool (*accept_ident)(char **v, char keyword_flag); } scan_fn; }; terminal_p all_term = NULL; #define DEF_ADD_TERMINAL_FN(F,T,V,S) \ void F(char *name, bool (*fn)S) \ { terminal_p new_term = MALLOC(terminal_t); \ new_term->next = all_term; \ all_term = new_term; \ new_term->name = name; \ new_term->type = T; \ new_term->scan_fn.V = fn; } DEF_ADD_TERMINAL_FN(add_literal_terminal, TT_LITERAL, accept_string, (char **)) DEF_ADD_TERMINAL_FN(add_string_terminal, TT_STRING, accept_string, (char **)) DEF_ADD_TERMINAL_FN(add_int_terminal, TT_INT, accept_int, (longint *)) DEF_ADD_TERMINAL_FN(add_double_terminal, TT_DOUBLE, accept_double, (double *)) DEF_ADD_TERMINAL_FN(add_char_terminal, TT_CHAR, accept_char, (char *)) DEF_ADD_TERMINAL_FN(add_ident_terminal, TT_IDENT, accept_ident, (char **, char)) terminal_p find_terminal(char *name) { terminal_p term; for (term = all_term; term != NULL; term = term->next) if (term->name == name) return term; return NULL; } /* Internal representation parsing rules ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ The following section of the code deals with internal representation of the parsing rules as they are used by the parsing routines. */ typedef struct val_t val_t, *val_p; typedef struct or_rule_t or_rule_t, *or_rule_p; typedef struct non_terminal_t non_terminal_t, *non_terminal_p; typedef struct ident_t ident_t, *ident_p; typedef struct rule_t rule_t, *rule_p; struct val_t { val_p prev; tree_p last; }; struct or_rule_t { or_rule_p next; rule_p rule; char *tree_name; }; struct non_terminal_t { non_terminal_p next; char *name; or_rule_p first; or_rule_p recursive; }; struct ident_t { int kind; char *ident_class; terminal_p terminal; }; struct rule_t { rule_p next; bool optional; bool sequential; char *chain_symbol; int kind; union { non_terminal_p non_terminal; terminal_p terminal; or_rule_p or_rule; char *str_value; ident_p ident; } info; long last_fail_pos; }; #define RK_NT 0 #define RK_LIT 1 #define RK_OR_RULE 2 #define RK_T_EOF 3 #define RK_TERM 4 #define RK_IDENT 5 #define RK_T_OPENCONTEXT 6 #define RK_T_CLOSECONTEXT 7 #define IK_IDENTDEF 1 #define IK_IDENTDEFADD 2 #define IK_IDENTUSE 3 #define IK_IDENTFIELD 4 non_terminal_p all_nt = NULL; non_terminal_p find_nt(char *name); rule_p make_rule(list_p rules); or_rule_p make_or_rule(list_p or); void make_all_nt(tree_p root); char *str_root, *str_nt_def, *str_ident, *str_identalone, *str_identdef, *str_identdefadd, *str_identuse, *str_identfield, *str_opencontext, *str_closecontext, *str_rule, *str_opt, *str_seq, *str_list, *str_chain, *str_prim_elem, *str_string, *str_cstring, *str_int, *str_double, *str_char, *str_eof; non_terminal_p find_nt(char *name) { non_terminal_p *p_nt = &all_nt; while (*p_nt != NULL && (*p_nt)->name != name) p_nt = &((*p_nt)->next); if (*p_nt == NULL) { *p_nt = MALLOC(non_terminal_t); (*p_nt)->name = name; (*p_nt)->first = NULL; (*p_nt)->recursive = NULL; (*p_nt)->next = NULL; } return *p_nt; } /* Filling the internal parser rules ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ The following section of code gives the procedures, which transform a grammer, represented as an Abstract Program Tree (which was produced when parsing the input grammar with the input grammar of IParse), into the internal representation of the parsing rules. */ rule_p make_rule(list_p rule) { rule_p result; tree_p elem; /* only used as pointer */ if (rule == NULL) return NULL; result = MALLOC(rule_t); result->optional = FALSE; result->sequential = FALSE; result->chain_symbol = NULL; result->last_fail_pos = (long)-1; elem = rule->first; /* Is the first element optional? */ if (equal_tree(elem, str_opt)) { result->optional = TRUE; elem = elements(elem)->first; } /* Is it a sequential, chain or list? */ if (equal_tree(elem, str_seq)) { result->sequential = TRUE; elem = elements(elem)->first; } else if (equal_tree(elem, str_chain)) { result->sequential = TRUE; result->chain_symbol = elements(elem)->next->first->c.str_value; elem = elements(elem)->first; } else if (equal_tree(elem, str_list)) { result->sequential = TRUE; result->chain_symbol = ","; elem = elements(elem)->first; } if (is_a_ident(elem)) { if (equal_ident(elem, str_eof)) result->kind = RK_T_EOF; else { terminal_p terminal = find_terminal(elem->c.str_value); if (terminal != NULL) { result->kind = RK_TERM; result->info.terminal = terminal; } else { result->kind = RK_NT; result->info.non_terminal = find_nt(elem->c.str_value); if (result->info.non_terminal == NULL) { printf("Undefined non-terminal: '%s'\n", elem->c.str_value); exit(1); } } } } else if (is_tree(elem, str_identalone)) { result->kind = RK_TERM; result->info.terminal = find_terminal(str_ident); } else if (is_tree(elem, str_identdef)) { result->kind = RK_IDENT; result->info.ident = MALLOC(ident_t); result->info.ident->kind = IK_IDENTDEF; result->info.ident->ident_class = part(elem, 1)->c.str_value; result->info.ident->terminal = find_terminal(str_ident); } else if (is_tree(elem, str_identdefadd)) { result->kind = RK_IDENT; result->info.ident = MALLOC(ident_t); result->info.ident->kind = IK_IDENTDEFADD; result->info.ident->ident_class = part(elem, 1)->c.str_value; result->info.ident->terminal = find_terminal(str_ident); } else if (is_tree(elem, str_identuse)) { result->kind = RK_IDENT; result->info.ident = MALLOC(ident_t); result->info.ident->kind = IK_IDENTUSE; result->info.ident->ident_class = part(elem, 1)->c.str_value; result->info.ident->terminal = find_terminal(str_ident); } else if (is_tree(elem, str_identfield)) { result->kind = RK_IDENT; result->info.ident = MALLOC(ident_t); result->info.ident->kind = IK_IDENTFIELD; result->info.ident->ident_class = part(elem, 1)->c.str_value; result->info.ident->terminal = find_terminal(str_ident); } else if (is_tree(elem, str_opencontext)) { result->kind = RK_T_OPENCONTEXT; } else if (is_tree(elem, str_closecontext)) { result->kind = RK_T_CLOSECONTEXT; } else if (is_a_str(elem)) { result->kind = RK_LIT; result->info.str_value = elem->c.str_value; } else if (is_a_list(elem)) { result->kind = RK_OR_RULE; result->info.or_rule = make_or_rule(elements(elem)); } result->next = make_rule(rule->next); return result; } or_rule_p make_or_rule(list_p or) { if (or == NULL) return NULL; { tree_p rule = or->first; /* only used as pointer */ if (is_tree(rule, str_rule)) { list_p elem = elements(rule); or_rule_p result = MALLOC(or_rule_t); if (elem->first == NULL) { result->rule = NULL; result->tree_name = NULL; } else { result->rule = make_rule(elements(elem->first)); result->tree_name = is_a_ident(elem->next->first) ? elem->next->first->c.str_value : NULL; } result->next = make_or_rule(or->next); return result; } } return make_or_rule(or->next); } void make_all_nt(tree_p root) { list_p rules; all_nt = NULL; for (rules = root->c.parts; rules; rules = rules->next) if (is_tree(rules->first, str_nt_def)) { char *nt_name = part(rules->first, 1)->c.str_value; non_terminal_p nt = find_nt(nt_name); or_rule_p *p_first = &nt->first; or_rule_p *p_recursive = &nt->recursive; list_p or = part(rules->first, 2)->c.parts; while (*p_first != NULL) p_first = &(*p_first)->next; while (*p_recursive != NULL) p_recursive = &(*p_recursive)->next; for (; or != NULL; or = or->next) { tree_p rule = or->first; /* only used as pointer */ if (is_tree(rule, str_rule)) { list_p elem = elements(rule); or_rule_p new = MALLOC(or_rule_t); if (elem->first == NULL) { new->rule = NULL; new->tree_name = NULL; new->next = NULL; *p_first = new; p_first = &new->next; } else { list_p parts = elements(elem->first); new->tree_name = is_a_ident(elem->next->first) ? elem->next->first->c.str_value : NULL; new->next = NULL; if (parts == NULL || !is_ident(parts->first, nt_name)) { new->rule = make_rule(parts); *p_first = new; p_first = &new->next; } else { new->rule = make_rule(parts->next); *p_recursive = new; p_recursive = &new->next; } } } } } { non_terminal_p nt; for (nt = all_nt; nt != NULL; nt = nt->next) if (nt->first == NULL && nt->recursive == NULL) printf("Non-terminal '%s' has no rule.\n", nt->name); } } /* Caching intermediate parse states ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Traditional recursive-descent parser are slow, because of the large amouth of back-tracking occurs, where depending on the properties of the grammar, the same fragment of the input is parsed over-and-over again. To prevent this, the following section of code, implements a cache for all intermediate parsing states (including the Abstract Program Trees that they produced). */ enum success_t { s_unknown, s_fail, s_success } ; typedef struct solution_t solution_t, *solution_p; struct solution_t { solution_p next; char *nt; enum success_t success; tree_p result; scan_pos_t sp; }; solution_p *solutions = NULL; void init_solutions() { longword i; solutions = MALLOC_N(file_len+1, solution_t*); for (i = 0; i < file_len+1; i++) solutions[i] = NULL; } void free_solutions() { longword i; for (i = 0; i < file_len+1; i++) { solution_p sol = solutions[i]; while (sol != NULL) { solution_p next_sol = sol->next; tree_release(&sol->result); free(sol); sol = next_sol; } } free(solutions); solutions = NULL; } solution_p find_solution(longword filepos, char *nt) { solution_p sol; if (filepos > file_len) filepos = file_len; for (sol = solutions[filepos]; sol != NULL; sol = sol->next) if ( sol->nt == nt ) return sol; sol = MALLOC(solution_t); sol->next = solutions[filepos]; sol->nt = nt; sol->success = s_unknown; sol->result = NULL; solutions[filepos] = sol; return sol; } /* Parsing procedures ~~~~~~~~~~~~~~~~~~ The following section of the code contains the procedures which implement the back-tracking recursive descended parser. */ int grammar_level = 2; char keyword_flag = 1, new_keyword_flag = 2; int depth = 0; bool debug_parse = FALSE; bool debug_nt = FALSE; char *current_nt = NULL; rule_p current_rule = NULL; #define DEBUG_ENTER(X) if (debug_parse) { DEBUG_TAB; printf("Enter: %s", X); depth += 2; } #define DEBUG_ENTER_P1(X,A) if (debug_parse) { DEBUG_TAB; printf("Enter: "); printf(X,A); depth += 2; } #define DEBUG_EXIT(X) if (debug_parse) { depth -=2; DEBUG_TAB; printf("Leave: %s", X); } #define DEBUG_EXIT_P1(X,A) if (debug_parse) { depth -=2; DEBUG_TAB; printf("Leave: "); printf(X,A); } #define DEBUG_TAB if (debug_parse) printf("%*.*s", depth, depth, "") #define DEBUG_NL if (debug_parse) printf("\n") #define DEBUG_PT(X) if (debug_parse) print_tree(stdout, X, TRUE) #define DEBUG_PO(X) if (debug_parse) print_or(stdout, X) #define DEBUG_PR(X) if (debug_parse) print_rule(stdout, X) #define DEBUG_(X) if (debug_parse) printf(X) #define DEBUG_P1(X,A) if (debug_parse) printf(X,A) bool parse_nt(non_terminal_p non_term, tree_p *rtree); bool parse_or(or_rule_p or, tree_p *rtree); bool parse_rule(rule_p rule, val_p prev_parts, char *tree_name, tree_p *rtree); bool parse_seq(rule_p rule, char *chain_sym, tree_p seq, val_p prev_parts, char *tree_name, tree_p *rtree) ; void print_or(FILE *f, or_rule_p or); void print_rule(FILE *f, rule_p rule); bool parse_term( terminal_p term, tree_p *rtree ) { bool try; switch( term->type ) { case TT_LITERAL: { char *v; try = (term->scan_fn.accept_string)(&v); if (try) { if (grammar_level > 1) { *rtree = make_str_atom(string(v)); free(v); *keyword_state |= new_keyword_flag; } else *rtree = make_str_atom(v); } break; } case TT_STRING: { char *v; try = (term->scan_fn.accept_string)(&v); if (try) *rtree = make_str_atom(v); break; } case TT_INT: { longint v; try = (term->scan_fn.accept_int)(&v); if (try) *rtree = make_int_atom(v); break; } case TT_DOUBLE: { double v; try = (term->scan_fn.accept_double)(&v); if (try) *rtree = make_double_atom(v); break; } case TT_CHAR: { char v; try = (term->scan_fn.accept_char)(&v); if (try) *rtree = make_char_atom(v); break; } case TT_IDENT: { char *v; try = (term->scan_fn.accept_ident)(&v, keyword_flag); if (try) *rtree = make_ident(v); break; } default: { try = FALSE; break; } } return try; } bool parse_ident(ident_p ident, tree_p *rtree) { bool try; char *v; try = (ident->terminal->scan_fn.accept_ident)(&v, keyword_flag); if (try) { tree_p tp = NULL; switch(ident->kind) { case IK_IDENTDEF: *rtree = make_tree(tt_identdef); break; case IK_IDENTDEFADD: *rtree = make_tree(tt_identdefadd); break; case IK_IDENTUSE: *rtree = make_tree(tt_identuse); break; case IK_IDENTFIELD: *rtree = make_tree(tt_identfield); break; } tp = make_ident(v); add_element(*rtree,tp); tree_release(&tp); tp = make_ident(ident->ident_class); add_element(*rtree,tp); tree_release(&tp); } return try; } bool parse_nt( non_terminal_p non_term, tree_p *rtree) { or_rule_p or; char *surr_nt = current_nt; char *nt = non_term->name; solution_p sol = find_solution(file_pos, nt); DEBUG_ENTER_P1("parse_nt(%s)", nt); DEBUG_NL; if (sol->success == s_success) { DEBUG_EXIT_P1("parse_nt(%s) SUCCESS", nt); DEBUG_NL; tree_assign(rtree, sol->result); restore_pos(&sol->sp); return TRUE; } else if (sol->success == s_fail) { DEBUG_EXIT_P1("parse_nt(%s) FAIL", nt); DEBUG_NL; return FALSE; } current_nt = nt; if (debug_nt) { printf("%*.*s", depth, depth, ""); printf("Enter: %s\n", nt); depth += 2; } for ( or = non_term->first; or != NULL; or = or->next ) if (parse_rule(or->rule, (val_p)NULL, or->tree_name, rtree)) break; if (or != NULL) { for(;;) { val_t val; val.prev = NULL; val.last = NULL; tree_move(&val.last, rtree); for ( or = non_term->recursive; or != NULL; or = or->next ) if (parse_rule(or->rule, &val, or->tree_name, rtree)) { tree_release(&val.last); break; } if (or == NULL) { tree_move(rtree, &val.last); break; } tree_release(&val.last); } DEBUG_EXIT_P1("parse_nt(%s) =", nt); DEBUG_PT(*rtree); DEBUG_NL; if (debug_nt) { depth -= 2; printf("%*.*s", depth, depth, ""); printf("Parsed: %s\n", nt); } current_nt = surr_nt; tree_assign(&sol->result, *rtree); sol->success = s_success; save_pos(&sol->sp); return TRUE; } DEBUG_EXIT_P1("parse_nt(%s) - failed", nt); DEBUG_NL; if (debug_nt) { depth -= 2; printf("%*.*s", depth, depth, ""); printf("Failed: %s\n", nt); } current_nt = surr_nt; sol->success = s_fail; return FALSE; } bool parse_or(or_rule_p or, tree_p *rtree) { DEBUG_ENTER("parse_or: "); DEBUG_PO(or); DEBUG_NL; for ( ; or != NULL; or = or->next ) if (parse_rule(or->rule, (val_p)NULL, or->tree_name, rtree)) { DEBUG_EXIT("parse_or = "); DEBUG_PT(*rtree); DEBUG_NL; return TRUE; } DEBUG_EXIT("parse_or: failed"); DEBUG_NL; return FALSE; } bool parse_rule(rule_p rule, val_p prev_parts, char *tree_name, tree_p *rtree) { bool optional, sequential; char *chain_sym; longword *last_fail_pos; scan_pos_t sp; bool try = FALSE; val_t val; DEBUG_ENTER("parse_rule: "); DEBUG_PR(rule); DEBUG_NL; /* At the end of the rule: */ if (rule == NULL) { if (tree_name != NULL) { if ( prev_parts != NULL && prev_parts->prev == NULL && is_a_list(prev_parts->last) && 0) { tree_assign(rtree, prev_parts->last); (*rtree)->type = tree_name; } else { /* make tree of previous elements: */ *rtree = make_tree(tree_name); while (prev_parts) { insert_element(*rtree, prev_parts->last); prev_parts = prev_parts->prev; } } } else { /* group previous elements into a list, if more than one: */ if (prev_parts == NULL) *rtree = NULL; else if (prev_parts->prev == NULL) tree_assign(rtree, prev_parts->last); else { *rtree = make_list(); while (prev_parts) { insert_element(*rtree, prev_parts->last); prev_parts = prev_parts->prev; } } } DEBUG_EXIT("parse_rule = "); DEBUG_PT(*rtree); DEBUG_NL; return TRUE; } optional = rule->optional, sequential = rule->sequential; chain_sym = rule->chain_symbol; /* Did we fail the last time at this position? */ if (rule->last_fail_pos == file_pos) { DEBUG_EXIT("parse_rule - BREAK "); DEBUG_NL; return FALSE; } last_fail_pos = &rule->last_fail_pos; save_pos(&sp); current_rule = rule; /* Try to accept first symbol */ { tree_p t = NULL; word s_line = cur_line, s_column = cur_column; switch( rule->kind ) { case RK_T_EOF: try = accept_eof(); if (try) { if (parse_rule(rule->next, prev_parts, tree_name, rtree)) { DEBUG_EXIT("parse_rule = "); DEBUG_PT(*rtree); DEBUG_NL; return TRUE; } } break; case RK_TERM: try = parse_term(rule->info.terminal, &t); break; case RK_IDENT: try = parse_ident(rule->info.ident, &t); break; case RK_T_OPENCONTEXT: try = TRUE; t = make_tree(tt_opencontext); break; case RK_T_CLOSECONTEXT: try = TRUE; t = make_tree(tt_closecontext); break; case RK_NT: try = parse_nt(rule->info.non_terminal, &t); break; case RK_LIT: { if (accept_literal(rule->info.str_value)) { if (sequential) { tree_p seq = make_list(); if (parse_seq(rule, chain_sym, seq, prev_parts, tree_name, rtree)) { DEBUG_EXIT("parse_rule = "); DEBUG_PT(*rtree); DEBUG_NL; tree_release(&t); tree_release(&seq); return TRUE; } tree_release(&seq); } else if (parse_rule(rule->next, prev_parts, tree_name, rtree)) { DEBUG_EXIT("parse_rule = "); DEBUG_PT(*rtree); DEBUG_NL; tree_release(&t); return TRUE; } } break; } case RK_OR_RULE: try = parse_or(rule->info.or_rule, &t); break; default: try = FALSE; } if (t != NULL) { t->line = s_line; t->column = s_column; } if (try) { /* We succeded in parsing the first element */ if (sequential) { tree_p seq = make_list(); add_element(seq, t); if (parse_seq(rule, chain_sym, seq, prev_parts, tree_name, rtree)) { DEBUG_EXIT("parse_rule = "); DEBUG_PT(*rtree); DEBUG_NL; tree_release(&t); tree_release(&seq); return TRUE; } tree_release(&seq); } else { val.last = NULL; tree_assign(&val.last, t); val.prev = prev_parts; prev_parts = &val; if (parse_rule(rule->next, prev_parts, tree_name, rtree)) { DEBUG_EXIT("parse_rule = "); DEBUG_PT(*rtree); DEBUG_NL; tree_release(&val.last); tree_release(&t); return TRUE; } tree_release(&val.last); } } tree_release(&t); } if (optional) { restore_pos(&sp); /* First element was optional: */ val.last = NULL; val.prev = prev_parts; prev_parts = &val; if (parse_rule(rule->next, prev_parts, tree_name, rtree)) { DEBUG_EXIT("parse_rule = "); DEBUG_PT(*rtree); DEBUG_NL; return TRUE; } } restore_pos(&sp); *last_fail_pos = file_pos; DEBUG_EXIT_P1("parse_rule - failed at %ld", file_pos); DEBUG_NL; return FALSE; } bool parse_seq(rule_p rule, char *chain_sym, tree_p seq, val_p prev_parts, char *tree_name, tree_p *rtree) { scan_pos_t sp; bool try = FALSE; val_t val; DEBUG_ENTER("parse_seq: "); DEBUG_PR(rule); DEBUG_NL; save_pos(&sp); current_rule = rule; /* Try to accept first symbol */ if (chain_sym == NULL || accept_literal(chain_sym)) { tree_p t = NULL; word s_line = cur_line, s_column = cur_column; switch( rule->kind ) { case RK_T_EOF: try = accept_eof(); if (try) t = make_str_atom(string("EOF")); break; case RK_TERM: try = parse_term(rule->info.terminal, &t); break; case RK_IDENT: try = parse_ident(rule->info.ident, &t); break; case RK_NT: try = parse_nt(rule->info.non_terminal, &t); break; case RK_LIT: try = accept_literal(rule->info.str_value); break; case RK_OR_RULE: try = parse_or(rule->info.or_rule, &t); break; default: try = FALSE; } if (t != NULL) { t->line = s_line; t->column = s_column; } if (try) { /* We succeded in parsing the first element */ add_element(seq, t); if (parse_seq(rule, chain_sym, seq, prev_parts, tree_name, rtree)) { DEBUG_EXIT("parse_seq = "); DEBUG_PT(*rtree); DEBUG_NL; tree_release(&t); return TRUE; } drop_last_element(seq); } tree_release(&t); } restore_pos(&sp); val.last = NULL; tree_assign(&val.last, seq); val.prev = prev_parts; prev_parts = &val; if (parse_rule(rule->next, prev_parts, tree_name, rtree)) { DEBUG_EXIT("parse_seq = "); DEBUG_PT(*rtree); DEBUG_NL; tree_release(&val.last); return TRUE; } restore_pos(&sp); DEBUG_EXIT_P1("parse_seq - failed at %ld", file_pos); DEBUG_NL; tree_release(&val.last); return FALSE; } /* Printing routines for the internal representation */ void print_or(FILE *f, or_rule_p or) { bool first = TRUE; for (; or; or = or->next) { if (!first) fprintf(f, "|"); first = FALSE; print_rule(f, or->rule); if (or->tree_name != NULL) fprintf(f, "[%s]", or->tree_name); } } void print_rule(FILE *f, rule_p rule) { bool optional, sequential; char *chain_sym; if (rule == NULL) return; optional = rule->optional, sequential = rule->sequential; chain_sym = rule->chain_symbol; switch(rule->kind) { case RK_T_EOF: fprintf(f, " "); break; case RK_TERM: fprintf(f, "<%s> ", rule->info.terminal->name); break; case RK_IDENT: fprintf(f, "<%s> ", rule->info.ident->terminal->name); break; case RK_NT: fprintf(f, "%s ", rule->info.non_terminal->name); break; case RK_LIT: fprintf(f, "\"%s\" ", rule->info.str_value); break; case RK_OR_RULE: fprintf(f, "("); print_or(f, rule->info.or_rule); fprintf(f, ")"); break; } if (sequential) { if (chain_sym == NULL) fprintf(f, "SEQ "); else if (strcmp( chain_sym, ",")) fprintf(f, "LIST "); else fprintf(f, "CHAIN %s ", chain_sym); } if (optional) { fprintf(f, "OPT "); } print_rule(f, rule->next); } /* Error reporting ~~~~~~~~~~~~~~~ In case the input cannot be parsed, the parser produces a list of all expected terminal symbols together with the grammar rule they occured in, at the position the parser could not continue. The following section implements this list. */ #define MAX_EXP_SYM 200 typedef struct { char *sym; char *in_nt; rule_p rule; char *is_keyword; } expect_t; expect_t expect[MAX_EXP_SYM]; char *exp_in_nt[MAX_EXP_SYM]; static void expected_string(char *s, char *is_keyword) { if (file_pos < f_file_pos) return; if (file_pos > f_file_pos) { nr_exp_syms = 0; f_file_pos = file_pos; f_line = cur_line; f_column = cur_column; } if (nr_exp_syms >= MAX_EXP_SYM) return; expect[nr_exp_syms].sym = s; expect[nr_exp_syms].in_nt = current_nt; expect[nr_exp_syms].rule = current_rule; expect[nr_exp_syms].is_keyword = is_keyword; nr_exp_syms++; } void print_last_pos() { int i; printf("%d.%d Expected:\n", f_line, f_column); for (i = 0; i < nr_exp_syms; i++) { bool unique = TRUE; int j; for (j = 0; unique && j < i; j++) if (expect[i].rule == expect[j].rule) unique = FALSE; if (unique) { printf(" "); if (expect[i].rule) print_rule(stdout, expect[i].rule); else printf("%s ", expect[i].sym); printf(":"); if (expect[i].in_nt) printf(" in %s", expect[i].in_nt); if (expect[i].is_keyword != NULL) printf(" ('%s' is a keyword)", expect[i].is_keyword); printf("\n"); } } } /* Initialization of the IParse grammar ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Below the procedure that initializes the Abstract Program Tree for the input grammar of IParse itself is given. This was generated by calling the procedure print_tree_to_c above. */ void init_IParse_grammar( tree_p *root ) { /* Generated by IParse version 1.1a of October 8, 2001. */ tree_p tt[100]; int v = 0; #define ID(s) tt[v]=make_ident(string(s)); #define LITERAL(s) tt[v]=make_str_atom(string(s)); *keyword_state |= 1; #define INT(i) tt[v]=make_int_atom(i); #define DOUBLE(i) tt[v]=make_double_atom(i); #define CHAR(i) tt[v]=make_char_atom(i); #define TREE(n) tt[v]=make_tree(string(n)); v++; #define LIST tt[v]=make_list(); v++; #define NONE tt[v]=NULL; #define SEP add_element(tt[v-1],tt[v]); tree_release(&tt[v]); #define CLOSE add_element(tt[v-1],tt[v]); tree_release(&tt[v]); v--; #define EMPTY_CLOSE v--; LIST TREE("nt_def") ID("root") SEP LIST TREE("rule") LIST TREE("seq") ID("nt_def") CLOSE SEP ID("eof") CLOSE SEP NONE CLOSE CLOSE CLOSE SEP TREE("nt_def") ID("nt_def") SEP LIST TREE("rule") LIST TREE("identalone") EMPTY_CLOSE SEP LITERAL(":") SEP ID("or_rule") SEP LITERAL(".") CLOSE SEP ID("nt_def") CLOSE CLOSE CLOSE SEP TREE("nt_def") ID("or_rule") SEP LIST TREE("rule") LIST TREE("chain") ID("rule") SEP LITERAL("|") CLOSE CLOSE SEP NONE CLOSE CLOSE CLOSE SEP TREE("nt_def") ID("rule") SEP LIST TREE("rule") LIST TREE("opt") TREE("seq") ID("opt_elem") CLOSE CLOSE SEP TREE("opt") LIST TREE("rule") LIST LITERAL("[") SEP TREE("identalone") EMPTY_CLOSE SEP LITERAL("]") CLOSE SEP NONE CLOSE CLOSE CLOSE CLOSE SEP ID("rule") CLOSE CLOSE CLOSE SEP TREE("nt_def") ID("opt_elem") SEP LIST TREE("rule") LIST ID("list_elem") SEP LITERAL("OPT") CLOSE SEP ID("opt") CLOSE SEP TREE("rule") LIST ID("list_elem") CLOSE SEP NONE CLOSE CLOSE CLOSE SEP TREE("nt_def") ID("list_elem") SEP LIST TREE("rule") LIST ID("prim_elem") SEP LITERAL("SEQ") CLOSE SEP ID("seq") CLOSE SEP TREE("rule") LIST ID("prim_elem") SEP LITERAL("LIST") CLOSE SEP ID("list") CLOSE SEP TREE("rule") LIST ID("prim_elem") SEP LITERAL("CHAIN") SEP ID("string") CLOSE SEP ID("chain") CLOSE SEP TREE("rule") LIST ID("prim_elem") CLOSE SEP NONE CLOSE CLOSE CLOSE SEP TREE("nt_def") ID("prim_elem") SEP LIST TREE("rule") LIST ID("string") CLOSE SEP NONE CLOSE SEP TREE("rule") LIST TREE("identalone") EMPTY_CLOSE CLOSE SEP NONE CLOSE SEP TREE("rule") LIST LITERAL("ident") SEP LITERAL(">+") SEP TREE("identalone") EMPTY_CLOSE CLOSE SEP ID("identdefadd") CLOSE SEP TREE("rule") LIST LITERAL("ident") SEP LITERAL(">") SEP TREE("identalone") EMPTY_CLOSE CLOSE SEP ID("identdef") CLOSE SEP TREE("rule") LIST LITERAL("ident") SEP LITERAL("<") SEP TREE("identalone") EMPTY_CLOSE CLOSE SEP ID("identuse") CLOSE SEP TREE("rule") LIST LITERAL("ident") SEP LITERAL("!") SEP TREE("identalone") EMPTY_CLOSE CLOSE SEP ID("identfield") CLOSE SEP TREE("rule") LIST LITERAL("ident") CLOSE SEP ID("identalone") CLOSE SEP TREE("rule") LIST LITERAL("{") CLOSE SEP ID("opencontext") CLOSE SEP TREE("rule") LIST LITERAL("}") CLOSE SEP ID("closecontext") CLOSE SEP TREE("rule") LIST LITERAL("(") SEP ID("or_rule") SEP LITERAL(")") CLOSE SEP NONE CLOSE CLOSE CLOSE CLOSE *root = tt[0]; } /* Symbol tables ~~~~~~~~~~~~~ The following section of code deals with the storage of symbol tables, and resolving all identifiers with their definitions. */ typedef struct ident_def_t ident_def_t, *ident_def_p; typedef struct context_entry_t context_entry_t, *context_entry_p; struct ident_def_t { ident_def_p next; tree_p tree; }; struct context_entry_t { context_entry_p next; char *ident_name; char *ident_class; ident_def_p defs; }; struct context_t { context_p parent; context_entry_p entries; }; context_p cur_context = NULL; context_entry_p find_entry_in_context(context_p context, char *ident_name, char *ident_class) { context_entry_p *r_entry; for (r_entry = &context->entries; *r_entry != NULL; r_entry = &(*r_entry)->next) if ( (*r_entry)->ident_name == ident_name && (*r_entry)->ident_class == ident_class) return *r_entry; (*r_entry) = MALLOC(context_entry_t); (*r_entry)->next = NULL; (*r_entry)->ident_name = ident_name; (*r_entry)->ident_class = ident_class; (*r_entry)->defs = NULL; return *r_entry; } void build_contexts_for_tree(tree_p tree, tree_p parent_tree) { if (tree == NULL) return; if (tree->type == tt_identdef || tree->type == tt_identdefadd) { char *ident_name = tree->c.parts->first->c.str_value; char *ident_class = tree->c.parts->next->first->c.str_value; context_entry_p entry; ident_def_p *r_def; tree_p def_tree; if (cur_context == NULL) { printf("Error at %d.%d: no context for identifier '%s'\n", tree->line, tree->column, ident_name); return; } entry = find_entry_in_context(cur_context, ident_name, ident_class); r_def = &entry->defs; if (tree->type == tt_identdef && (*r_def) != NULL) { //printf("Error at %d.%d: redefinition of identifier '%s' at %d.%d\n", // tree->line, tree->column, ident_name, // (*r_def)->tree->line, (*r_def)->tree->column); printf("Error at %d.%d: redefinition of identifier '%s' at ", tree->line, tree->column, ident_name); if ((*r_def)->tree) printf("%d.%d\n", (*r_def)->tree->line, (*r_def)->tree->column); else printf("?.?\n"); return; } while ((*r_def) != NULL) r_def = &(*r_def)->next; if (parent_tree == NULL) def_tree = tree; // simply the identifier else { def_tree = parent_tree; if (nr_parts(parent_tree) == 2) { tree_p first_child = part(parent_tree, 1), second_child = part(parent_tree, 2); if (first_child == tree && second_child != NULL) def_tree = second_child; else if (second_child == tree && first_child != NULL) def_tree = first_child; } } (*r_def) = MALLOC(ident_def_t); (*r_def)->next = NULL; (*r_def)->tree = def_tree; } else if (type_can_have_parts(tree->type)) { tree_p parent_tree = tree->type == tt_list ? NULL : tree; list_p list; for (list = tree->c.parts; list != NULL; list = list->next) { tree_p elem_tree = list->first; if (elem_tree != NULL && elem_tree->type != NULL) { if (elem_tree->type == tt_opencontext) { context_p new_context = MALLOC(context_t); new_context->parent = cur_context; new_context->entries = NULL; elem_tree->c.context = new_context; cur_context = new_context; } else if (elem_tree->type == tt_closecontext) { if (cur_context != NULL) cur_context = cur_context->parent; else printf("Error at %d.%d: close context while no context is open\n", elem_tree->line, elem_tree->column); } else build_contexts_for_tree(elem_tree, parent_tree); } } } } void print_context(FILE *f, context_p context, bool compact) { context_entry_p entry; if (context == NULL || context->entries == NULL) return; fprintf(f, "<"); for (entry = context->entries; entry != NULL; entry = entry->next) { ident_def_p def; printf("%s:%s=", entry->ident_name, entry->ident_class); for (def = entry->defs; def != NULL; def = def->next) { if (def->tree) printf("%d.%d", def->tree->line, def->tree->column); if (def->next) fprintf(f, ","); } if (entry->next) fprintf(f,";"); } fprintf(f, ">"); } context_entry_p find_identifier(context_p context, char *ident_name, char *ident_class) { context_entry_p entry; if (context == NULL) return NULL; for (entry = context->entries; entry != NULL; entry = entry->next) if ( entry->ident_name == ident_name && entry->ident_class == ident_class) return entry; return find_identifier(context->parent, ident_name, ident_class); } void search_for_context_in_tree(tree_p tree, context_p *rcontext, int *rcount) { int context_depth = 0; if (tree == NULL) return; if (type_can_have_parts(tree->type)) { list_p list = tree->c.parts; for (; list != NULL; list = list->next) { tree_p child_tree = list->first; if (child_tree != NULL && child_tree->type != NULL) { if (child_tree->type == tt_opencontext) { if (context_depth == 0) { *rcontext = child_tree->c.context; (*rcount)++; } context_depth++; } else if (child_tree->type == tt_closecontext) context_depth--; else if (context_depth == 0) search_for_context_in_tree(child_tree, rcontext, rcount); } } } } context_p find_local_context(ident_def_p defs) { context_p result = NULL; int count = 0; if (defs == NULL || defs->next != NULL) return NULL; search_for_context_in_tree(defs->tree, &result, &count); if (count == 1) return result; return NULL; } bool verify_info = 0; void verify_identifiers_in_list(list_p list); void verify_identifiers_in_tree(context_p *local_context, tree_p tree) { if (tree == NULL) return; if (tree->type == tt_identuse || tree->type == tt_identfield) { context_p used_context = tree->type == tt_identuse ? cur_context : *local_context; char *ident_name = tree->c.parts->first->c.str_value; char *ident_class = tree->c.parts->next->first->c.str_value; context_entry_p context_entry; if (verify_info) { printf("%d.%d check %s of %s in ", tree->line, tree->column, ident_name, ident_class); print_context(stdout, used_context, TRUE); printf("\n"); } context_entry = find_identifier(used_context, ident_name, ident_class); if (context_entry == NULL) printf("Error at %d.%d: no definition for identifier '%s'\n", tree->line, tree->column, ident_name); else if (verify_info) { context_p new_local_context; { ident_def_p defs; for (defs = context_entry->defs; defs != NULL; defs = defs->next) { if (defs->tree != NULL && verify_info) printf("Uses definition from %d.%d\n", defs->tree->line, defs->tree->column); } } new_local_context = find_local_context(context_entry->defs); if (new_local_context != NULL) *local_context = new_local_context; else printf("Warning: no local context\n"); } } else if (type_can_have_parts(tree->type)) verify_identifiers_in_list(tree->c.parts); } void verify_identifiers_in_list(list_p list) { context_p local_context = NULL; for (; list != NULL; list = list->next) { tree_p tree = list->first; if (tree != NULL && tree->type != NULL) { if (tree->type == tt_opencontext) { cur_context = tree->c.context; if (verify_info) printf("%d.%d: open context\n", tree->line, tree->column); local_context = NULL; } else if (tree->type == tt_closecontext) { if (verify_info) printf("%d.%d: close context\n", tree->line, tree->column); if (cur_context != NULL) cur_context = cur_context->parent; local_context = NULL; } else verify_identifiers_in_tree(&local_context, tree); } } } /* Initialization ~~~~~~~~~~~~~~ */ void initialize() { /* init strings */ str_root = string("root"); str_nt_def = string("nt_def"); str_ident = string("ident"); str_identalone = string("identalone"); str_identdef = string("identdef"); str_identdefadd = string("identdefadd"); str_identuse = string("identuse"); str_identfield = string("identfield"); str_opencontext = string("opencontext"); str_closecontext = string("closecontext"); str_rule = string("rule"); str_opt = string("opt"); str_seq = string("seq"); str_list = string("list"); str_chain = string("chain"); str_prim_elem = string("prim_elem"); str_string = string("string"); str_cstring = string("cstring"); str_int = string("int"); str_double = string("double"); str_char = string("char"); str_eof = string("eof"); /* init terminal types */ add_literal_terminal(str_string, accept_string); add_string_terminal(str_cstring, accept_cstring); add_int_terminal(str_int, accept_int); add_double_terminal(str_double, accept_double); add_ident_terminal(str_ident, accept_ident); add_char_terminal(str_char, accept_char); /* for Transact-SQL: */ add_string_terminal(string("sql_string_single"), accept_single_sql_string); add_string_terminal(string("sql_string_double"), accept_double_sql_string); add_ident_terminal(string("sql_ident"), accept_sql_ident); add_ident_terminal(string("sql_vident"), accept_sql_var_ident); add_ident_terminal(string("sql_sysident"), accept_sql_sysvar_ident); add_ident_terminal(string("sql_label"), accept_sql_label_ident); } /* Command line interface ~~~~~~~~~~~~~~~~~~~~~~ The main procedure implements the command-line interface for IParse. */ int main(int argc, char *argv[]) { tree_p tree = NULL; int i; if (argc == 1) { printf("Usage: %s \n" "\n" " options\n" " -p print parse tree\n" " -o ouput tree to C file\n" " +ds debug scanning (full)\n" " +dss debug scanning (normal)\n" " -ds no debug scanning\n" " +dp debug parsing\n" " -dp no debug parsing\n" " +dn debug non-terminals\n" " -dn no debug non-terminals\n", argv[0]); return 0; } fprintf(stderr, "Iparser, Version: %s\n", VERSION); initialize(); init_IParse_grammar(&tree); for (i = 1; i < argc; i++) { char *arg = argv[i]; printf("Processing: %s\n", arg); if (!strcmp(arg, "+g")) grammar_level++; if (!strcmp(arg, "-p")) { printf("tree:\n"); print_tree(stdout, tree, FALSE); printf("\n--------------\n"); } else if (!strcmp(arg, "-pc")) { printf("tree:\n"); print_tree(stdout, tree, TRUE); printf("\n--------------\n"); } else if (!strcmp(arg, "-o") && i + 1 < argc) { char *file_name = argv[++i]; FILE *fout = !strcmp(file_name, "-") ? stdout : fopen(file_name, "w"); if (fout != NULL) { char *dot = strstr(file_name, "."); if (dot) *dot = '\0'; print_tree_to_c(fout, tree, file_name); fclose(fout); } else { printf("Cannot open: %s\n", file_name); return 0; } } else if (!strcmp(arg, "+ds")) debug_scan = 2; else if (!strcmp(arg, "+dss")) debug_scan = 1; else if (!strcmp(arg, "-ds")) debug_scan = 0; else if (!strcmp(arg, "+dp")) debug_parse = TRUE; else if (!strcmp(arg, "-dp")) debug_parse = FALSE; else if (!strcmp(arg, "+dn")) debug_nt = TRUE; else if (!strcmp(arg, "-dn")) debug_nt = FALSE; else { FILE *fin = fopen(arg, "r"); if (fin != NULL) { tree_p new_tree = NULL; init_scan(fin); make_all_nt(tree); tree_release(&tree); init_solutions(); if (!parse_nt(find_nt(str_root), &new_tree)) { print_last_pos(); return 0; } tree_assign(&tree, new_tree); tree_release(&new_tree); free_solutions(); keyword_flag *= 2; new_keyword_flag *= 2; grammar_level--; cur_context = NULL; build_contexts_for_tree(tree, tree); if (cur_context != NULL) printf("Error: Left open contexts\n"); { context_p last_identifier_context = NULL; cur_context = NULL; verify_identifiers_in_tree(&last_identifier_context, tree); } fclose(fin); } else { printf("Cannot open: %s\n", arg); return 0; } } } tree_release(&tree); if (alloced_trees != 0) printf("Error: alloced trees = %ld\n", alloced_trees); return 0; } /* Building a hard-code parser ~~~~~~~~~~~~~~~~~~~~~~~~~~~ IParse implements an interpretting parser. It can also be used to implement a parser for a fixed grammar. (Actually, that is the way how the parser of the input grammar is implemented.) In case your languages uses different lexical rules for the terminals, you will have to write your own scanning routines for all these terminals. For this you will have to modify this file. Read the sections on scanning for information how to do this. The next step for building a parser consist of writing the grammar, for example the file 'lang.gr'. To verify this grammar on some input file, for example 'test.lang' call IParse from the command line in the following manner: IParse lang.gr test.gr -p Once the grammar is correct, call IParse from the command line in the following manner: IParse lang.gr -o lang_grammar.c This will produce the procedure init_lang_grammar in the file lang_grammar.c. Now make a copy of this file and rename it to 'LangParse.c'. In this copy replace init_IParse_grammar with the code of init_lang_grammar. After 'LangParse.c' has been compiled, you can verify its working by calling it from the command line with: LangParse test.gr -p As a final step, the main procedure can be replaced by a procedure which parses a given input file to an Abstract Program Tree. An example of such a procedure is: bool parse_lang(FILE *fin, tree_p *r_tree) { tree_p lang_grammar_tree = NULL; bool okay; initialize(); init_lang_grammar(&lang_grammar_tree); make_all_nt(lang_grammar_tree); tree_release(&lang_grammar_tree); init_solutions(); init_scan(fin); okay = parse_nt(find_nt(str_root), r_tree); free_solutions(); fclose(fin); if (!okay) { print_last_pos(); return FALSE; } cur_context = NULL; build_contexts_for_tree(*r_tree, *r_tree); if (cur_context != NULL) printf("Error: Left open contexts\n"); { context_p last_identifier_context = NULL; cur_context = NULL; verify_identifiers_in_tree(&last_identifier_context, *r_tree); } fclose(fin); return TRUE; } This gives you a procedure which will parse a given file according to your grammar 'lang.gr' into an Abstract Program Tree, in which all identifiers have been resolved. Errors are reported on the standard output. If you want them to be reported differently, replace the procedure print_last_pos by something more fitting. If you would like to parse from a string buffer instead of from a file, you can replace init_scan by your own initialization routine. */