From 2659c7d81bb58ed7ec0349354a1b978f1290a29c Mon Sep 17 00:00:00 2001 From: Martin Stoilov Date: Mon, 20 Feb 2012 23:42:27 -0800 Subject: [PATCH] Work on documentation. --- doc/doxygen/rpa.cfg | 1 + rex/doc/example/tokenjs.rexcc | 20 ------ rex/doc/rexcc.txt | 14 ++++- rex/doc/rexgrep.txt | 140 +++++++++++++++++++++++++++++++++++++++++ rex/rexcompiler.c | 5 +- rex/rexdb.c | 8 +++ rex/rexdb.h | 1 + rex/rexdfa.h | 6 +- rexcc/rexcc.c | 70 ++++++++++++++++++++- rexcc/rexcc.h | 2 + rexcc/unix/main.c | 9 +++ rexgrep/unix/main.c | 11 ++-- rpa/doc/main.txt | 1 + tests/testrexcc/tokenjs.rexcc | 20 ------ 14 files changed, 256 insertions(+), 52 deletions(-) create mode 100644 rex/doc/rexgrep.txt diff --git a/doc/doxygen/rpa.cfg b/doc/doxygen/rpa.cfg index a310c1f..d855f86 100644 --- a/doc/doxygen/rpa.cfg +++ b/doc/doxygen/rpa.cfg @@ -109,6 +109,7 @@ INPUT = ../../rpa/rpadbex.h \ ../../rex/doc/rexdb.txt \ ../../rex/doc/rexdfa.txt \ ../../rex/doc/rexcc.txt \ + ../../rex/doc/rexgrep.txt \ ../../rex/rexdb.h \ ../../rex/rexdfa.h \ diff --git a/rex/doc/example/tokenjs.rexcc b/rex/doc/example/tokenjs.rexcc index 094e133..1c2d7c8 100644 --- a/rex/doc/example/tokenjs.rexcc +++ b/rex/doc/example/tokenjs.rexcc @@ -1,24 +1,4 @@ /* - * Regular Pattern Analyzer (RPA) - * Copyright (c) 2009-2010 Martin Stoilov - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see . - * - * Martin Stoilov - */ - -/* * To build: * # rexcc tokenjs.rexcc -o tokenjs.c * # gcc -I/usr/include/rpatk/rex -o tokenjs tokenjs.c diff --git a/rex/doc/rexcc.txt b/rex/doc/rexcc.txt index a75c8fc..729a810 100644 --- a/rex/doc/rexcc.txt +++ b/rex/doc/rexcc.txt @@ -89,7 +89,19 @@ List of macros:

Example

- @ref tokenjs.rexcc - JavaScript tokenizer. - +@section rexcc_command_line rexcc parameters +@verbatim +# rexcc [OPTIONS] + OPTIONS: + -o Output .c file. + -d Dump regular expressions. + -D Dump DFA states. + -N Dump NFA states. + -s Include substates. + -t Display statistics. + -v Display version information. + -h, --help Display this help. +@endverbatim @example tokenjs.rexcc diff --git a/rex/doc/rexgrep.txt b/rex/doc/rexgrep.txt new file mode 100644 index 0000000..db4fc6a --- /dev/null +++ b/rex/doc/rexgrep.txt @@ -0,0 +1,140 @@ +/** \page rexgrep REX Grep +rexgrep is a program for matching and searching strings in files using +the REX library. It is intended as a front end for the REX library and provides some command +line switches to help using the REX library. For example rexgrep can display the generated +Deterministic Finite Automata (DFA) or Non-Deterministic Finite Automata (NFA) from a +regular expression. + +Example 1 - Display DFA: +@verbatim +# rexgrep -e "hel+o" -D +State 0 (): REX_STATETYPE_DEAD + [0x0 - 0xFFFFFFFF] -> 0 + +State 1 (): REX_STATETYPE_START + [0x0 - 0x67] -> 0 + 'h' -> 2 + [0x69 - 0xFFFFFFFF] -> 0 + +State 2 (): + [0x0 - 0x64] -> 0 + 'e' -> 3 + [0x66 - 0xFFFFFFFF] -> 0 + +State 3 (): + [0x0 - 0x6B] -> 0 + 'l' -> 4 + [0x6D - 0xFFFFFFFF] -> 0 + +State 4 (): + [0x0 - 0x6B] -> 0 + 'l' -> 4 + [m - n] -> 0 + 'o' -> 5 + [0x70 - 0xFFFFFFFF] -> 0 + +State 5 (): REX_STATETYPE_ACCEPT (5*) + [0x0 - 0xFFFFFFFF] -> 0 + +@endverbatim + +Example 2 - Display NFA. Note: if the regular expression characters are not ASCII, their hex value will be displayed. +Wide characters are encoded within the state the same way as ASCII characters, they are just displayed with their hex value. +@verbatim +# rexgrep -e "здравей" -N -D +State 0: REX_STATETYPE_START + 0x437 -> 1 + +State 1: + 0x434 -> 2 + +State 2: + 0x440 -> 3 + +State 3: + 0x430 -> 4 + +State 4: + 0x432 -> 5 + +State 5: + 0x435 -> 6 + +State 6: + 0x439 -> 7 + +State 7: REX_STATETYPE_ACCEPT + (none) + +@endverbatim + +Example 3 - Display DFA with the NFA substates. +@verbatim +# rexgrep -e "здравей" -S -D +State 0 (): REX_STATETYPE_DEAD + [0x0 - 0xFFFFFFFF] -> 0 + +State 1 (0): REX_STATETYPE_START + [0x0 - 0x436] -> 0 + 0x437 -> 2 + [0x438 - 0xFFFFFFFF] -> 0 + +State 2 (1): + [0x0 - 0x433] -> 0 + 0x434 -> 3 + [0x435 - 0xFFFFFFFF] -> 0 + +State 3 (2): + [0x0 - 0x43F] -> 0 + 0x440 -> 4 + [0x441 - 0xFFFFFFFF] -> 0 + +State 4 (3): + [0x0 - 0x42F] -> 0 + 0x430 -> 5 + [0x431 - 0xFFFFFFFF] -> 0 + +State 5 (4): + [0x0 - 0x431] -> 0 + 0x432 -> 6 + [0x433 - 0xFFFFFFFF] -> 0 + +State 6 (5): + [0x0 - 0x434] -> 0 + 0x435 -> 7 + [0x436 - 0xFFFFFFFF] -> 0 + +State 7 (6): + [0x0 - 0x438] -> 0 + 0x439 -> 8 + [0x43A - 0xFFFFFFFF] -> 0 + +State 8 (7*): REX_STATETYPE_ACCEPT (7*) + [0x0 - 0xFFFFFFFF] -> 0 + + +@endverbatim + +Here is a complete list of available command line options. +@verbatim +Usage: + rexgrep [OPTIONS] + OPTIONS: + -e patterns Regular Expression. + -f patternfile Read Regular Expressions from a file. + -b binfile Use DFA from binfile. + -c Compile DFA and save to binfile. Use -b option to specify the name of the file. + -o, --only-matching Show only the part of a line matching PATTERN + -l Line mode. + -N Use NFA. + -D Dump states. + -S Include DFA substates. + -q Quiet mode. + -t Display statistics. Works only when built in DEBUG mode. + -s string Search in string. + -v Display version information. + -h, --help Display this help. +@endverbatim + + +*/ \ No newline at end of file diff --git a/rex/rexcompiler.c b/rex/rexcompiler.c index 0eb7fb6..71e5658 100644 --- a/rex/rexcompiler.c +++ b/rex/rexcompiler.c @@ -600,7 +600,10 @@ long rex_compiler_expression_s(rexcompiler_t *co, rexdb_t *rexdb, const char *st long rex_compiler_addexpression(rexcompiler_t *co, rexdb_t *rexdb, unsigned long prev, const char *str, unsigned int size, rexuserdata_t userdata) { rexstate_t *sprev = NULL, *scur = NULL; - long cur = rex_compiler_expression(co, rexdb, str, size, userdata); + long cur; + if (r_array_empty(rexdb->states)) + prev = -1UL; + cur = rex_compiler_expression(co, rexdb, str, size, userdata); if (cur < 0) return -1; sprev = rex_db_getstate(rexdb, prev); diff --git a/rex/rexdb.c b/rex/rexdb.c index 006cfc2..b15415b 100644 --- a/rex/rexdb.c +++ b/rex/rexdb.c @@ -406,3 +406,11 @@ rexdfa_t *rex_db_todfa(rexdb_t *db, int withsubstates) R_ASSERT(ctx.naccsubstates == naccsubstates); return dfa; } + + +int rex_db_isempty(rexdb_t *db) +{ + if (!db) + return 0; + return r_array_length(db->states) ? 0 : 1; +} diff --git a/rex/rexdb.h b/rex/rexdb.h index d2b6773..8ee0b54 100644 --- a/rex/rexdb.h +++ b/rex/rexdb.h @@ -139,6 +139,7 @@ long rex_db_numtransitions(rexdb_t *rexdb); long rex_db_numstates(rexdb_t *rexdb); long rex_db_numsubstates(rexdb_t *rexdb); long rex_db_numaccsubstates(rexdb_t *rexdb); +int rex_db_isempty(rexdb_t *db); const char *rex_db_version(); /** diff --git a/rex/rexdfa.h b/rex/rexdfa.h index ee383c7..9154bfb 100644 --- a/rex/rexdfa.h +++ b/rex/rexdfa.h @@ -94,7 +94,7 @@ typedef enum { * @param __dfa__ Pointer to @ref rexdfa_t object * @param __nstate__ State ID returned from @ref REX_DFA_NEXT or @ref REX_DFA_DEADSTATE, @ref REX_DFA_STARTSTATE * @param __ntrans__ Transition offset in the array of transitions for the specified state. This parameter - * must not exceed rexdfs_t::ntrans. + * must be from 0 to rexdfs_t::ntrans - 1. * @return Pointer to @ref rexdft_t transition */ #define REX_DFA_TRANSITION(__dfa__, __nstate__, __ntrans__) (&(__dfa__)->trans[(REX_DFA_STATE(__dfa__, __nstate__)->trans) + (__ntrans__)]) @@ -107,7 +107,7 @@ typedef enum { * @param __dfa__ Pointer to @ref rexdfa_t object * @param __nstate__ State ID returned from @ref REX_DFA_NEXT or @ref REX_DFA_STARTSTATE * @param __nsubstate__ Sub-state offset in the array of sub-states for the specified state. This parameter - * must not exceed rexdfs_t::nsubstates. + * must from 0 to rexdfs_t::nsubstates - 1. * @return Pointer to @ref rexdfss_t substate. */ #define REX_DFA_SUBSTATE(__dfa__, __nstate__, __nsubstate__) ((__dfa__)->substates ? &(__dfa__)->substates[REX_DFA_STATE(__dfa__, __nstate__)->substates + (__nsubstate__)] : ((rexdfss_t*)0)) @@ -119,7 +119,7 @@ typedef enum { * @param __dfa__ Pointer to @ref rexdfa_t object * @param __nstate__ State ID returned from @ref REX_DFA_NEXT or @ref REX_DFA_STARTSTATE * @param __naccsubstate__ Sub-state offset in the array of accepting sub-states for the specified state. This parameter - * must not exceed rexdfs_t::naccsubstates. + * must be from 0 to rexdfs_t::naccsubstates - 1. * @return Pointer to @ref rexdfss_t accepting substate. */ #define REX_DFA_ACCSUBSTATE(__dfa__, __nstate__, __naccsubstate__) ((__dfa__)->accsubstates ? &(__dfa__)->accsubstates[REX_DFA_STATE(__dfa__, __nstate__)->accsubstates + (__naccsubstate__)] : ((rexdfss_t*)0)) diff --git a/rexcc/rexcc.c b/rexcc/rexcc.c index 4eddb2d..0915ae1 100644 --- a/rexcc/rexcc.c +++ b/rexcc/rexcc.c @@ -41,6 +41,7 @@ struct tokeninfo_s { struct parseinfo_s { rbuffer_t id; rbuffer_t regex; + int line; }; @@ -152,6 +153,32 @@ int rex_cc_fprintf(FILE *out, int indent, const char *format, ...) } +static void rex_cc_output_gpl(FILE *out) +{ + static char *gpl = + "/*\n" + " * Regular Pattern Analyzer Toolkit(RPA/Tk)\n" + " * Copyright (c) 2009-2012 Martin Stoilov\n" + " *\n" + " * This program is free software: you can redistribute it and/or modify\n" + " * it under the terms of the GNU General Public License as published by\n" + " * the Free Software Foundation, either version 3 of the License, or\n" + " * (at your option) any later version.\n" + " *\n" + " * This program is distributed in the hope that it will be useful,\n" + " * but WITHOUT ANY WARRANTY; without even the implied warranty of\n" + " * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the\n" + " * GNU General Public License for more details.\n" + " *\n" + " * You should have received a copy of the GNU General Public License\n" + " * along with this program. If not, see .\n" + " *\n" + " * Martin Stoilov \n" + " */\n"; + rex_cc_fprintf(out, 0, "%s\n", gpl); +} + + static int rex_cc_output_statesubstates(rexcc_t *pCC, FILE *out, long nstate) { long i; @@ -294,8 +321,8 @@ static int rex_cc_output_dfa(rexcc_t *pCC, FILE *out) int rex_cc_output(rexcc_t *pCC, FILE *outc) { - if (outc) { + rex_cc_output_gpl(outc); rex_cc_fprintf(outc, 0, "#include \"rexdfa.h\"\n\n"); if (pCC->prolog.size) { fwrite(pCC->prolog.s, 1, pCC->prolog.size, outc); @@ -319,6 +346,18 @@ int rex_cc_output(rexcc_t *pCC, FILE *outc) } +static int rex_cc_getlineno(rexcc_t *pCC, const char *input) +{ + int ret = 1; + + while (--input >= pCC->start) { + if (*input == '\n') + ret += 1; + } + return ret; +} + + int rex_cc_gettoken(rexcc_t *pCC) { ruint32 wc = 0; @@ -389,6 +428,7 @@ static int rex_cc_parseline(rexcc_t *pCC) /* * Unexpected char. */ + fprintf(stdout, "Line %d, (%s) Unexpected Char.\n", rex_cc_getlineno(pCC, pCC->input), "Error"); return -1; } if (rex_cc_parseregex(pCC, &info) < 0) @@ -403,8 +443,9 @@ int rex_cc_parse(rexcc_t *pCC) pCC->prolog.s = pCC->input; pCC->prolog.size = 0; while (pCC->input + 3 < pCC->end) { - if (*pCC->input == '%' && *(pCC->input+1) == '%' && (*(pCC->input+2) == '\n' || (*(pCC->input+2) == '\r' && *(pCC->input+3) == '\n'))) + if (*pCC->input == '%' && *(pCC->input+1) == '%' && (*(pCC->input+2) == '\n' || (*(pCC->input+2) == '\r' && *(pCC->input+3) == '\n'))) { break; + } pCC->prolog.size += 1; pCC->input += 1; } @@ -416,7 +457,8 @@ int rex_cc_parse(rexcc_t *pCC) if (pCC->token == REXCC_TOKEN_CR || pCC->token == REXCC_TOKEN_SPACE || pCC->token == REXCC_TOKEN_REGEX) { rex_cc_gettoken(pCC); } else if (pCC->token == REXCC_TOKEN_IDENTIFIER) { - rex_cc_parseline(pCC); + if (rex_cc_parseline(pCC) < 0) + return -1; } else if (pCC->token == REXCC_TOKEN_DELIMITER) { rex_cc_gettoken(pCC); return 0; @@ -424,6 +466,7 @@ int rex_cc_parse(rexcc_t *pCC) /* * Unexpected char */ + fprintf(stdout, "Line %d, (%s) Unexpected Char.\n", rex_cc_getlineno(pCC, pCC->input), "Error"); return -1; } pCC->epilog.s = pCC->input; @@ -438,6 +481,7 @@ int rex_cc_load_buffer(rexcc_t *pCC, rbuffer_t *text) int ret = 0, i; struct parseinfo_s *pi; + pCC->start = text->s; pCC->input = text->s; pCC->end = text->s + text->size; r_array_setlength(pCC->parseinfo, 0); @@ -445,6 +489,11 @@ int rex_cc_load_buffer(rexcc_t *pCC, rbuffer_t *text) for (i = 0; i < r_array_length(pCC->parseinfo); i++) { pi = (struct parseinfo_s *)r_array_slot(pCC->parseinfo, i); if (rex_cc_load_pattern(pCC, &pi->regex, i) < 0) { + fprintf(stdout, "Line %d, (%s) Syntax error: ", rex_cc_getlineno(pCC, pi->id.s), "Error"); + fwrite(pi->id.s, 1, pi->id.size, stdout); + fprintf(stdout, " "); + fwrite(pi->regex.s, 1, pi->regex.size, stdout); + fprintf(stdout, "\n"); return -1; } #if 0 @@ -457,3 +506,18 @@ int rex_cc_load_buffer(rexcc_t *pCC, rbuffer_t *text) } return ret; } + + +void rex_cc_parseinfodump(rexcc_t *pCC) +{ + long i; + struct parseinfo_s *pi; + + for (i = 0; i < r_array_length(pCC->parseinfo); i++) { + pi = (struct parseinfo_s *)r_array_slot(pCC->parseinfo, i); + fwrite(pi->id.s, 1, pi->id.size, stdout); + fprintf(stdout, " "); + fwrite(pi->regex.s, 1, pi->regex.size, stdout); + fprintf(stdout, "\n"); + } +} diff --git a/rexcc/rexcc.h b/rexcc/rexcc.h index 5f74f57..852df9c 100644 --- a/rexcc/rexcc.h +++ b/rexcc/rexcc.h @@ -46,6 +46,7 @@ typedef struct rexcc_s { char *temp; int token; char *input; + char *start; char *end; char *tokenptr; int tokenlen; @@ -58,6 +59,7 @@ int rex_cc_load_pattern(rexcc_t *pCC, rbuffer_t *buf, rexuserdata_t userdata); int rex_cc_load_buffer(rexcc_t *pCC, rbuffer_t *text); int rex_cc_output(rexcc_t *pCC, FILE *outc); rexdfa_t * rex_cc_tokensdfa(); +void rex_cc_parseinfodump(rexcc_t *pCC); #ifdef __cplusplus diff --git a/rexcc/unix/main.c b/rexcc/unix/main.c index 3923cf2..877e0af 100644 --- a/rexcc/unix/main.c +++ b/rexcc/unix/main.c @@ -91,6 +91,7 @@ int usage(int argc, const char *argv[]) fprintf(stderr, "Usage: \n %s [OPTIONS] \n", argv[0]); fprintf(stderr, " OPTIONS:\n"); fprintf(stderr, "\t-o Output .c file.\n"); + fprintf(stderr, "\t-d Dump regular expressions.\n"); fprintf(stderr, "\t-D Dump DFA states.\n"); fprintf(stderr, "\t-N Dump NFA states.\n"); fprintf(stderr, "\t-s Include substates.\n"); @@ -207,6 +208,11 @@ int main(int argc, const char *argv[]) } } for (i = 1; i < argc; i++) { + if (strcmp(argv[i], "-d") == 0) { + dumponly = 3; + } + } + for (i = 1; i < argc; i++) { if (argv[i][0] != '-') { rbuffer_t *text = rex_buffer_map_file(argv[i]); if (text) { @@ -221,6 +227,9 @@ int main(int argc, const char *argv[]) rex_db_destroy(tempdb); if (pCC->dfa && !dumponly) rex_cc_output(pCC, cfile); + if (dumponly == 3) { + rex_cc_parseinfodump(pCC); + } } r_buffer_destroy(text); } else { diff --git a/rexgrep/unix/main.c b/rexgrep/unix/main.c index b8c574e..48a983d 100644 --- a/rexgrep/unix/main.c +++ b/rexgrep/unix/main.c @@ -55,8 +55,8 @@ int usage(int argc, const char *argv[]) fprintf(stderr, "\t-D Dump states.\n"); fprintf(stderr, "\t-S Include DFA substates.\n"); fprintf(stderr, "\t-q Quiet mode.\n"); - fprintf(stderr, "\t-t Display time elapsed.\n"); - fprintf(stderr, "\t-s string Scan string.\n"); + fprintf(stderr, "\t-t Display statistics. Works only when built in DEBUG mode.\n"); + fprintf(stderr, "\t-s string Search in string.\n"); fprintf(stderr, "\t-v Display version information.\n"); fprintf(stderr, "\t-h, --help Display this help.\n"); @@ -251,7 +251,7 @@ int main(int argc, const char *argv[]) fclose(pfile); } - if (!pGrep->dfa && pGrep->usedfa) { + if (!pGrep->dfa && !rex_db_isempty(pGrep->nfa) && pGrep->usedfa) { rexdb_t *dfadb = rex_db_createdfa(pGrep->nfa, pGrep->startuid); pGrep->dfa = rex_db_todfa(dfadb, pGrep->withsubstates); rex_db_destroy(dfadb); @@ -297,7 +297,10 @@ int main(int argc, const char *argv[]) fclose(pfile); goto end; } - + if (!pGrep->dfa && pGrep->usedfa) + goto end; + if (rex_db_isempty(pGrep->nfa) && !pGrep->usedfa) + goto end; for (i = 1; i < argc; i++) { if (strcmp(argv[i], "-s") == 0) { if (++i < argc) { diff --git a/rpa/doc/main.txt b/rpa/doc/main.txt index 78d8413..f246ce4 100644 --- a/rpa/doc/main.txt +++ b/rpa/doc/main.txt @@ -13,6 +13,7 @@ - @subpage rexdb - @subpage rexdfa - @subpage rexcc + - @subpage rexgrep */ diff --git a/tests/testrexcc/tokenjs.rexcc b/tests/testrexcc/tokenjs.rexcc index 094e133..1c2d7c8 100644 --- a/tests/testrexcc/tokenjs.rexcc +++ b/tests/testrexcc/tokenjs.rexcc @@ -1,24 +1,4 @@ /* - * Regular Pattern Analyzer (RPA) - * Copyright (c) 2009-2010 Martin Stoilov - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see . - * - * Martin Stoilov - */ - -/* * To build: * # rexcc tokenjs.rexcc -o tokenjs.c * # gcc -I/usr/include/rpatk/rex -o tokenjs tokenjs.c -- 1.7.9.5