From ee833a5b2ebf9c473d343f5386e7a1888c7ffb44 Mon Sep 17 00:00:00 2001 From: Martin Stoilov Date: Sun, 19 Feb 2012 23:02:30 -0800 Subject: [PATCH] Work on REX documentation. --- doc/doxygen/rpa.cfg | 1 + rex/doc/example/js-tokenizer.c | 4 +- rex/doc/example/tokenjs.rexcc | 131 ++++++++++++++++++++++++++++++++++++++++ rex/doc/rex_main.txt | 2 +- rex/doc/rexcc.txt | 99 ++++++++++++++++++++++++++++++ rex/rexdb.h | 4 +- rex/rexdfa.h | 58 ++++++++++++++++++ rpa/doc/main.txt | 1 + tests/testrexcc/tokenjs.rexcc | 131 ++++++++++++++++++++++++++++++++++++++++ 9 files changed, 427 insertions(+), 4 deletions(-) create mode 100644 rex/doc/example/tokenjs.rexcc create mode 100644 rex/doc/rexcc.txt create mode 100644 tests/testrexcc/tokenjs.rexcc diff --git a/doc/doxygen/rpa.cfg b/doc/doxygen/rpa.cfg index 40c104f..a310c1f 100644 --- a/doc/doxygen/rpa.cfg +++ b/doc/doxygen/rpa.cfg @@ -108,6 +108,7 @@ INPUT = ../../rpa/rpadbex.h \ ../../rex/doc/rex_main.txt \ ../../rex/doc/rexdb.txt \ ../../rex/doc/rexdfa.txt \ + ../../rex/doc/rexcc.txt \ ../../rex/rexdb.h \ ../../rex/rexdfa.h \ diff --git a/rex/doc/example/js-tokenizer.c b/rex/doc/example/js-tokenizer.c index 1c75b2d..af53552 100644 --- a/rex/doc/example/js-tokenizer.c +++ b/rex/doc/example/js-tokenizer.c @@ -24,6 +24,8 @@ * * To run: * # echo "function add(a,b) { var c = a + b; return c; } print('здравей means hello');" | ./js-tokenizer + * + * Your terminal must use UTF8 encoding, something like: LANG=en_US.utf8 */ #include @@ -94,7 +96,7 @@ int main(int argc, char *argv[]) int token; if (!setlocale(LC_CTYPE, "")) { - printf("Can not set the specified locale, please check LANG, LC_CTYPE, LC_ALL.\n"); + printf("Failed to set the specified locale, please check LANG, LC_CTYPE, LC_ALL.\n"); return 1; } diff --git a/rex/doc/example/tokenjs.rexcc b/rex/doc/example/tokenjs.rexcc new file mode 100644 index 0000000..094e133 --- /dev/null +++ b/rex/doc/example/tokenjs.rexcc @@ -0,0 +1,131 @@ +/* + * Regular Pattern Analyzer (RPA) + * Copyright (c) 2009-2010 Martin Stoilov + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + * + * Martin Stoilov + */ + +/* + * To build: + * # rexcc tokenjs.rexcc -o tokenjs.c + * # gcc -I/usr/include/rpatk/rex -o tokenjs tokenjs.c + * + * To run: + * # echo "function add(a,b) { var c = a + b; return c; }" | ./tokenjs + */ + +#include +#include +#include + +#define TOKEN_SELF 256 +#define TOKEN_IDENTIFIER 257 +#define TOKEN_SPACE 258 +#define TOKEN_KEYWORD 259 +#define TOKEN_OPERATOR 260 +#define TOKEN_STRING 261 +#define TOKEN_DECIMAL 262 + + +%% +TOKEN_KEYWORD instanceof | typeof | break | do | new | var | case | else | \ + return | void | catch | finally | continue | for | \ + switch | while | this | with |debugger | function | throw | default | \ + if | try | delete | in | class | enum | extends | import | const | export | \ + implements | let | private | public | static | interface | package | protected + +TOKEN_IDENTIFIER ([#0x0041-#0x005A] | [#0x00C0-#0x00DE] | [#0x0100-#0x0232] | [#0x0061-#0x007A] | \ + [#0x00C0-#0x00DE] | $ | _ )([#0x0041-#0x005A] | [#0x00C0-#0x00DE] | \ + [#0x0100-#0x0232] | [#0x0061-#0x007A] | [#0x00C0-#0x00DE] | $ | _ | [0-9] | [#0x0660-#0x0669])* + +TOKEN_OPERATOR === | !== | >= | <= | == | != | << | >>> | >> | & | ^= | ^ | ! | ~ | && | [|][|] | [?] | : | \ + >>= | >>>= | &= | [|]= | = | [+]= | -= | [*]= | %= | <<= | [.] | ; | , | < | > | [|] | \ + [+] | - | [*] | % | [+][+] | -- | / | /= + +TOKEN_DECIMAL [1-9][0-9]* + +TOKEN_STRING '[^']*'|"[^"]*" + +TOKEN_SPACE [\t\r\n ]+ + +TOKEN_SELF [^\t\r\n+'" ] + + +%% + + +rexdfa_t *dfa = &ccdfa; + + +int get_token(wint_t *buffer, int size) +{ + rexdfss_t *acc_ss = NULL; + rexuint_t nstate = REX_DFA_STARTSTATE; + int ret = -1, i = 0; + wint_t wc; + + while ((wc = fgetwc(stdin)) != WEOF) { + if ((nstate = REX_DFA_NEXT(dfa, nstate, wc)) == REX_DFA_DEADSTATE) { + ungetc(wc, stdin); + break; + } + if (i + 1 < size) { + buffer[i++] = wc; + } + if (REX_DFA_STATE(dfa, nstate)->type == REX_STATETYPE_ACCEPT) { + /* + * The DFA is in accepting state, lets find out what exactly is + * being accepted. + * The token ID is recorder in the substate's userdata + * + * Note: There are may be more than one accepting substate, + * but we only check the first one (at offset 0). A real implementation + * might need to check the rest of the accepting substates(and userdata) + * to decide which one to use. + * + * Note: Some of the conflicts might be resolved simply be reordering + * the regular expressions. For example TOKEN_KEYWORD such as + * 'while', 'if', etc. also match TOKEN_IDENTIFIER, but because + * TOKEN_KEYWORD appears before TOKEN_IDENTIFIER it is placed first. + * + * Note: We will not break out of the loop here. We will keep going + * in order to find the longest match. + */ + acc_ss = REX_DFA_ACCSUBSTATE(dfa, nstate, 0); + ret = (int) acc_ss->userdata; + if (ret == TOKEN_SELF) + ret = wc; + } + } + buffer[i++] = '\0'; + return ret; +} + +int main(int argc, char *argv[]) +{ + wint_t buffer[4000]; + int token; + + if (!setlocale(LC_CTYPE, "")) { + printf("Can not set the specified locale, please check LANG, LC_CTYPE, LC_ALL.\n"); + return 1; + } + while ((token = get_token(buffer, sizeof(buffer)/sizeof(buffer[0]))) > 0) { + if (token != TOKEN_SPACE) + fwprintf(stdout, L"token(%3d): %ls\n", token, buffer); + } + return 0; +} diff --git a/rex/doc/rex_main.txt b/rex/doc/rex_main.txt index e090d53..f8e6dd7 100644 --- a/rex/doc/rex_main.txt +++ b/rex/doc/rex_main.txt @@ -36,6 +36,6 @@ while (*str) { REX doesn't provide API for matching or searching directly, it is up to the user to decide how to implement whatever functionality they need using the automaton. -The JavaScript tokenizer example @ref js-tokenizer.c is a simple demonstration how to use the REX library for lexical analysis. +The JavaScript tokenizer example @ref js-tokenizer.c is a simple demonstration how to use the REX library for lexical analysis of UTF8 encoded text. */ \ No newline at end of file diff --git a/rex/doc/rexcc.txt b/rex/doc/rexcc.txt new file mode 100644 index 0000000..a75c8fc --- /dev/null +++ b/rex/doc/rexcc.txt @@ -0,0 +1,99 @@ +/** \page rexcc REX C code generator. +rexcc is a code generator for C language. It generates C code from regular expressions and +initializes Deterministic Finite Automata(DFA) rexdfa_t object. The rexcc program reads +user specified input file, for a description of the code to generate. It will produce a +C file or it will output the generated code to the standard output. + +

Input file format

+The rexcc input file consists of three sections, separated by a line containing only `%%'. + +@verbatim +C code prolog +%% +regular expressions +%% +C code epilog +@endverbatim + + +

C code prolog

+This section is used to include any header files or definitions that are required by the rest of the C code. + +

regular expressions

+This section is used to specify the regular expressions that will be used to generate and initialize the +Deterministic Finite Automata (DFA). This section contain series of regular expression definitions of the form: + +@verbatim +userdata regex +@endverbatim + +where userdata must be a user defined data of type @ref rexuserdata_t and regex must be a regular expression. +Both must be separated by space or tab. + +

C code epilog

+This section is used to add any C code that uses the @ref rexdfa_t object generated from the rules specified in +the previous section. The name of the generated variable of type @ref rexdfa_t is always `ccdfa' and it is declared +as static. If you need to access it outside of the generated file you should add code in this section that will +make such access possible. For example: + +@verbatim +rexdfa_t *mydfa = &ccdfa; +@endverbatim + +Or using accessor function: +@verbatim +rexdfa_t *GetMyDfaPtr() +{ + return &ccdfa; +} +@endverbatim + +

Example

+ +@code +#include "mydefinitions.h" +#define IDENTIFIER 257 + +%% +IDENTIFIER [A-Za-z_][A-Za-z_0-9]* +"keyword" while|do +256 [ \n\r\t] +%% + +/* All userdata used in the previous section, can be cast to rexuserdata_t. */ + +rexdfa_t *get_simple_dfa() +{ + return &ccdfa; +} + +@endcode + +The userdata specified for eache regular expression is used to identify that regular expression when the +automata arrives at an accepting state. + +@section build_rexcc_code Building the generated code +The code generated with rexcc doesn't require to be linked with the REX library, but it includes the header file @ref rexdfa.h. +This file provides the definitions of the DFA related structures used by the generated code and it also provides macros for +accessing the states and substates of the DFA. You must add the path to the @ref rexdfa.h header file to your default search path. + +List of macros: + - @ref REX_DFA_NEXT - Get the next state in the DFA for the specified input. + - @ref REX_DFA_STATE - Get a pointer to @ref rexdfa_t state. + - @ref REX_DFA_TRANSITION - Get a pointer to @ref rexdft_t transition. + - @ref REX_DFA_SUBSTATE - Get a pointer to @ref rexdfss_t substate. This works only if rexcc is instructed to generate the substates. + - @ref REX_DFA_ACCSUBSTATE - Get a pointer to @ref rexdfss_t accepting substate. + + + +

Example

+ - @ref tokenjs.rexcc - JavaScript tokenizer. + + + +@example tokenjs.rexcc + + + + +*/ \ No newline at end of file diff --git a/rex/rexdb.h b/rex/rexdb.h index 94a0f7f..d2b6773 100644 --- a/rex/rexdb.h +++ b/rex/rexdb.h @@ -108,8 +108,8 @@ void rex_db_destroy(rexdb_t *rexdb); * @param nfa NFA object. * @param prev This is the previous start state of the automata, returned from a previous call to this function. * If this is the first call to this function prev is ignored. - * @param str Regular expression string. - * @param size The size of the string to be parsed. + * @param str UTF8 encoded regular expression string. + * @param size The size of the regular expression string. * @param userdata The value of this parameter is stored in the accepting state of the NFA(which also becomes * a sub-state in an accepting DFA state). You can use this value to identify which of the many regular expressions * compiled into the automaton is actually matching. A DFA state can have multiple sub-states, this means it can have diff --git a/rex/rexdfa.h b/rex/rexdfa.h index 7518c1a..ee383c7 100644 --- a/rex/rexdfa.h +++ b/rex/rexdfa.h @@ -75,10 +75,68 @@ typedef enum { #define REX_DFA_DEADSTATE (0) /**< DFA Dead State ID, In rexdfa_t object the state at offset 0 is always the dead state */ #define REX_DFA_STARTSTATE (1) /**< DFA Start State ID, In rexdfa_t object the start state is always at offset 1 */ + +/** + * @def REX_DFA_STATE(__dfa__, __nstate__) + * + * Get a pointer to @ref rexdfa_t state. + * @param __dfa__ Pointer to @ref rexdfa_t object + * @param __nstate__ State ID returned from @ref REX_DFA_NEXT or @ref REX_DFA_DEADSTATE, @ref REX_DFA_STARTSTATE + * @return Pointer to @ref rexdfa_t + */ #define REX_DFA_STATE(__dfa__, __nstate__) (&(__dfa__)->states[__nstate__]) + +/** + * @def REX_DFA_TRANSITION(__dfa__, __nstate__, __ntrans__) + * Get a pointer to @ref rexdft_t transition. This macro is used internally to find + * a transition to the next state. + * + * @param __dfa__ Pointer to @ref rexdfa_t object + * @param __nstate__ State ID returned from @ref REX_DFA_NEXT or @ref REX_DFA_DEADSTATE, @ref REX_DFA_STARTSTATE + * @param __ntrans__ Transition offset in the array of transitions for the specified state. This parameter + * must not exceed rexdfs_t::ntrans. + * @return Pointer to @ref rexdft_t transition + */ #define REX_DFA_TRANSITION(__dfa__, __nstate__, __ntrans__) (&(__dfa__)->trans[(REX_DFA_STATE(__dfa__, __nstate__)->trans) + (__ntrans__)]) + +/** + * @def REX_DFA_SUBSTATE(__dfa__, __nstate__, __nsubstate__) + * Get a pointer to @ref rexdfss_t sub-state. This macro would only work if the DFA + * is generated with its NFA sub-states. + * + * @param __dfa__ Pointer to @ref rexdfa_t object + * @param __nstate__ State ID returned from @ref REX_DFA_NEXT or @ref REX_DFA_STARTSTATE + * @param __nsubstate__ Sub-state offset in the array of sub-states for the specified state. This parameter + * must not exceed rexdfs_t::nsubstates. + * @return Pointer to @ref rexdfss_t substate. + */ #define REX_DFA_SUBSTATE(__dfa__, __nstate__, __nsubstate__) ((__dfa__)->substates ? &(__dfa__)->substates[REX_DFA_STATE(__dfa__, __nstate__)->substates + (__nsubstate__)] : ((rexdfss_t*)0)) + +/** + * @def REX_DFA_ACCSUBSTATE(__dfa__, __nstate__, __naccsubstate__) + * Get a pointer to @ref rexdfss_t accepting sub-state. + * + * @param __dfa__ Pointer to @ref rexdfa_t object + * @param __nstate__ State ID returned from @ref REX_DFA_NEXT or @ref REX_DFA_STARTSTATE + * @param __naccsubstate__ Sub-state offset in the array of accepting sub-states for the specified state. This parameter + * must not exceed rexdfs_t::naccsubstates. + * @return Pointer to @ref rexdfss_t accepting substate. + */ #define REX_DFA_ACCSUBSTATE(__dfa__, __nstate__, __naccsubstate__) ((__dfa__)->accsubstates ? &(__dfa__)->accsubstates[REX_DFA_STATE(__dfa__, __nstate__)->accsubstates + (__naccsubstate__)] : ((rexdfss_t*)0)) + + +/** + * @def REX_DFA_NEXT(__dfa__, __nstate__, __input__) + * + * Get the next state ID in the DFA for the specified input. The macro will + * search through the transitions of the current state to find the next + * state of the DFA for the specified input. + * + * @param __dfa__ Pointer to @ref rexdfa_t object + * @param __nstate__ Current state of the DFA + * @param __input__ Current input + * @return The next state of the DFA for the specified input + */ #define REX_DFA_NEXT(__dfa__, __nstate__, __input__) \ ({ \ rexdft_t *t; \ diff --git a/rpa/doc/main.txt b/rpa/doc/main.txt index 0b18b27..78d8413 100644 --- a/rpa/doc/main.txt +++ b/rpa/doc/main.txt @@ -12,6 +12,7 @@ - @subpage rex_main "Regular Expressions (REX library)" - @subpage rexdb - @subpage rexdfa + - @subpage rexcc */ diff --git a/tests/testrexcc/tokenjs.rexcc b/tests/testrexcc/tokenjs.rexcc new file mode 100644 index 0000000..094e133 --- /dev/null +++ b/tests/testrexcc/tokenjs.rexcc @@ -0,0 +1,131 @@ +/* + * Regular Pattern Analyzer (RPA) + * Copyright (c) 2009-2010 Martin Stoilov + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + * + * Martin Stoilov + */ + +/* + * To build: + * # rexcc tokenjs.rexcc -o tokenjs.c + * # gcc -I/usr/include/rpatk/rex -o tokenjs tokenjs.c + * + * To run: + * # echo "function add(a,b) { var c = a + b; return c; }" | ./tokenjs + */ + +#include +#include +#include + +#define TOKEN_SELF 256 +#define TOKEN_IDENTIFIER 257 +#define TOKEN_SPACE 258 +#define TOKEN_KEYWORD 259 +#define TOKEN_OPERATOR 260 +#define TOKEN_STRING 261 +#define TOKEN_DECIMAL 262 + + +%% +TOKEN_KEYWORD instanceof | typeof | break | do | new | var | case | else | \ + return | void | catch | finally | continue | for | \ + switch | while | this | with |debugger | function | throw | default | \ + if | try | delete | in | class | enum | extends | import | const | export | \ + implements | let | private | public | static | interface | package | protected + +TOKEN_IDENTIFIER ([#0x0041-#0x005A] | [#0x00C0-#0x00DE] | [#0x0100-#0x0232] | [#0x0061-#0x007A] | \ + [#0x00C0-#0x00DE] | $ | _ )([#0x0041-#0x005A] | [#0x00C0-#0x00DE] | \ + [#0x0100-#0x0232] | [#0x0061-#0x007A] | [#0x00C0-#0x00DE] | $ | _ | [0-9] | [#0x0660-#0x0669])* + +TOKEN_OPERATOR === | !== | >= | <= | == | != | << | >>> | >> | & | ^= | ^ | ! | ~ | && | [|][|] | [?] | : | \ + >>= | >>>= | &= | [|]= | = | [+]= | -= | [*]= | %= | <<= | [.] | ; | , | < | > | [|] | \ + [+] | - | [*] | % | [+][+] | -- | / | /= + +TOKEN_DECIMAL [1-9][0-9]* + +TOKEN_STRING '[^']*'|"[^"]*" + +TOKEN_SPACE [\t\r\n ]+ + +TOKEN_SELF [^\t\r\n+'" ] + + +%% + + +rexdfa_t *dfa = &ccdfa; + + +int get_token(wint_t *buffer, int size) +{ + rexdfss_t *acc_ss = NULL; + rexuint_t nstate = REX_DFA_STARTSTATE; + int ret = -1, i = 0; + wint_t wc; + + while ((wc = fgetwc(stdin)) != WEOF) { + if ((nstate = REX_DFA_NEXT(dfa, nstate, wc)) == REX_DFA_DEADSTATE) { + ungetc(wc, stdin); + break; + } + if (i + 1 < size) { + buffer[i++] = wc; + } + if (REX_DFA_STATE(dfa, nstate)->type == REX_STATETYPE_ACCEPT) { + /* + * The DFA is in accepting state, lets find out what exactly is + * being accepted. + * The token ID is recorder in the substate's userdata + * + * Note: There are may be more than one accepting substate, + * but we only check the first one (at offset 0). A real implementation + * might need to check the rest of the accepting substates(and userdata) + * to decide which one to use. + * + * Note: Some of the conflicts might be resolved simply be reordering + * the regular expressions. For example TOKEN_KEYWORD such as + * 'while', 'if', etc. also match TOKEN_IDENTIFIER, but because + * TOKEN_KEYWORD appears before TOKEN_IDENTIFIER it is placed first. + * + * Note: We will not break out of the loop here. We will keep going + * in order to find the longest match. + */ + acc_ss = REX_DFA_ACCSUBSTATE(dfa, nstate, 0); + ret = (int) acc_ss->userdata; + if (ret == TOKEN_SELF) + ret = wc; + } + } + buffer[i++] = '\0'; + return ret; +} + +int main(int argc, char *argv[]) +{ + wint_t buffer[4000]; + int token; + + if (!setlocale(LC_CTYPE, "")) { + printf("Can not set the specified locale, please check LANG, LC_CTYPE, LC_ALL.\n"); + return 1; + } + while ((token = get_token(buffer, sizeof(buffer)/sizeof(buffer[0]))) > 0) { + if (token != TOKEN_SPACE) + fwprintf(stdout, L"token(%3d): %ls\n", token, buffer); + } + return 0; +} -- 1.7.9.5