../../rex/doc/rex_main.txt \
../../rex/doc/rexdb.txt \
../../rex/doc/rexdfa.txt \
+ ../../rex/doc/rexcc.txt \
../../rex/rexdb.h \
../../rex/rexdfa.h \
*
* To run:
* # echo "function add(a,b) { var c = a + b; return c; } print('здравей means hello');" | ./js-tokenizer
+ *
+ * Your terminal must use UTF8 encoding, something like: LANG=en_US.utf8
*/
#include <stdio.h>
int token;
if (!setlocale(LC_CTYPE, "")) {
- printf("Can not set the specified locale, please check LANG, LC_CTYPE, LC_ALL.\n");
+ printf("Failed to set the specified locale, please check LANG, LC_CTYPE, LC_ALL.\n");
return 1;
}
--- /dev/null
+/*
+ * Regular Pattern Analyzer (RPA)
+ * Copyright (c) 2009-2010 Martin Stoilov
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Martin Stoilov <martin@rpasearch.com>
+ */
+
+/*
+ * To build:
+ * # rexcc tokenjs.rexcc -o tokenjs.c
+ * # gcc -I/usr/include/rpatk/rex -o tokenjs tokenjs.c
+ *
+ * To run:
+ * # echo "function add(a,b) { var c = a + b; return c; }" | ./tokenjs
+ */
+
+#include <stdio.h>
+#include <wchar.h>
+#include <locale.h>
+
+#define TOKEN_SELF 256
+#define TOKEN_IDENTIFIER 257
+#define TOKEN_SPACE 258
+#define TOKEN_KEYWORD 259
+#define TOKEN_OPERATOR 260
+#define TOKEN_STRING 261
+#define TOKEN_DECIMAL 262
+
+
+%%
+TOKEN_KEYWORD instanceof | typeof | break | do | new | var | case | else | \
+ return | void | catch | finally | continue | for | \
+ switch | while | this | with |debugger | function | throw | default | \
+ if | try | delete | in | class | enum | extends | import | const | export | \
+ implements | let | private | public | static | interface | package | protected
+
+TOKEN_IDENTIFIER ([#0x0041-#0x005A] | [#0x00C0-#0x00DE] | [#0x0100-#0x0232] | [#0x0061-#0x007A] | \
+ [#0x00C0-#0x00DE] | $ | _ )([#0x0041-#0x005A] | [#0x00C0-#0x00DE] | \
+ [#0x0100-#0x0232] | [#0x0061-#0x007A] | [#0x00C0-#0x00DE] | $ | _ | [0-9] | [#0x0660-#0x0669])*
+
+TOKEN_OPERATOR === | !== | >= | <= | == | != | << | >>> | >> | & | ^= | ^ | ! | ~ | && | [|][|] | [?] | : | \
+ >>= | >>>= | &= | [|]= | = | [+]= | -= | [*]= | %= | <<= | [.] | ; | , | < | > | [|] | \
+ [+] | - | [*] | % | [+][+] | -- | / | /=
+
+TOKEN_DECIMAL [1-9][0-9]*
+
+TOKEN_STRING '[^']*'|"[^"]*"
+
+TOKEN_SPACE [\t\r\n ]+
+
+TOKEN_SELF [^\t\r\n+'" ]
+
+
+%%
+
+
+rexdfa_t *dfa = &ccdfa;
+
+
+int get_token(wint_t *buffer, int size)
+{
+ rexdfss_t *acc_ss = NULL;
+ rexuint_t nstate = REX_DFA_STARTSTATE;
+ int ret = -1, i = 0;
+ wint_t wc;
+
+ while ((wc = fgetwc(stdin)) != WEOF) {
+ if ((nstate = REX_DFA_NEXT(dfa, nstate, wc)) == REX_DFA_DEADSTATE) {
+ ungetc(wc, stdin);
+ break;
+ }
+ if (i + 1 < size) {
+ buffer[i++] = wc;
+ }
+ if (REX_DFA_STATE(dfa, nstate)->type == REX_STATETYPE_ACCEPT) {
+ /*
+ * The DFA is in accepting state, lets find out what exactly is
+ * being accepted.
+ * The token ID is recorder in the substate's userdata
+ *
+ * Note: There are may be more than one accepting substate,
+ * but we only check the first one (at offset 0). A real implementation
+ * might need to check the rest of the accepting substates(and userdata)
+ * to decide which one to use.
+ *
+ * Note: Some of the conflicts might be resolved simply be reordering
+ * the regular expressions. For example TOKEN_KEYWORD such as
+ * 'while', 'if', etc. also match TOKEN_IDENTIFIER, but because
+ * TOKEN_KEYWORD appears before TOKEN_IDENTIFIER it is placed first.
+ *
+ * Note: We will not break out of the loop here. We will keep going
+ * in order to find the longest match.
+ */
+ acc_ss = REX_DFA_ACCSUBSTATE(dfa, nstate, 0);
+ ret = (int) acc_ss->userdata;
+ if (ret == TOKEN_SELF)
+ ret = wc;
+ }
+ }
+ buffer[i++] = '\0';
+ return ret;
+}
+
+int main(int argc, char *argv[])
+{
+ wint_t buffer[4000];
+ int token;
+
+ if (!setlocale(LC_CTYPE, "")) {
+ printf("Can not set the specified locale, please check LANG, LC_CTYPE, LC_ALL.\n");
+ return 1;
+ }
+ while ((token = get_token(buffer, sizeof(buffer)/sizeof(buffer[0]))) > 0) {
+ if (token != TOKEN_SPACE)
+ fwprintf(stdout, L"token(%3d): %ls\n", token, buffer);
+ }
+ return 0;
+}
REX doesn't provide API for matching or searching directly, it is up to the user to decide how to
implement whatever functionality they need using the automaton.
-The JavaScript tokenizer example @ref js-tokenizer.c is a simple demonstration how to use the REX library for lexical analysis.
+The JavaScript tokenizer example @ref js-tokenizer.c is a simple demonstration how to use the REX library for lexical analysis of UTF8 encoded text.
*/
\ No newline at end of file
--- /dev/null
+/** \page rexcc REX C code generator.
+rexcc is a code generator for C language. It generates C code from regular expressions and
+initializes Deterministic Finite Automata(DFA) rexdfa_t object. The rexcc program reads
+user specified input file, for a description of the code to generate. It will produce a
+C file or it will output the generated code to the standard output.
+
+<h2>Input file format</h2>
+The rexcc input file consists of three sections, separated by a line containing only `%%'.
+
+@verbatim
+C code prolog
+%%
+regular expressions
+%%
+C code epilog
+@endverbatim
+
+
+<h3>C code prolog</h3>
+This section is used to include any header files or definitions that are required by the rest of the C code.
+
+<h3>regular expressions</h3>
+This section is used to specify the regular expressions that will be used to generate and initialize the
+Deterministic Finite Automata (DFA). This section contain series of regular expression definitions of the form:
+
+@verbatim
+userdata regex
+@endverbatim
+
+where userdata must be a user defined data of type @ref rexuserdata_t and regex must be a regular expression.
+Both must be separated by space or tab.
+
+<h3>C code epilog</h3>
+This section is used to add any C code that uses the @ref rexdfa_t object generated from the rules specified in
+the previous section. The name of the generated variable of type @ref rexdfa_t is always `ccdfa' and it is declared
+as static. If you need to access it outside of the generated file you should add code in this section that will
+make such access possible. For example:
+
+@verbatim
+rexdfa_t *mydfa = &ccdfa;
+@endverbatim
+
+Or using accessor function:
+@verbatim
+rexdfa_t *GetMyDfaPtr()
+{
+ return &ccdfa;
+}
+@endverbatim
+
+<h2>Example</h2>
+
+@code
+#include "mydefinitions.h"
+#define IDENTIFIER 257
+
+%%
+IDENTIFIER [A-Za-z_][A-Za-z_0-9]*
+"keyword" while|do
+256 [ \n\r\t]
+%%
+
+/* All userdata used in the previous section, can be cast to rexuserdata_t. */
+
+rexdfa_t *get_simple_dfa()
+{
+ return &ccdfa;
+}
+
+@endcode
+
+The userdata specified for eache regular expression is used to identify that regular expression when the
+automata arrives at an accepting state.
+
+@section build_rexcc_code Building the generated code
+The code generated with rexcc doesn't require to be linked with the REX library, but it includes the header file @ref rexdfa.h.
+This file provides the definitions of the DFA related structures used by the generated code and it also provides macros for
+accessing the states and substates of the DFA. You must add the path to the @ref rexdfa.h header file to your default search path.
+
+List of macros:
+ - @ref REX_DFA_NEXT - Get the next state in the DFA for the specified input.
+ - @ref REX_DFA_STATE - Get a pointer to @ref rexdfa_t state.
+ - @ref REX_DFA_TRANSITION - Get a pointer to @ref rexdft_t transition.
+ - @ref REX_DFA_SUBSTATE - Get a pointer to @ref rexdfss_t substate. This works only if rexcc is instructed to generate the substates.
+ - @ref REX_DFA_ACCSUBSTATE - Get a pointer to @ref rexdfss_t accepting substate.
+
+
+
+<h2>Example</h2>
+ - @ref tokenjs.rexcc - JavaScript tokenizer.
+
+
+
+@example tokenjs.rexcc
+
+
+
+
+*/
\ No newline at end of file
* @param nfa NFA object.
* @param prev This is the previous start state of the automata, returned from a previous call to this function.
* If this is the first call to this function prev is ignored.
- * @param str Regular expression string.
- * @param size The size of the string to be parsed.
+ * @param str UTF8 encoded regular expression string.
+ * @param size The size of the regular expression string.
* @param userdata The value of this parameter is stored in the accepting state of the NFA(which also becomes
* a sub-state in an accepting DFA state). You can use this value to identify which of the many regular expressions
* compiled into the automaton is actually matching. A DFA state can have multiple sub-states, this means it can have
#define REX_DFA_DEADSTATE (0) /**< DFA Dead State ID, In rexdfa_t object the state at offset 0 is always the dead state */
#define REX_DFA_STARTSTATE (1) /**< DFA Start State ID, In rexdfa_t object the start state is always at offset 1 */
+
+/**
+ * @def REX_DFA_STATE(__dfa__, __nstate__)
+ *
+ * Get a pointer to @ref rexdfa_t state.
+ * @param __dfa__ Pointer to @ref rexdfa_t object
+ * @param __nstate__ State ID returned from @ref REX_DFA_NEXT or @ref REX_DFA_DEADSTATE, @ref REX_DFA_STARTSTATE
+ * @return Pointer to @ref rexdfa_t
+ */
#define REX_DFA_STATE(__dfa__, __nstate__) (&(__dfa__)->states[__nstate__])
+
+/**
+ * @def REX_DFA_TRANSITION(__dfa__, __nstate__, __ntrans__)
+ * Get a pointer to @ref rexdft_t transition. This macro is used internally to find
+ * a transition to the next state.
+ *
+ * @param __dfa__ Pointer to @ref rexdfa_t object
+ * @param __nstate__ State ID returned from @ref REX_DFA_NEXT or @ref REX_DFA_DEADSTATE, @ref REX_DFA_STARTSTATE
+ * @param __ntrans__ Transition offset in the array of transitions for the specified state. This parameter
+ * must not exceed rexdfs_t::ntrans.
+ * @return Pointer to @ref rexdft_t transition
+ */
#define REX_DFA_TRANSITION(__dfa__, __nstate__, __ntrans__) (&(__dfa__)->trans[(REX_DFA_STATE(__dfa__, __nstate__)->trans) + (__ntrans__)])
+
+/**
+ * @def REX_DFA_SUBSTATE(__dfa__, __nstate__, __nsubstate__)
+ * Get a pointer to @ref rexdfss_t sub-state. This macro would only work if the DFA
+ * is generated with its NFA sub-states.
+ *
+ * @param __dfa__ Pointer to @ref rexdfa_t object
+ * @param __nstate__ State ID returned from @ref REX_DFA_NEXT or @ref REX_DFA_STARTSTATE
+ * @param __nsubstate__ Sub-state offset in the array of sub-states for the specified state. This parameter
+ * must not exceed rexdfs_t::nsubstates.
+ * @return Pointer to @ref rexdfss_t substate.
+ */
#define REX_DFA_SUBSTATE(__dfa__, __nstate__, __nsubstate__) ((__dfa__)->substates ? &(__dfa__)->substates[REX_DFA_STATE(__dfa__, __nstate__)->substates + (__nsubstate__)] : ((rexdfss_t*)0))
+
+/**
+ * @def REX_DFA_ACCSUBSTATE(__dfa__, __nstate__, __naccsubstate__)
+ * Get a pointer to @ref rexdfss_t accepting sub-state.
+ *
+ * @param __dfa__ Pointer to @ref rexdfa_t object
+ * @param __nstate__ State ID returned from @ref REX_DFA_NEXT or @ref REX_DFA_STARTSTATE
+ * @param __naccsubstate__ Sub-state offset in the array of accepting sub-states for the specified state. This parameter
+ * must not exceed rexdfs_t::naccsubstates.
+ * @return Pointer to @ref rexdfss_t accepting substate.
+ */
#define REX_DFA_ACCSUBSTATE(__dfa__, __nstate__, __naccsubstate__) ((__dfa__)->accsubstates ? &(__dfa__)->accsubstates[REX_DFA_STATE(__dfa__, __nstate__)->accsubstates + (__naccsubstate__)] : ((rexdfss_t*)0))
+
+
+/**
+ * @def REX_DFA_NEXT(__dfa__, __nstate__, __input__)
+ *
+ * Get the next state ID in the DFA for the specified input. The macro will
+ * search through the transitions of the current state to find the next
+ * state of the DFA for the specified input.
+ *
+ * @param __dfa__ Pointer to @ref rexdfa_t object
+ * @param __nstate__ Current state of the DFA
+ * @param __input__ Current input
+ * @return The next state of the DFA for the specified input
+ */
#define REX_DFA_NEXT(__dfa__, __nstate__, __input__) \
({ \
rexdft_t *t; \
- @subpage rex_main "Regular Expressions (REX library)"
- @subpage rexdb
- @subpage rexdfa
+ - @subpage rexcc
*/
--- /dev/null
+/*
+ * Regular Pattern Analyzer (RPA)
+ * Copyright (c) 2009-2010 Martin Stoilov
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Martin Stoilov <martin@rpasearch.com>
+ */
+
+/*
+ * To build:
+ * # rexcc tokenjs.rexcc -o tokenjs.c
+ * # gcc -I/usr/include/rpatk/rex -o tokenjs tokenjs.c
+ *
+ * To run:
+ * # echo "function add(a,b) { var c = a + b; return c; }" | ./tokenjs
+ */
+
+#include <stdio.h>
+#include <wchar.h>
+#include <locale.h>
+
+#define TOKEN_SELF 256
+#define TOKEN_IDENTIFIER 257
+#define TOKEN_SPACE 258
+#define TOKEN_KEYWORD 259
+#define TOKEN_OPERATOR 260
+#define TOKEN_STRING 261
+#define TOKEN_DECIMAL 262
+
+
+%%
+TOKEN_KEYWORD instanceof | typeof | break | do | new | var | case | else | \
+ return | void | catch | finally | continue | for | \
+ switch | while | this | with |debugger | function | throw | default | \
+ if | try | delete | in | class | enum | extends | import | const | export | \
+ implements | let | private | public | static | interface | package | protected
+
+TOKEN_IDENTIFIER ([#0x0041-#0x005A] | [#0x00C0-#0x00DE] | [#0x0100-#0x0232] | [#0x0061-#0x007A] | \
+ [#0x00C0-#0x00DE] | $ | _ )([#0x0041-#0x005A] | [#0x00C0-#0x00DE] | \
+ [#0x0100-#0x0232] | [#0x0061-#0x007A] | [#0x00C0-#0x00DE] | $ | _ | [0-9] | [#0x0660-#0x0669])*
+
+TOKEN_OPERATOR === | !== | >= | <= | == | != | << | >>> | >> | & | ^= | ^ | ! | ~ | && | [|][|] | [?] | : | \
+ >>= | >>>= | &= | [|]= | = | [+]= | -= | [*]= | %= | <<= | [.] | ; | , | < | > | [|] | \
+ [+] | - | [*] | % | [+][+] | -- | / | /=
+
+TOKEN_DECIMAL [1-9][0-9]*
+
+TOKEN_STRING '[^']*'|"[^"]*"
+
+TOKEN_SPACE [\t\r\n ]+
+
+TOKEN_SELF [^\t\r\n+'" ]
+
+
+%%
+
+
+rexdfa_t *dfa = &ccdfa;
+
+
+int get_token(wint_t *buffer, int size)
+{
+ rexdfss_t *acc_ss = NULL;
+ rexuint_t nstate = REX_DFA_STARTSTATE;
+ int ret = -1, i = 0;
+ wint_t wc;
+
+ while ((wc = fgetwc(stdin)) != WEOF) {
+ if ((nstate = REX_DFA_NEXT(dfa, nstate, wc)) == REX_DFA_DEADSTATE) {
+ ungetc(wc, stdin);
+ break;
+ }
+ if (i + 1 < size) {
+ buffer[i++] = wc;
+ }
+ if (REX_DFA_STATE(dfa, nstate)->type == REX_STATETYPE_ACCEPT) {
+ /*
+ * The DFA is in accepting state, lets find out what exactly is
+ * being accepted.
+ * The token ID is recorder in the substate's userdata
+ *
+ * Note: There are may be more than one accepting substate,
+ * but we only check the first one (at offset 0). A real implementation
+ * might need to check the rest of the accepting substates(and userdata)
+ * to decide which one to use.
+ *
+ * Note: Some of the conflicts might be resolved simply be reordering
+ * the regular expressions. For example TOKEN_KEYWORD such as
+ * 'while', 'if', etc. also match TOKEN_IDENTIFIER, but because
+ * TOKEN_KEYWORD appears before TOKEN_IDENTIFIER it is placed first.
+ *
+ * Note: We will not break out of the loop here. We will keep going
+ * in order to find the longest match.
+ */
+ acc_ss = REX_DFA_ACCSUBSTATE(dfa, nstate, 0);
+ ret = (int) acc_ss->userdata;
+ if (ret == TOKEN_SELF)
+ ret = wc;
+ }
+ }
+ buffer[i++] = '\0';
+ return ret;
+}
+
+int main(int argc, char *argv[])
+{
+ wint_t buffer[4000];
+ int token;
+
+ if (!setlocale(LC_CTYPE, "")) {
+ printf("Can not set the specified locale, please check LANG, LC_CTYPE, LC_ALL.\n");
+ return 1;
+ }
+ while ((token = get_token(buffer, sizeof(buffer)/sizeof(buffer[0]))) > 0) {
+ if (token != TOKEN_SPACE)
+ fwprintf(stdout, L"token(%3d): %ls\n", token, buffer);
+ }
+ return 0;
+}