RPA Toolkit
added rexcc test/example
authorMartin Stoilov <martin@rpasearch.com>
Fri, 17 Feb 2012 04:50:42 +0000 (20:50 -0800)
committerMartin Stoilov <martin@rpasearch.com>
Fri, 17 Feb 2012 04:50:42 +0000 (20:50 -0800)
rex/rexcompiler.c
rexcc/rexcc.c
tests/testrexcc/tokenjs.rex [new file with mode: 0644]

index 98b89de..0eb7fb6 100644 (file)
@@ -77,15 +77,6 @@ static int rex_compiler_getchar(rexcompiler_t *co)
        ruint32 wc = REX_TOKEN_EOF;
        int inc = 0;
 
-again:
-       if (co->ptr + 1 < co->end && *co->ptr == '\\' && *(co->ptr + 1) == '\n') {
-               co->ptr += 2;
-               goto again;
-       }
-       if (co->ptr + 2 < co->end && *co->ptr == '\\' && *(co->ptr + 1) == '\r' && *(co->ptr + 2) == '\n') {
-               co->ptr += 3;
-               goto again;
-       }
        inc = r_utf8_mbtowc(&wc, (const unsigned char*)co->ptr, (const unsigned char*)co->end);
        if (inc <= 0)
                return REX_TOKEN_EOF;
@@ -159,8 +150,18 @@ static void rex_compiler_adjustescapedtoken(rexcompiler_t *co)
 
 static int rex_compiler_getnbtok(rexcompiler_t *co)
 {
+again:
        while (co->ptr < co->end && rex_compiler_isblank(*co->ptr))
                co->ptr += 1;
+       if (co->ptr + 1 < co->end && *co->ptr == '\\' && *(co->ptr + 1) == '\n') {
+               co->ptr += 2;
+               goto again;
+       }
+       if (co->ptr + 2 < co->end && *co->ptr == '\\' && *(co->ptr + 1) == '\r' && *(co->ptr + 2) == '\n') {
+               co->ptr += 3;
+               goto again;
+       }
+
        return rex_compiler_gettok(co);
 
 }
@@ -168,8 +169,18 @@ static int rex_compiler_getnbtok(rexcompiler_t *co)
 
 static int rex_compiler_getnstok(rexcompiler_t *co)
 {
+again:
        while (co->ptr < co->end && rex_compiler_isspace(*co->ptr))
                co->ptr += 1;
+       if (co->ptr + 1 < co->end && *co->ptr == '\\' && *(co->ptr + 1) == '\n') {
+               co->ptr += 2;
+               goto again;
+       }
+       if (co->ptr + 2 < co->end && *co->ptr == '\\' && *(co->ptr + 1) == '\r' && *(co->ptr + 2) == '\n') {
+               co->ptr += 3;
+               goto again;
+       }
+
        return rex_compiler_gettok(co);
 }
 
index 4dcd324..e002637 100644 (file)
@@ -413,7 +413,7 @@ int rex_cc_parse(rexcc_t *pCC)
                return -1;
        rex_cc_gettoken(pCC);
        while (pCC->token) {
-               if (pCC->token == REXCC_TOKEN_CR || pCC->token == REXCC_TOKEN_SPACE) {
+               if (pCC->token == REXCC_TOKEN_CR || pCC->token == REXCC_TOKEN_SPACE || pCC->token == REXCC_TOKEN_REGEX) {
                        rex_cc_gettoken(pCC);
                } else if (pCC->token == REXCC_TOKEN_IDENTIFIER) {
                        rex_cc_parseline(pCC);
diff --git a/tests/testrexcc/tokenjs.rex b/tests/testrexcc/tokenjs.rex
new file mode 100644 (file)
index 0000000..6466028
--- /dev/null
@@ -0,0 +1,102 @@
+#include <stdio.h>
+#include <wchar.h>
+#include <locale.h>
+
+#define TOKEN_SELF 256
+#define TOKEN_IDENTIFIER 257
+#define TOKEN_SPACE 258
+#define TOKEN_KEYWORD 259
+#define TOKEN_OPERATOR 260
+#define TOKEN_STRING 261
+#define TOKEN_DECIMAL 262
+
+
+%%
+TOKEN_KEYWORD          instanceof | typeof | break | do | new | var | case | else | \
+                                       return | void | catch | finally | continue | for | \
+                                       switch | while | this | with |debugger | function | throw | default | \
+                                       if | try | delete | in | class | enum | extends | import | const | export | \
+                                       implements | let | private | public | static | interface | package | protected
+
+TOKEN_IDENTIFIER       ([#0x0041-#0x005A] | [#0x00C0-#0x00DE] | [#0x0100-#0x0232] | [#0x0061-#0x007A] | \
+                                       [#0x00C0-#0x00DE] | $ | _ )([#0x0041-#0x005A] | [#0x00C0-#0x00DE] | \
+                                       [#0x0100-#0x0232] | [#0x0061-#0x007A] | [#0x00C0-#0x00DE] | $ | _ | [0-9] | [#0x0660-#0x0669])*
+
+TOKEN_OPERATOR         === | !== | >= | <= | == | != | << | >>> | >> | & | ^= | ^ | ! | ~ | && | [|][|] | [?] | : | \
+                                       >>= | >>>= | &= | [|]= | = | [+]= | -= | [*]= | %= | <<= | [.] | ; | , | < | > | [|] | \
+                                       [+] | - | [*] | % | [+][+] | -- | / | /=
+
+TOKEN_DECIMAL          [1-9][0-9]*
+
+TOKEN_STRING           '[^']*'|"[^"]*"
+
+TOKEN_SPACE                    [\t\r\n ]+
+
+TOKEN_SELF                     [^\t\r\n+'" ]
+
+
+%%
+
+
+rexdfa_t *dfa = &ccdfa;
+
+
+int get_token(wint_t *buffer, int size)
+{
+       rexdfss_t *acc_ss = NULL;
+       rexuint_t nstate = REX_DFA_STARTSTATE;
+       int ret = -1, i = 0;
+       wint_t wc;
+       
+       while ((wc = fgetwc(stdin)) != WEOF) {
+               if ((nstate = REX_DFA_NEXT(dfa, nstate, wc)) == REX_DFA_DEADSTATE) {
+                       ungetc(wc, stdin);
+                       break;
+               }
+               if (i + 1 < size) {
+                       buffer[i++] = wc;
+               }
+               if (REX_DFA_STATE(dfa, nstate)->type == REX_STATETYPE_ACCEPT) {
+                       /*
+                        * The DFA is in accepting state, lets find out what exactly is
+                        * being accepted.
+                        * The token ID is recorder in the substate's userdata
+                        *
+                        * Note: There are may be more than one accepting substate,
+                        * but we only check the first one (at offset 0). A real implementation
+                        * might need to check the rest of the accepting substates(and userdata)
+                        * to decide which one to use.
+                        *
+                        * Note: Some of the conflicts might be resolved simply be reordering
+                        * the regular expressions. For example TOKEN_KEYWORD such as 
+                        * 'while', 'if', etc. also match TOKEN_IDENTIFIER, but because
+                        * TOKEN_KEYWORD appears before TOKEN_IDENTIFIER it is placed first.
+                        *
+                        * Note: We will not break out of the loop here. We will keep going
+                        * in order to find the longest match.
+                        */
+                       acc_ss = REX_DFA_ACCSUBSTATE(dfa, nstate, 0);
+                       ret = (int) acc_ss->userdata;
+                       if (ret == TOKEN_SELF)
+                               ret = wc;
+               }
+       }
+       buffer[i++] = '\0';
+       return ret;
+}
+
+int main(int argc, char *argv[])
+{
+       wint_t buffer[4000];
+       int token;
+       
+       if (!setlocale(LC_CTYPE, "")) {
+               printf("Can not set the specified locale, please check LANG, LC_CTYPE, LC_ALL.\n");
+               return 1;
+    }
+       while ((token = get_token(buffer, sizeof(buffer)/sizeof(buffer[0]))) > 0) {
+               if (token != TOKEN_SPACE)
+                       fwprintf(stdout, L"token(%3d): %ls\n", token, buffer);
+       }
+       return 0;
+}