ruint32 wc = REX_TOKEN_EOF;
int inc = 0;
-again:
- if (co->ptr + 1 < co->end && *co->ptr == '\\' && *(co->ptr + 1) == '\n') {
- co->ptr += 2;
- goto again;
- }
- if (co->ptr + 2 < co->end && *co->ptr == '\\' && *(co->ptr + 1) == '\r' && *(co->ptr + 2) == '\n') {
- co->ptr += 3;
- goto again;
- }
inc = r_utf8_mbtowc(&wc, (const unsigned char*)co->ptr, (const unsigned char*)co->end);
if (inc <= 0)
return REX_TOKEN_EOF;
static int rex_compiler_getnbtok(rexcompiler_t *co)
{
+again:
while (co->ptr < co->end && rex_compiler_isblank(*co->ptr))
co->ptr += 1;
+ if (co->ptr + 1 < co->end && *co->ptr == '\\' && *(co->ptr + 1) == '\n') {
+ co->ptr += 2;
+ goto again;
+ }
+ if (co->ptr + 2 < co->end && *co->ptr == '\\' && *(co->ptr + 1) == '\r' && *(co->ptr + 2) == '\n') {
+ co->ptr += 3;
+ goto again;
+ }
+
return rex_compiler_gettok(co);
}
static int rex_compiler_getnstok(rexcompiler_t *co)
{
+again:
while (co->ptr < co->end && rex_compiler_isspace(*co->ptr))
co->ptr += 1;
+ if (co->ptr + 1 < co->end && *co->ptr == '\\' && *(co->ptr + 1) == '\n') {
+ co->ptr += 2;
+ goto again;
+ }
+ if (co->ptr + 2 < co->end && *co->ptr == '\\' && *(co->ptr + 1) == '\r' && *(co->ptr + 2) == '\n') {
+ co->ptr += 3;
+ goto again;
+ }
+
return rex_compiler_gettok(co);
}
--- /dev/null
+#include <stdio.h>
+#include <wchar.h>
+#include <locale.h>
+
+#define TOKEN_SELF 256
+#define TOKEN_IDENTIFIER 257
+#define TOKEN_SPACE 258
+#define TOKEN_KEYWORD 259
+#define TOKEN_OPERATOR 260
+#define TOKEN_STRING 261
+#define TOKEN_DECIMAL 262
+
+
+%%
+TOKEN_KEYWORD instanceof | typeof | break | do | new | var | case | else | \
+ return | void | catch | finally | continue | for | \
+ switch | while | this | with |debugger | function | throw | default | \
+ if | try | delete | in | class | enum | extends | import | const | export | \
+ implements | let | private | public | static | interface | package | protected
+
+TOKEN_IDENTIFIER ([#0x0041-#0x005A] | [#0x00C0-#0x00DE] | [#0x0100-#0x0232] | [#0x0061-#0x007A] | \
+ [#0x00C0-#0x00DE] | $ | _ )([#0x0041-#0x005A] | [#0x00C0-#0x00DE] | \
+ [#0x0100-#0x0232] | [#0x0061-#0x007A] | [#0x00C0-#0x00DE] | $ | _ | [0-9] | [#0x0660-#0x0669])*
+
+TOKEN_OPERATOR === | !== | >= | <= | == | != | << | >>> | >> | & | ^= | ^ | ! | ~ | && | [|][|] | [?] | : | \
+ >>= | >>>= | &= | [|]= | = | [+]= | -= | [*]= | %= | <<= | [.] | ; | , | < | > | [|] | \
+ [+] | - | [*] | % | [+][+] | -- | / | /=
+
+TOKEN_DECIMAL [1-9][0-9]*
+
+TOKEN_STRING '[^']*'|"[^"]*"
+
+TOKEN_SPACE [\t\r\n ]+
+
+TOKEN_SELF [^\t\r\n+'" ]
+
+
+%%
+
+
+rexdfa_t *dfa = &ccdfa;
+
+
+int get_token(wint_t *buffer, int size)
+{
+ rexdfss_t *acc_ss = NULL;
+ rexuint_t nstate = REX_DFA_STARTSTATE;
+ int ret = -1, i = 0;
+ wint_t wc;
+
+ while ((wc = fgetwc(stdin)) != WEOF) {
+ if ((nstate = REX_DFA_NEXT(dfa, nstate, wc)) == REX_DFA_DEADSTATE) {
+ ungetc(wc, stdin);
+ break;
+ }
+ if (i + 1 < size) {
+ buffer[i++] = wc;
+ }
+ if (REX_DFA_STATE(dfa, nstate)->type == REX_STATETYPE_ACCEPT) {
+ /*
+ * The DFA is in accepting state, lets find out what exactly is
+ * being accepted.
+ * The token ID is recorder in the substate's userdata
+ *
+ * Note: There are may be more than one accepting substate,
+ * but we only check the first one (at offset 0). A real implementation
+ * might need to check the rest of the accepting substates(and userdata)
+ * to decide which one to use.
+ *
+ * Note: Some of the conflicts might be resolved simply be reordering
+ * the regular expressions. For example TOKEN_KEYWORD such as
+ * 'while', 'if', etc. also match TOKEN_IDENTIFIER, but because
+ * TOKEN_KEYWORD appears before TOKEN_IDENTIFIER it is placed first.
+ *
+ * Note: We will not break out of the loop here. We will keep going
+ * in order to find the longest match.
+ */
+ acc_ss = REX_DFA_ACCSUBSTATE(dfa, nstate, 0);
+ ret = (int) acc_ss->userdata;
+ if (ret == TOKEN_SELF)
+ ret = wc;
+ }
+ }
+ buffer[i++] = '\0';
+ return ret;
+}
+
+int main(int argc, char *argv[])
+{
+ wint_t buffer[4000];
+ int token;
+
+ if (!setlocale(LC_CTYPE, "")) {
+ printf("Can not set the specified locale, please check LANG, LC_CTYPE, LC_ALL.\n");
+ return 1;
+ }
+ while ((token = get_token(buffer, sizeof(buffer)/sizeof(buffer[0]))) > 0) {
+ if (token != TOKEN_SPACE)
+ fwprintf(stdout, L"token(%3d): %ls\n", token, buffer);
+ }
+ return 0;
+}