tokenjs.rexcc
/*
 * To build:
 * # rexcc tokenjs.rexcc -o tokenjs.c
 * # gcc -I/usr/include/rpatk -o tokenjs tokenjs.c
 *
 * To run:
 * # echo "function add(a,b) { var c = a + b; return c; }" | ./tokenjs
 */

#include <stdio.h>
#include <wchar.h>
#include <locale.h>
#include "rex/rexdfa.h"

#define TOKEN_SELF 256
#define TOKEN_IDENTIFIER 257
#define TOKEN_SPACE 258
#define TOKEN_KEYWORD 259
#define TOKEN_OPERATOR 260
#define TOKEN_STRING 261
#define TOKEN_DECIMAL 262


%%
TOKEN_KEYWORD           instanceof | typeof | break | do | new | var | case | else | \
                                        return | void | catch | finally | continue | for | \
                                        switch | while | this | with |debugger | function | throw | default | \
                                        if | try | delete | in | class | enum | extends | import | const | export | \
                                        implements | let | private | public | static | interface | package | protected

TOKEN_IDENTIFIER        ([#0x0041-#0x005A] | [#0x00C0-#0x00DE] | [#0x0100-#0x0232] | [#0x0061-#0x007A] | \
                                        [#0x00C0-#0x00DE] | $ | _ )([#0x0041-#0x005A] | [#0x00C0-#0x00DE] | \
                                        [#0x0100-#0x0232] | [#0x0061-#0x007A] | [#0x00C0-#0x00DE] | $ | _ | [0-9] | [#0x0660-#0x0669])*

TOKEN_OPERATOR          === | !== | >= | <= | == | != | << | >>> | >> | & | ^= | ^ | ! | ~ | && | [|][|] | [?] | : | \
                                        >>= | >>>= | &= | [|]= | = | [+]= | -= | [*]= | %= | <<= | [.] | ; | , | < | > | [|] | \
                                        [+] | - | [*] | % | [+][+] | -- | / | /=

TOKEN_DECIMAL           [1-9][0-9]*

TOKEN_STRING            '[^']*'|"[^"]*"

TOKEN_SPACE                     [\t\r\n ]+

TOKEN_SELF                      [^\t\r\n+'" ]


%%


rexdfa_t *dfa = &ccdfa;


int get_token(wint_t *buffer, int size)
{
        rexdfss_t *acc_ss = NULL;
        rexuint_t nstate = REX_DFA_STARTSTATE;
        int ret = -1, i = 0;
        wint_t wc;
        
        while ((wc = fgetwc(stdin)) != WEOF) {
                REX_DFA_NEXT(dfa, nstate, wc, &nstate);
                if (nstate == REX_DFA_DEADSTATE) {
                        ungetwc(wc, stdin);
                        break;
                }
                if (i + 1 < size) {
                        buffer[i++] = wc;
                }
                if (REX_DFA_STATE(dfa, nstate)->type == REX_STATETYPE_ACCEPT) {
                        /*
                         * The DFA is in accepting state, lets find out what exactly is
                         * being accepted.
                         * The token ID is recorded in the substate's userdata
                         *
                         * Note: There are may be more than one accepting substate,
                         * but we only check the first one (at offset 0). A real implementation
                         * might need to check the rest of the accepting substates(and userdata)
                         * to decide which one to use.
                         *
                         * Note: Some of the conflicts might be resolved simply be reordering
                         * the regular expressions. For example TOKEN_KEYWORD such as 
                         * 'while', 'if', etc. also match TOKEN_IDENTIFIER, but because
                         * TOKEN_KEYWORD appears before TOKEN_IDENTIFIER it is placed first.
                         *
                         * Note: We will not break out of the loop here. We will keep going
                         * in order to find the longest match.
                         */
                        acc_ss = REX_DFA_ACCSUBSTATE(dfa, nstate, 0);
                        ret = (int) acc_ss->userdata;
                        if (ret == TOKEN_SELF)
                                ret = wc;
                }
        }
        buffer[i++] = '\0';
        return ret;
}

int main(int argc, char *argv[])
{
        wint_t buffer[4000];
        int token;
        
        if (!setlocale(LC_ALL, "en_US.UTF8" )) {
                printf("Can not set the specified locale, please check LANG, LC_CTYPE, LC_ALL.\n");
                return 1;
        }
        while ((token = get_token(buffer, sizeof(buffer)/sizeof(buffer[0]))) > 0) {
                if (token != TOKEN_SPACE)
                        fwprintf(stdout, L"token(%3d): %ls\n", token, buffer);
        }
        return 0;
}