+++ /dev/null
-/*
- * Regular Pattern Analyzer (RPA)
- * Copyright (c) 2009-2010 Martin Stoilov
- *
- * This program is free software: you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program. If not, see <http://www.gnu.org/licenses/>.
- *
- * Martin Stoilov <martin@rpasearch.com>
- */
-
-/*
- * To build:
- * # rexcc tokenjs.rex -o tokenjs.c
- * # gcc -I/usr/include/rpatk/rex -o tokenjs tokenjs.c
- *
- * To run:
- * # echo "function add(a,b) { var c = a + b; return c; }" | ./tokenjs
- */
-
-#include <stdio.h>
-#include <wchar.h>
-#include <locale.h>
-
-#define TOKEN_SELF 256
-#define TOKEN_IDENTIFIER 257
-#define TOKEN_SPACE 258
-#define TOKEN_KEYWORD 259
-#define TOKEN_OPERATOR 260
-#define TOKEN_STRING 261
-#define TOKEN_DECIMAL 262
-
-
-%%
-TOKEN_KEYWORD instanceof | typeof | break | do | new | var | case | else | \
- return | void | catch | finally | continue | for | \
- switch | while | this | with |debugger | function | throw | default | \
- if | try | delete | in | class | enum | extends | import | const | export | \
- implements | let | private | public | static | interface | package | protected
-
-TOKEN_IDENTIFIER ([#0x0041-#0x005A] | [#0x00C0-#0x00DE] | [#0x0100-#0x0232] | [#0x0061-#0x007A] | \
- [#0x00C0-#0x00DE] | $ | _ )([#0x0041-#0x005A] | [#0x00C0-#0x00DE] | \
- [#0x0100-#0x0232] | [#0x0061-#0x007A] | [#0x00C0-#0x00DE] | $ | _ | [0-9] | [#0x0660-#0x0669])*
-
-TOKEN_OPERATOR === | !== | >= | <= | == | != | << | >>> | >> | & | ^= | ^ | ! | ~ | && | [|][|] | [?] | : | \
- >>= | >>>= | &= | [|]= | = | [+]= | -= | [*]= | %= | <<= | [.] | ; | , | < | > | [|] | \
- [+] | - | [*] | % | [+][+] | -- | / | /=
-
-TOKEN_DECIMAL [1-9][0-9]*
-
-TOKEN_STRING '[^']*'|"[^"]*"
-
-TOKEN_SPACE [\t\r\n ]+
-
-TOKEN_SELF [^\t\r\n+'" ]
-
-
-%%
-
-
-rexdfa_t *dfa = &ccdfa;
-
-
-int get_token(wint_t *buffer, int size)
-{
- rexdfss_t *acc_ss = NULL;
- rexuint_t nstate = REX_DFA_STARTSTATE;
- int ret = -1, i = 0;
- wint_t wc;
-
- while ((wc = fgetwc(stdin)) != WEOF) {
- if ((nstate = REX_DFA_NEXT(dfa, nstate, wc)) == REX_DFA_DEADSTATE) {
- ungetc(wc, stdin);
- break;
- }
- if (i + 1 < size) {
- buffer[i++] = wc;
- }
- if (REX_DFA_STATE(dfa, nstate)->type == REX_STATETYPE_ACCEPT) {
- /*
- * The DFA is in accepting state, lets find out what exactly is
- * being accepted.
- * The token ID is recorder in the substate's userdata
- *
- * Note: There are may be more than one accepting substate,
- * but we only check the first one (at offset 0). A real implementation
- * might need to check the rest of the accepting substates(and userdata)
- * to decide which one to use.
- *
- * Note: Some of the conflicts might be resolved simply be reordering
- * the regular expressions. For example TOKEN_KEYWORD such as
- * 'while', 'if', etc. also match TOKEN_IDENTIFIER, but because
- * TOKEN_KEYWORD appears before TOKEN_IDENTIFIER it is placed first.
- *
- * Note: We will not break out of the loop here. We will keep going
- * in order to find the longest match.
- */
- acc_ss = REX_DFA_ACCSUBSTATE(dfa, nstate, 0);
- ret = (int) acc_ss->userdata;
- if (ret == TOKEN_SELF)
- ret = wc;
- }
- }
- buffer[i++] = '\0';
- return ret;
-}
-
-int main(int argc, char *argv[])
-{
- wint_t buffer[4000];
- int token;
-
- if (!setlocale(LC_CTYPE, "")) {
- printf("Can not set the specified locale, please check LANG, LC_CTYPE, LC_ALL.\n");
- return 1;
- }
- while ((token = get_token(buffer, sizeof(buffer)/sizeof(buffer[0]))) > 0) {
- if (token != TOKEN_SPACE)
- fwprintf(stdout, L"token(%3d): %ls\n", token, buffer);
- }
- return 0;
-}