RPA Toolkit
d67ceac21c7b6c6341b992685bdf5993fe83e11c
[rpatk.git] / rex / doc / example / js-tokenizer.c
1 /*
2  *  Regular Pattern Analyzer Toolkit (RPA/Tk)
3  *  Copyright (c) 2009-2012 Martin Stoilov
4  *
5  *  This program is free software: you can redistribute it and/or modify
6  *  it under the terms of the GNU General Public License as published by
7  *  the Free Software Foundation, either version 3 of the License, or
8  *  (at your option) any later version.
9  *
10  *  This program is distributed in the hope that it will be useful,
11  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
12  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13  *  GNU General Public License for more details.
14  *
15  *  You should have received a copy of the GNU General Public License
16  *  along with this program.  If not, see <http://www.gnu.org/licenses/>.
17  *
18  *  Martin Stoilov <martin@rpasearch.com>
19  */
20
21 /*
22  * To build:
23  * # gcc -I/usr/include/rpatk -o js-tokenizer js-tokenizer.c -lrex -lrlib
24  *
25  * To run:
26  * # echo "function add(a,b) { var c = a + b; return c; } print('здравей means hello');" | ./js-tokenizer
27  */
28
29 #include <stdio.h>
30 #include <wchar.h>
31 #include <locale.h>
32 #include "rex/rexdb.h"
33 #include "rex/rexdfa.h"
34
35 #define TOKEN_SELF 256
36 #define TOKEN_IDENTIFIER 257
37 #define TOKEN_SPACE 258
38 #define TOKEN_KEYWORD 259
39 #define TOKEN_OPERATOR 260
40 #define TOKEN_STRING 261
41 #define TOKEN_DECIMAL 262
42
43 int get_token(rexdfa_t *dfa, wint_t *buffer, int size)
44 {
45         rexdfss_t *acc_ss = NULL;
46         rexuint_t nstate = REX_DFA_STARTSTATE;
47         int ret = -1, i = 0;
48         wint_t wc;
49         
50         while ((wc = fgetwc(stdin)) != WEOF) {
51                 if ((nstate = REX_DFA_NEXT(dfa, nstate, wc)) == REX_DFA_DEADSTATE) {
52                         ungetc(wc, stdin);
53                         break;
54                 }
55                 if (i + 1 < size) {
56                         buffer[i++] = wc;
57                 }
58                 if (REX_DFA_STATE(dfa, nstate)->type == REX_STATETYPE_ACCEPT) {
59                         /*
60                          * The DFA is in accepting state, lets find out what exactly is
61                          * being accepted.
62                          * The token ID is recorder in the substate's userdata
63                          *
64                          * Note: There are may be more than one accepting substate,
65                          * but we only check the first one (at offset 0). A real implementation
66                          * might need to check the rest of the accepting substates(and userdata)
67                          * to decide which one to use.
68                          *
69                          * Note: Some of the conflicts might be resolved simply be reordering
70                          * the regular expressions. For example TOKEN_KEYWORD such as 
71                          * 'while', 'if', etc. also match TOKEN_IDENTIFIER, but because
72                          * TOKEN_KEYWORD appears before TOKEN_IDENTIFIER it is placed first.
73                          *
74                          * Note: We will not break out of the loop here. We will keep going
75                          * in order to find the longest match.
76                          */
77                         acc_ss = REX_DFA_ACCSUBSTATE(dfa, nstate, 0);
78                         ret = (int) acc_ss->userdata;
79                         if (ret == TOKEN_SELF)
80                                 ret = wc;
81                 }
82         }
83         buffer[i++] = '\0';
84         return ret;
85 }
86
87 int main(int argc, char *argv[])
88 {
89         rexdb_t *nfadb;
90         rexdb_t *dfadb;
91         rexdfa_t *dfa;
92         long startstate = 0;
93         wint_t buffer[4000];
94         int token;
95         
96         if (!setlocale(LC_CTYPE, "")) {
97                 printf("Can not set the specified locale, please check LANG, LC_CTYPE, LC_ALL.\n");
98                 return 1;
99     }
100
101         /*
102          * Create empty automaton of type NFA.
103          */
104         nfadb = rex_db_create(REXDB_TYPE_NFA);
105
106         /*
107          * Load the automaton with regular expressions, defining JavaScript language tokens.
108          */
109         startstate = rex_db_addexpression_s(nfadb, startstate,
110                                                         "instanceof | typeof | break | do | new | var | case | else | "
111                                                         "return | void | catch | finally | continue | for | "
112                                                         "switch | while | this | with |debugger | function | throw | default | "
113                                                         "if | try | delete | in | class | enum | extends | import | const | export | "
114                                                         "implements | let | private | public | static | interface | package | protected",
115                                                         TOKEN_KEYWORD);
116         startstate = rex_db_addexpression_s(nfadb, startstate,
117                                                         "([#0x0041-#0x005A] | [#0x00C0-#0x00DE] | [#0x0100-#0x0232] | [#0x0061-#0x007A] | "
118                                                         "[#0x00C0-#0x00DE] | $ | _ )([#0x0041-#0x005A] | [#0x00C0-#0x00DE] | "
119                                                         "[#0x0100-#0x0232] | [#0x0061-#0x007A] | [#0x00C0-#0x00DE] | $ | _ | [0-9] | [#0x0660-#0x0669])*",
120                                                         TOKEN_IDENTIFIER);
121         startstate = rex_db_addexpression_s(nfadb, startstate,
122                                                         "=== | !== | >= | <= | == | != | << | >>> | >> | & | ^= | ^ | ! | ~ | && | [|][|] | [?] | : | "
123                                                         ">>= | >>>= | &= | [|]= | = | [+]= | -= | [*]= | %= | <<= | [.] | ; | , | < | > | [|] | "
124                                                         "[+] | - | [*] | % | [+][+] | -- | / | /=",
125                                                         TOKEN_OPERATOR);
126         startstate = rex_db_addexpression_s(nfadb, startstate, "[1-9][0-9]*", TOKEN_DECIMAL);
127         startstate = rex_db_addexpression_s(nfadb, startstate, "'[^']*'|\"[^\"]*\"", TOKEN_STRING);
128         startstate = rex_db_addexpression_s(nfadb, startstate, "[\\t\\r\\n ]+", TOKEN_SPACE);
129         startstate = rex_db_addexpression_s(nfadb, startstate, "[^\\t\\r\\n'\" ]", TOKEN_SELF);
130
131         /*
132          * Construct the DFA from the NFA
133          */
134         dfadb = rex_db_createdfa(nfadb, startstate);
135
136         /*
137          * At this point you can start using the dfadb for matching, but it is better
138          * to generate the more compact representation rexdfa_t.
139          */
140         dfa = rex_db_todfa(dfadb, 0);
141
142         /*
143          * Destroy the rexdb_t objects. We don't need them any more.
144          */
145         rex_db_destroy(nfadb);
146         rex_db_destroy(dfadb);
147
148         /*
149          * Read the tokens and print them.
150          */
151         while ((token = get_token(dfa, buffer, sizeof(buffer)/sizeof(buffer[0]))) > 0) {
152                 if (token != TOKEN_SPACE)
153                         fwprintf(stdout, L"token(%3d): %ls\n", token, buffer);
154         }
155         rex_dfa_destroy(dfa);
156         return 0;
157 }