RPA Toolkit
Fixed REX_DFA_NEXT macro.
[rpatk.git] / rexcc / rexcc.c
1 /*
2  *  Regular Pattern Analyzer Toolkit (RPA/Tk)
3  *  Copyright (c) 2009-2012 Martin Stoilov
4  *
5  *  This program is free software: you can redistribute it and/or modify
6  *  it under the terms of the GNU General Public License as published by
7  *  the Free Software Foundation, either version 3 of the License, or
8  *  (at your option) any later version.
9  *
10  *  This program is distributed in the hope that it will be useful,
11  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
12  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13  *  GNU General Public License for more details.
14  *
15  *  You should have received a copy of the GNU General Public License
16  *  along with this program.  If not, see <http://www.gnu.org/licenses/>.
17  *
18  *  Martin Stoilov <martin@rpasearch.com>
19  */
20
21
22 #include <stdio.h>
23 #include <stdlib.h>
24 #include <string.h>
25 #include <stdarg.h>
26 #include <time.h>
27
28 #include "rlib/rutf.h"
29 #include "rlib/rmem.h"
30 #include "rexcc.h"
31
32
33 static rexdfa_t *tokens_dfa = NULL;
34 struct tokeninfo_s {
35         int id;
36         const char *name;
37         const char *regex;
38 };
39
40
41 struct parseinfo_s {
42         rbuffer_t id;
43         rbuffer_t regex;
44         int line;
45 };
46
47
48 #define REXCC_TOKEN_NONE                        0
49 #define REXCC_TOKEN_DELIMITER           1
50 #define REXCC_TOKEN_IDENTIFIER          2
51 #define REXCC_TOKEN_SPACE                       3
52 #define REXCC_TOKEN_CR                          4
53 #define REXCC_TOKEN_REGEX                       5
54
55
56 static struct tokeninfo_s tokens[] = {
57                 {REXCC_TOKEN_DELIMITER,         "delimiter",    "%%[ \\t]?[\\r]?[\\n]"},
58                 {REXCC_TOKEN_IDENTIFIER,        "identifier",   "([^\\t\\r\\n\'\" ]+|\"([^\"\\n]|\\\\\")*\")+|'.+'"},
59                 {REXCC_TOKEN_SPACE,                     "space",                "[ \\t]+"},
60                 {REXCC_TOKEN_CR,                        "cr ",                  "[\\r]?[\\n]"},
61                 {REXCC_TOKEN_REGEX,                     "regex",                "[ \\t](\\\\[\\r]?[\\n]|.)+\\r?\\n"},
62                 {0, NULL, NULL},
63 };
64
65
66
67 rexdfa_t * rex_cc_tokensdfa()
68 {
69         if (!tokens_dfa) {
70                 long start = -1;
71                 struct tokeninfo_s *ti = tokens;
72                 rexdb_t *nfadb, *dfadb;
73                 nfadb = rex_db_create(REXDB_TYPE_NFA);
74                 while (ti->regex) {
75                         start = rex_db_addexpression_s(nfadb, start, ti->regex, (rexuserdata_t)ti);
76                         if (start < 0) {
77                                 rex_db_destroy(nfadb);
78                                 return NULL;
79                         }
80                         ++ti;
81                 }
82                 dfadb = rex_db_createdfa(nfadb, start);
83                 if (dfadb) {
84                         tokens_dfa = rex_db_todfa(dfadb, 0);
85                 }
86                 rex_db_destroy(dfadb);
87                 rex_db_destroy(nfadb);
88         }
89         return tokens_dfa;
90 }
91
92
93 rexcc_t *rex_cc_create()
94 {
95         rexcc_t *pCC;
96         
97         pCC = (rexcc_t *)r_malloc(sizeof(*pCC));
98         if (!pCC)
99                 return (void *)0;
100         r_memset(pCC, 0, sizeof(*pCC));
101         pCC->parseinfo = r_array_create(sizeof(struct parseinfo_s));
102         pCC->nfa = rex_db_create(REXDB_TYPE_NFA);
103         pCC->startuid = -1L;
104         return pCC;
105 }
106
107
108 void rex_cc_destroy(rexcc_t *pCC)
109 {
110         if (!pCC)
111                 return;
112         r_array_destroy(pCC->parseinfo);
113         rex_db_destroy(pCC->nfa);
114         rex_dfa_destroy(pCC->dfa);
115         r_free(pCC->temp);
116         r_free(pCC);
117         if (tokens_dfa) {
118                 rex_dfa_destroy(tokens_dfa);
119                 tokens_dfa = NULL;
120         }
121 }
122
123
124 int rex_cc_load_pattern(rexcc_t *pCC, rbuffer_t *buf, rexuserdata_t userdata)
125 {
126         pCC->startuid = rex_db_addexpression(pCC->nfa, pCC->startuid, buf->s, buf->size, userdata);
127         if (pCC->startuid < 0) {
128                 return -1;
129         }
130         return 0;
131 }
132
133
134 int rex_cc_vfprintf(FILE *out, int indent, const char *format, va_list ap)
135 {
136         while (indent > 0) {
137                 fprintf(out, "\t");
138                 --indent;
139         }
140         return vfprintf(out, format, ap);
141 }
142
143
144 int rex_cc_fprintf(FILE *out, int indent, const char *format, ...)
145 {
146         va_list ap;
147         int ret;
148
149         va_start(ap, format);
150         ret = rex_cc_vfprintf(out, indent, format, ap);
151         va_end(ap);
152         return ret;
153 }
154
155
156 static void rex_cc_output_gpl(FILE *out)
157 {
158         static char *gpl =
159                         "/*\n"
160                         " *  Regular Pattern Analyzer Toolkit(RPA/Tk)\n"
161                         " *  Copyright (c) 2009-2012 Martin Stoilov\n"
162                         " *\n"
163                         " *  This program is free software: you can redistribute it and/or modify\n"
164                         " *  it under the terms of the GNU General Public License as published by\n"
165                         " *  the Free Software Foundation, either version 3 of the License, or\n"
166                         " *  (at your option) any later version.\n"
167                         " *\n"
168                         " *  This program is distributed in the hope that it will be useful,\n"
169                         " *  but WITHOUT ANY WARRANTY; without even the implied warranty of\n"
170                         " *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the\n"
171                         " *  GNU General Public License for more details.\n"
172                         " *\n"
173                         " *  You should have received a copy of the GNU General Public License\n"
174                         " *  along with this program.  If not, see <http://www.gnu.org/licenses/>.\n"
175                         " *\n"
176                         " *  Martin Stoilov <martin@rpasearch.com>\n"
177                         " */\n";
178         rex_cc_fprintf(out, 0, "%s\n", gpl);
179 }
180
181
182 static int rex_cc_output_statesubstates(rexcc_t *pCC, FILE *out, long nstate)
183 {
184         long i;
185         rexdfa_t *dfa = pCC->dfa;
186         rexdfs_t *s = rex_dfa_state(dfa, nstate);
187         rexdfss_t *ss;
188         struct parseinfo_s *pi;
189
190         for (i = 0; i < s->nsubstates; i++) {
191                 ss = rex_dfa_substate(dfa, nstate, i);
192                 if (ss->type == REX_STATETYPE_ACCEPT) {
193                         pi = (struct parseinfo_s *)r_array_slot(pCC->parseinfo, ss->userdata);
194                         pCC->temp = r_realloc(pCC->temp, pi->id.size + 1);
195                         r_memset(pCC->temp, 0, pi->id.size + 1);
196                         r_memcpy(pCC->temp, pi->id.s, pi->id.size);
197                         rex_cc_fprintf(out, 1, "{ %16lu, %16lu, (rexuserdata_t)(%s) },\n", (unsigned long)ss->type, (unsigned long)ss->uid, pCC->temp);
198                 } else {
199                         rex_cc_fprintf(out, 1, "{ %16lu, %16lu, %16lu },\n", (unsigned long)ss->type, (unsigned long)ss->uid, (unsigned long)0);
200                 }
201         }
202         return 0;
203 }
204
205
206 static int rex_cc_output_substates(rexcc_t *pCC, FILE *out)
207 {
208         long i;
209         rexdfa_t *dfa = pCC->dfa;
210
211         rex_cc_fprintf(out, 0, "static rexdfss_t substates[] = {\n");
212         for (i = 0; i < dfa->nstates; i++)
213                 rex_cc_output_statesubstates(pCC, out, i);
214         rex_cc_fprintf(out, 1, "{ %16lu, %16lu, %16lu },\n", 0UL, 0UL, 0UL);
215         rex_cc_fprintf(out, 0, "};\n");
216         return 0;
217 }
218
219
220 static int rex_cc_output_stateaccsubstates(rexcc_t *pCC, FILE *out, long nstate)
221 {
222         long i;
223         rexdfa_t *dfa = pCC->dfa;
224         rexdfs_t *s = rex_dfa_state(dfa, nstate);
225         rexdfss_t *ss;
226         struct parseinfo_s *pi;
227
228         for (i = 0; i < s->naccsubstates; i++) {
229                 ss = rex_dfa_accsubstate(dfa, nstate, i);
230                 pi = (struct parseinfo_s *)r_array_slot(pCC->parseinfo, ss->userdata);
231                 pCC->temp = r_realloc(pCC->temp, pi->id.size + 1);
232                 r_memset(pCC->temp, 0, pi->id.size + 1);
233                 r_memcpy(pCC->temp, pi->id.s, pi->id.size);
234                 rex_cc_fprintf(out, 1, "{ %16lu, %16lu, (rexuserdata_t)(%s) },\n", (unsigned long)ss->type, (unsigned long)ss->uid, pCC->temp);
235         }
236         return 0;
237 }
238
239
240 static int rex_cc_output_accsubstates(rexcc_t *pCC, FILE *out)
241 {
242         long i;
243         rexdfa_t *dfa = pCC->dfa;
244
245         rex_cc_fprintf(out, 0, "static rexdfss_t accsubstates[] = {\n");
246         for (i = 0; i < dfa->nstates; i++)
247                 rex_cc_output_stateaccsubstates(pCC, out, i);
248         rex_cc_fprintf(out, 1, "{ %16lu, %16lu, %16lu },\n", 0UL, 0UL, 0UL);
249         rex_cc_fprintf(out, 0, "};\n");
250         return 0;
251 }
252
253
254 static int rex_cc_output_statetransitions(rexcc_t *pCC, FILE *out, long nstate)
255 {
256         long i;
257         rexdfa_t *dfa = pCC->dfa;
258         rexdfs_t *s = rex_dfa_state(dfa, nstate);
259         rexdft_t *t;
260         for (i = 0; i < s->ntrans; i++) {
261                 t = rex_dfa_transition(dfa, nstate, i);
262                 rex_cc_fprintf(out, 1, "{ %16lu, %16lu, %16lu },\n", (unsigned long)t->lowin, (unsigned long)t->highin, (unsigned long)t->state);
263         }
264         return 0;
265 }
266
267
268 static int rex_cc_output_transitions(rexcc_t *pCC, FILE *out)
269 {
270         long i;
271         rexdfa_t *dfa = pCC->dfa;
272
273         rex_cc_fprintf(out, 0, "static rexdft_t transitions[] = {\n");
274         for (i = 0; i < dfa->nstates; i++)
275                 rex_cc_output_statetransitions(pCC, out, i);
276         rex_cc_fprintf(out, 1, "{ %16lu, %16lu, %16lu },\n", 0UL, 0UL, 0UL);
277         rex_cc_fprintf(out, 0, "};\n");
278         return 0;
279 }
280
281
282 static int rex_cc_output_states(rexcc_t *pCC, FILE *out)
283 {
284         long i;
285         rexdfs_t *s;
286         rexdfa_t *dfa = pCC->dfa;
287
288         rex_cc_fprintf(out, 0, "static rexdfs_t states[] = {\n");
289         for (i = 0; i < dfa->nstates; i++) {
290                 s = rex_dfa_state(dfa, i);
291                 rex_cc_fprintf(out, 1, "{ %16lu, %16lu, %16lu, %16lu, %16lu, %16lu , %16lu},\n",
292                                 (unsigned long)s->type, (unsigned long)s->trans, (unsigned long)s->ntrans, (unsigned long)s->accsubstates,
293                                 (unsigned long)s->naccsubstates, (unsigned long)s->substates, (unsigned long)s->nsubstates);
294
295         }
296         rex_cc_fprintf(out, 1, "{ %16lu, %16lu, %16lu, %16lu, %16lu, %16lu , %16lu},\n", 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL);
297         rex_cc_fprintf(out, 0, "};\n");
298         return 0;
299 }
300
301
302 static int rex_cc_output_dfa(rexcc_t *pCC, FILE *out)
303 {
304         rexdfa_t *dfa = pCC->dfa;
305
306         rex_cc_fprintf(out, 0, "static rexdfa_t ccdfa = {\n");
307         rex_cc_fprintf(out, 1, "%lu,\n", dfa->nstates);
308         rex_cc_fprintf(out, 1, "%s,\n", "states");
309         rex_cc_fprintf(out, 1, "%lu,\n", dfa->ntrans);
310         rex_cc_fprintf(out, 1, "%s,\n", "transitions");
311         rex_cc_fprintf(out, 1, "%lu,\n", dfa->naccsubstates);
312         rex_cc_fprintf(out, 1, "%s,\n", "accsubstates");
313         rex_cc_fprintf(out, 1, "%lu,\n", dfa->nsubstates);
314         rex_cc_fprintf(out, 1, "%s,\n", "substates");
315         rex_cc_fprintf(out, 1, "{0, },\n");
316         rex_cc_fprintf(out, 0, "};\n");
317         return 0;
318 }
319
320
321
322 int rex_cc_output(rexcc_t *pCC, FILE *outc)
323 {
324         if (outc) {
325                 rex_cc_output_gpl(outc);
326                 rex_cc_fprintf(outc, 0, "#include \"rexdfa.h\"\n\n");
327                 if (pCC->prolog.size) {
328                         fwrite(pCC->prolog.s, 1, pCC->prolog.size, outc);
329                         rex_cc_fprintf(outc, 0, "\n");
330                 }
331                 rex_cc_output_accsubstates(pCC, outc);
332                 rex_cc_fprintf(outc, 0, "\n\n");
333                 rex_cc_output_substates(pCC, outc);
334                 rex_cc_fprintf(outc, 0, "\n\n");
335                 rex_cc_output_transitions(pCC, outc);
336                 rex_cc_fprintf(outc, 0, "\n\n");
337                 rex_cc_output_states(pCC, outc);
338                 rex_cc_fprintf(outc, 0, "\n\n");
339                 rex_cc_output_dfa(pCC, outc);
340                 if (pCC->epilog.size) {
341                         rex_cc_fprintf(outc, 0, "\n");
342                         fwrite(pCC->epilog.s, 1, pCC->epilog.size, outc);
343                 }
344         }
345         return 0;
346 }
347
348
349 static int rex_cc_getlineno(rexcc_t *pCC, const char *input)
350 {
351         int ret = 1;
352
353         while (--input >= pCC->start) {
354                 if (*input == '\n')
355                         ret += 1;
356         }
357         return ret;
358 }
359
360
361 int rex_cc_gettoken(rexcc_t *pCC)
362 {
363         ruint32 wc = 0;
364         int inc, ret = 0;
365         long nstate = REX_DFA_STARTSTATE;
366         const char *input = pCC->input;
367         rexdfa_t *dfa = rex_cc_tokensdfa();
368         rexdfs_t *s = NULL;
369         rexdfss_t *ss = NULL;
370
371         pCC->token = 0;
372         pCC->tokenlen = 0;
373         if (!dfa) {
374                 /*
375                  * Error
376                  */
377                 return -1;
378         }
379         while ((inc = r_utf8_mbtowc(&wc, (const unsigned char*)input, (const unsigned char*)pCC->end)) > 0) {
380                 REX_DFA_NEXT(dfa, nstate, wc, &nstate);
381                 if (nstate == 0)
382                         break;
383                 input += inc;
384                 s = REX_DFA_STATE(dfa, nstate);
385                 ss = REX_DFA_ACCSUBSTATE(dfa, nstate, 0);
386                 if (s->type == REX_STATETYPE_ACCEPT) {
387                         pCC->token = ((struct tokeninfo_s *)ss->userdata)->id;
388                         ret = input - pCC->input;
389                         if (ss && ((struct tokeninfo_s *)ss->userdata)->id == 1) {
390                                 break;
391                         }
392                 }
393         }
394         if (ret) {
395                 pCC->tokenptr = pCC->input;
396                 pCC->tokenlen = ret;
397                 pCC->input += ret;
398         }
399         return ret;
400 }
401
402
403 static int rex_cc_parseid(rexcc_t *pCC, struct parseinfo_s *info)
404 {
405         info->id.s = pCC->tokenptr;
406         info->id.size = pCC->tokenlen;
407         rex_cc_gettoken(pCC);
408         return 0;
409 }
410
411
412 static int rex_cc_parseregex(rexcc_t *pCC, struct parseinfo_s *info)
413 {
414         info->regex.s = pCC->tokenptr;
415         info->regex.size = pCC->tokenlen;
416         rex_cc_gettoken(pCC);
417         return 0;
418 }
419
420
421 static int rex_cc_parseline(rexcc_t *pCC)
422 {
423         struct parseinfo_s info;
424
425         r_memset(&info, 0, sizeof(info));
426         if (rex_cc_parseid(pCC, &info) < 0)
427                 return -1;
428         if (pCC->token != REXCC_TOKEN_REGEX) {
429                 /*
430                  * Unexpected char.
431                  */
432                 fprintf(stdout, "Line %d, (%s) Unexpected Char.\n", rex_cc_getlineno(pCC, pCC->input), "Error");
433                 return -1;
434         }
435         if (rex_cc_parseregex(pCC, &info) < 0)
436                 return -1;
437         r_array_add(pCC->parseinfo, &info);
438         return 0;
439 }
440
441
442 int rex_cc_parse(rexcc_t *pCC)
443 {
444         pCC->prolog.s = pCC->input;
445         pCC->prolog.size = 0;
446         while (pCC->input + 3 < pCC->end) {
447                 if (*pCC->input == '%' && *(pCC->input+1) == '%' && (*(pCC->input+2) == '\n' || (*(pCC->input+2) == '\r' && *(pCC->input+3) == '\n'))) {
448                         break;
449                 }
450                 pCC->prolog.size += 1;
451                 pCC->input += 1;
452         }
453         rex_cc_gettoken(pCC);
454         if (pCC->token != REXCC_TOKEN_DELIMITER)
455                 return -1;
456         rex_cc_gettoken(pCC);
457         while (pCC->token) {
458                 if (pCC->token == REXCC_TOKEN_CR || pCC->token == REXCC_TOKEN_SPACE || pCC->token == REXCC_TOKEN_REGEX) {
459                         rex_cc_gettoken(pCC);
460                 } else if (pCC->token == REXCC_TOKEN_IDENTIFIER) {
461                         if (rex_cc_parseline(pCC) < 0)
462                                 return -1;
463                 } else if (pCC->token == REXCC_TOKEN_DELIMITER) {
464                         rex_cc_gettoken(pCC);
465                         return 0;
466                 } else {
467                         /*
468                          * Unexpected char
469                          */
470                         fprintf(stdout, "Line %d, (%s) Unexpected Char.\n", rex_cc_getlineno(pCC, pCC->input), "Error");
471                         return -1;
472                 }
473                 pCC->epilog.s = pCC->input;
474                 pCC->epilog.size = pCC->end - pCC->input;
475         }
476         return -1;
477 }
478
479
480 int rex_cc_load_buffer(rexcc_t *pCC, rbuffer_t *text)
481 {
482         int ret = 0, i;
483         struct parseinfo_s *pi;
484
485         pCC->start = text->s;
486         pCC->input = text->s;
487         pCC->end = text->s + text->size;
488         r_array_setlength(pCC->parseinfo, 0);
489         if (rex_cc_parse(pCC) == 0) {
490                 for (i = 0; i < r_array_length(pCC->parseinfo); i++) {
491                         pi = (struct parseinfo_s *)r_array_slot(pCC->parseinfo, i);
492                         if (rex_cc_load_pattern(pCC, &pi->regex, i) < 0) {
493                                 fprintf(stdout, "Line %d, (%s) Syntax error: ", rex_cc_getlineno(pCC, pi->id.s), "Error");
494                                 fwrite(pi->id.s, 1, pi->id.size, stdout);
495                                 fprintf(stdout, "  ");
496                                 fwrite(pi->regex.s, 1, pi->regex.size, stdout);
497                                 fprintf(stdout, "\n");
498                                 return -1;
499                         }
500 #if 0
501                         fwrite(pi->id.s, 1, pi->id.size, stdout);
502                         fprintf(stdout, " : ");
503                         fwrite(pi->regex.s, 1, pi->regex.size, stdout);
504                         fprintf(stdout, "\n");
505 #endif
506                 }
507         }
508         return ret;
509 }
510
511
512 void rex_cc_parseinfodump(rexcc_t *pCC)
513 {
514         long i;
515         struct parseinfo_s *pi;
516
517         for (i = 0; i < r_array_length(pCC->parseinfo); i++) {
518                 pi = (struct parseinfo_s *)r_array_slot(pCC->parseinfo, i);
519                 fwrite(pi->id.s, 1, pi->id.size, stdout);
520                 fprintf(stdout, "  ");
521                 fwrite(pi->regex.s, 1, pi->regex.size, stdout);
522                 fprintf(stdout, "\n");
523         }
524 }