RPA Toolkit
Work on documentation.
[rpatk.git] / rexcc / rexcc.c
1 /*
2  *  Regular Pattern Analyzer Toolkit (RPA/Tk)
3  *  Copyright (c) 2009-2012 Martin Stoilov
4  *
5  *  This program is free software: you can redistribute it and/or modify
6  *  it under the terms of the GNU General Public License as published by
7  *  the Free Software Foundation, either version 3 of the License, or
8  *  (at your option) any later version.
9  *
10  *  This program is distributed in the hope that it will be useful,
11  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
12  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13  *  GNU General Public License for more details.
14  *
15  *  You should have received a copy of the GNU General Public License
16  *  along with this program.  If not, see <http://www.gnu.org/licenses/>.
17  *
18  *  Martin Stoilov <martin@rpasearch.com>
19  */
20
21
22 #include <stdio.h>
23 #include <stdlib.h>
24 #include <string.h>
25 #include <stdarg.h>
26 #include <time.h>
27
28 #include "rlib/rutf.h"
29 #include "rlib/rmem.h"
30 #include "rexcc.h"
31
32
33 static rexdfa_t *tokens_dfa = NULL;
34 struct tokeninfo_s {
35         int id;
36         const char *name;
37         const char *regex;
38 };
39
40
41 struct parseinfo_s {
42         rbuffer_t id;
43         rbuffer_t regex;
44         int line;
45 };
46
47
48 #define REXCC_TOKEN_NONE                        0
49 #define REXCC_TOKEN_DELIMITER           1
50 #define REXCC_TOKEN_IDENTIFIER          2
51 #define REXCC_TOKEN_SPACE                       3
52 #define REXCC_TOKEN_CR                          4
53 #define REXCC_TOKEN_REGEX                       5
54
55
56 static struct tokeninfo_s tokens[] = {
57                 {REXCC_TOKEN_DELIMITER,         "delimiter",    "%%[ \\t]?[\\r]?[\\n]"},
58                 {REXCC_TOKEN_IDENTIFIER,        "identifier",   "([^\\t\\r\\n\'\" ]+|\"([^\"\\n]|\\\\\")*\")+|'.+'"},
59                 {REXCC_TOKEN_SPACE,                     "space",                "[ \\t]+"},
60                 {REXCC_TOKEN_CR,                        "cr ",                  "[\\r]?[\\n]"},
61                 {REXCC_TOKEN_REGEX,                     "regex",                "[ \\t](\\\\[\\r]?[\\n]|.)+\\r?\\n"},
62                 {0, NULL, NULL},
63 };
64
65
66
67 rexdfa_t * rex_cc_tokensdfa()
68 {
69         if (!tokens_dfa) {
70                 long start = -1;
71                 struct tokeninfo_s *ti = tokens;
72                 rexdb_t *nfadb, *dfadb;
73                 nfadb = rex_db_create(REXDB_TYPE_NFA);
74                 while (ti->regex) {
75                         start = rex_db_addexpression_s(nfadb, start, ti->regex, (rexuserdata_t)ti);
76                         if (start < 0) {
77                                 rex_db_destroy(nfadb);
78                                 return NULL;
79                         }
80                         ++ti;
81                 }
82                 dfadb = rex_db_createdfa(nfadb, start);
83                 if (dfadb) {
84                         tokens_dfa = rex_db_todfa(dfadb, 0);
85                 }
86                 rex_db_destroy(dfadb);
87                 rex_db_destroy(nfadb);
88         }
89         return tokens_dfa;
90 }
91
92
93 rexcc_t *rex_cc_create()
94 {
95         rexcc_t *pCC;
96         
97         pCC = (rexcc_t *)r_malloc(sizeof(*pCC));
98         if (!pCC)
99                 return (void *)0;
100         r_memset(pCC, 0, sizeof(*pCC));
101         pCC->parseinfo = r_array_create(sizeof(struct parseinfo_s));
102         pCC->nfa = rex_db_create(REXDB_TYPE_NFA);
103         pCC->startuid = -1L;
104         return pCC;
105 }
106
107
108 void rex_cc_destroy(rexcc_t *pCC)
109 {
110         if (!pCC)
111                 return;
112         r_array_destroy(pCC->parseinfo);
113         rex_db_destroy(pCC->nfa);
114         rex_dfa_destroy(pCC->dfa);
115         r_free(pCC->temp);
116         r_free(pCC);
117         if (tokens_dfa) {
118                 rex_dfa_destroy(tokens_dfa);
119                 tokens_dfa = NULL;
120         }
121 }
122
123
124 int rex_cc_load_pattern(rexcc_t *pCC, rbuffer_t *buf, rexuserdata_t userdata)
125 {
126         pCC->startuid = rex_db_addexpression(pCC->nfa, pCC->startuid, buf->s, buf->size, userdata);
127         if (pCC->startuid < 0) {
128                 return -1;
129         }
130         return 0;
131 }
132
133
134 int rex_cc_vfprintf(FILE *out, int indent, const char *format, va_list ap)
135 {
136         while (indent > 0) {
137                 fprintf(out, "\t");
138                 --indent;
139         }
140         return vfprintf(out, format, ap);
141 }
142
143
144 int rex_cc_fprintf(FILE *out, int indent, const char *format, ...)
145 {
146         va_list ap;
147         int ret;
148
149         va_start(ap, format);
150         ret = rex_cc_vfprintf(out, indent, format, ap);
151         va_end(ap);
152         return ret;
153 }
154
155
156 static void rex_cc_output_gpl(FILE *out)
157 {
158         static char *gpl =
159                         "/*\n"
160                         " *  Regular Pattern Analyzer Toolkit(RPA/Tk)\n"
161                         " *  Copyright (c) 2009-2012 Martin Stoilov\n"
162                         " *\n"
163                         " *  This program is free software: you can redistribute it and/or modify\n"
164                         " *  it under the terms of the GNU General Public License as published by\n"
165                         " *  the Free Software Foundation, either version 3 of the License, or\n"
166                         " *  (at your option) any later version.\n"
167                         " *\n"
168                         " *  This program is distributed in the hope that it will be useful,\n"
169                         " *  but WITHOUT ANY WARRANTY; without even the implied warranty of\n"
170                         " *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the\n"
171                         " *  GNU General Public License for more details.\n"
172                         " *\n"
173                         " *  You should have received a copy of the GNU General Public License\n"
174                         " *  along with this program.  If not, see <http://www.gnu.org/licenses/>.\n"
175                         " *\n"
176                         " *  Martin Stoilov <martin@rpasearch.com>\n"
177                         " */\n";
178         rex_cc_fprintf(out, 0, "%s\n", gpl);
179 }
180
181
182 static int rex_cc_output_statesubstates(rexcc_t *pCC, FILE *out, long nstate)
183 {
184         long i;
185         rexdfa_t *dfa = pCC->dfa;
186         rexdfs_t *s = rex_dfa_state(dfa, nstate);
187         rexdfss_t *ss;
188         struct parseinfo_s *pi;
189
190         for (i = 0; i < s->nsubstates; i++) {
191                 ss = rex_dfa_substate(dfa, nstate, i);
192                 if (ss->type == REX_STATETYPE_ACCEPT) {
193                         pi = (struct parseinfo_s *)r_array_slot(pCC->parseinfo, ss->userdata);
194                         pCC->temp = r_realloc(pCC->temp, pi->id.size + 1);
195                         r_memset(pCC->temp, 0, pi->id.size + 1);
196                         r_memcpy(pCC->temp, pi->id.s, pi->id.size);
197                         rex_cc_fprintf(out, 1, "{ %16lu, %16lu, (rexuserdata_t)(%s) },\n", (unsigned long)ss->type, (unsigned long)ss->uid, pCC->temp);
198                 } else {
199                         rex_cc_fprintf(out, 1, "{ %16lu, %16lu, %16lu },\n", (unsigned long)ss->type, (unsigned long)ss->uid, (unsigned long)0);
200                 }
201         }
202         return 0;
203 }
204
205
206 static int rex_cc_output_substates(rexcc_t *pCC, FILE *out)
207 {
208         long i;
209         rexdfa_t *dfa = pCC->dfa;
210
211         rex_cc_fprintf(out, 0, "static rexdfss_t substates[] = {\n");
212         for (i = 0; i < dfa->nstates; i++)
213                 rex_cc_output_statesubstates(pCC, out, i);
214         rex_cc_fprintf(out, 1, "{ %16lu, %16lu, %16lu },\n", 0UL, 0UL, 0UL);
215         rex_cc_fprintf(out, 0, "};\n");
216         return 0;
217 }
218
219
220 static int rex_cc_output_stateaccsubstates(rexcc_t *pCC, FILE *out, long nstate)
221 {
222         long i;
223         rexdfa_t *dfa = pCC->dfa;
224         rexdfs_t *s = rex_dfa_state(dfa, nstate);
225         rexdfss_t *ss;
226         struct parseinfo_s *pi;
227
228         for (i = 0; i < s->naccsubstates; i++) {
229                 ss = rex_dfa_accsubstate(dfa, nstate, i);
230                 pi = (struct parseinfo_s *)r_array_slot(pCC->parseinfo, ss->userdata);
231                 pCC->temp = r_realloc(pCC->temp, pi->id.size + 1);
232                 r_memset(pCC->temp, 0, pi->id.size + 1);
233                 r_memcpy(pCC->temp, pi->id.s, pi->id.size);
234                 rex_cc_fprintf(out, 1, "{ %16lu, %16lu, (rexuserdata_t)(%s) },\n", (unsigned long)ss->type, (unsigned long)ss->uid, pCC->temp);
235         }
236         return 0;
237 }
238
239
240 static int rex_cc_output_accsubstates(rexcc_t *pCC, FILE *out)
241 {
242         long i;
243         rexdfa_t *dfa = pCC->dfa;
244
245         rex_cc_fprintf(out, 0, "static rexdfss_t accsubstates[] = {\n");
246         for (i = 0; i < dfa->nstates; i++)
247                 rex_cc_output_stateaccsubstates(pCC, out, i);
248         rex_cc_fprintf(out, 1, "{ %16lu, %16lu, %16lu },\n", 0UL, 0UL, 0UL);
249         rex_cc_fprintf(out, 0, "};\n");
250         return 0;
251 }
252
253
254 static int rex_cc_output_statetransitions(rexcc_t *pCC, FILE *out, long nstate)
255 {
256         long i;
257         rexdfa_t *dfa = pCC->dfa;
258         rexdfs_t *s = rex_dfa_state(dfa, nstate);
259         rexdft_t *t;
260         for (i = 0; i < s->ntrans; i++) {
261                 t = rex_dfa_transition(dfa, nstate, i);
262                 rex_cc_fprintf(out, 1, "{ %16lu, %16lu, %16lu },\n", (unsigned long)t->lowin, (unsigned long)t->highin, (unsigned long)t->state);
263         }
264         return 0;
265 }
266
267
268 static int rex_cc_output_transitions(rexcc_t *pCC, FILE *out)
269 {
270         long i;
271         rexdfa_t *dfa = pCC->dfa;
272
273         rex_cc_fprintf(out, 0, "static rexdft_t transitions[] = {\n");
274         for (i = 0; i < dfa->nstates; i++)
275                 rex_cc_output_statetransitions(pCC, out, i);
276         rex_cc_fprintf(out, 1, "{ %16lu, %16lu, %16lu },\n", 0UL, 0UL, 0UL);
277         rex_cc_fprintf(out, 0, "};\n");
278         return 0;
279 }
280
281
282 static int rex_cc_output_states(rexcc_t *pCC, FILE *out)
283 {
284         long i;
285         rexdfs_t *s;
286         rexdfa_t *dfa = pCC->dfa;
287
288         rex_cc_fprintf(out, 0, "static rexdfs_t states[] = {\n");
289         for (i = 0; i < dfa->nstates; i++) {
290                 s = rex_dfa_state(dfa, i);
291                 rex_cc_fprintf(out, 1, "{ %16lu, %16lu, %16lu, %16lu, %16lu, %16lu , %16lu},\n",
292                                 (unsigned long)s->type, (unsigned long)s->trans, (unsigned long)s->ntrans, (unsigned long)s->accsubstates,
293                                 (unsigned long)s->naccsubstates, (unsigned long)s->substates, (unsigned long)s->nsubstates);
294
295         }
296         rex_cc_fprintf(out, 1, "{ %16lu, %16lu, %16lu, %16lu, %16lu, %16lu , %16lu},\n", 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL);
297         rex_cc_fprintf(out, 0, "};\n");
298         return 0;
299 }
300
301
302 static int rex_cc_output_dfa(rexcc_t *pCC, FILE *out)
303 {
304         rexdfa_t *dfa = pCC->dfa;
305
306         rex_cc_fprintf(out, 0, "static rexdfa_t ccdfa = {\n");
307         rex_cc_fprintf(out, 1, "%lu,\n", dfa->nstates);
308         rex_cc_fprintf(out, 1, "%s,\n", "states");
309         rex_cc_fprintf(out, 1, "%lu,\n", dfa->ntrans);
310         rex_cc_fprintf(out, 1, "%s,\n", "transitions");
311         rex_cc_fprintf(out, 1, "%lu,\n", dfa->naccsubstates);
312         rex_cc_fprintf(out, 1, "%s,\n", "accsubstates");
313         rex_cc_fprintf(out, 1, "%lu,\n", dfa->nsubstates);
314         rex_cc_fprintf(out, 1, "%s,\n", "substates");
315         rex_cc_fprintf(out, 1, "{0, },\n");
316         rex_cc_fprintf(out, 0, "};\n");
317         return 0;
318 }
319
320
321
322 int rex_cc_output(rexcc_t *pCC, FILE *outc)
323 {
324         if (outc) {
325                 rex_cc_output_gpl(outc);
326                 rex_cc_fprintf(outc, 0, "#include \"rexdfa.h\"\n\n");
327                 if (pCC->prolog.size) {
328                         fwrite(pCC->prolog.s, 1, pCC->prolog.size, outc);
329                         rex_cc_fprintf(outc, 0, "\n");
330                 }
331                 rex_cc_output_accsubstates(pCC, outc);
332                 rex_cc_fprintf(outc, 0, "\n\n");
333                 rex_cc_output_substates(pCC, outc);
334                 rex_cc_fprintf(outc, 0, "\n\n");
335                 rex_cc_output_transitions(pCC, outc);
336                 rex_cc_fprintf(outc, 0, "\n\n");
337                 rex_cc_output_states(pCC, outc);
338                 rex_cc_fprintf(outc, 0, "\n\n");
339                 rex_cc_output_dfa(pCC, outc);
340                 if (pCC->epilog.size) {
341                         rex_cc_fprintf(outc, 0, "\n");
342                         fwrite(pCC->epilog.s, 1, pCC->epilog.size, outc);
343                 }
344         }
345         return 0;
346 }
347
348
349 static int rex_cc_getlineno(rexcc_t *pCC, const char *input)
350 {
351         int ret = 1;
352
353         while (--input >= pCC->start) {
354                 if (*input == '\n')
355                         ret += 1;
356         }
357         return ret;
358 }
359
360
361 int rex_cc_gettoken(rexcc_t *pCC)
362 {
363         ruint32 wc = 0;
364         int inc, ret = 0;
365         long nstate = REX_DFA_STARTSTATE;
366         const char *input = pCC->input;
367         rexdfa_t *dfa = rex_cc_tokensdfa();
368         rexdfs_t *s = NULL;
369         rexdfss_t *ss = NULL;
370
371         pCC->token = 0;
372         pCC->tokenlen = 0;
373         if (!dfa) {
374                 /*
375                  * Error
376                  */
377                 return -1;
378         }
379         while ((inc = r_utf8_mbtowc(&wc, (const unsigned char*)input, (const unsigned char*)pCC->end)) > 0) {
380                 if ((nstate = REX_DFA_NEXT(dfa, nstate, wc)) <= 0)
381                         break;
382                 input += inc;
383                 s = REX_DFA_STATE(dfa, nstate);
384                 ss = REX_DFA_ACCSUBSTATE(dfa, nstate, 0);
385                 if (s->type == REX_STATETYPE_ACCEPT) {
386                         pCC->token = ((struct tokeninfo_s *)ss->userdata)->id;
387                         ret = input - pCC->input;
388                         if (ss && ((struct tokeninfo_s *)ss->userdata)->id == 1) {
389                                 break;
390                         }
391                 }
392         }
393         if (ret) {
394                 pCC->tokenptr = pCC->input;
395                 pCC->tokenlen = ret;
396                 pCC->input += ret;
397         }
398         return ret;
399 }
400
401
402 static int rex_cc_parseid(rexcc_t *pCC, struct parseinfo_s *info)
403 {
404         info->id.s = pCC->tokenptr;
405         info->id.size = pCC->tokenlen;
406         rex_cc_gettoken(pCC);
407         return 0;
408 }
409
410
411 static int rex_cc_parseregex(rexcc_t *pCC, struct parseinfo_s *info)
412 {
413         info->regex.s = pCC->tokenptr;
414         info->regex.size = pCC->tokenlen;
415         rex_cc_gettoken(pCC);
416         return 0;
417 }
418
419
420 static int rex_cc_parseline(rexcc_t *pCC)
421 {
422         struct parseinfo_s info;
423
424         r_memset(&info, 0, sizeof(info));
425         if (rex_cc_parseid(pCC, &info) < 0)
426                 return -1;
427         if (pCC->token != REXCC_TOKEN_REGEX) {
428                 /*
429                  * Unexpected char.
430                  */
431                 fprintf(stdout, "Line %d, (%s) Unexpected Char.\n", rex_cc_getlineno(pCC, pCC->input), "Error");
432                 return -1;
433         }
434         if (rex_cc_parseregex(pCC, &info) < 0)
435                 return -1;
436         r_array_add(pCC->parseinfo, &info);
437         return 0;
438 }
439
440
441 int rex_cc_parse(rexcc_t *pCC)
442 {
443         pCC->prolog.s = pCC->input;
444         pCC->prolog.size = 0;
445         while (pCC->input + 3 < pCC->end) {
446                 if (*pCC->input == '%' && *(pCC->input+1) == '%' && (*(pCC->input+2) == '\n' || (*(pCC->input+2) == '\r' && *(pCC->input+3) == '\n'))) {
447                         break;
448                 }
449                 pCC->prolog.size += 1;
450                 pCC->input += 1;
451         }
452         rex_cc_gettoken(pCC);
453         if (pCC->token != REXCC_TOKEN_DELIMITER)
454                 return -1;
455         rex_cc_gettoken(pCC);
456         while (pCC->token) {
457                 if (pCC->token == REXCC_TOKEN_CR || pCC->token == REXCC_TOKEN_SPACE || pCC->token == REXCC_TOKEN_REGEX) {
458                         rex_cc_gettoken(pCC);
459                 } else if (pCC->token == REXCC_TOKEN_IDENTIFIER) {
460                         if (rex_cc_parseline(pCC) < 0)
461                                 return -1;
462                 } else if (pCC->token == REXCC_TOKEN_DELIMITER) {
463                         rex_cc_gettoken(pCC);
464                         return 0;
465                 } else {
466                         /*
467                          * Unexpected char
468                          */
469                         fprintf(stdout, "Line %d, (%s) Unexpected Char.\n", rex_cc_getlineno(pCC, pCC->input), "Error");
470                         return -1;
471                 }
472                 pCC->epilog.s = pCC->input;
473                 pCC->epilog.size = pCC->end - pCC->input;
474         }
475         return -1;
476 }
477
478
479 int rex_cc_load_buffer(rexcc_t *pCC, rbuffer_t *text)
480 {
481         int ret = 0, i;
482         struct parseinfo_s *pi;
483
484         pCC->start = text->s;
485         pCC->input = text->s;
486         pCC->end = text->s + text->size;
487         r_array_setlength(pCC->parseinfo, 0);
488         if (rex_cc_parse(pCC) == 0) {
489                 for (i = 0; i < r_array_length(pCC->parseinfo); i++) {
490                         pi = (struct parseinfo_s *)r_array_slot(pCC->parseinfo, i);
491                         if (rex_cc_load_pattern(pCC, &pi->regex, i) < 0) {
492                                 fprintf(stdout, "Line %d, (%s) Syntax error: ", rex_cc_getlineno(pCC, pi->id.s), "Error");
493                                 fwrite(pi->id.s, 1, pi->id.size, stdout);
494                                 fprintf(stdout, "  ");
495                                 fwrite(pi->regex.s, 1, pi->regex.size, stdout);
496                                 fprintf(stdout, "\n");
497                                 return -1;
498                         }
499 #if 0
500                         fwrite(pi->id.s, 1, pi->id.size, stdout);
501                         fprintf(stdout, " : ");
502                         fwrite(pi->regex.s, 1, pi->regex.size, stdout);
503                         fprintf(stdout, "\n");
504 #endif
505                 }
506         }
507         return ret;
508 }
509
510
511 void rex_cc_parseinfodump(rexcc_t *pCC)
512 {
513         long i;
514         struct parseinfo_s *pi;
515
516         for (i = 0; i < r_array_length(pCC->parseinfo); i++) {
517                 pi = (struct parseinfo_s *)r_array_slot(pCC->parseinfo, i);
518                 fwrite(pi->id.s, 1, pi->id.size, stdout);
519                 fprintf(stdout, "  ");
520                 fwrite(pi->regex.s, 1, pi->regex.size, stdout);
521                 fprintf(stdout, "\n");
522         }
523 }