RPA Toolkit
4eddb2db38479e782e67ea685d29930477686dd7
[rpatk.git] / rexcc / rexcc.c
1 /*
2  *  Regular Pattern Analyzer Toolkit (RPA/Tk)
3  *  Copyright (c) 2009-2012 Martin Stoilov
4  *
5  *  This program is free software: you can redistribute it and/or modify
6  *  it under the terms of the GNU General Public License as published by
7  *  the Free Software Foundation, either version 3 of the License, or
8  *  (at your option) any later version.
9  *
10  *  This program is distributed in the hope that it will be useful,
11  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
12  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13  *  GNU General Public License for more details.
14  *
15  *  You should have received a copy of the GNU General Public License
16  *  along with this program.  If not, see <http://www.gnu.org/licenses/>.
17  *
18  *  Martin Stoilov <martin@rpasearch.com>
19  */
20
21
22 #include <stdio.h>
23 #include <stdlib.h>
24 #include <string.h>
25 #include <stdarg.h>
26 #include <time.h>
27
28 #include "rlib/rutf.h"
29 #include "rlib/rmem.h"
30 #include "rexcc.h"
31
32
33 static rexdfa_t *tokens_dfa = NULL;
34 struct tokeninfo_s {
35         int id;
36         const char *name;
37         const char *regex;
38 };
39
40
41 struct parseinfo_s {
42         rbuffer_t id;
43         rbuffer_t regex;
44 };
45
46
47 #define REXCC_TOKEN_NONE                        0
48 #define REXCC_TOKEN_DELIMITER           1
49 #define REXCC_TOKEN_IDENTIFIER          2
50 #define REXCC_TOKEN_SPACE                       3
51 #define REXCC_TOKEN_CR                          4
52 #define REXCC_TOKEN_REGEX                       5
53
54
55 static struct tokeninfo_s tokens[] = {
56                 {REXCC_TOKEN_DELIMITER,         "delimiter",    "%%[ \\t]?[\\r]?[\\n]"},
57                 {REXCC_TOKEN_IDENTIFIER,        "identifier",   "([^\\t\\r\\n\'\" ]+|\"([^\"\\n]|\\\\\")*\")+|'.+'"},
58                 {REXCC_TOKEN_SPACE,                     "space",                "[ \\t]+"},
59                 {REXCC_TOKEN_CR,                        "cr ",                  "[\\r]?[\\n]"},
60                 {REXCC_TOKEN_REGEX,                     "regex",                "[ \\t](\\\\[\\r]?[\\n]|.)+\\r?\\n"},
61                 {0, NULL, NULL},
62 };
63
64
65
66 rexdfa_t * rex_cc_tokensdfa()
67 {
68         if (!tokens_dfa) {
69                 long start = -1;
70                 struct tokeninfo_s *ti = tokens;
71                 rexdb_t *nfadb, *dfadb;
72                 nfadb = rex_db_create(REXDB_TYPE_NFA);
73                 while (ti->regex) {
74                         start = rex_db_addexpression_s(nfadb, start, ti->regex, (rexuserdata_t)ti);
75                         if (start < 0) {
76                                 rex_db_destroy(nfadb);
77                                 return NULL;
78                         }
79                         ++ti;
80                 }
81                 dfadb = rex_db_createdfa(nfadb, start);
82                 if (dfadb) {
83                         tokens_dfa = rex_db_todfa(dfadb, 0);
84                 }
85                 rex_db_destroy(dfadb);
86                 rex_db_destroy(nfadb);
87         }
88         return tokens_dfa;
89 }
90
91
92 rexcc_t *rex_cc_create()
93 {
94         rexcc_t *pCC;
95         
96         pCC = (rexcc_t *)r_malloc(sizeof(*pCC));
97         if (!pCC)
98                 return (void *)0;
99         r_memset(pCC, 0, sizeof(*pCC));
100         pCC->parseinfo = r_array_create(sizeof(struct parseinfo_s));
101         pCC->nfa = rex_db_create(REXDB_TYPE_NFA);
102         pCC->startuid = -1L;
103         return pCC;
104 }
105
106
107 void rex_cc_destroy(rexcc_t *pCC)
108 {
109         if (!pCC)
110                 return;
111         r_array_destroy(pCC->parseinfo);
112         rex_db_destroy(pCC->nfa);
113         rex_dfa_destroy(pCC->dfa);
114         r_free(pCC->temp);
115         r_free(pCC);
116         if (tokens_dfa) {
117                 rex_dfa_destroy(tokens_dfa);
118                 tokens_dfa = NULL;
119         }
120 }
121
122
123 int rex_cc_load_pattern(rexcc_t *pCC, rbuffer_t *buf, rexuserdata_t userdata)
124 {
125         pCC->startuid = rex_db_addexpression(pCC->nfa, pCC->startuid, buf->s, buf->size, userdata);
126         if (pCC->startuid < 0) {
127                 return -1;
128         }
129         return 0;
130 }
131
132
133 int rex_cc_vfprintf(FILE *out, int indent, const char *format, va_list ap)
134 {
135         while (indent > 0) {
136                 fprintf(out, "\t");
137                 --indent;
138         }
139         return vfprintf(out, format, ap);
140 }
141
142
143 int rex_cc_fprintf(FILE *out, int indent, const char *format, ...)
144 {
145         va_list ap;
146         int ret;
147
148         va_start(ap, format);
149         ret = rex_cc_vfprintf(out, indent, format, ap);
150         va_end(ap);
151         return ret;
152 }
153
154
155 static int rex_cc_output_statesubstates(rexcc_t *pCC, FILE *out, long nstate)
156 {
157         long i;
158         rexdfa_t *dfa = pCC->dfa;
159         rexdfs_t *s = rex_dfa_state(dfa, nstate);
160         rexdfss_t *ss;
161         struct parseinfo_s *pi;
162
163         for (i = 0; i < s->nsubstates; i++) {
164                 ss = rex_dfa_substate(dfa, nstate, i);
165                 if (ss->type == REX_STATETYPE_ACCEPT) {
166                         pi = (struct parseinfo_s *)r_array_slot(pCC->parseinfo, ss->userdata);
167                         pCC->temp = r_realloc(pCC->temp, pi->id.size + 1);
168                         r_memset(pCC->temp, 0, pi->id.size + 1);
169                         r_memcpy(pCC->temp, pi->id.s, pi->id.size);
170                         rex_cc_fprintf(out, 1, "{ %16lu, %16lu, (rexuserdata_t)(%s) },\n", (unsigned long)ss->type, (unsigned long)ss->uid, pCC->temp);
171                 } else {
172                         rex_cc_fprintf(out, 1, "{ %16lu, %16lu, %16lu },\n", (unsigned long)ss->type, (unsigned long)ss->uid, (unsigned long)0);
173                 }
174         }
175         return 0;
176 }
177
178
179 static int rex_cc_output_substates(rexcc_t *pCC, FILE *out)
180 {
181         long i;
182         rexdfa_t *dfa = pCC->dfa;
183
184         rex_cc_fprintf(out, 0, "static rexdfss_t substates[] = {\n");
185         for (i = 0; i < dfa->nstates; i++)
186                 rex_cc_output_statesubstates(pCC, out, i);
187         rex_cc_fprintf(out, 1, "{ %16lu, %16lu, %16lu },\n", 0UL, 0UL, 0UL);
188         rex_cc_fprintf(out, 0, "};\n");
189         return 0;
190 }
191
192
193 static int rex_cc_output_stateaccsubstates(rexcc_t *pCC, FILE *out, long nstate)
194 {
195         long i;
196         rexdfa_t *dfa = pCC->dfa;
197         rexdfs_t *s = rex_dfa_state(dfa, nstate);
198         rexdfss_t *ss;
199         struct parseinfo_s *pi;
200
201         for (i = 0; i < s->naccsubstates; i++) {
202                 ss = rex_dfa_accsubstate(dfa, nstate, i);
203                 pi = (struct parseinfo_s *)r_array_slot(pCC->parseinfo, ss->userdata);
204                 pCC->temp = r_realloc(pCC->temp, pi->id.size + 1);
205                 r_memset(pCC->temp, 0, pi->id.size + 1);
206                 r_memcpy(pCC->temp, pi->id.s, pi->id.size);
207                 rex_cc_fprintf(out, 1, "{ %16lu, %16lu, (rexuserdata_t)(%s) },\n", (unsigned long)ss->type, (unsigned long)ss->uid, pCC->temp);
208         }
209         return 0;
210 }
211
212
213 static int rex_cc_output_accsubstates(rexcc_t *pCC, FILE *out)
214 {
215         long i;
216         rexdfa_t *dfa = pCC->dfa;
217
218         rex_cc_fprintf(out, 0, "static rexdfss_t accsubstates[] = {\n");
219         for (i = 0; i < dfa->nstates; i++)
220                 rex_cc_output_stateaccsubstates(pCC, out, i);
221         rex_cc_fprintf(out, 1, "{ %16lu, %16lu, %16lu },\n", 0UL, 0UL, 0UL);
222         rex_cc_fprintf(out, 0, "};\n");
223         return 0;
224 }
225
226
227 static int rex_cc_output_statetransitions(rexcc_t *pCC, FILE *out, long nstate)
228 {
229         long i;
230         rexdfa_t *dfa = pCC->dfa;
231         rexdfs_t *s = rex_dfa_state(dfa, nstate);
232         rexdft_t *t;
233         for (i = 0; i < s->ntrans; i++) {
234                 t = rex_dfa_transition(dfa, nstate, i);
235                 rex_cc_fprintf(out, 1, "{ %16lu, %16lu, %16lu },\n", (unsigned long)t->lowin, (unsigned long)t->highin, (unsigned long)t->state);
236         }
237         return 0;
238 }
239
240
241 static int rex_cc_output_transitions(rexcc_t *pCC, FILE *out)
242 {
243         long i;
244         rexdfa_t *dfa = pCC->dfa;
245
246         rex_cc_fprintf(out, 0, "static rexdft_t transitions[] = {\n");
247         for (i = 0; i < dfa->nstates; i++)
248                 rex_cc_output_statetransitions(pCC, out, i);
249         rex_cc_fprintf(out, 1, "{ %16lu, %16lu, %16lu },\n", 0UL, 0UL, 0UL);
250         rex_cc_fprintf(out, 0, "};\n");
251         return 0;
252 }
253
254
255 static int rex_cc_output_states(rexcc_t *pCC, FILE *out)
256 {
257         long i;
258         rexdfs_t *s;
259         rexdfa_t *dfa = pCC->dfa;
260
261         rex_cc_fprintf(out, 0, "static rexdfs_t states[] = {\n");
262         for (i = 0; i < dfa->nstates; i++) {
263                 s = rex_dfa_state(dfa, i);
264                 rex_cc_fprintf(out, 1, "{ %16lu, %16lu, %16lu, %16lu, %16lu, %16lu , %16lu},\n",
265                                 (unsigned long)s->type, (unsigned long)s->trans, (unsigned long)s->ntrans, (unsigned long)s->accsubstates,
266                                 (unsigned long)s->naccsubstates, (unsigned long)s->substates, (unsigned long)s->nsubstates);
267
268         }
269         rex_cc_fprintf(out, 1, "{ %16lu, %16lu, %16lu, %16lu, %16lu, %16lu , %16lu},\n", 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL);
270         rex_cc_fprintf(out, 0, "};\n");
271         return 0;
272 }
273
274
275 static int rex_cc_output_dfa(rexcc_t *pCC, FILE *out)
276 {
277         rexdfa_t *dfa = pCC->dfa;
278
279         rex_cc_fprintf(out, 0, "static rexdfa_t ccdfa = {\n");
280         rex_cc_fprintf(out, 1, "%lu,\n", dfa->nstates);
281         rex_cc_fprintf(out, 1, "%s,\n", "states");
282         rex_cc_fprintf(out, 1, "%lu,\n", dfa->ntrans);
283         rex_cc_fprintf(out, 1, "%s,\n", "transitions");
284         rex_cc_fprintf(out, 1, "%lu,\n", dfa->naccsubstates);
285         rex_cc_fprintf(out, 1, "%s,\n", "accsubstates");
286         rex_cc_fprintf(out, 1, "%lu,\n", dfa->nsubstates);
287         rex_cc_fprintf(out, 1, "%s,\n", "substates");
288         rex_cc_fprintf(out, 1, "{0, },\n");
289         rex_cc_fprintf(out, 0, "};\n");
290         return 0;
291 }
292
293
294
295 int rex_cc_output(rexcc_t *pCC, FILE *outc)
296 {
297
298         if (outc) {
299                 rex_cc_fprintf(outc, 0, "#include \"rexdfa.h\"\n\n");
300                 if (pCC->prolog.size) {
301                         fwrite(pCC->prolog.s, 1, pCC->prolog.size, outc);
302                         rex_cc_fprintf(outc, 0, "\n");
303                 }
304                 rex_cc_output_accsubstates(pCC, outc);
305                 rex_cc_fprintf(outc, 0, "\n\n");
306                 rex_cc_output_substates(pCC, outc);
307                 rex_cc_fprintf(outc, 0, "\n\n");
308                 rex_cc_output_transitions(pCC, outc);
309                 rex_cc_fprintf(outc, 0, "\n\n");
310                 rex_cc_output_states(pCC, outc);
311                 rex_cc_fprintf(outc, 0, "\n\n");
312                 rex_cc_output_dfa(pCC, outc);
313                 if (pCC->epilog.size) {
314                         rex_cc_fprintf(outc, 0, "\n");
315                         fwrite(pCC->epilog.s, 1, pCC->epilog.size, outc);
316                 }
317         }
318         return 0;
319 }
320
321
322 int rex_cc_gettoken(rexcc_t *pCC)
323 {
324         ruint32 wc = 0;
325         int inc, ret = 0;
326         long nstate = REX_DFA_STARTSTATE;
327         const char *input = pCC->input;
328         rexdfa_t *dfa = rex_cc_tokensdfa();
329         rexdfs_t *s = NULL;
330         rexdfss_t *ss = NULL;
331
332         pCC->token = 0;
333         pCC->tokenlen = 0;
334         if (!dfa) {
335                 /*
336                  * Error
337                  */
338                 return -1;
339         }
340         while ((inc = r_utf8_mbtowc(&wc, (const unsigned char*)input, (const unsigned char*)pCC->end)) > 0) {
341                 if ((nstate = REX_DFA_NEXT(dfa, nstate, wc)) <= 0)
342                         break;
343                 input += inc;
344                 s = REX_DFA_STATE(dfa, nstate);
345                 ss = REX_DFA_ACCSUBSTATE(dfa, nstate, 0);
346                 if (s->type == REX_STATETYPE_ACCEPT) {
347                         pCC->token = ((struct tokeninfo_s *)ss->userdata)->id;
348                         ret = input - pCC->input;
349                         if (ss && ((struct tokeninfo_s *)ss->userdata)->id == 1) {
350                                 break;
351                         }
352                 }
353         }
354         if (ret) {
355                 pCC->tokenptr = pCC->input;
356                 pCC->tokenlen = ret;
357                 pCC->input += ret;
358         }
359         return ret;
360 }
361
362
363 static int rex_cc_parseid(rexcc_t *pCC, struct parseinfo_s *info)
364 {
365         info->id.s = pCC->tokenptr;
366         info->id.size = pCC->tokenlen;
367         rex_cc_gettoken(pCC);
368         return 0;
369 }
370
371
372 static int rex_cc_parseregex(rexcc_t *pCC, struct parseinfo_s *info)
373 {
374         info->regex.s = pCC->tokenptr;
375         info->regex.size = pCC->tokenlen;
376         rex_cc_gettoken(pCC);
377         return 0;
378 }
379
380
381 static int rex_cc_parseline(rexcc_t *pCC)
382 {
383         struct parseinfo_s info;
384
385         r_memset(&info, 0, sizeof(info));
386         if (rex_cc_parseid(pCC, &info) < 0)
387                 return -1;
388         if (pCC->token != REXCC_TOKEN_REGEX) {
389                 /*
390                  * Unexpected char.
391                  */
392                 return -1;
393         }
394         if (rex_cc_parseregex(pCC, &info) < 0)
395                 return -1;
396         r_array_add(pCC->parseinfo, &info);
397         return 0;
398 }
399
400
401 int rex_cc_parse(rexcc_t *pCC)
402 {
403         pCC->prolog.s = pCC->input;
404         pCC->prolog.size = 0;
405         while (pCC->input + 3 < pCC->end) {
406                 if (*pCC->input == '%' && *(pCC->input+1) == '%' && (*(pCC->input+2) == '\n' || (*(pCC->input+2) == '\r' && *(pCC->input+3) == '\n')))
407                         break;
408                 pCC->prolog.size += 1;
409                 pCC->input += 1;
410         }
411         rex_cc_gettoken(pCC);
412         if (pCC->token != REXCC_TOKEN_DELIMITER)
413                 return -1;
414         rex_cc_gettoken(pCC);
415         while (pCC->token) {
416                 if (pCC->token == REXCC_TOKEN_CR || pCC->token == REXCC_TOKEN_SPACE || pCC->token == REXCC_TOKEN_REGEX) {
417                         rex_cc_gettoken(pCC);
418                 } else if (pCC->token == REXCC_TOKEN_IDENTIFIER) {
419                         rex_cc_parseline(pCC);
420                 } else if (pCC->token == REXCC_TOKEN_DELIMITER) {
421                         rex_cc_gettoken(pCC);
422                         return 0;
423                 } else {
424                         /*
425                          * Unexpected char
426                          */
427                         return -1;
428                 }
429                 pCC->epilog.s = pCC->input;
430                 pCC->epilog.size = pCC->end - pCC->input;
431         }
432         return -1;
433 }
434
435
436 int rex_cc_load_buffer(rexcc_t *pCC, rbuffer_t *text)
437 {
438         int ret = 0, i;
439         struct parseinfo_s *pi;
440
441         pCC->input = text->s;
442         pCC->end = text->s + text->size;
443         r_array_setlength(pCC->parseinfo, 0);
444         if (rex_cc_parse(pCC) == 0) {
445                 for (i = 0; i < r_array_length(pCC->parseinfo); i++) {
446                         pi = (struct parseinfo_s *)r_array_slot(pCC->parseinfo, i);
447                         if (rex_cc_load_pattern(pCC, &pi->regex, i) < 0) {
448                                 return -1;
449                         }
450 #if 0
451                         fwrite(pi->id.s, 1, pi->id.size, stdout);
452                         fprintf(stdout, " : ");
453                         fwrite(pi->regex.s, 1, pi->regex.size, stdout);
454                         fprintf(stdout, "\n");
455 #endif
456                 }
457         }
458         return ret;
459 }