RPA Toolkit
work on rexcc
[rpatk.git] / rexcc / rexcc.c
1 /*
2  *  Regular Pattern Analyzer (RPA)
3  *  Copyright (c) 2009-2010 Martin Stoilov
4  *
5  *  This program is free software: you can redistribute it and/or modify
6  *  it under the terms of the GNU General Public License as published by
7  *  the Free Software Foundation, either version 3 of the License, or
8  *  (at your option) any later version.
9  *
10  *  This program is distributed in the hope that it will be useful,
11  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
12  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13  *  GNU General Public License for more details.
14  *
15  *  You should have received a copy of the GNU General Public License
16  *  along with this program.  If not, see <http://www.gnu.org/licenses/>.
17  *
18  *  Martin Stoilov <martin@rpasearch.com>
19  */
20
21
22 #include <stdio.h>
23 #include <stdlib.h>
24 #include <string.h>
25 #include <stdarg.h>
26 #include <time.h>
27
28 /*
29  * Temporary here. Need to fix the encoding definitions.
30  */
31 #include "rpa/rpastat.h"
32
33 #include "rlib/rutf.h"
34 #include "rlib/rmem.h"
35 #include "rexcc.h"
36
37
38 static rexdfa_t *tokens_dfa = NULL;
39 struct tokeninfo_s {
40         int id;
41         const char *name;
42         const char *regex;
43 };
44
45
46 struct parseinfo_s {
47         rbuffer_t id;
48         rbuffer_t regex;
49 };
50
51
52 #define REXCC_TOKEN_NONE                        0
53 #define REXCC_TOKEN_DELIMITER           1
54 #define REXCC_TOKEN_IDENTIFIER          2
55 #define REXCC_TOKEN_SPACE                       3
56 #define REXCC_TOKEN_CR                          4
57 #define REXCC_TOKEN_REGEX                       5
58
59
60 static struct tokeninfo_s tokens[] = {
61                 {REXCC_TOKEN_DELIMITER,         "delimiter",    "%%[ \\t]?[\\r]?[\\n]"},
62                 {REXCC_TOKEN_IDENTIFIER,        "identifier",   "([^\\t\\r\\n\'\" ]+|\".*\")+|'.+'"},
63                 {REXCC_TOKEN_SPACE,                     "space",                "[ \\t]+"},
64                 {REXCC_TOKEN_CR,                        "cr ",                  "[\\r]?[\\n]"},
65                 {REXCC_TOKEN_REGEX,                     "regex",                "[ \\t](\\\\[\\r]?[\\n]|.)+\\r?\\n"},
66                 {0, NULL, NULL},
67 };
68
69
70
71 rexdfa_t * rex_cc_tokensdfa()
72 {
73         if (!tokens_dfa) {
74                 long start = -1;
75                 struct tokeninfo_s *ti = tokens;
76                 rexdb_t *nfadb, *dfadb;
77                 nfadb = rex_db_create(REXDB_TYPE_NFA);
78                 while (ti->regex) {
79                         start = rex_db_addexpression_s(nfadb, start, ti->regex, (rexuserdata_t)ti);
80                         if (start < 0) {
81                                 rex_db_destroy(nfadb);
82                                 return NULL;
83                         }
84                         ++ti;
85                 }
86
87 //              if (nfadb) {
88 //                      long j;
89 //                      rexdb_t *db = nfadb;
90 //                      for (j = 0; j < r_array_length(db->states); j++) {
91 //                              rex_db_dumpstate(db, j);
92 //                      }
93 //              }
94
95                 dfadb = rex_db_createdfa(nfadb, start);
96                 if (dfadb) {
97                         tokens_dfa = rex_db_todfa(dfadb, 0);
98                 }
99                 rex_db_destroy(dfadb);
100                 rex_db_destroy(nfadb);
101         }
102         return tokens_dfa;
103 }
104
105 rexcc_t *rex_cc_create()
106 {
107         rexcc_t *pCC;
108         
109         pCC = (rexcc_t *)r_malloc(sizeof(*pCC));
110         if (!pCC)
111                 return (void *)0;
112         r_memset(pCC, 0, sizeof(*pCC));
113         pCC->parseinfo = r_array_create(sizeof(struct parseinfo_s));
114         pCC->nfa = rex_db_create(REXDB_TYPE_NFA);
115         pCC->startuid = -1L;
116         return pCC;
117 }
118
119
120 void rex_cc_destroy(rexcc_t *pCC)
121 {
122         if (!pCC)
123                 return;
124         r_array_destroy(pCC->parseinfo);
125         rex_db_destroy(pCC->nfa);
126         rex_dfa_destroy(pCC->dfa);
127         r_free(pCC->temp);
128         r_free(pCC);
129 }
130
131
132 int rex_cc_load_pattern(rexcc_t *pCC, rbuffer_t *buf, rexuserdata_t userdata)
133 {
134         pCC->startuid = rex_db_addexpression(pCC->nfa, pCC->startuid, buf->s, buf->size, userdata);
135         if (pCC->startuid < 0) {
136                 return -1;
137         }
138         return 0;
139 }
140
141
142 int rex_cc_vfprintf(FILE *out, int indent, const char *format, va_list ap)
143 {
144         while (indent > 0) {
145                 fprintf(out, "\t");
146                 --indent;
147         }
148         return vfprintf(out, format, ap);
149 }
150
151
152 int rex_cc_fprintf(FILE *out, int indent, const char *format, ...)
153 {
154         va_list ap;
155         int ret;
156
157         va_start(ap, format);
158         ret = rex_cc_vfprintf(out, indent, format, ap);
159         va_end(ap);
160         return ret;
161 }
162
163
164 static int rex_cc_output_statesubstates(rexcc_t *pCC, FILE *out, long nstate)
165 {
166         long i;
167         rexdfa_t *dfa = pCC->dfa;
168         rexdfs_t *s = rex_dfa_state(dfa, nstate);
169         rexdfss_t *ss;
170         struct parseinfo_s *pi;
171
172         for (i = 0; i < s->nsubstates; i++) {
173                 ss = rex_dfa_substate(dfa, nstate, i);
174                 if (ss->type == REX_STATETYPE_ACCEPT) {
175                         pi = (struct parseinfo_s *)r_array_slot(pCC->parseinfo, ss->userdata);
176                         pCC->temp = r_realloc(pCC->temp, pi->id.size + 1);
177                         r_memset(pCC->temp, 0, pi->id.size + 1);
178                         r_memcpy(pCC->temp, pi->id.s, pi->id.size);
179                         rex_cc_fprintf(out, 1, "{ %16lu, %16lu, (rexuserdata_t)(%s) },\n", ss->type, ss->uid, pCC->temp);
180                 } else {
181                         rex_cc_fprintf(out, 1, "{ %16lu, %16lu, %16lu },\n", ss->type, ss->uid, (unsigned long)0);
182                 }
183         }
184         return 0;
185 }
186
187
188 static int rex_cc_output_substates(rexcc_t *pCC, FILE *out)
189 {
190         long i;
191         rexdfa_t *dfa = pCC->dfa;
192
193         rex_cc_fprintf(out, 0, "static rexdfss_t substates[] = {\n");
194         for (i = 0; i < dfa->nstates; i++)
195                 rex_cc_output_statesubstates(pCC, out, i);
196         rex_cc_fprintf(out, 1, "{ %16lu, %16lu, %16lu },\n", 0, 0, 0);
197         rex_cc_fprintf(out, 0, "};\n");
198
199         return 0;
200 }
201
202
203 static int rex_cc_output_stateaccsubstates(rexcc_t *pCC, FILE *out, long nstate)
204 {
205         long i;
206         rexdfa_t *dfa = pCC->dfa;
207         rexdfs_t *s = rex_dfa_state(dfa, nstate);
208         rexdfss_t *ss;
209         struct parseinfo_s *pi;
210
211         for (i = 0; i < s->naccsubstates; i++) {
212                 ss = rex_dfa_accsubstate(dfa, nstate, i);
213                 pi = (struct parseinfo_s *)r_array_slot(pCC->parseinfo, ss->userdata);
214                 pCC->temp = r_realloc(pCC->temp, pi->id.size + 1);
215                 r_memset(pCC->temp, 0, pi->id.size + 1);
216                 r_memcpy(pCC->temp, pi->id.s, pi->id.size);
217                 rex_cc_fprintf(out, 1, "{ %16lu, %16lu, (rexuserdata_t)(%s) },\n", ss->type, ss->uid, pCC->temp);
218         }
219         return 0;
220 }
221
222
223 static int rex_cc_output_accsubstates(rexcc_t *pCC, FILE *out)
224 {
225         long i;
226         rexdfa_t *dfa = pCC->dfa;
227
228         rex_cc_fprintf(out, 0, "static rexdfss_t accsubstates[] = {\n");
229         for (i = 0; i < dfa->nstates; i++)
230                 rex_cc_output_stateaccsubstates(pCC, out, i);
231         rex_cc_fprintf(out, 1, "{ %16lu, %16lu, %16lu },\n", 0, 0, 0);
232         rex_cc_fprintf(out, 0, "};\n");
233
234         return 0;
235 }
236
237
238 static int rex_cc_output_statetransitions(rexcc_t *pCC, FILE *out, long nstate)
239 {
240         long i;
241         rexdfa_t *dfa = pCC->dfa;
242         rexdfs_t *s = rex_dfa_state(dfa, nstate);
243         rexdft_t *t;
244         for (i = 0; i < s->ntrans; i++) {
245                 t = rex_dfa_transition(dfa, nstate, i);
246                 rex_cc_fprintf(out, 1, "{ %16lu, %16lu, %16lu },\n", t->lowin, t->highin, t->state);
247         }
248         return 0;
249 }
250
251
252 static int rex_cc_output_transitions(rexcc_t *pCC, FILE *out)
253 {
254         long i;
255         rexdfa_t *dfa = pCC->dfa;
256
257         rex_cc_fprintf(out, 0, "static rexdft_t transitions[] = {\n");
258         for (i = 0; i < dfa->nstates; i++)
259                 rex_cc_output_statetransitions(pCC, out, i);
260         rex_cc_fprintf(out, 1, "{ %16lu, %16lu, %16lu },\n", 0, 0, 0);
261         rex_cc_fprintf(out, 0, "};\n");
262
263         return 0;
264 }
265
266
267 static int rex_cc_output_states(rexcc_t *pCC, FILE *out)
268 {
269         long i;
270         rexdfs_t *s;
271         rexdfa_t *dfa = pCC->dfa;
272
273         rex_cc_fprintf(out, 0, "static rexdfs_t states[] = {\n");
274         for (i = 0; i < dfa->nstates; i++) {
275                 s = rex_dfa_state(dfa, i);
276                 rex_cc_fprintf(out, 1, "{ %16lu, %16lu, %16lu, %16lu, %16lu, %16lu , %16lu},\n",
277                                 s->type, s->trans, s->ntrans, s->accsubstates, s->naccsubstates, s->substates, s->nsubstates);
278
279         }
280         rex_cc_fprintf(out, 1, "{ %16lu, %16lu, %16lu, %16lu, %16lu, %16lu , %16lu},\n", 0, 0, 0, 0, 0, 0, 0);
281         rex_cc_fprintf(out, 0, "};\n");
282
283         return 0;
284 }
285
286
287 static int rex_cc_output_dfa(rexcc_t *pCC, FILE *out)
288 {
289         rexdfa_t *dfa = pCC->dfa;
290
291         rex_cc_fprintf(out, 0, "static rexdfa_t ccdfa = {\n");
292         rex_cc_fprintf(out, 1, "%lu,\n", dfa->nstates);
293         rex_cc_fprintf(out, 1, "%s,\n", "states");
294         rex_cc_fprintf(out, 1, "%lu,\n", dfa->ntrans);
295         rex_cc_fprintf(out, 1, "%s,\n", "transitions");
296         rex_cc_fprintf(out, 1, "%lu,\n", dfa->naccsubstates);
297         rex_cc_fprintf(out, 1, "%s,\n", "accsubstates");
298         rex_cc_fprintf(out, 1, "%lu,\n", dfa->nsubstates);
299         rex_cc_fprintf(out, 1, "%s,\n", "substates");
300         rex_cc_fprintf(out, 1, "{0, },\n");
301         rex_cc_fprintf(out, 0, "};\n");
302
303         return 0;
304 }
305
306
307
308 int rex_cc_output(rexcc_t *pCC, FILE *outc)
309 {
310
311         if (outc) {
312                 rex_cc_fprintf(outc, 0, "#include \"rexdfa.h\"\n\n");
313                 if (pCC->prolog.size) {
314                         fwrite(pCC->prolog.s, 1, pCC->prolog.size, outc);
315                         rex_cc_fprintf(outc, 0, "\n");
316                 }
317                 rex_cc_output_accsubstates(pCC, outc);
318                 rex_cc_fprintf(outc, 0, "\n\n");
319                 rex_cc_output_substates(pCC, outc);
320                 rex_cc_fprintf(outc, 0, "\n\n");
321                 rex_cc_output_transitions(pCC, outc);
322                 rex_cc_fprintf(outc, 0, "\n\n");
323                 rex_cc_output_states(pCC, outc);
324                 rex_cc_fprintf(outc, 0, "\n\n");
325                 rex_cc_output_dfa(pCC, outc);
326                 if (pCC->epilog.size) {
327                         rex_cc_fprintf(outc, 0, "\n");
328                         fwrite(pCC->epilog.s, 1, pCC->epilog.size, outc);
329                 }
330         }
331
332         return 0;
333 }
334
335
336 int rex_cc_gettoken(rexcc_t *pCC)
337 {
338         ruint32 wc = 0;
339         int inc, ret = 0;
340         long nstate = REX_DFA_STARTSTATE;
341         const char *input = pCC->input;
342         rexdfa_t *dfa = rex_cc_tokensdfa();
343         rexdfs_t *s = NULL;
344         rexdfss_t *ss = NULL;
345
346         pCC->token = 0;
347         pCC->tokenlen = 0;
348         if (!dfa) {
349                 /*
350                  * Error
351                  */
352                 return -1;
353         }
354         while ((inc = r_utf8_mbtowc(&wc, (const unsigned char*)input, (const unsigned char*)pCC->end)) > 0) {
355                 if ((nstate = REX_DFA_NEXT(dfa, nstate, wc)) <= 0)
356                         break;
357                 input += inc;
358                 s = REX_DFA_STATE(dfa, nstate);
359                 ss = REX_DFA_ACCSUBSTATE(dfa, nstate, 0);
360                 if (s->type == REX_STATETYPE_ACCEPT) {
361                         pCC->token = ((struct tokeninfo_s *)ss->userdata)->id;
362                         ret = input - pCC->input;
363                         if (ss && ((struct tokeninfo_s *)ss->userdata)->id == 1) {
364                                 break;
365                         }
366                 }
367         }
368         if (ret) {
369                 pCC->tokenptr = pCC->input;
370                 pCC->tokenlen = ret;
371                 pCC->input += ret;
372         }
373         return ret;
374 }
375
376
377 static int rex_cc_parseid(rexcc_t *pCC, struct parseinfo_s *info)
378 {
379         info->id.s = pCC->tokenptr;
380         info->id.size = pCC->tokenlen;
381         rex_cc_gettoken(pCC);
382         return 0;
383 }
384
385
386 static int rex_cc_parseregex(rexcc_t *pCC, struct parseinfo_s *info)
387 {
388         info->regex.s = pCC->tokenptr;
389         info->regex.size = pCC->tokenlen;
390         rex_cc_gettoken(pCC);
391         return 0;
392 }
393
394
395 static int rex_cc_parseline(rexcc_t *pCC)
396 {
397         struct parseinfo_s info;
398
399         r_memset(&info, 0, sizeof(info));
400         if (rex_cc_parseid(pCC, &info) < 0)
401                 return -1;
402         if (pCC->token != REXCC_TOKEN_REGEX) {
403                 /*
404                  * Unexpected char.
405                  */
406                 return -1;
407         }
408         if (rex_cc_parseregex(pCC, &info) < 0)
409                 return -1;
410         r_array_add(pCC->parseinfo, &info);
411         return 0;
412 }
413
414
415 int rex_cc_parse(rexcc_t *pCC)
416 {
417         pCC->prolog.s = pCC->input;
418         pCC->prolog.size = 0;
419         while (pCC->input + 3 < pCC->end) {
420                 if (*pCC->input == '%' && *(pCC->input+1) == '%' && (*(pCC->input+2) == '\n' || (*(pCC->input+2) == '\r' && *(pCC->input+3) == '\n')))
421                         break;
422                 pCC->prolog.size += 1;
423                 pCC->input += 1;
424         }
425         rex_cc_gettoken(pCC);
426         if (pCC->token != REXCC_TOKEN_DELIMITER)
427                 return -1;
428         rex_cc_gettoken(pCC);
429         while (pCC->token) {
430                 if (pCC->token == REXCC_TOKEN_CR || pCC->token == REXCC_TOKEN_SPACE) {
431                         rex_cc_gettoken(pCC);
432                 } else if (pCC->token == REXCC_TOKEN_IDENTIFIER) {
433                         rex_cc_parseline(pCC);
434                 } else if (pCC->token == REXCC_TOKEN_DELIMITER) {
435                         rex_cc_gettoken(pCC);
436                         return 0;
437                 } else {
438                         /*
439                          * Unexpected char
440                          */
441                         return -1;
442                 }
443                 pCC->epilog.s = pCC->input;
444                 pCC->epilog.size = pCC->end - pCC->input;
445         }
446         return -1;
447 }
448
449
450 int rex_cc_load_buffer(rexcc_t *pCC, rbuffer_t *text)
451 {
452         int ret = 0, i;
453         struct parseinfo_s *pi;
454
455         pCC->input = text->s;
456         pCC->end = text->s + text->size;
457         r_array_setlength(pCC->parseinfo, 0);
458         if (rex_cc_parse(pCC) == 0) {
459                 for (i = 0; i < r_array_length(pCC->parseinfo); i++) {
460                         pi = (struct parseinfo_s *)r_array_slot(pCC->parseinfo, i);
461                         if (rex_cc_load_pattern(pCC, &pi->regex, i) < 0) {
462                                 return -1;
463                         }
464
465 #if 0
466                         fwrite(pi->id.s, 1, pi->id.size, stdout);
467                         fprintf(stdout, " : ");
468                         fwrite(pi->regex.s, 1, pi->regex.size, stdout);
469                         fprintf(stdout, "\n");
470 #endif
471                 }
472         }
473         return ret;
474 }