2 * Regular Pattern Analyzer Toolkit (RPA/Tk)
3 * Copyright (c) 2009-2012 Martin Stoilov
5 * This program is free software: you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation, either version 3 of the License, or
8 * (at your option) any later version.
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
15 * You should have received a copy of the GNU General Public License
16 * along with this program. If not, see <http://www.gnu.org/licenses/>.
18 * Martin Stoilov <martin@rpasearch.com>
28 #include "rlib/rutf.h"
29 #include "rlib/rmem.h"
33 static rexdfa_t *tokens_dfa = NULL;
47 #define REXCC_TOKEN_NONE 0
48 #define REXCC_TOKEN_DELIMITER 1
49 #define REXCC_TOKEN_IDENTIFIER 2
50 #define REXCC_TOKEN_SPACE 3
51 #define REXCC_TOKEN_CR 4
52 #define REXCC_TOKEN_REGEX 5
55 static struct tokeninfo_s tokens[] = {
56 {REXCC_TOKEN_DELIMITER, "delimiter", "%%[ \\t]?[\\r]?[\\n]"},
57 {REXCC_TOKEN_IDENTIFIER, "identifier", "([^\\t\\r\\n\'\" ]+|\"([^\"\\n]|\\\\\")*\")+|'.+'"},
58 {REXCC_TOKEN_SPACE, "space", "[ \\t]+"},
59 {REXCC_TOKEN_CR, "cr ", "[\\r]?[\\n]"},
60 {REXCC_TOKEN_REGEX, "regex", "[ \\t](\\\\[\\r]?[\\n]|.)+\\r?\\n"},
66 rexdfa_t * rex_cc_tokensdfa()
70 struct tokeninfo_s *ti = tokens;
71 rexdb_t *nfadb, *dfadb;
72 nfadb = rex_db_create(REXDB_TYPE_NFA);
74 start = rex_db_addexpression_s(nfadb, start, ti->regex, (rexuserdata_t)ti);
76 rex_db_destroy(nfadb);
81 dfadb = rex_db_createdfa(nfadb, start);
83 tokens_dfa = rex_db_todfa(dfadb, 0);
85 rex_db_destroy(dfadb);
86 rex_db_destroy(nfadb);
92 rexcc_t *rex_cc_create()
96 pCC = (rexcc_t *)r_malloc(sizeof(*pCC));
99 r_memset(pCC, 0, sizeof(*pCC));
100 pCC->parseinfo = r_array_create(sizeof(struct parseinfo_s));
101 pCC->nfa = rex_db_create(REXDB_TYPE_NFA);
107 void rex_cc_destroy(rexcc_t *pCC)
111 r_array_destroy(pCC->parseinfo);
112 rex_db_destroy(pCC->nfa);
113 rex_dfa_destroy(pCC->dfa);
117 rex_dfa_destroy(tokens_dfa);
123 int rex_cc_load_pattern(rexcc_t *pCC, rbuffer_t *buf, rexuserdata_t userdata)
125 pCC->startuid = rex_db_addexpression(pCC->nfa, pCC->startuid, buf->s, buf->size, userdata);
126 if (pCC->startuid < 0) {
133 int rex_cc_vfprintf(FILE *out, int indent, const char *format, va_list ap)
139 return vfprintf(out, format, ap);
143 int rex_cc_fprintf(FILE *out, int indent, const char *format, ...)
148 va_start(ap, format);
149 ret = rex_cc_vfprintf(out, indent, format, ap);
155 static int rex_cc_output_statesubstates(rexcc_t *pCC, FILE *out, long nstate)
158 rexdfa_t *dfa = pCC->dfa;
159 rexdfs_t *s = rex_dfa_state(dfa, nstate);
161 struct parseinfo_s *pi;
163 for (i = 0; i < s->nsubstates; i++) {
164 ss = rex_dfa_substate(dfa, nstate, i);
165 if (ss->type == REX_STATETYPE_ACCEPT) {
166 pi = (struct parseinfo_s *)r_array_slot(pCC->parseinfo, ss->userdata);
167 pCC->temp = r_realloc(pCC->temp, pi->id.size + 1);
168 r_memset(pCC->temp, 0, pi->id.size + 1);
169 r_memcpy(pCC->temp, pi->id.s, pi->id.size);
170 rex_cc_fprintf(out, 1, "{ %16lu, %16lu, (rexuserdata_t)(%s) },\n", (unsigned long)ss->type, (unsigned long)ss->uid, pCC->temp);
172 rex_cc_fprintf(out, 1, "{ %16lu, %16lu, %16lu },\n", (unsigned long)ss->type, (unsigned long)ss->uid, (unsigned long)0);
179 static int rex_cc_output_substates(rexcc_t *pCC, FILE *out)
182 rexdfa_t *dfa = pCC->dfa;
184 rex_cc_fprintf(out, 0, "static rexdfss_t substates[] = {\n");
185 for (i = 0; i < dfa->nstates; i++)
186 rex_cc_output_statesubstates(pCC, out, i);
187 rex_cc_fprintf(out, 1, "{ %16lu, %16lu, %16lu },\n", 0UL, 0UL, 0UL);
188 rex_cc_fprintf(out, 0, "};\n");
193 static int rex_cc_output_stateaccsubstates(rexcc_t *pCC, FILE *out, long nstate)
196 rexdfa_t *dfa = pCC->dfa;
197 rexdfs_t *s = rex_dfa_state(dfa, nstate);
199 struct parseinfo_s *pi;
201 for (i = 0; i < s->naccsubstates; i++) {
202 ss = rex_dfa_accsubstate(dfa, nstate, i);
203 pi = (struct parseinfo_s *)r_array_slot(pCC->parseinfo, ss->userdata);
204 pCC->temp = r_realloc(pCC->temp, pi->id.size + 1);
205 r_memset(pCC->temp, 0, pi->id.size + 1);
206 r_memcpy(pCC->temp, pi->id.s, pi->id.size);
207 rex_cc_fprintf(out, 1, "{ %16lu, %16lu, (rexuserdata_t)(%s) },\n", (unsigned long)ss->type, (unsigned long)ss->uid, pCC->temp);
213 static int rex_cc_output_accsubstates(rexcc_t *pCC, FILE *out)
216 rexdfa_t *dfa = pCC->dfa;
218 rex_cc_fprintf(out, 0, "static rexdfss_t accsubstates[] = {\n");
219 for (i = 0; i < dfa->nstates; i++)
220 rex_cc_output_stateaccsubstates(pCC, out, i);
221 rex_cc_fprintf(out, 1, "{ %16lu, %16lu, %16lu },\n", 0UL, 0UL, 0UL);
222 rex_cc_fprintf(out, 0, "};\n");
227 static int rex_cc_output_statetransitions(rexcc_t *pCC, FILE *out, long nstate)
230 rexdfa_t *dfa = pCC->dfa;
231 rexdfs_t *s = rex_dfa_state(dfa, nstate);
233 for (i = 0; i < s->ntrans; i++) {
234 t = rex_dfa_transition(dfa, nstate, i);
235 rex_cc_fprintf(out, 1, "{ %16lu, %16lu, %16lu },\n", (unsigned long)t->lowin, (unsigned long)t->highin, (unsigned long)t->state);
241 static int rex_cc_output_transitions(rexcc_t *pCC, FILE *out)
244 rexdfa_t *dfa = pCC->dfa;
246 rex_cc_fprintf(out, 0, "static rexdft_t transitions[] = {\n");
247 for (i = 0; i < dfa->nstates; i++)
248 rex_cc_output_statetransitions(pCC, out, i);
249 rex_cc_fprintf(out, 1, "{ %16lu, %16lu, %16lu },\n", 0UL, 0UL, 0UL);
250 rex_cc_fprintf(out, 0, "};\n");
255 static int rex_cc_output_states(rexcc_t *pCC, FILE *out)
259 rexdfa_t *dfa = pCC->dfa;
261 rex_cc_fprintf(out, 0, "static rexdfs_t states[] = {\n");
262 for (i = 0; i < dfa->nstates; i++) {
263 s = rex_dfa_state(dfa, i);
264 rex_cc_fprintf(out, 1, "{ %16lu, %16lu, %16lu, %16lu, %16lu, %16lu , %16lu},\n",
265 (unsigned long)s->type, (unsigned long)s->trans, (unsigned long)s->ntrans, (unsigned long)s->accsubstates,
266 (unsigned long)s->naccsubstates, (unsigned long)s->substates, (unsigned long)s->nsubstates);
269 rex_cc_fprintf(out, 1, "{ %16lu, %16lu, %16lu, %16lu, %16lu, %16lu , %16lu},\n", 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL);
270 rex_cc_fprintf(out, 0, "};\n");
275 static int rex_cc_output_dfa(rexcc_t *pCC, FILE *out)
277 rexdfa_t *dfa = pCC->dfa;
279 rex_cc_fprintf(out, 0, "static rexdfa_t ccdfa = {\n");
280 rex_cc_fprintf(out, 1, "%lu,\n", dfa->nstates);
281 rex_cc_fprintf(out, 1, "%s,\n", "states");
282 rex_cc_fprintf(out, 1, "%lu,\n", dfa->ntrans);
283 rex_cc_fprintf(out, 1, "%s,\n", "transitions");
284 rex_cc_fprintf(out, 1, "%lu,\n", dfa->naccsubstates);
285 rex_cc_fprintf(out, 1, "%s,\n", "accsubstates");
286 rex_cc_fprintf(out, 1, "%lu,\n", dfa->nsubstates);
287 rex_cc_fprintf(out, 1, "%s,\n", "substates");
288 rex_cc_fprintf(out, 1, "{0, },\n");
289 rex_cc_fprintf(out, 0, "};\n");
295 int rex_cc_output(rexcc_t *pCC, FILE *outc)
299 rex_cc_fprintf(outc, 0, "#include \"rexdfa.h\"\n\n");
300 if (pCC->prolog.size) {
301 fwrite(pCC->prolog.s, 1, pCC->prolog.size, outc);
302 rex_cc_fprintf(outc, 0, "\n");
304 rex_cc_output_accsubstates(pCC, outc);
305 rex_cc_fprintf(outc, 0, "\n\n");
306 rex_cc_output_substates(pCC, outc);
307 rex_cc_fprintf(outc, 0, "\n\n");
308 rex_cc_output_transitions(pCC, outc);
309 rex_cc_fprintf(outc, 0, "\n\n");
310 rex_cc_output_states(pCC, outc);
311 rex_cc_fprintf(outc, 0, "\n\n");
312 rex_cc_output_dfa(pCC, outc);
313 if (pCC->epilog.size) {
314 rex_cc_fprintf(outc, 0, "\n");
315 fwrite(pCC->epilog.s, 1, pCC->epilog.size, outc);
322 int rex_cc_gettoken(rexcc_t *pCC)
326 long nstate = REX_DFA_STARTSTATE;
327 const char *input = pCC->input;
328 rexdfa_t *dfa = rex_cc_tokensdfa();
330 rexdfss_t *ss = NULL;
340 while ((inc = r_utf8_mbtowc(&wc, (const unsigned char*)input, (const unsigned char*)pCC->end)) > 0) {
341 if ((nstate = REX_DFA_NEXT(dfa, nstate, wc)) <= 0)
344 s = REX_DFA_STATE(dfa, nstate);
345 ss = REX_DFA_ACCSUBSTATE(dfa, nstate, 0);
346 if (s->type == REX_STATETYPE_ACCEPT) {
347 pCC->token = ((struct tokeninfo_s *)ss->userdata)->id;
348 ret = input - pCC->input;
349 if (ss && ((struct tokeninfo_s *)ss->userdata)->id == 1) {
355 pCC->tokenptr = pCC->input;
363 static int rex_cc_parseid(rexcc_t *pCC, struct parseinfo_s *info)
365 info->id.s = pCC->tokenptr;
366 info->id.size = pCC->tokenlen;
367 rex_cc_gettoken(pCC);
372 static int rex_cc_parseregex(rexcc_t *pCC, struct parseinfo_s *info)
374 info->regex.s = pCC->tokenptr;
375 info->regex.size = pCC->tokenlen;
376 rex_cc_gettoken(pCC);
381 static int rex_cc_parseline(rexcc_t *pCC)
383 struct parseinfo_s info;
385 r_memset(&info, 0, sizeof(info));
386 if (rex_cc_parseid(pCC, &info) < 0)
388 if (pCC->token != REXCC_TOKEN_REGEX) {
394 if (rex_cc_parseregex(pCC, &info) < 0)
396 r_array_add(pCC->parseinfo, &info);
401 int rex_cc_parse(rexcc_t *pCC)
403 pCC->prolog.s = pCC->input;
404 pCC->prolog.size = 0;
405 while (pCC->input + 3 < pCC->end) {
406 if (*pCC->input == '%' && *(pCC->input+1) == '%' && (*(pCC->input+2) == '\n' || (*(pCC->input+2) == '\r' && *(pCC->input+3) == '\n')))
408 pCC->prolog.size += 1;
411 rex_cc_gettoken(pCC);
412 if (pCC->token != REXCC_TOKEN_DELIMITER)
414 rex_cc_gettoken(pCC);
416 if (pCC->token == REXCC_TOKEN_CR || pCC->token == REXCC_TOKEN_SPACE || pCC->token == REXCC_TOKEN_REGEX) {
417 rex_cc_gettoken(pCC);
418 } else if (pCC->token == REXCC_TOKEN_IDENTIFIER) {
419 rex_cc_parseline(pCC);
420 } else if (pCC->token == REXCC_TOKEN_DELIMITER) {
421 rex_cc_gettoken(pCC);
429 pCC->epilog.s = pCC->input;
430 pCC->epilog.size = pCC->end - pCC->input;
436 int rex_cc_load_buffer(rexcc_t *pCC, rbuffer_t *text)
439 struct parseinfo_s *pi;
441 pCC->input = text->s;
442 pCC->end = text->s + text->size;
443 r_array_setlength(pCC->parseinfo, 0);
444 if (rex_cc_parse(pCC) == 0) {
445 for (i = 0; i < r_array_length(pCC->parseinfo); i++) {
446 pi = (struct parseinfo_s *)r_array_slot(pCC->parseinfo, i);
447 if (rex_cc_load_pattern(pCC, &pi->regex, i) < 0) {
451 fwrite(pi->id.s, 1, pi->id.size, stdout);
452 fprintf(stdout, " : ");
453 fwrite(pi->regex.s, 1, pi->regex.size, stdout);
454 fprintf(stdout, "\n");