RPA Toolkit
b37bdb5cd6020fb03ec6e6a46f66a30a4a389913
[rpatk.git] / rex / rexcompiler.c
1 #include <ctype.h>
2 #include <stdlib.h>
3 #include <stdio.h>
4 #include <string.h>
5 #include "rlib/rutf.h"
6 #include "rlib/rmem.h"
7 #include "rex/rexcompiler.h"
8 #include "rex/rextransition.h"
9 #include "rex/rexstate.h"
10 #include "rex/rexfragment.h"
11
12
13 struct rexcompiler_s {
14         rarray_t *stack;
15         rarray_t *temptrans;
16         const char *start;
17         const char *end;
18         const char *ptr;
19         const char *tokenptr;
20         int token;
21         rexchar_t numtoken;
22         unsigned long fromcount;
23         unsigned long tocount;
24         char tokenstr[REX_COMPILER_TOKENSIZE];
25 };
26
27
28 #define FPUSH(__co__, __fragment__) do { r_array_push((__co__)->stack, __fragment__, rexfragment_t*); } while (0)
29 #define FPOP(__co__) r_array_pop((__co__)->stack, rexfragment_t*)
30 #define FLEN(__co__) r_array_length((__co__)->stack)
31
32 static int rex_compiler_catexpression(rexcompiler_t *co, rexdb_t *rexdb);
33 static int rex_compiler_altexpression(rexcompiler_t *co, rexdb_t *rexdb);
34
35 enum {
36         REX_TOKEN_ERR = -1,
37         REX_TOKEN_EOF = -2,
38 };
39
40 rexcompiler_t *rex_compiler_create()
41 {
42         rexcompiler_t *co;
43
44         co = (rexcompiler_t *)r_malloc(sizeof(*co));
45         r_memset(co, 0, sizeof(*co));
46         co->stack = r_array_create(sizeof(rexfragment_t*));
47         co->temptrans = r_array_create(sizeof(rex_transition_t));
48         return co;
49 }
50
51
52 void rex_compiler_destroy(rexcompiler_t *co)
53 {
54         if (co) {
55                 r_array_destroy(co->stack);
56                 r_array_destroy(co->temptrans);
57                 r_free(co);
58         }
59
60 }
61
62
63 static int rex_compiler_isspace(int c)
64 {
65         return isspace(c);
66 }
67
68
69 static int rex_compiler_isblank(int c)
70 {
71         return isblank(c);
72 }
73
74
75 static int rex_compiler_getchar(rexcompiler_t *co)
76 {
77         ruint32 wc = REX_TOKEN_EOF;
78         int inc = 0;
79
80 again:
81         if (co->ptr + 1 < co->end && *co->ptr == '\\' && *(co->ptr + 1) == '\n') {
82                 co->ptr += 2;
83                 goto again;
84         }
85         if (co->ptr + 2 < co->end && *co->ptr == '\\' && *(co->ptr + 1) == '\r' && *(co->ptr + 2) == '\n') {
86                 co->ptr += 3;
87                 goto again;
88         }
89         inc = r_utf8_mbtowc(&wc, (const unsigned char*)co->ptr, (const unsigned char*)co->end);
90         if (inc <= 0)
91                 return REX_TOKEN_EOF;
92         if (wc == '\r' || wc == '\n') {
93                 return REX_TOKEN_EOF;
94         }
95         co->ptr += inc;
96         return wc;
97 }
98
99
100 static int rex_compiler_escapedchar(int c)
101 {
102         switch (c) {
103         case 'r':
104                 return '\r';
105         case 'n':
106                 return '\n';
107         case 't':
108                 return '\t';
109         case '0':
110                 return '\0';
111         case 'f':
112                 return '\f';
113         case 'v':
114                 return '\v';
115         case '\"':
116                 return '\"';
117         case '\'':
118                 return '\'';
119         default:
120                 return c;
121         }
122         return c;
123 }
124
125
126 static int rex_compiler_gettok(rexcompiler_t *co)
127 {
128         int c;
129
130         co->tokenptr = co->ptr;
131         c = rex_compiler_getchar(co);
132         if (c == REX_TOKEN_EOF) {
133                 co->tokenstr[0] = '\0';
134                 co->token = REX_TOKEN_EOF;
135                 return co->token;
136         } else {
137                 co->tokenstr[0] = c;
138                 co->tokenstr[1] = '\0';
139                 co->token = c;
140         }
141         return co->token;
142 }
143
144
145 /*
146  * Don't do anything if this is not an escaped token
147  */
148 static void rex_compiler_adjustescapedtoken(rexcompiler_t *co)
149 {
150         if (co->token == '\\') {
151                 rex_compiler_gettok(co);                /* eat '\\' */
152                 if (co->token == REX_TOKEN_EOF) {
153                         return;
154                 }
155                 co->token = rex_compiler_escapedchar(co->token);
156         }
157 }
158
159
160 static int rex_compiler_getnbtok(rexcompiler_t *co)
161 {
162         while (co->ptr < co->end && rex_compiler_isblank(*co->ptr))
163                 co->ptr += 1;
164         return rex_compiler_gettok(co);
165
166 }
167
168
169 static int rex_compiler_getnstok(rexcompiler_t *co)
170 {
171         while (co->ptr < co->end && rex_compiler_isspace(*co->ptr))
172                 co->ptr += 1;
173         return rex_compiler_gettok(co);
174 }
175
176
177 #define REX_CONV_BUFSIZE 128
178 int rex_compiler_numerictoken(rexcompiler_t *co, rexdb_t *rexdb)
179 {
180         char convbuf[REX_CONV_BUFSIZE + 1];
181         int i = 0;
182         int base = 0;
183
184         rex_compiler_gettok(co); /* Eat # */
185         if (co->token == 'x' || co->token == 'X') {
186                 rex_compiler_gettok(co);
187                 base = 16;
188         } else if (co->token == '0') {
189                 rex_compiler_gettok(co);
190                 convbuf[i++] = '0';
191                 if (co->token == 'x' || co->token == 'X') {
192                         rex_compiler_gettok(co);
193                         i = 0;
194                         base = 16;
195                 } else {
196                         base = 8;
197                 }
198         } else if (!isdigit(co->token)) {
199                 return -1;
200         }
201         while ((base == 16 && isxdigit(co->token)) || (base == 8 && isdigit(co->token) && co->token != '8' && co->token != '9') || isdigit(co->token)) {
202                 if (i >= REX_CONV_BUFSIZE)
203                         return -1;
204                 convbuf[i++] = co->token;
205                 if (rex_compiler_gettok(co) == REX_TOKEN_EOF)
206                         break;
207         }
208         convbuf[i++] = 0;
209         co->numtoken = strtoul(convbuf, NULL, base);
210         return 0;
211 }
212
213
214 int rex_compiler_charclass(rexcompiler_t *co, rexdb_t *rexdb)
215 {
216         int negative = 0;
217         int low;
218         int high;
219         long i;
220         rex_transition_t *t;
221         rexstate_t *srcstate = rex_db_getstate(rexdb, rex_db_createstate(rexdb, REX_STATETYPE_NONE));
222         rexfragment_t *frag = NULL;
223
224         rex_compiler_gettok(co);
225         if (co->token == '^') {
226                 negative = 1;
227                 rex_compiler_gettok(co);
228         }
229
230         while (1) {
231                 if (co->token == '[' || co->token == ']' || co->token == '-' ||  co->token == REX_TOKEN_EOF)
232                         return -1;
233                 if (co->token == '#') {
234                         if (rex_compiler_numerictoken(co, rexdb) < 0)
235                                 return -1;
236                         high = low = co->numtoken;
237                 } else {
238                         rex_compiler_adjustescapedtoken(co);
239                         high = low = co->token;
240                         rex_compiler_gettok(co);
241                 }
242                 if (co->token == '-') {
243                         rex_compiler_gettok(co);
244                         switch (co->token) {
245                         case '[':
246                         case ']':
247                         case '-':
248                         case REX_TOKEN_EOF:
249                         case REX_TOKEN_ERR:
250                                 return -1;
251                                 break;
252                         }
253                         if (co->token == '#') {
254                                 if (rex_compiler_numerictoken(co, rexdb) < 0)
255                                         return -1;
256                                 high = co->numtoken;
257                         } else {
258                                 rex_compiler_adjustescapedtoken(co);
259                                 high = co->token;
260                                 rex_compiler_gettok(co);
261                         }
262                 }
263                 rex_state_addtransition(srcstate, low, high, -1);
264                 if (co->token == ']') {
265                         rex_compiler_getnbtok(co); /* eat it */
266                         break;
267                 }
268         }
269         rex_transitions_normalize(srcstate->trans);
270         if (negative) {
271                 r_array_setlength(co->temptrans, 0);
272                 rex_transitions_negative(co->temptrans, srcstate->trans, srcstate->uid, -1);
273                 r_array_setlength(srcstate->trans, 0);
274                 for (i = 0; i < r_array_length(co->temptrans); i++) {
275                         t = (rex_transition_t *)r_array_slot(co->temptrans, i);
276                         r_array_add(srcstate->trans, t);
277                 }
278                 rex_transitions_normalize(srcstate->trans);
279         }
280         frag = rex_fragment_create(srcstate);
281         FPUSH(co, frag);
282
283         return 0;
284 }
285
286
287 static int rex_compiler_factor(rexcompiler_t *co, rexdb_t *rexdb)
288 {
289         rexstate_t *srcstate;
290         rexfragment_t *frag;
291
292         if (co->token == '[') {
293                 return rex_compiler_charclass(co, rexdb);
294         } else if (co->token == '(') {
295                 rex_compiler_getnbtok(co);              /* eat '(' */
296                 if (co->token == ')')
297                         return -1;
298                 if (rex_compiler_altexpression(co, rexdb) < 0)
299                         return -1;
300                 if (co->token != ')')
301                         return -1;
302                 rex_compiler_getnbtok(co);              /* eat ')' */
303                 return 0;
304         } else if (co->token == '.') {
305                 srcstate = rex_db_getstate(rexdb, rex_db_createstate(rexdb, REX_STATETYPE_NONE));
306                 rex_compiler_adjustescapedtoken(co);
307                 rex_state_addtransition(srcstate, 0, '\n' - 1, -1);
308                 rex_state_addtransition(srcstate, '\n' + 1, REX_CHAR_MAX, -1);
309                 frag = rex_fragment_create(srcstate);
310                 FPUSH(co, frag);
311                 rex_compiler_getnbtok(co);
312                 return 0;
313         } else {
314                 srcstate = rex_db_getstate(rexdb, rex_db_createstate(rexdb, REX_STATETYPE_NONE));
315                 rex_compiler_adjustescapedtoken(co);
316                 rex_state_addtransition(srcstate, co->token, co->token, -1);
317                 frag = rex_fragment_create(srcstate);
318                 FPUSH(co, frag);
319                 rex_compiler_getnbtok(co);
320                 return 0;
321         }
322
323         return -1;
324 }
325
326
327 static int rex_compiler_parsecount(rexcompiler_t *co, rexdb_t *rexdb)
328 {
329         char convbuf[REX_CONV_BUFSIZE + 1];
330         int i = 0;
331
332         rex_compiler_getnbtok(co);              /* eat '{' */
333         if (!isdigit(co->token))
334                 return -1;
335         for (i = 0; isdigit(co->token); i++) {
336                 if (i >= REX_CONV_BUFSIZE)
337                         return -1;
338                 convbuf[i] = co->token;
339                 if (rex_compiler_gettok(co) == REX_TOKEN_EOF)
340                         return -1;
341         }
342         convbuf[i++] = 0;
343         co->fromcount = strtoul(convbuf, NULL, 10);
344         co->tocount = co->fromcount;
345         if (rex_compiler_isblank(co->token)) {
346                 rex_compiler_getnbtok(co);
347         }
348         if (co->token == ',') {
349                 rex_compiler_getnbtok(co);              /* eat ',' */
350                 co->tocount = (unsigned long)-1;
351                 if (isdigit(co->token)) {
352                         for (i = 0; isdigit(co->token); i++) {
353                                 if (i >= REX_CONV_BUFSIZE)
354                                         return -1;
355                                 convbuf[i] = co->token;
356                                 if (rex_compiler_gettok(co) == REX_TOKEN_EOF)
357                                         return -1;
358                         }
359                         convbuf[i++] = 0;
360                         co->tocount = strtoul(convbuf, NULL, 10);
361                         if (rex_compiler_isblank(co->token)) {
362                                 rex_compiler_getnbtok(co);
363                         }
364                 }
365         }
366         if (co->token != '}')
367                 return -1;
368         rex_compiler_getnbtok(co);              /* eat '}' */
369         return 0;
370 }
371
372
373 static int rex_compiler_qfactor(rexcompiler_t *co, rexdb_t *rexdb)
374 {
375         rexfragment_t *frag, *frag1, *frag2;
376         const char *fstart, *fend;
377         rexstate_t *srcstate;
378
379         if (strchr("{}*?+]\n\r)", co->token)) {
380                 /*
381                  * Unexpected char.
382                  */
383                 return -1;
384         }
385         fstart = co->tokenptr;
386         if (rex_compiler_factor(co, rexdb) < 0) {
387                 return -1;
388         }
389         fend = co->tokenptr;
390         switch (co->token) {
391         case '*':
392                 rex_compiler_getnbtok(co); /* Eat it */
393                 frag = FPOP(co);
394                 frag = rex_fragment_mop(rexdb, frag);
395                 FPUSH(co, frag);
396                 break;
397         case '?':
398                 rex_compiler_getnbtok(co); /* Eat it */
399                 frag = FPOP(co);
400                 frag = rex_fragment_opt(rexdb, frag);
401                 FPUSH(co, frag);
402                 break;
403         case '+':
404                 rex_compiler_getnbtok(co); /* Eat it */
405                 frag = FPOP(co);
406                 frag = rex_fragment_mul(rexdb, frag);
407                 FPUSH(co, frag);
408                 break;
409         case '{':
410                 if (rex_compiler_parsecount(co, rexdb) < 0)
411                         return -1;
412                 if (co->fromcount > co->tocount)
413                         return -1;
414                 if (co->fromcount == 0 && co->tocount == 0) {
415                         /*
416                          * Special case
417                          */
418                         frag = FPOP(co);
419                         srcstate = rex_db_getstate(rexdb, rex_db_createstate(rexdb, REX_STATETYPE_NONE));
420                         rex_compiler_adjustescapedtoken(co);
421                         rex_state_addtransition_e(srcstate, -1);
422                         frag = rex_fragment_create(srcstate);
423                         FPUSH(co, frag);
424                 }
425                 if (co->fromcount == 0) {
426                         frag = FPOP(co);
427                         frag = rex_fragment_opt(rexdb, frag);
428                         FPUSH(co, frag);
429                 }
430                 do {
431                         long count;
432                         const char *saved_start = co->start;
433                         const char *saved_end = co->end;
434                         const char *saved_ptr = co->ptr;
435                         const char *saved_tokenptr = co->tokenptr;
436                         int saved_token = co->token;
437                         for (count = 1; count < co->tocount; count++) {
438                                 co->start = fstart;
439                                 co->end = fend;
440                                 co->ptr = fstart;
441                                 rex_compiler_getnbtok(co);
442                                 rex_compiler_catexpression(co, rexdb);
443                                 frag2 = FPOP(co);
444                                 if (co->tocount == (unsigned long)-1 && count >= co->fromcount) {
445                                         frag2 = rex_fragment_mop(rexdb, frag2);
446                                         frag1 = FPOP(co);
447                                         frag1 = rex_fragment_cat(rexdb, frag1, frag2);
448                                         FPUSH(co, frag1);
449                                         break;
450                                 }
451                                 if (count >= co->fromcount) {
452                                         frag2 = rex_fragment_opt(rexdb, frag2);
453                                 }
454                                 frag1 = FPOP(co);
455                                 frag1 = rex_fragment_cat(rexdb, frag1, frag2);
456                                 FPUSH(co, frag1);
457                         }
458 //                      fwrite(fstart, 1, fend - fstart, stdout);
459 //                      fprintf(stdout, "{%ld,%ld}\n", co->fromcount, co->tocount);
460                         co->start = saved_start;
461                         co->end = saved_end;
462                         co->ptr = saved_ptr;
463                         co->tokenptr = saved_tokenptr;
464                         co->token = saved_token;
465                 } while (0);
466                 break;
467         }
468         return 0;
469 }
470
471
472 static int rex_compiler_catexpression(rexcompiler_t *co, rexdb_t *rexdb)
473 {
474         int nfactors = 0;
475         rexfragment_t *frag1;
476         rexfragment_t *frag2;
477
478         if (co->token == REX_TOKEN_EOF)
479                 return -1;
480         while (1) {
481                 switch (co->token) {
482                 case REX_TOKEN_EOF:
483                 case '|':
484                 case ')':
485                         return 0;
486                 }
487                 if (rex_compiler_qfactor(co, rexdb) < 0)
488                         return -1;
489                 ++nfactors;
490                 if (nfactors >= 2) {
491                         frag2 = FPOP(co);
492                         frag1 = FPOP(co);
493                         frag1 = rex_fragment_cat(rexdb, frag1, frag2);
494                         FPUSH(co, frag1);
495                 }
496         }
497         return 0;
498 }
499
500
501 static int rex_compiler_altexpression(rexcompiler_t *co, rexdb_t *rexdb)
502 {
503         rexfragment_t *frag1;
504         rexfragment_t *frag2;
505
506         if (rex_compiler_catexpression(co, rexdb) < 0)
507                 return -1;
508         while (co->token == '|') {
509                 rex_compiler_getnstok(co); /* Eat '|' */
510                 switch (co->token) {
511                 case REX_TOKEN_EOF:
512                 case '|':
513                 case ')':
514                 case ']':
515                         return -1;
516                 }
517                 if (rex_compiler_catexpression(co, rexdb) < 0)
518                         return -1;
519                 frag2 = FPOP(co);
520                 frag1 = FPOP(co);
521                 frag1 = rex_fragment_alt(rexdb, frag1, frag2);
522                 FPUSH(co, frag1);
523         }
524         return 0;
525 }
526
527
528 long rex_compiler_expression(rexcompiler_t *co, rexdb_t *rexdb, const char *str, unsigned int size, rexuserdata_t userdata)
529 {
530         long curlen = r_array_length(rexdb->states);
531         long i;
532         long ret = -1L;
533         rexfragment_t *frag1 = NULL, *frag2 = NULL;
534
535         co->start = str;
536         co->end = str + size;
537         co->ptr = str;
538
539         rex_compiler_getnbtok(co);
540         if (rex_compiler_altexpression(co, rexdb) < 0)
541                 goto error;
542         frag1 = FPOP(co);
543         frag2 = rex_fragment_create(rex_db_getstate(rexdb, rex_db_createstate(rexdb, REX_STATETYPE_ACCEPT)));
544         rex_state_setuserdata(REX_FRAG_STATE(frag2), userdata);
545         frag1 = rex_fragment_cat(rexdb, frag1, frag2);
546         ret = REX_FRAG_STATEUID(frag1);
547         REX_FRAG_STATE(frag1)->type = REX_STATETYPE_START;
548         rex_fragment_destroy(frag1);
549         return ret;
550 error:
551         if (curlen < r_array_length(rexdb->states)) {
552                 for (i = curlen; i < r_array_length(rexdb->states); i++)
553                         rex_state_destroy(rex_db_getstate(rexdb, i));
554                 r_array_setlength(rexdb->states, curlen);
555         }
556         while (FLEN(co)) {
557                 rex_fragment_destroy(FPOP(co));
558         }
559         return -1;
560 }
561
562
563 long rex_compiler_expression_s(rexcompiler_t *co, rexdb_t *rexdb, const char *str, rexuserdata_t userdata)
564 {
565         return rex_compiler_expression(co, rexdb, str, r_strlen(str), userdata);
566 }
567
568
569 long rex_compiler_addexpression(rexcompiler_t *co, rexdb_t *rexdb, unsigned long prev, const char *str, unsigned int size, rexuserdata_t userdata)
570 {
571         rexstate_t *sprev = NULL, *scur = NULL;
572         long cur = rex_compiler_expression(co, rexdb, str, size, userdata);
573         if (cur < 0)
574                 return -1;
575         sprev = rex_db_getstate(rexdb, prev);
576         scur = rex_db_getstate(rexdb, cur);
577         if (!sprev)
578                 return scur->uid;
579         rex_state_addtransition_e_dst(sprev, scur);
580         scur->type = REX_STATETYPE_NONE;
581         sprev->type = REX_STATETYPE_START;
582         return prev;
583 }
584
585
586 long rex_compiler_addexpression_s(rexcompiler_t *co, rexdb_t *rexdb, unsigned long prev, const char *str, rexuserdata_t userdata)
587 {
588         return rex_compiler_addexpression(co, rexdb, prev, str, r_strlen(str), userdata);
589 }