RPA Toolkit
0eb7fb65c5f760405f957a71cf05a6deb83bb7d0
[rpatk.git] / rex / rexcompiler.c
1 #include <ctype.h>
2 #include <stdlib.h>
3 #include <stdio.h>
4 #include <string.h>
5 #include "rlib/rutf.h"
6 #include "rlib/rmem.h"
7 #include "rex/rexcompiler.h"
8 #include "rex/rextransition.h"
9 #include "rex/rexstate.h"
10 #include "rex/rexfragment.h"
11
12
13 struct rexcompiler_s {
14         rarray_t *stack;
15         rarray_t *temptrans;
16         const char *start;
17         const char *end;
18         const char *ptr;
19         const char *tokenptr;
20         int token;
21         rexchar_t numtoken;
22         unsigned long fromcount;
23         unsigned long tocount;
24         char tokenstr[REX_COMPILER_TOKENSIZE];
25 };
26
27
28 #define FPUSH(__co__, __fragment__) do { r_array_push((__co__)->stack, __fragment__, rexfragment_t*); } while (0)
29 #define FPOP(__co__) r_array_pop((__co__)->stack, rexfragment_t*)
30 #define FLEN(__co__) r_array_length((__co__)->stack)
31
32 static int rex_compiler_catexpression(rexcompiler_t *co, rexdb_t *rexdb);
33 static int rex_compiler_altexpression(rexcompiler_t *co, rexdb_t *rexdb);
34
35 enum {
36         REX_TOKEN_ERR = -1,
37         REX_TOKEN_EOF = -2,
38 };
39
40 rexcompiler_t *rex_compiler_create()
41 {
42         rexcompiler_t *co;
43
44         co = (rexcompiler_t *)r_malloc(sizeof(*co));
45         r_memset(co, 0, sizeof(*co));
46         co->stack = r_array_create(sizeof(rexfragment_t*));
47         co->temptrans = r_array_create(sizeof(rex_transition_t));
48         return co;
49 }
50
51
52 void rex_compiler_destroy(rexcompiler_t *co)
53 {
54         if (co) {
55                 r_array_destroy(co->stack);
56                 r_array_destroy(co->temptrans);
57                 r_free(co);
58         }
59
60 }
61
62
63 static int rex_compiler_isspace(int c)
64 {
65         return isspace(c);
66 }
67
68
69 static int rex_compiler_isblank(int c)
70 {
71         return isblank(c);
72 }
73
74
75 static int rex_compiler_getchar(rexcompiler_t *co)
76 {
77         ruint32 wc = REX_TOKEN_EOF;
78         int inc = 0;
79
80         inc = r_utf8_mbtowc(&wc, (const unsigned char*)co->ptr, (const unsigned char*)co->end);
81         if (inc <= 0)
82                 return REX_TOKEN_EOF;
83         if (wc == '\r' || wc == '\n') {
84                 return REX_TOKEN_EOF;
85         }
86         co->ptr += inc;
87         return wc;
88 }
89
90
91 static int rex_compiler_escapedchar(int c)
92 {
93         switch (c) {
94         case 'r':
95                 return '\r';
96         case 'n':
97                 return '\n';
98         case 't':
99                 return '\t';
100         case '0':
101                 return '\0';
102         case 'f':
103                 return '\f';
104         case 'v':
105                 return '\v';
106         case '\"':
107                 return '\"';
108         case '\'':
109                 return '\'';
110         default:
111                 return c;
112         }
113         return c;
114 }
115
116
117 static int rex_compiler_gettok(rexcompiler_t *co)
118 {
119         int c;
120
121         co->tokenptr = co->ptr;
122         c = rex_compiler_getchar(co);
123         if (c == REX_TOKEN_EOF) {
124                 co->tokenstr[0] = '\0';
125                 co->token = REX_TOKEN_EOF;
126                 return co->token;
127         } else {
128                 co->tokenstr[0] = c;
129                 co->tokenstr[1] = '\0';
130                 co->token = c;
131         }
132         return co->token;
133 }
134
135
136 /*
137  * Don't do anything if this is not an escaped token
138  */
139 static void rex_compiler_adjustescapedtoken(rexcompiler_t *co)
140 {
141         if (co->token == '\\') {
142                 rex_compiler_gettok(co);                /* eat '\\' */
143                 if (co->token == REX_TOKEN_EOF) {
144                         return;
145                 }
146                 co->token = rex_compiler_escapedchar(co->token);
147         }
148 }
149
150
151 static int rex_compiler_getnbtok(rexcompiler_t *co)
152 {
153 again:
154         while (co->ptr < co->end && rex_compiler_isblank(*co->ptr))
155                 co->ptr += 1;
156         if (co->ptr + 1 < co->end && *co->ptr == '\\' && *(co->ptr + 1) == '\n') {
157                 co->ptr += 2;
158                 goto again;
159         }
160         if (co->ptr + 2 < co->end && *co->ptr == '\\' && *(co->ptr + 1) == '\r' && *(co->ptr + 2) == '\n') {
161                 co->ptr += 3;
162                 goto again;
163         }
164
165         return rex_compiler_gettok(co);
166
167 }
168
169
170 static int rex_compiler_getnstok(rexcompiler_t *co)
171 {
172 again:
173         while (co->ptr < co->end && rex_compiler_isspace(*co->ptr))
174                 co->ptr += 1;
175         if (co->ptr + 1 < co->end && *co->ptr == '\\' && *(co->ptr + 1) == '\n') {
176                 co->ptr += 2;
177                 goto again;
178         }
179         if (co->ptr + 2 < co->end && *co->ptr == '\\' && *(co->ptr + 1) == '\r' && *(co->ptr + 2) == '\n') {
180                 co->ptr += 3;
181                 goto again;
182         }
183
184         return rex_compiler_gettok(co);
185 }
186
187
188 #define REX_CONV_BUFSIZE 128
189 int rex_compiler_numerictoken(rexcompiler_t *co, rexdb_t *rexdb)
190 {
191         char convbuf[REX_CONV_BUFSIZE + 1];
192         int i = 0;
193         int base = 0;
194
195         rex_compiler_gettok(co); /* Eat # */
196         if (co->token == 'x' || co->token == 'X') {
197                 rex_compiler_gettok(co);
198                 base = 16;
199         } else if (co->token == '0') {
200                 rex_compiler_gettok(co);
201                 convbuf[i++] = '0';
202                 if (co->token == 'x' || co->token == 'X') {
203                         rex_compiler_gettok(co);
204                         i = 0;
205                         base = 16;
206                 } else {
207                         base = 8;
208                 }
209         } else if (!isdigit(co->token)) {
210                 return -1;
211         }
212         while ((base == 16 && isxdigit(co->token)) || (base == 8 && isdigit(co->token) && co->token != '8' && co->token != '9') || isdigit(co->token)) {
213                 if (i >= REX_CONV_BUFSIZE)
214                         return -1;
215                 convbuf[i++] = co->token;
216                 if (rex_compiler_gettok(co) == REX_TOKEN_EOF)
217                         break;
218         }
219         convbuf[i++] = 0;
220         co->numtoken = strtoul(convbuf, NULL, base);
221         return 0;
222 }
223
224
225 int rex_compiler_charclass(rexcompiler_t *co, rexdb_t *rexdb)
226 {
227         int negative = 0;
228         int low;
229         int high;
230         long i;
231         rex_transition_t *t;
232         rexstate_t *srcstate = rex_db_getstate(rexdb, rex_db_createstate(rexdb, REX_STATETYPE_NONE));
233         rexfragment_t *frag = NULL;
234
235         rex_compiler_gettok(co);
236         if (co->token == '^') {
237                 negative = 1;
238                 rex_compiler_gettok(co);
239         }
240
241         while (1) {
242                 if (co->token == '[' || co->token == ']' || co->token == '-' ||  co->token == REX_TOKEN_EOF)
243                         return -1;
244                 if (co->token == '#') {
245                         if (rex_compiler_numerictoken(co, rexdb) < 0)
246                                 return -1;
247                         high = low = co->numtoken;
248                 } else {
249                         rex_compiler_adjustescapedtoken(co);
250                         high = low = co->token;
251                         rex_compiler_gettok(co);
252                 }
253                 if (co->token == '-') {
254                         rex_compiler_gettok(co);
255                         switch (co->token) {
256                         case '[':
257                         case ']':
258                         case '-':
259                         case REX_TOKEN_EOF:
260                         case REX_TOKEN_ERR:
261                                 return -1;
262                                 break;
263                         }
264                         if (co->token == '#') {
265                                 if (rex_compiler_numerictoken(co, rexdb) < 0)
266                                         return -1;
267                                 high = co->numtoken;
268                         } else {
269                                 rex_compiler_adjustescapedtoken(co);
270                                 high = co->token;
271                                 rex_compiler_gettok(co);
272                         }
273                 }
274                 rex_state_addtransition(srcstate, low, high, -1);
275                 if (co->token == ']') {
276                         rex_compiler_getnbtok(co); /* eat it */
277                         break;
278                 }
279         }
280         rex_transitions_normalize(srcstate->trans);
281         if (negative) {
282                 r_array_setlength(co->temptrans, 0);
283                 rex_transitions_negative(co->temptrans, srcstate->trans, srcstate->uid, -1);
284                 r_array_setlength(srcstate->trans, 0);
285                 for (i = 0; i < r_array_length(co->temptrans); i++) {
286                         t = (rex_transition_t *)r_array_slot(co->temptrans, i);
287                         r_array_add(srcstate->trans, t);
288                 }
289                 rex_transitions_normalize(srcstate->trans);
290         }
291         frag = rex_fragment_create(srcstate);
292         FPUSH(co, frag);
293
294         return 0;
295 }
296
297
298 static int rex_compiler_factor(rexcompiler_t *co, rexdb_t *rexdb)
299 {
300         rexstate_t *srcstate;
301         rexfragment_t *frag;
302
303         if (co->token == '[') {
304                 return rex_compiler_charclass(co, rexdb);
305         } else if (co->token == '(') {
306                 rex_compiler_getnbtok(co);              /* eat '(' */
307                 if (co->token == ')')
308                         return -1;
309                 if (rex_compiler_altexpression(co, rexdb) < 0)
310                         return -1;
311                 if (co->token != ')')
312                         return -1;
313                 rex_compiler_getnbtok(co);              /* eat ')' */
314                 return 0;
315         } else if (co->token == '.') {
316                 srcstate = rex_db_getstate(rexdb, rex_db_createstate(rexdb, REX_STATETYPE_NONE));
317                 rex_compiler_adjustescapedtoken(co);
318                 rex_state_addtransition(srcstate, 0, '\n' - 1, -1);
319                 rex_state_addtransition(srcstate, '\n' + 1, REX_CHAR_MAX, -1);
320                 frag = rex_fragment_create(srcstate);
321                 FPUSH(co, frag);
322                 rex_compiler_getnbtok(co);
323                 return 0;
324         } else {
325                 srcstate = rex_db_getstate(rexdb, rex_db_createstate(rexdb, REX_STATETYPE_NONE));
326                 rex_compiler_adjustescapedtoken(co);
327                 rex_state_addtransition(srcstate, co->token, co->token, -1);
328                 frag = rex_fragment_create(srcstate);
329                 FPUSH(co, frag);
330                 rex_compiler_getnbtok(co);
331                 return 0;
332         }
333
334         return -1;
335 }
336
337
338 static int rex_compiler_parsecount(rexcompiler_t *co, rexdb_t *rexdb)
339 {
340         char convbuf[REX_CONV_BUFSIZE + 1];
341         int i = 0;
342
343         rex_compiler_getnbtok(co);              /* eat '{' */
344         if (!isdigit(co->token))
345                 return -1;
346         for (i = 0; isdigit(co->token); i++) {
347                 if (i >= REX_CONV_BUFSIZE)
348                         return -1;
349                 convbuf[i] = co->token;
350                 if (rex_compiler_gettok(co) == REX_TOKEN_EOF)
351                         return -1;
352         }
353         convbuf[i++] = 0;
354         co->fromcount = strtoul(convbuf, NULL, 10);
355         co->tocount = co->fromcount;
356         if (rex_compiler_isblank(co->token)) {
357                 rex_compiler_getnbtok(co);
358         }
359         if (co->token == ',') {
360                 rex_compiler_getnbtok(co);              /* eat ',' */
361                 co->tocount = (unsigned long)-1;
362                 if (isdigit(co->token)) {
363                         for (i = 0; isdigit(co->token); i++) {
364                                 if (i >= REX_CONV_BUFSIZE)
365                                         return -1;
366                                 convbuf[i] = co->token;
367                                 if (rex_compiler_gettok(co) == REX_TOKEN_EOF)
368                                         return -1;
369                         }
370                         convbuf[i++] = 0;
371                         co->tocount = strtoul(convbuf, NULL, 10);
372                         if (rex_compiler_isblank(co->token)) {
373                                 rex_compiler_getnbtok(co);
374                         }
375                 }
376         }
377         if (co->token != '}')
378                 return -1;
379         rex_compiler_getnbtok(co);              /* eat '}' */
380         return 0;
381 }
382
383
384 static int rex_compiler_qfactor(rexcompiler_t *co, rexdb_t *rexdb)
385 {
386         rexfragment_t *frag, *frag1, *frag2;
387         const char *fstart, *fend;
388         rexstate_t *srcstate;
389         long nstates, i;
390
391         if (strchr("{}*?+]\n\r)", co->token)) {
392                 /*
393                  * Unexpected char.
394                  */
395                 return -1;
396         }
397         nstates = r_array_length(rexdb->states);
398         fstart = co->tokenptr;
399         if (rex_compiler_factor(co, rexdb) < 0) {
400                 return -1;
401         }
402         fend = co->tokenptr;
403         switch (co->token) {
404         case '*':
405                 rex_compiler_getnbtok(co); /* Eat it */
406                 frag = FPOP(co);
407                 frag = rex_fragment_mop(rexdb, frag);
408                 FPUSH(co, frag);
409                 break;
410         case '?':
411                 rex_compiler_getnbtok(co); /* Eat it */
412                 frag = FPOP(co);
413                 frag = rex_fragment_opt(rexdb, frag);
414                 FPUSH(co, frag);
415                 break;
416         case '+':
417                 rex_compiler_getnbtok(co); /* Eat it */
418                 frag = FPOP(co);
419                 frag = rex_fragment_mul(rexdb, frag);
420                 FPUSH(co, frag);
421                 break;
422         case '{':
423                 if (rex_compiler_parsecount(co, rexdb) < 0)
424                         return -1;
425                 if (co->fromcount > co->tocount)
426                         return -1;
427                 if (co->fromcount == 0 && co->tocount == (unsigned long)-1) {
428                         frag = FPOP(co);
429                         frag = rex_fragment_mop(rexdb, frag);
430                         FPUSH(co, frag);
431                         break;
432                 }
433                 if (co->fromcount == 1 && co->tocount == (unsigned long)-1) {
434                         frag = FPOP(co);
435                         frag = rex_fragment_mul(rexdb, frag);
436                         FPUSH(co, frag);
437                         break;
438                 }
439                 if (co->fromcount == 0 && co->tocount == 0) {
440                         /*
441                          * Special case
442                          */
443                         frag = FPOP(co);
444                         for (i = nstates; i < r_array_length(rexdb->states); i++) {
445                                 srcstate = rex_db_getstate(rexdb, i);
446                                 rex_state_destroy(srcstate);
447                         }
448                         r_array_setlength(rexdb->states, nstates);
449                         srcstate = rex_db_getstate(rexdb, rex_db_createstate(rexdb, REX_STATETYPE_NONE));
450                         rex_compiler_adjustescapedtoken(co);
451                         rex_state_addtransition_e(srcstate, -1);
452                         frag = rex_fragment_create(srcstate);
453                         FPUSH(co, frag);
454                         break;
455                 }
456                 if (co->fromcount == 0) {
457                         frag = FPOP(co);
458                         frag = rex_fragment_opt(rexdb, frag);
459                         FPUSH(co, frag);
460                 }
461                 do {
462                         long count;
463                         const char *saved_start = co->start;
464                         const char *saved_end = co->end;
465                         const char *saved_ptr = co->ptr;
466                         const char *saved_tokenptr = co->tokenptr;
467                         int saved_token = co->token;
468                         for (count = 1; count < co->tocount; count++) {
469                                 co->start = fstart;
470                                 co->end = fend;
471                                 co->ptr = fstart;
472                                 rex_compiler_getnbtok(co);
473                                 rex_compiler_catexpression(co, rexdb);
474                                 frag2 = FPOP(co);
475                                 if (co->tocount == (unsigned long)-1 && count >= co->fromcount) {
476                                         frag2 = rex_fragment_mop(rexdb, frag2);
477                                         frag1 = FPOP(co);
478                                         frag1 = rex_fragment_cat(rexdb, frag1, frag2);
479                                         FPUSH(co, frag1);
480                                         break;
481                                 }
482                                 if (count >= co->fromcount) {
483                                         frag2 = rex_fragment_opt(rexdb, frag2);
484                                 }
485                                 frag1 = FPOP(co);
486                                 frag1 = rex_fragment_cat(rexdb, frag1, frag2);
487                                 FPUSH(co, frag1);
488                         }
489 //                      fwrite(fstart, 1, fend - fstart, stdout);
490 //                      fprintf(stdout, "{%ld,%ld}\n", co->fromcount, co->tocount);
491                         co->start = saved_start;
492                         co->end = saved_end;
493                         co->ptr = saved_ptr;
494                         co->tokenptr = saved_tokenptr;
495                         co->token = saved_token;
496                 } while (0);
497                 break;
498         }
499         return 0;
500 }
501
502
503 static int rex_compiler_catexpression(rexcompiler_t *co, rexdb_t *rexdb)
504 {
505         int nfactors = 0;
506         rexfragment_t *frag1;
507         rexfragment_t *frag2;
508
509         if (co->token == REX_TOKEN_EOF)
510                 return -1;
511         while (1) {
512                 switch (co->token) {
513                 case REX_TOKEN_EOF:
514                 case '|':
515                 case ')':
516                         return 0;
517                 }
518                 if (rex_compiler_qfactor(co, rexdb) < 0)
519                         return -1;
520                 ++nfactors;
521                 if (nfactors >= 2) {
522                         frag2 = FPOP(co);
523                         frag1 = FPOP(co);
524                         frag1 = rex_fragment_cat(rexdb, frag1, frag2);
525                         FPUSH(co, frag1);
526                 }
527         }
528         return 0;
529 }
530
531
532 static int rex_compiler_altexpression(rexcompiler_t *co, rexdb_t *rexdb)
533 {
534         rexfragment_t *frag1;
535         rexfragment_t *frag2;
536
537         if (rex_compiler_catexpression(co, rexdb) < 0)
538                 return -1;
539         while (co->token == '|') {
540                 rex_compiler_getnstok(co); /* Eat '|' */
541                 switch (co->token) {
542                 case REX_TOKEN_EOF:
543                 case '|':
544                 case ')':
545                 case ']':
546                         return -1;
547                 }
548                 if (rex_compiler_catexpression(co, rexdb) < 0)
549                         return -1;
550                 frag2 = FPOP(co);
551                 frag1 = FPOP(co);
552                 frag1 = rex_fragment_alt(rexdb, frag1, frag2);
553                 FPUSH(co, frag1);
554         }
555         return 0;
556 }
557
558
559 long rex_compiler_expression(rexcompiler_t *co, rexdb_t *rexdb, const char *str, unsigned int size, rexuserdata_t userdata)
560 {
561         long curlen = r_array_length(rexdb->states);
562         long i;
563         long ret = -1L;
564         rexfragment_t *frag1 = NULL, *frag2 = NULL;
565
566         co->start = str;
567         co->end = str + size;
568         co->ptr = str;
569
570         rex_compiler_getnbtok(co);
571         if (rex_compiler_altexpression(co, rexdb) < 0)
572                 goto error;
573         frag1 = FPOP(co);
574         frag2 = rex_fragment_create(rex_db_getstate(rexdb, rex_db_createstate(rexdb, REX_STATETYPE_ACCEPT)));
575         rex_state_setuserdata(REX_FRAG_STATE(frag2), userdata);
576         frag1 = rex_fragment_cat(rexdb, frag1, frag2);
577         ret = REX_FRAG_STATEUID(frag1);
578         REX_FRAG_STATE(frag1)->type = REX_STATETYPE_START;
579         rex_fragment_destroy(frag1);
580         return ret;
581 error:
582         if (curlen < r_array_length(rexdb->states)) {
583                 for (i = curlen; i < r_array_length(rexdb->states); i++)
584                         rex_state_destroy(rex_db_getstate(rexdb, i));
585                 r_array_setlength(rexdb->states, curlen);
586         }
587         while (FLEN(co)) {
588                 rex_fragment_destroy(FPOP(co));
589         }
590         return -1;
591 }
592
593
594 long rex_compiler_expression_s(rexcompiler_t *co, rexdb_t *rexdb, const char *str, rexuserdata_t userdata)
595 {
596         return rex_compiler_expression(co, rexdb, str, r_strlen(str), userdata);
597 }
598
599
600 long rex_compiler_addexpression(rexcompiler_t *co, rexdb_t *rexdb, unsigned long prev, const char *str, unsigned int size, rexuserdata_t userdata)
601 {
602         rexstate_t *sprev = NULL, *scur = NULL;
603         long cur = rex_compiler_expression(co, rexdb, str, size, userdata);
604         if (cur < 0)
605                 return -1;
606         sprev = rex_db_getstate(rexdb, prev);
607         scur = rex_db_getstate(rexdb, cur);
608         if (!sprev)
609                 return scur->uid;
610         rex_state_addtransition_e_dst(sprev, scur);
611         scur->type = REX_STATETYPE_NONE;
612         sprev->type = REX_STATETYPE_START;
613         return prev;
614 }
615
616
617 long rex_compiler_addexpression_s(rexcompiler_t *co, rexdb_t *rexdb, unsigned long prev, const char *str, rexuserdata_t userdata)
618 {
619         return rex_compiler_addexpression(co, rexdb, prev, str, r_strlen(str), userdata);
620 }