RPA Toolkit
Work on support for number of occurrences - improvements.
[rpatk.git] / rex / rexcompiler.c
1 #include <ctype.h>
2 #include <stdlib.h>
3 #include <stdio.h>
4 #include <string.h>
5 #include "rlib/rutf.h"
6 #include "rlib/rmem.h"
7 #include "rex/rexcompiler.h"
8 #include "rex/rextransition.h"
9 #include "rex/rexstate.h"
10 #include "rex/rexfragment.h"
11
12
13 struct rexcompiler_s {
14         rarray_t *stack;
15         rarray_t *temptrans;
16         const char *start;
17         const char *end;
18         const char *ptr;
19         const char *tokenptr;
20         int token;
21         rexchar_t numtoken;
22         unsigned long fromcount;
23         unsigned long tocount;
24         char tokenstr[REX_COMPILER_TOKENSIZE];
25 };
26
27
28 #define FPUSH(__co__, __fragment__) do { r_array_push((__co__)->stack, __fragment__, rexfragment_t*); } while (0)
29 #define FPOP(__co__) r_array_pop((__co__)->stack, rexfragment_t*)
30 #define FLEN(__co__) r_array_length((__co__)->stack)
31
32 static int rex_compiler_catexpression(rexcompiler_t *co, rexdb_t *rexdb);
33 static int rex_compiler_altexpression(rexcompiler_t *co, rexdb_t *rexdb);
34
35 enum {
36         REX_TOKEN_ERR = -1,
37         REX_TOKEN_EOF = -2,
38 };
39
40 rexcompiler_t *rex_compiler_create()
41 {
42         rexcompiler_t *co;
43
44         co = (rexcompiler_t *)r_malloc(sizeof(*co));
45         r_memset(co, 0, sizeof(*co));
46         co->stack = r_array_create(sizeof(rexfragment_t*));
47         co->temptrans = r_array_create(sizeof(rex_transition_t));
48         return co;
49 }
50
51
52 void rex_compiler_destroy(rexcompiler_t *co)
53 {
54         if (co) {
55                 r_array_destroy(co->stack);
56                 r_array_destroy(co->temptrans);
57                 r_free(co);
58         }
59
60 }
61
62
63 static int rex_compiler_isspace(int c)
64 {
65         return isspace(c);
66 }
67
68
69 static int rex_compiler_isblank(int c)
70 {
71         return isblank(c);
72 }
73
74
75 static int rex_compiler_getchar(rexcompiler_t *co)
76 {
77         ruint32 wc = REX_TOKEN_EOF;
78         int inc = 0;
79
80 again:
81         if (co->ptr + 1 < co->end && *co->ptr == '\\' && *(co->ptr + 1) == '\n') {
82                 co->ptr += 2;
83                 goto again;
84         }
85         if (co->ptr + 2 < co->end && *co->ptr == '\\' && *(co->ptr + 1) == '\r' && *(co->ptr + 2) == '\n') {
86                 co->ptr += 3;
87                 goto again;
88         }
89         inc = r_utf8_mbtowc(&wc, (const unsigned char*)co->ptr, (const unsigned char*)co->end);
90         if (inc <= 0)
91                 return REX_TOKEN_EOF;
92         if (wc == '\r' || wc == '\n') {
93                 return REX_TOKEN_EOF;
94         }
95         co->ptr += inc;
96         return wc;
97 }
98
99
100 static int rex_compiler_escapedchar(int c)
101 {
102         switch (c) {
103         case 'r':
104                 return '\r';
105         case 'n':
106                 return '\n';
107         case 't':
108                 return '\t';
109         case '0':
110                 return '\0';
111         case 'f':
112                 return '\f';
113         case 'v':
114                 return '\v';
115         case '\"':
116                 return '\"';
117         case '\'':
118                 return '\'';
119         default:
120                 return c;
121         }
122         return c;
123 }
124
125
126 static int rex_compiler_gettok(rexcompiler_t *co)
127 {
128         int c;
129
130         co->tokenptr = co->ptr;
131         c = rex_compiler_getchar(co);
132         if (c == REX_TOKEN_EOF) {
133                 co->tokenstr[0] = '\0';
134                 co->token = REX_TOKEN_EOF;
135                 return co->token;
136         } else {
137                 co->tokenstr[0] = c;
138                 co->tokenstr[1] = '\0';
139                 co->token = c;
140         }
141         return co->token;
142 }
143
144
145 /*
146  * Don't do anything if this is not an escaped token
147  */
148 static void rex_compiler_adjustescapedtoken(rexcompiler_t *co)
149 {
150         if (co->token == '\\') {
151                 rex_compiler_gettok(co);                /* eat '\\' */
152                 if (co->token == REX_TOKEN_EOF) {
153                         return;
154                 }
155                 co->token = rex_compiler_escapedchar(co->token);
156         }
157 }
158
159
160 static int rex_compiler_getnbtok(rexcompiler_t *co)
161 {
162         while (co->ptr < co->end && rex_compiler_isblank(*co->ptr))
163                 co->ptr += 1;
164         return rex_compiler_gettok(co);
165
166 }
167
168
169 static int rex_compiler_getnstok(rexcompiler_t *co)
170 {
171         while (co->ptr < co->end && rex_compiler_isspace(*co->ptr))
172                 co->ptr += 1;
173         return rex_compiler_gettok(co);
174 }
175
176
177 #define REX_CONV_BUFSIZE 128
178 int rex_compiler_numerictoken(rexcompiler_t *co, rexdb_t *rexdb)
179 {
180         char convbuf[REX_CONV_BUFSIZE + 1];
181         int i = 0;
182         int base = 0;
183
184         rex_compiler_gettok(co); /* Eat # */
185         if (co->token == 'x' || co->token == 'X') {
186                 rex_compiler_gettok(co);
187                 base = 16;
188         } else if (co->token == '0') {
189                 rex_compiler_gettok(co);
190                 convbuf[i++] = '0';
191                 if (co->token == 'x' || co->token == 'X') {
192                         rex_compiler_gettok(co);
193                         i = 0;
194                         base = 16;
195                 } else {
196                         base = 8;
197                 }
198         } else if (!isdigit(co->token)) {
199                 return -1;
200         }
201         while ((base == 16 && isxdigit(co->token)) || (base == 8 && isdigit(co->token) && co->token != '8' && co->token != '9') || isdigit(co->token)) {
202                 if (i >= REX_CONV_BUFSIZE)
203                         return -1;
204                 convbuf[i++] = co->token;
205                 if (rex_compiler_gettok(co) == REX_TOKEN_EOF)
206                         break;
207         }
208         convbuf[i++] = 0;
209         co->numtoken = strtoul(convbuf, NULL, base);
210         return 0;
211 }
212
213
214 int rex_compiler_charclass(rexcompiler_t *co, rexdb_t *rexdb)
215 {
216         int negative = 0;
217         int low;
218         int high;
219         long i;
220         rex_transition_t *t;
221         rexstate_t *srcstate = rex_db_getstate(rexdb, rex_db_createstate(rexdb, REX_STATETYPE_NONE));
222         rexfragment_t *frag = NULL;
223
224         rex_compiler_gettok(co);
225         if (co->token == '^') {
226                 negative = 1;
227                 rex_compiler_gettok(co);
228         }
229
230         while (1) {
231                 if (co->token == '[' || co->token == ']' || co->token == '-' ||  co->token == REX_TOKEN_EOF)
232                         return -1;
233                 if (co->token == '#') {
234                         if (rex_compiler_numerictoken(co, rexdb) < 0)
235                                 return -1;
236                         high = low = co->numtoken;
237                 } else {
238                         rex_compiler_adjustescapedtoken(co);
239                         high = low = co->token;
240                         rex_compiler_gettok(co);
241                 }
242                 if (co->token == '-') {
243                         rex_compiler_gettok(co);
244                         switch (co->token) {
245                         case '[':
246                         case ']':
247                         case '-':
248                         case REX_TOKEN_EOF:
249                         case REX_TOKEN_ERR:
250                                 return -1;
251                                 break;
252                         }
253                         if (co->token == '#') {
254                                 if (rex_compiler_numerictoken(co, rexdb) < 0)
255                                         return -1;
256                                 high = co->numtoken;
257                         } else {
258                                 rex_compiler_adjustescapedtoken(co);
259                                 high = co->token;
260                                 rex_compiler_gettok(co);
261                         }
262                 }
263                 rex_state_addtransition(srcstate, low, high, -1);
264                 if (co->token == ']') {
265                         rex_compiler_getnbtok(co); /* eat it */
266                         break;
267                 }
268         }
269         rex_transitions_normalize(srcstate->trans);
270         if (negative) {
271                 r_array_setlength(co->temptrans, 0);
272                 rex_transitions_negative(co->temptrans, srcstate->trans, srcstate->uid, -1);
273                 r_array_setlength(srcstate->trans, 0);
274                 for (i = 0; i < r_array_length(co->temptrans); i++) {
275                         t = (rex_transition_t *)r_array_slot(co->temptrans, i);
276                         r_array_add(srcstate->trans, t);
277                 }
278                 rex_transitions_normalize(srcstate->trans);
279         }
280         frag = rex_fragment_create(srcstate);
281         FPUSH(co, frag);
282
283         return 0;
284 }
285
286
287 static int rex_compiler_factor(rexcompiler_t *co, rexdb_t *rexdb)
288 {
289         rexstate_t *srcstate;
290         rexfragment_t *frag;
291
292         if (co->token == '[') {
293                 return rex_compiler_charclass(co, rexdb);
294         } else if (co->token == '(') {
295                 rex_compiler_getnbtok(co);              /* eat '(' */
296                 if (co->token == ')')
297                         return -1;
298                 if (rex_compiler_altexpression(co, rexdb) < 0)
299                         return -1;
300                 if (co->token != ')')
301                         return -1;
302                 rex_compiler_getnbtok(co);              /* eat ')' */
303                 return 0;
304         } else if (co->token == '.') {
305                 srcstate = rex_db_getstate(rexdb, rex_db_createstate(rexdb, REX_STATETYPE_NONE));
306                 rex_compiler_adjustescapedtoken(co);
307                 rex_state_addtransition(srcstate, 0, '\n' - 1, -1);
308                 rex_state_addtransition(srcstate, '\n' + 1, REX_CHAR_MAX, -1);
309                 frag = rex_fragment_create(srcstate);
310                 FPUSH(co, frag);
311                 rex_compiler_getnbtok(co);
312                 return 0;
313         } else {
314                 srcstate = rex_db_getstate(rexdb, rex_db_createstate(rexdb, REX_STATETYPE_NONE));
315                 rex_compiler_adjustescapedtoken(co);
316                 rex_state_addtransition(srcstate, co->token, co->token, -1);
317                 frag = rex_fragment_create(srcstate);
318                 FPUSH(co, frag);
319                 rex_compiler_getnbtok(co);
320                 return 0;
321         }
322
323         return -1;
324 }
325
326
327 static int rex_compiler_parsecount(rexcompiler_t *co, rexdb_t *rexdb)
328 {
329         char convbuf[REX_CONV_BUFSIZE + 1];
330         int i = 0;
331
332         rex_compiler_getnbtok(co);              /* eat '{' */
333         if (!isdigit(co->token))
334                 return -1;
335         for (i = 0; isdigit(co->token); i++) {
336                 if (i >= REX_CONV_BUFSIZE)
337                         return -1;
338                 convbuf[i] = co->token;
339                 if (rex_compiler_gettok(co) == REX_TOKEN_EOF)
340                         return -1;
341         }
342         convbuf[i++] = 0;
343         co->fromcount = strtoul(convbuf, NULL, 10);
344         co->tocount = co->fromcount;
345         if (rex_compiler_isblank(co->token)) {
346                 rex_compiler_getnbtok(co);
347         }
348         if (co->token == ',') {
349                 rex_compiler_getnbtok(co);              /* eat ',' */
350                 co->tocount = (unsigned long)-1;
351                 if (isdigit(co->token)) {
352                         for (i = 0; isdigit(co->token); i++) {
353                                 if (i >= REX_CONV_BUFSIZE)
354                                         return -1;
355                                 convbuf[i] = co->token;
356                                 if (rex_compiler_gettok(co) == REX_TOKEN_EOF)
357                                         return -1;
358                         }
359                         convbuf[i++] = 0;
360                         co->tocount = strtoul(convbuf, NULL, 10);
361                         if (rex_compiler_isblank(co->token)) {
362                                 rex_compiler_getnbtok(co);
363                         }
364                 }
365         }
366         if (co->token != '}')
367                 return -1;
368         rex_compiler_getnbtok(co);              /* eat '}' */
369         return 0;
370 }
371
372
373 static int rex_compiler_qfactor(rexcompiler_t *co, rexdb_t *rexdb)
374 {
375         rexfragment_t *frag, *frag1, *frag2;
376         const char *fstart, *fend;
377         rexstate_t *srcstate;
378         long nstates, i;
379
380         if (strchr("{}*?+]\n\r)", co->token)) {
381                 /*
382                  * Unexpected char.
383                  */
384                 return -1;
385         }
386         nstates = r_array_length(rexdb->states);
387         fstart = co->tokenptr;
388         if (rex_compiler_factor(co, rexdb) < 0) {
389                 return -1;
390         }
391         fend = co->tokenptr;
392         switch (co->token) {
393         case '*':
394                 rex_compiler_getnbtok(co); /* Eat it */
395                 frag = FPOP(co);
396                 frag = rex_fragment_mop(rexdb, frag);
397                 FPUSH(co, frag);
398                 break;
399         case '?':
400                 rex_compiler_getnbtok(co); /* Eat it */
401                 frag = FPOP(co);
402                 frag = rex_fragment_opt(rexdb, frag);
403                 FPUSH(co, frag);
404                 break;
405         case '+':
406                 rex_compiler_getnbtok(co); /* Eat it */
407                 frag = FPOP(co);
408                 frag = rex_fragment_mul(rexdb, frag);
409                 FPUSH(co, frag);
410                 break;
411         case '{':
412                 if (rex_compiler_parsecount(co, rexdb) < 0)
413                         return -1;
414                 if (co->fromcount > co->tocount)
415                         return -1;
416                 if (co->fromcount == 0 && co->tocount == (unsigned long)-1) {
417                         frag = FPOP(co);
418                         frag = rex_fragment_mop(rexdb, frag);
419                         FPUSH(co, frag);
420                         break;
421                 }
422                 if (co->fromcount == 1 && co->tocount == (unsigned long)-1) {
423                         frag = FPOP(co);
424                         frag = rex_fragment_mul(rexdb, frag);
425                         FPUSH(co, frag);
426                         break;
427                 }
428                 if (co->fromcount == 0 && co->tocount == 0) {
429                         /*
430                          * Special case
431                          */
432                         frag = FPOP(co);
433                         for (i = nstates; i < r_array_length(rexdb->states); i++) {
434                                 srcstate = rex_db_getstate(rexdb, i);
435                                 rex_state_destroy(srcstate);
436                         }
437                         r_array_setlength(rexdb->states, nstates);
438                         srcstate = rex_db_getstate(rexdb, rex_db_createstate(rexdb, REX_STATETYPE_NONE));
439                         rex_compiler_adjustescapedtoken(co);
440                         rex_state_addtransition_e(srcstate, -1);
441                         frag = rex_fragment_create(srcstate);
442                         FPUSH(co, frag);
443                         break;
444                 }
445                 if (co->fromcount == 0) {
446                         frag = FPOP(co);
447                         frag = rex_fragment_opt(rexdb, frag);
448                         FPUSH(co, frag);
449                 }
450                 do {
451                         long count;
452                         const char *saved_start = co->start;
453                         const char *saved_end = co->end;
454                         const char *saved_ptr = co->ptr;
455                         const char *saved_tokenptr = co->tokenptr;
456                         int saved_token = co->token;
457                         for (count = 1; count < co->tocount; count++) {
458                                 co->start = fstart;
459                                 co->end = fend;
460                                 co->ptr = fstart;
461                                 rex_compiler_getnbtok(co);
462                                 rex_compiler_catexpression(co, rexdb);
463                                 frag2 = FPOP(co);
464                                 if (co->tocount == (unsigned long)-1 && count >= co->fromcount) {
465                                         frag2 = rex_fragment_mop(rexdb, frag2);
466                                         frag1 = FPOP(co);
467                                         frag1 = rex_fragment_cat(rexdb, frag1, frag2);
468                                         FPUSH(co, frag1);
469                                         break;
470                                 }
471                                 if (count >= co->fromcount) {
472                                         frag2 = rex_fragment_opt(rexdb, frag2);
473                                 }
474                                 frag1 = FPOP(co);
475                                 frag1 = rex_fragment_cat(rexdb, frag1, frag2);
476                                 FPUSH(co, frag1);
477                         }
478 //                      fwrite(fstart, 1, fend - fstart, stdout);
479 //                      fprintf(stdout, "{%ld,%ld}\n", co->fromcount, co->tocount);
480                         co->start = saved_start;
481                         co->end = saved_end;
482                         co->ptr = saved_ptr;
483                         co->tokenptr = saved_tokenptr;
484                         co->token = saved_token;
485                 } while (0);
486                 break;
487         }
488         return 0;
489 }
490
491
492 static int rex_compiler_catexpression(rexcompiler_t *co, rexdb_t *rexdb)
493 {
494         int nfactors = 0;
495         rexfragment_t *frag1;
496         rexfragment_t *frag2;
497
498         if (co->token == REX_TOKEN_EOF)
499                 return -1;
500         while (1) {
501                 switch (co->token) {
502                 case REX_TOKEN_EOF:
503                 case '|':
504                 case ')':
505                         return 0;
506                 }
507                 if (rex_compiler_qfactor(co, rexdb) < 0)
508                         return -1;
509                 ++nfactors;
510                 if (nfactors >= 2) {
511                         frag2 = FPOP(co);
512                         frag1 = FPOP(co);
513                         frag1 = rex_fragment_cat(rexdb, frag1, frag2);
514                         FPUSH(co, frag1);
515                 }
516         }
517         return 0;
518 }
519
520
521 static int rex_compiler_altexpression(rexcompiler_t *co, rexdb_t *rexdb)
522 {
523         rexfragment_t *frag1;
524         rexfragment_t *frag2;
525
526         if (rex_compiler_catexpression(co, rexdb) < 0)
527                 return -1;
528         while (co->token == '|') {
529                 rex_compiler_getnstok(co); /* Eat '|' */
530                 switch (co->token) {
531                 case REX_TOKEN_EOF:
532                 case '|':
533                 case ')':
534                 case ']':
535                         return -1;
536                 }
537                 if (rex_compiler_catexpression(co, rexdb) < 0)
538                         return -1;
539                 frag2 = FPOP(co);
540                 frag1 = FPOP(co);
541                 frag1 = rex_fragment_alt(rexdb, frag1, frag2);
542                 FPUSH(co, frag1);
543         }
544         return 0;
545 }
546
547
548 long rex_compiler_expression(rexcompiler_t *co, rexdb_t *rexdb, const char *str, unsigned int size, rexuserdata_t userdata)
549 {
550         long curlen = r_array_length(rexdb->states);
551         long i;
552         long ret = -1L;
553         rexfragment_t *frag1 = NULL, *frag2 = NULL;
554
555         co->start = str;
556         co->end = str + size;
557         co->ptr = str;
558
559         rex_compiler_getnbtok(co);
560         if (rex_compiler_altexpression(co, rexdb) < 0)
561                 goto error;
562         frag1 = FPOP(co);
563         frag2 = rex_fragment_create(rex_db_getstate(rexdb, rex_db_createstate(rexdb, REX_STATETYPE_ACCEPT)));
564         rex_state_setuserdata(REX_FRAG_STATE(frag2), userdata);
565         frag1 = rex_fragment_cat(rexdb, frag1, frag2);
566         ret = REX_FRAG_STATEUID(frag1);
567         REX_FRAG_STATE(frag1)->type = REX_STATETYPE_START;
568         rex_fragment_destroy(frag1);
569         return ret;
570 error:
571         if (curlen < r_array_length(rexdb->states)) {
572                 for (i = curlen; i < r_array_length(rexdb->states); i++)
573                         rex_state_destroy(rex_db_getstate(rexdb, i));
574                 r_array_setlength(rexdb->states, curlen);
575         }
576         while (FLEN(co)) {
577                 rex_fragment_destroy(FPOP(co));
578         }
579         return -1;
580 }
581
582
583 long rex_compiler_expression_s(rexcompiler_t *co, rexdb_t *rexdb, const char *str, rexuserdata_t userdata)
584 {
585         return rex_compiler_expression(co, rexdb, str, r_strlen(str), userdata);
586 }
587
588
589 long rex_compiler_addexpression(rexcompiler_t *co, rexdb_t *rexdb, unsigned long prev, const char *str, unsigned int size, rexuserdata_t userdata)
590 {
591         rexstate_t *sprev = NULL, *scur = NULL;
592         long cur = rex_compiler_expression(co, rexdb, str, size, userdata);
593         if (cur < 0)
594                 return -1;
595         sprev = rex_db_getstate(rexdb, prev);
596         scur = rex_db_getstate(rexdb, cur);
597         if (!sprev)
598                 return scur->uid;
599         rex_state_addtransition_e_dst(sprev, scur);
600         scur->type = REX_STATETYPE_NONE;
601         sprev->type = REX_STATETYPE_START;
602         return prev;
603 }
604
605
606 long rex_compiler_addexpression_s(rexcompiler_t *co, rexdb_t *rexdb, unsigned long prev, const char *str, rexuserdata_t userdata)
607 {
608         return rex_compiler_addexpression(co, rexdb, prev, str, r_strlen(str), userdata);
609 }