RPA Toolkit
Added support for binary serialization of DFA
authorMartin Stoilov <martin@rpasearch.com>
Sat, 4 Feb 2012 05:05:18 +0000 (21:05 -0800)
committerMartin Stoilov <martin@rpasearch.com>
Sat, 4 Feb 2012 05:05:18 +0000 (21:05 -0800)
rex/rexdb.c
rex/rexdfa.c
rex/rexdfa.h
rex/rexdfaconv.c
rexgrep/build/unix/regexgen.sh
rexgrep/rexgrep.c
rexgrep/unix/main.c

index f3218af..950b357 100644 (file)
@@ -388,7 +388,7 @@ rexdfa_t *rex_db_todfa(rexdb_t *db)
        unsigned long ntrans = rex_db_numtransitions(db);
        unsigned long nsubstates = rex_db_numsubstates(db);
        unsigned long naccsubstates = rex_db_numaccsubstates(db);
-       dfa = rex_dfa_create(nstates, ntrans, nsubstates, naccsubstates);
+       dfa = rex_dfa_create(nstates, ntrans, naccsubstates, nsubstates);
        r_memset(&ctx, 0, sizeof(ctx));
 
        for (i = 0; i < r_array_length(db->states); i++) {
index 63efe5f..cd7f2d2 100644 (file)
 #include "rex/rexdfa.h"
 
 
-rexdfa_t *rex_dfa_create(rexuint_t nstates, rexuint_t ntrans, rexuint_t nsubstates, rexuint_t naccsubstates)
+rexdfa_t *rex_dfa_create(rexuint_t nstates, rexuint_t ntrans, rexuint_t naccsubstates, rexuint_t nsubstates)
 {
        rexdfa_t *dfa = (rexdfa_t *)r_zmalloc(sizeof(*dfa));
        dfa->states = r_zmalloc(sizeof(rexdfs_t) * nstates);
        dfa->trans = r_zmalloc(sizeof(rexdft_t) * ntrans);
-       dfa->substates = r_zmalloc(sizeof(rexdfss_t) * nsubstates);
-       dfa->accsubstates = r_zmalloc(sizeof(rexdfss_t) * naccsubstates);
+       if (naccsubstates)
+               dfa->accsubstates = r_zmalloc(sizeof(rexdfss_t) * naccsubstates);
+       if (nsubstates)
+               dfa->substates = r_zmalloc(sizeof(rexdfss_t) * nsubstates);
        dfa->nstates = nstates;
        dfa->ntrans = ntrans;
        dfa->nsubstates = nsubstates;
@@ -128,6 +130,7 @@ void rex_dfa_dumpstate(rexdfa_t *dfa, rexuint_t nstate)
        int bufsize = sizeof(buf) - 1;
        int n = 0;
        rexdfs_t *s = rex_dfa_state(dfa, nstate);
+       rexdfss_t *ss = NULL;
        rexdft_t *t = NULL;
 
        if (!s)
@@ -135,11 +138,14 @@ void rex_dfa_dumpstate(rexdfa_t *dfa, rexuint_t nstate)
        fprintf(stdout, "State %ld", (unsigned long)nstate);
        fprintf(stdout, " (");
        for (index = 0; index < s->nsubstates; index++) {
-               fprintf(stdout, "%ld", (unsigned long)dfa->substates[s->substates + index].uid);
-               if (dfa->substates[s->substates + index].type == REX_STATETYPE_ACCEPT)
-                       fprintf(stdout, "*");
-               if (index + 1 < s->nsubstates)
-                       fprintf(stdout, ",");
+               ss = REX_DFA_SUBSTATE(dfa, nstate, index);
+               if (ss) {
+                       fprintf(stdout, "%ld", (unsigned long)ss->uid);
+                       if (ss->type == REX_STATETYPE_ACCEPT)
+                               fprintf(stdout, "*");
+                       if (index + 1 < s->nsubstates)
+                               fprintf(stdout, ",");
+               }
        }
        fprintf(stdout, ")");
 
@@ -149,9 +155,12 @@ void rex_dfa_dumpstate(rexdfa_t *dfa, rexuint_t nstate)
                fprintf(stdout, " REX_STATETYPE_ACCEPT ");
                fprintf(stdout, " (");
                for (index = 0; index < s->naccsubstates; index++) {
-                       fprintf(stdout, "%ld*", (unsigned long)dfa->accsubstates[s->accsubstates + index].uid);
-                       if (index + 1 < s->naccsubstates)
-                               fprintf(stdout, ",");
+                       ss = REX_DFA_ACCSUBSTATE(dfa, nstate, index);
+                       if (ss) {
+                               fprintf(stdout, "%ld*", (unsigned long)ss->uid);
+                               if (index + 1 < s->naccsubstates)
+                                       fprintf(stdout, ",");
+                       }
                }
                fprintf(stdout, ")");
        } else if (s->type == REX_STATETYPE_DEAD) {
@@ -165,14 +174,14 @@ void rex_dfa_dumpstate(rexdfa_t *dfa, rexuint_t nstate)
                n = 0;
                if (t->lowin != t->highin) {
                        if (isprint(t->lowin) && !isspace(t->lowin) && isprint(t->highin) && !isspace(t->highin))
-                               n += r_snprintf(buf + n, n < bufsize ? bufsize - n : 0, "        [%c - %c] ", t->lowin, t->highin);
+                               n += r_snprintf(buf + n, n < bufsize ? bufsize - n : 0, "    [%c - %c] ", t->lowin, t->highin);
                        else
-                               n += r_snprintf(buf + n, n < bufsize ? bufsize - n : 0, "        [0x%X - 0x%X] ", t->lowin, t->highin);
+                               n += r_snprintf(buf + n, n < bufsize ? bufsize - n : 0, "    [0x%X - 0x%X] ", t->lowin, t->highin);
                } else {
                        if (isprint(t->lowin) && !isspace(t->lowin))
-                               n += r_snprintf(buf + n, n < bufsize ? bufsize - n : 0, "        '%c' ", t->lowin);
+                               n += r_snprintf(buf + n, n < bufsize ? bufsize - n : 0, "    '%c' ", t->lowin);
                        else
-                               n += r_snprintf(buf + n, n < bufsize ? bufsize - n : 0, "        0x%X ", t->lowin);
+                               n += r_snprintf(buf + n, n < bufsize ? bufsize - n : 0, "    0x%X ", t->lowin);
                }
                r_memset(buf + n, ' ', bufsize - n);
                n = 40;
index 82b6df7..458f717 100644 (file)
@@ -33,7 +33,7 @@ typedef REX_USERDATA_TYPE rexuserdata_t;
 
 
 #ifndef REX_UINT_TYPE
-typedef unsigned long rexuint_t;
+typedef unsigned int rexuint_t;
 #else
 typedef REX_UINT_TYPE rexuint_t;
 #endif
@@ -59,8 +59,8 @@ typedef enum {
 
 #define REX_DFA_STATE(__dfa__, __nstate__)                                                     (&(__dfa__)->states[__nstate__])
 #define REX_DFA_TRANSITION(__dfa__, __nstate__, __ntrans__)                    (&(__dfa__)->trans[(REX_DFA_STATE(__dfa__, __nstate__)->trans) + (__ntrans__)])
-#define REX_DFA_SUBSTATE(__dfa__, __nstate__, __nsubstate__)           (&(__dfa__)->substates[REX_DFA_STATE(__dfa__, __nstate__)->substates + (__nsubstate__)])
-#define REX_DFA_ACCSUBSTATE(__dfa__, __nstate__, __naccsubstate__)     (&(__dfa__)->accsubstates[REX_DFA_STATE(__dfa__, __nstate__)->accsubstates + (__naccsubstate__)])
+#define REX_DFA_SUBSTATE(__dfa__, __nstate__, __nsubstate__)           ((__dfa__)->substates ? &(__dfa__)->substates[REX_DFA_STATE(__dfa__, __nstate__)->substates + (__nsubstate__)] : ((void*)0))
+#define REX_DFA_ACCSUBSTATE(__dfa__, __nstate__, __naccsubstate__)     ((__dfa__)->accsubstates ? &(__dfa__)->accsubstates[REX_DFA_STATE(__dfa__, __nstate__)->accsubstates + (__naccsubstate__)] : ((void*)0))
 #define REX_DFA_NEXT(__dfa__, __nstate__, __input__) \
                ({ \
                        rexdft_t *t; \
@@ -76,7 +76,7 @@ typedef enum {
                        } \
                        min -= (min > 0) ? 1 : 0; \
                        t = REX_DFA_TRANSITION(__dfa__, __nstate__, min); \
-                       (((__input__) >= t->lowin && (__input__) <= t->highin) ? t->state : 0); \
+                       (t->state); \
                })
 
 
@@ -107,10 +107,10 @@ typedef struct rexdfs_s {
        rexuint_t type;
        rexuint_t trans;
        rexuint_t ntrans;
-       rexuint_t substates;
-       rexuint_t nsubstates;
        rexuint_t accsubstates;
        rexuint_t naccsubstates;
+       rexuint_t substates;
+       rexuint_t nsubstates;
 } rexdfs_t;
 
 
@@ -122,14 +122,14 @@ typedef struct rexdfa_s {
        rexdfs_t *states;
        rexuint_t ntrans;
        rexdft_t *trans;
-       rexuint_t nsubstates;
-       rexdfss_t *substates;
        rexuint_t naccsubstates;
        rexdfss_t *accsubstates;
+       rexuint_t nsubstates;
+       rexdfss_t *substates;
 } rexdfa_t;
 
 
-rexdfa_t *rex_dfa_create(rexuint_t nstates, rexuint_t ntrans, rexuint_t nsubsets, rexuint_t naccsubsets);
+rexdfa_t *rex_dfa_create(rexuint_t nstates, rexuint_t ntrans, rexuint_t naccsubstates, rexuint_t nsubstates);
 void rex_dfa_destroy(rexdfa_t *dfa);
 void rex_dfa_dumpstate(rexdfa_t *dfa, rexuint_t nstate);
 rexdfs_t *rex_dfa_state(rexdfa_t *dfa, rexuint_t nstate);
index 2e346f3..f9586b2 100644 (file)
@@ -63,7 +63,7 @@ rexdfaconv_t *rex_dfaconv_create()
        co->trans = r_array_create(sizeof(rex_transition_t));
        co->temptrans = r_array_create(sizeof(rex_transition_t));
        co->marked = r_array_create(sizeof(unsigned char));
-       co->hash = r_hash_create(12, rex_dfaconv_subsetequal, rex_dfaconv_subsethash);
+       co->hash = r_hash_create(16, rex_dfaconv_subsetequal, rex_dfaconv_subsethash);
        return co;
 }
 
index de30f9e..ed8f492 100755 (executable)
@@ -1,10 +1,12 @@
 #!/bin/bash
 
-for i in {a..z} ; do 
-    for j in {a..z} {A..Z}; do 
-       echo -n "${i}*${j}h*e?l${i}othere|"; 
+for i in {a..z} {A..Z} {0..9}; do 
+    for j in {a..z} {A..Z} {0..9}; do 
+       for k in {z..z}; do 
+           echo -n "${i}*${j}h*e?l([${i}-${j}]|ot${j}h)${k}*ewre|"; 
+       done
     done
 done
 
-echo -n "hesi..|martin|lesssthan.*there|hello.*|mello.*|fori|morei|elsi|whilei|doio|l*(i(n.*)*.)+ux|ponty|montyie|smuzi|lussi.*|"
+echo -n "hesi..z|ma?rti*n|"
 echo "theregexend"; 
index 91c47df..150e50f 100644 (file)
@@ -85,35 +85,30 @@ int rex_grep_load_pattern(rexgrep_t *pGrep, rbuffer_t *buf)
 }
 
 
-int rex_grep_dfamatch(rexgrep_t *pGrep, const char* input, const char *end)
-{
-       ruint32 wc = 0;
-       int inc = 0, ret = 0;
-       long nstate = REX_DFA_STARTSTATE;
-       const char *start = input;
-       rexdfa_t *dfa = pGrep->dfa;
-       rexdfs_t *s;
-
-       while ((inc = r_utf8_mbtowc(&wc, (const unsigned char*)input, (const unsigned char*)end)) > 0) {
-               if ((nstate = REX_DFA_NEXT(dfa, nstate, wc)) <= 0)
-                       break;
-               input += inc;
-               s = REX_DFA_STATE(dfa, nstate);
-               if (s->type == REX_STATETYPE_ACCEPT)
-                       ret = input - start;
-       }
-       return ret;
-}
-
-
 int rex_grep_match(rexgrep_t *pGrep, const char* input, const char *end)
 {
-       int inc;
+       int inc = 1;
        ruint32 wc;
        rexdb_t *db;
 
-       if (pGrep->usedfa)
-               return rex_grep_dfamatch(pGrep, input, end);
+       if (pGrep->usedfa) {
+               ruint32 wc = 0;
+               int ret = 0;
+               long nstate = REX_DFA_STARTSTATE;
+               const char *start = input;
+               rexdfa_t *dfa = pGrep->dfa;
+               rexdfs_t *s;
+
+               while ((inc = r_utf8_mbtowc(&wc, (const unsigned char*)input, (const unsigned char*)end)) > 0) {
+                       if ((nstate = REX_DFA_NEXT(dfa, nstate, wc)) <= 0)
+                               break;
+                       input += inc;
+                       s = REX_DFA_STATE(dfa, nstate);
+                       if (s->type == REX_STATETYPE_ACCEPT)
+                               ret = input - start;
+               }
+               return ret;
+       }
 
        if (pGrep->startuid < 0) {
                return -1;
index 4ebfa91..330b440 100644 (file)
@@ -29,6 +29,7 @@
 #include <stdlib.h>
 #include <wchar.h>
 #include <time.h>
+#include <errno.h>
 #include "rlib/rmem.h"
 #include "rlib/rarray.h"
 #include "rex/rexdfaconv.h"
@@ -46,6 +47,8 @@ int usage(int argc, const char *argv[])
                fprintf(stderr, " OPTIONS:\n");
                fprintf(stderr, "\t-e patterns              Regular Expression.\n");
                fprintf(stderr, "\t-f patternfile           Read Regular Expressions from a file.\n");
+               fprintf(stderr, "\t-c binfile               Compile DFA and save to binfile.\n");
+               fprintf(stderr, "\t-b binfile               Load DFA from binfile.\n");
                fprintf(stderr, "\t-o, --only-matching      Show only the part of a line matching PATTERN\n");
                fprintf(stderr, "\t-l                       Line mode.\n");
                fprintf(stderr, "\t-N                       Use NFA.\n");
@@ -111,6 +114,8 @@ int main(int argc, const char *argv[])
        int ret, scanned = 0, i;
        rexgrep_t *pGrep;
        rarray_t *buffers;
+       const char *loadbinfile = NULL;
+       const char *savebinfile = NULL;
        FILE *devnull = NULL;
 
        buffers = r_array_create(sizeof(rbuffer_t *));
@@ -149,32 +154,54 @@ int main(int argc, const char *argv[])
        }
 
        for (i = 1; i < argc; i++) {
-               if (strcmp(argv[i], "-f") == 0) {
+               if (strcmp(argv[i], "-c") == 0) {
                        if (++i < argc) {
-                               rbuffer_t *pattern = rex_buffer_map_file(argv[i]);
-                               if (pattern) {
-                                       ret = rex_grep_load_pattern(pGrep, pattern);
-                                       r_array_add(buffers, &pattern);
-                               } else {
-                                       ret = -1;
-                               }
-                               if (ret < 0)
-                                       goto error;
+                               savebinfile = argv[i];
                        }
                }
        }
 
        for (i = 1; i < argc; i++) {
-               if (strcmp(argv[i], "-e") == 0) {
+               if (strcmp(argv[i], "-b") == 0) {
                        if (++i < argc) {
-                               rbuffer_t pattern;
-                               pattern.s = (char*)argv[i];
-                               pattern.size = strlen(argv[i]);
-                               ret = rex_grep_load_string_pattern(pGrep, &pattern);
-                               if (ret < 0)
-                                       goto error;
+                               loadbinfile = argv[i];
+                       }
+               }
+       }
+
+       if (!loadbinfile) {
+               for (i = 1; i < argc; i++) {
+                       if (strcmp(argv[i], "-f") == 0) {
+                               if (++i < argc) {
+                                       rbuffer_t *pattern = rex_buffer_map_file(argv[i]);
+                                       if (pattern) {
+                                               ret = rex_grep_load_pattern(pGrep, pattern);
+                                               r_array_add(buffers, &pattern);
+                                       } else {
+                                               ret = -1;
+                                       }
+                                       if (ret < 0)
+                                               goto error;
+                               }
+                       }
+               }
+               for (i = 1; i < argc; i++) {
+                       if (strcmp(argv[i], "-e") == 0) {
+                               if (++i < argc) {
+                                       rbuffer_t pattern;
+                                       pattern.s = (char*)argv[i];
+                                       pattern.size = strlen(argv[i]);
+                                       ret = rex_grep_load_string_pattern(pGrep, &pattern);
+                                       if (ret < 0)
+                                               goto error;
+                               }
+
+                       }
+               }
+               for (i = 1; i < argc; i++) {
+                       if (strcmp(argv[i], "-N") == 0) {
+                               pGrep->usedfa = 0;
                        }
-                       
                }
        }
 
@@ -201,13 +228,25 @@ int main(int argc, const char *argv[])
                }
        }
 
-       for (i = 1; i < argc; i++) {
-               if (strcmp(argv[i], "-N") == 0) {
-                       pGrep->usedfa = 0;
+       if (!pGrep->dfa && loadbinfile) {
+               FILE *pfile = NULL;
+               rexdfa_t dfa;
+               r_memset(&dfa, 0, sizeof(dfa));
+               pfile = fopen(loadbinfile, "rb");
+               if (!pfile) {
+                       fprintf(stderr, "Failed to open file: %s, %s\n", loadbinfile, strerror(errno));
+                       goto error;
                }
+               fread(&dfa, sizeof(dfa), 1, pfile);
+               pGrep->dfa = rex_dfa_create(dfa.nstates, dfa.ntrans, dfa.naccsubstates, dfa.nsubstates);
+               fread(pGrep->dfa->states, sizeof(*dfa.states), dfa.nstates, pfile);
+               fread(pGrep->dfa->trans, sizeof(*dfa.trans), dfa.ntrans, pfile);
+               fread(pGrep->dfa->accsubstates, sizeof(*dfa.accsubstates), dfa.naccsubstates, pfile);
+               fread(pGrep->dfa->substates, sizeof(*dfa.substates), dfa.nsubstates, pfile);
+               fclose(pfile);
        }
 
-       if (pGrep->usedfa) {
+       if (!pGrep->dfa && pGrep->usedfa) {
                rexdb_t *dfadb = rex_db_createdfa(pGrep->nfa, pGrep->startuid);
                pGrep->dfa = rex_db_todfa(dfadb);
                rex_db_destroy(dfadb);
@@ -230,12 +269,35 @@ int main(int argc, const char *argv[])
                }
        }
 
+       if (pGrep->dfa && savebinfile) {
+               rexdfa_t dfa = *pGrep->dfa;
+               dfa.nsubstates = 0;
+               dfa.substates = NULL;
+               dfa.states = NULL;
+               dfa.trans = NULL;
+               dfa.accsubstates = NULL;
+               FILE *pfile = fopen(savebinfile, "wb");
+               if (!pfile) {
+                       fprintf(stderr, "Failed to create file: %s, %s\n", savebinfile, strerror(errno));
+                       goto error;
+               }
+               fwrite(&dfa, sizeof(dfa), 1, pfile);
+               dfa.states = pGrep->dfa->states;
+               dfa.trans = pGrep->dfa->trans;
+               dfa.accsubstates = pGrep->dfa->accsubstates;
+               fwrite(dfa.states, sizeof(*dfa.states), dfa.nstates, pfile);
+               fwrite(dfa.trans, sizeof(*dfa.trans), dfa.ntrans, pfile);
+               fwrite(dfa.accsubstates, sizeof(*dfa.accsubstates), dfa.naccsubstates, pfile);
+               fclose(pfile);
+               goto end;
+       }
+
        /* scan files */
        for (i = 1; i < argc; i++) {
                if (argv[i][0] != '-') {
                        ++scanned;
                        rex_grep_scan_path(pGrep, argv[i]);
-               } else if (argv[i][1] == 'e' || argv[i][1] == 'f' || argv[i][1] == 'c' || argv[i][1] == 'C'){
+               } else if (argv[i][1] == 'e' || argv[i][1] == 'f' || argv[i][1] == 'c' || argv[i][1] == 'b'){
                        ++i;
                }
                
@@ -255,9 +317,21 @@ end:
        }
        r_object_destroy((robject_t*)buffers);
        ret = pGrep->ret;
+       if (pGrep->showtime) {
+               rexdfa_t *dfa = pGrep->dfa;
+               unsigned long sizestates = dfa->nstates * sizeof(rexdfs_t);
+               unsigned long sizetrans = dfa->ntrans * sizeof(rexdft_t);
+               unsigned long sizeaccsubs = dfa->naccsubstates * sizeof(rexdfss_t);
+               unsigned long sizesubs = dfa->nsubstates * sizeof(rexdfss_t);
+               unsigned long sizetotal = sizestates + sizetrans + sizeaccsubs + sizesubs + sizeof(rexdfa_t);
+               fprintf(stdout, "\n\n");
+               fprintf(stdout, "\tDFA Memory: %ld KB, States: %ld KB (%.2f), Transitions: %ld KB (%.2f), Accecpting Substates: %ld KB(%.2f), Substates: %ld KB (%.2f)\n",
+                               sizetotal/1024, sizestates/1024, (100.0*sizestates/sizetotal), sizetrans/1024, (100.0*sizetrans/sizetotal),
+                               sizeaccsubs/1024, (100.0*sizeaccsubs/sizetotal), sizesubs/1024, (100.0*sizesubs/sizetotal));
+       }
        rex_grep_destroy(pGrep);
        if (pGrep->showtime) {
-               fprintf(stdout, "memory: %ld KB (leaked %ld Bytes)\n", (long)r_debug_get_maxmem()/1024, (long)r_debug_get_allocmem());
+               fprintf(stdout, "\tmemory: %ld KB (leaked %ld Bytes)\n", (long)r_debug_get_maxmem()/1024, (long)r_debug_get_allocmem());
        }
 
        if (devnull)