RPA Toolkit
Work on rexdfa_t matching. Replaced rexdb_t DFA with rexdfa_t in rexgrep.
authorMartin Stoilov <martin@rpasearch.com>
Thu, 2 Feb 2012 04:58:07 +0000 (20:58 -0800)
committerMartin Stoilov <martin@rpasearch.com>
Thu, 2 Feb 2012 04:58:07 +0000 (20:58 -0800)
rex/rexdb.c
rex/rexdb.h
rex/rexdef.h
rex/rexdfa.c
rex/rexdfa.h
rexgrep/build/unix/i386/Makefile
rexgrep/rexgrep.c
rexgrep/rexgrep.h
rexgrep/unix/main.c

index 09e693d..cac618d 100644 (file)
@@ -308,6 +308,23 @@ long rex_db_numsubstates(rexdb_t *rexdb)
 }
 
 
+long rex_db_numaccsubstates(rexdb_t *rexdb)
+{
+       long i, j;
+       long ret = 0;
+
+       for (i = 0; i < r_array_length(rexdb->states); i++) {
+               rexstate_t *state = rex_db_getstate(rexdb, i);
+               for (j = 0; j < rex_subset_length(state->subset); j++) {
+                       rexsubstate_t *substate = rex_db_getsubstate(rexdb, rex_subset_index(state->subset, j));
+                       if (substate->ss_type == REX_STATETYPE_ACCEPT)
+                               ret += 1;
+               }
+       }
+       return ret;
+}
+
+
 const char *rex_db_version()
 {
        return "1.0";
index ccccb50..bb1143d 100644 (file)
@@ -66,6 +66,7 @@ void rex_db_dumpstate(rexdb_t *rexdb, unsigned long uid);
 long rex_db_numtransitions(rexdb_t *rexdb);
 long rex_db_numstates(rexdb_t *rexdb);
 long rex_db_numsubstates(rexdb_t *rexdb);
+long rex_db_numaccsubstates(rexdb_t *rexdb);
 const char *rex_db_version();
 
 
index 5816975..478867b 100644 (file)
@@ -1,8 +1,21 @@
 /*
- * rexdef.h
+ *  Regular Pattern Analyzer (RPA)
+ *  Copyright (c) 2009-2010 Martin Stoilov
  *
- *  Created on: Jan 31, 2012
- *      Author: mstoilov
+ *  This program is free software: you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation, either version 3 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ *  Martin Stoilov <martin@rpasearch.com>
  */
 
 #ifndef _REXDEF_H_
index 9ac6dac..7858aee 100644 (file)
@@ -1,9 +1,23 @@
 /*
- * rexdfa.c
+ *  Regular Pattern Analyzer (RPA)
+ *  Copyright (c) 2009-2010 Martin Stoilov
  *
- *  Created on: Jan 31, 2012
- *      Author: mstoilov
+ *  This program is free software: you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation, either version 3 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ *  Martin Stoilov <martin@rpasearch.com>
  */
+
 #include <stdlib.h>
 #include <stdio.h>
 #include <ctype.h>
@@ -24,8 +38,8 @@ rexdfa_t *rex_dfa_create(unsigned long nstates, unsigned long ntrans, unsigned l
        rexdfa_t *dfa = (rexdfa_t *)r_zmalloc(sizeof(*dfa));
        dfa->states = r_zmalloc(sizeof(rexdfs_t) * nstates);
        dfa->trans = r_zmalloc(sizeof(rexdft_t) * ntrans);
-       dfa->substates = r_zmalloc(sizeof(rexsi_t) * nsubstates);
-       dfa->accsubstates = r_zmalloc(sizeof(rexsi_t) * naccsubstates);
+       dfa->substates = r_zmalloc(sizeof(rexdfss_t) * nsubstates);
+       dfa->accsubstates = r_zmalloc(sizeof(rexdfss_t) * naccsubstates);
        dfa->nstates = nstates;
        dfa->ntrans = ntrans;
        dfa->nsubstates = nsubstates;
@@ -41,6 +55,7 @@ void rex_dfa_destroy(rexdfa_t *dfa)
                r_free(dfa->accsubstates);
                r_free(dfa->states);
                r_free(dfa->trans);
+               r_free(dfa);
        }
 }
 
@@ -51,25 +66,38 @@ static void rex_dfa_fillstate(rexdb_t *db, rexdfa_t *dfa, struct rexdfa_ctx *ctx
        rex_transition_t *t = NULL;
        rexdfs_t *s = &dfa->states[ctx->nstates++];
        s->type = state->type;
-       s->trans = &dfa->trans[ctx->ntrnas];
+       s->trans = ctx->ntrnas;
        s->ntrans = r_array_length(state->trans);
        for (i = 0; i < s->ntrans; i++) {
                t = (rex_transition_t *)r_array_slot(state->trans, i);
-               s->trans[i].lowin = t->lowin;
-               s->trans[i].highin = t->highin;
-               s->trans[i].state = t->dstuid;
+               dfa->trans[s->trans + i].lowin = t->lowin;
+               dfa->trans[s->trans + i].highin = t->highin;
+               dfa->trans[s->trans + i].state = t->dstuid;
        }
        ctx->ntrnas += s->ntrans;
-       s->substates = &dfa->substates[ctx->nsubstates];
+       s->substates = ctx->nsubstates;
        s->nsubstates = rex_subset_length(state->subset);
        for (i = 0; i < s->nsubstates; i++) {
                unsigned long uid = rex_subset_index(state->subset, i);
                rexsubstate_t *substate = rex_db_getsubstate(db, uid);
-               s->substates[i].uid = uid;
-               s->substates[i].type = substate->ss_type;
-               s->substates[i].userdata = substate->ss_userdata;
+               dfa->substates[s->substates + i].uid = uid;
+               dfa->substates[s->substates + i].type = substate->ss_type;
+               dfa->substates[s->substates + i].userdata = substate->ss_userdata;
        }
        ctx->nsubstates += s->nsubstates;
+       s->accsubstates = ctx->naccsubstates;
+       s->naccsubstates = 0L;
+       for (i = 0; i < s->nsubstates; i++) {
+               unsigned long uid = rex_subset_index(state->subset, i);
+               rexsubstate_t *substate = rex_db_getsubstate(db, uid);
+               if (substate->ss_type == REX_STATETYPE_ACCEPT) {
+                       dfa->accsubstates[s->accsubstates + s->naccsubstates].uid = uid;
+                       dfa->accsubstates[s->accsubstates + s->naccsubstates].type = substate->ss_type;
+                       dfa->accsubstates[s->accsubstates + s->naccsubstates].userdata = substate->ss_userdata;
+                       s->naccsubstates++;
+               }
+       }
+       ctx->naccsubstates += s->naccsubstates;
 }
 
 
@@ -81,34 +109,118 @@ rexdfa_t *rex_dfa_create_from_db(rexdb_t *db)
        unsigned long nstates = rex_db_numstates(db);
        unsigned long ntrans = rex_db_numtransitions(db);
        unsigned long nsubstates = rex_db_numsubstates(db);
-       dfa = rex_dfa_create(nstates, ntrans, nsubstates, nsubstates);
+       unsigned long naccsubstates = rex_db_numaccsubstates(db);
+       dfa = rex_dfa_create(nstates, ntrans, nsubstates, naccsubstates);
        r_memset(&ctx, 0, sizeof(ctx));
 
        for (i = 0; i < r_array_length(db->states); i++) {
                rexstate_t *state = rex_db_getstate(db, i);
                rex_dfa_fillstate(db, dfa, &ctx, state);
        }
+       R_ASSERT(ctx.nstates == nstates);
+       R_ASSERT(ctx.ntrnas == ntrans);
+       R_ASSERT(ctx.nsubstates == nsubstates);
+       R_ASSERT(ctx.naccsubstates == naccsubstates);
        return dfa;
 }
 
 
-void rex_dfa_dumpstate(rexdfa_t *dfa, long off)
+rexdfs_t *rex_dfa_state(rexdfa_t *dfa, unsigned long nstate)
+{
+       rexdfs_t *s;
+       if (nstate >= dfa->nstates)
+               return NULL;
+       s = &dfa->states[nstate];
+       return s;
+}
+
+
+rexdft_t *rex_dfa_transition(rexdfa_t *dfa, unsigned long nstate, unsigned long ntrans)
+{
+       rexdft_t *t;
+       rexdfs_t *s = rex_dfa_state(dfa, nstate);
+       if (!s)
+               return NULL;
+       if (ntrans >= s->ntrans)
+               return NULL;
+       t = &dfa->trans[s->trans + ntrans];
+       return t;
+}
+
+
+rexdfss_t *rex_dfa_substate(rexdfa_t *dfa, unsigned long nstate, unsigned long nsubstate)
+{
+       rexdfss_t *ss;
+       rexdfs_t *s = rex_dfa_state(dfa, nstate);
+       if (!s)
+               return NULL;
+       if (nsubstate >= s->nsubstates)
+               return NULL;
+       ss = &dfa->substates[s->substates + nsubstate];
+       return ss;
+}
+
+
+rexdfss_t *rex_dfa_accsubstate(rexdfa_t *dfa, unsigned long nstate, unsigned long naccsubstate)
+{
+       rexdfss_t *ss;
+       rexdfs_t *s = rex_dfa_state(dfa, nstate);
+       if (!s)
+               return NULL;
+       if (naccsubstate >= s->naccsubstates)
+               return NULL;
+       ss = &dfa->substates[s->accsubstates + naccsubstate];
+       return ss;
+}
+
+
+long rex_dfa_next(rexdfa_t *dfa, unsigned long nstate, rexchar_t input)
+{
+       rexdft_t *t;
+       rexdfs_t *s = rex_dfa_state(dfa, nstate);
+       long min, max, mid;
+
+       if (!s || !s->ntrans)
+               return 0L;
+       min = 0;
+       max = min + s->ntrans;
+       while (max > min) {
+               mid = (min + max)/2;
+               t = rex_dfa_transition(dfa, nstate, mid);
+               if (input >= t->lowin) {
+                       min = mid + 1;
+               } else {
+                       max = mid;
+               }
+       }
+       if (min > 0)
+               --min;
+       t = rex_dfa_transition(dfa, nstate, min);
+       if (input >= t->lowin && input <= t->highin)
+               return t->state;
+       return 0;
+}
+
+
+void rex_dfa_dumpstate(rexdfa_t *dfa, unsigned long nstate)
 {
        long index;
        char buf[240];
        int bufsize = sizeof(buf) - 1;
        int n = 0;
-       rexdfs_t *s = &dfa->states[off];
+       rexdfs_t *s = rex_dfa_state(dfa, nstate);
        rexdft_t *t = NULL;
 
-       fprintf(stdout, "State %ld", off);
-
+       if (!s)
+               return;
+       fprintf(stdout, "State %ld", nstate);
        fprintf(stdout, " (");
        for (index = 0; index < s->nsubstates; index++) {
-               fprintf(stdout, "%ld", s->substates[index].uid);
-               if (s->substates[index].type == REX_STATETYPE_ACCEPT)
+               fprintf(stdout, "%ld", dfa->substates[s->substates + index].uid);
+               if (dfa->substates[s->substates + index].type == REX_STATETYPE_ACCEPT)
                        fprintf(stdout, "*");
-               fprintf(stdout, ",");
+               if (index + 1 < s->nsubstates)
+                       fprintf(stdout, ",");
        }
        fprintf(stdout, ")");
 
@@ -116,6 +228,13 @@ void rex_dfa_dumpstate(rexdfa_t *dfa, long off)
        fprintf(stdout, ": ");
        if (s->type == REX_STATETYPE_ACCEPT) {
                fprintf(stdout, " REX_STATETYPE_ACCEPT ");
+               fprintf(stdout, " (");
+               for (index = 0; index < s->naccsubstates; index++) {
+                       fprintf(stdout, "%ld*", dfa->accsubstates[s->accsubstates + index].uid);
+                       if (index + 1 < s->naccsubstates)
+                               fprintf(stdout, ",");
+               }
+               fprintf(stdout, ")");
        } else if (s->type == REX_STATETYPE_DEAD) {
                fprintf(stdout, " REX_STATETYPE_DEAD ");
        } else if (s->type == REX_STATETYPE_START) {
@@ -123,7 +242,7 @@ void rex_dfa_dumpstate(rexdfa_t *dfa, long off)
        }
        fprintf(stdout, "\n");
        for (index = 0; index < s->ntrans; index++) {
-               t = &s->trans[index];
+               t = &dfa->trans[s->trans + index];
                n = 0;
                if (t->lowin != t->highin) {
                        if (isprint(t->lowin) && !isspace(t->lowin) && isprint(t->highin) && !isspace(t->highin))
index 82f1576..8f6a94a 100644 (file)
@@ -1,3 +1,23 @@
+/*
+ *  Regular Pattern Analyzer (RPA)
+ *  Copyright (c) 2009-2010 Martin Stoilov
+ *
+ *  This program is free software: you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation, either version 3 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ *  Martin Stoilov <martin@rpasearch.com>
+ */
+
 #ifndef _REXDFA_H_
 #define _REXDFA_H_
 
@@ -9,14 +29,17 @@ extern "C" {
 #endif
 
 
+#define REX_DFA_DEADSTATE (0)
+#define REX_DFA_STARTSTATE (1)
+
 /*
- * Subset info definition
+ * Sub-state info definition
  */
-typedef struct rexsi_s {
+typedef struct rexdfss_s {
        unsigned int type;
        unsigned long uid;
        rexuserdata_t userdata;
-} rexsi_t;
+} rexdfss_t;
 
 
 /*
@@ -34,10 +57,12 @@ typedef struct rexdft_s {
  */
 typedef struct rexdfs_s {
        unsigned int type;
-       unsigned int ntrans;
-       rexdft_t *trans;
-       unsigned int nsubstates;
-       rexsi_t *substates;
+       unsigned long trans;
+       unsigned long ntrans;
+       unsigned long substates;
+       unsigned long nsubstates;
+       unsigned long accsubstates;
+       unsigned long naccsubstates;
 } rexdfs_t;
 
 
@@ -50,17 +75,21 @@ typedef struct rexdfa_s {
        unsigned long ntrans;
        rexdft_t *trans;
        unsigned long nsubstates;
-       rexsi_t *substates;
+       rexdfss_t *substates;
        unsigned long naccsubstates;
-       rexsi_t *accsubstates;
+       rexdfss_t *accsubstates;
 } rexdfa_t;
 
 
 rexdfa_t *rex_dfa_create(unsigned long nstates, unsigned long ntrans, unsigned long nsubsets, unsigned long naccsubsets);
 rexdfa_t *rex_dfa_create_from_db(rexdb_t *db);
 void rex_dfa_destroy(rexdfa_t *dfa);
-void rex_dfa_dumpstate(rexdfa_t *dfa, long off);
-
+void rex_dfa_dumpstate(rexdfa_t *dfa, unsigned long nstate);
+rexdfs_t *rex_dfa_state(rexdfa_t *dfa, unsigned long nstate);
+rexdft_t *rex_dfa_transition(rexdfa_t *dfa, unsigned long nstate, unsigned long ntransition);
+rexdfss_t *rex_dfa_substate(rexdfa_t *dfa, unsigned long nstate, unsigned long nsubstate);
+rexdfss_t *rex_dfa_accsubstate(rexdfa_t *dfa, unsigned long nstate, unsigned long naccsubstate);
+long rex_dfa_next(rexdfa_t *dfa, unsigned long nstate, rexchar_t input);
 
 #ifdef __cplusplus
 }
index b401d5a..6ecd49a 100644 (file)
@@ -25,4 +25,4 @@ CFLAGS += $(INCLUDE)
 LDFLAGS = $(MACH)
 
 
-include ../rgrep.mk
+include ../rexgrep.mk
index 4bdf0e5..3b2dad2 100644 (file)
@@ -62,7 +62,7 @@ void rex_grep_destroy(rexgrep_t *pGrep)
        if (!pGrep)
                return;
        rex_db_destroy(pGrep->nfa);
-       rex_db_destroy(pGrep->dfa);
+       rex_dfa_destroy(pGrep->dfa);
        rex_nfasimulator_destroy(pGrep->si);
        rex_dfasimulator_destroy(pGrep->dfasi);
        r_free(pGrep);
@@ -85,6 +85,27 @@ int rex_grep_load_pattern(rexgrep_t *pGrep, rbuffer_t *buf)
 }
 
 
+int rex_grep_dfamatch(rexgrep_t *pGrep, const char* input, const char *end)
+{
+       ruint32 wc = 0;
+       int inc = 0, ret = 0;
+       long nstate = REX_DFA_STARTSTATE;
+       const char *start = input;
+       rexdfa_t *dfa = pGrep->dfa;
+       rexdfs_t *s;
+
+       while ((inc = r_utf8_mbtowc(&wc, (const unsigned char*)input, (const unsigned char*)end)) > 0) {
+               if ((nstate = rex_dfa_next(dfa, nstate, wc)) <= 0)
+                       break;
+               input += inc;
+               s = rex_dfa_state(dfa, nstate);
+               if (s->type == REX_STATETYPE_ACCEPT)
+                       ret = input - start;
+       }
+       return ret;
+}
+
+
 int rex_grep_match(rexgrep_t *pGrep, const char* input, const char *end)
 {
        int inc;
@@ -92,7 +113,7 @@ int rex_grep_match(rexgrep_t *pGrep, const char* input, const char *end)
        rexdb_t *db;
 
        if (pGrep->usedfa)
-               return rex_dfasimulator_run(pGrep->dfasi, pGrep->dfa, 1, input, end - input);
+               return rex_grep_dfamatch(pGrep, input, end);
 
        if (pGrep->startuid < 0) {
                return -1;
index 7b3bec1..bfc3cd3 100644 (file)
@@ -25,6 +25,7 @@
 #include <stdio.h>
 #include "rlib/rbuffer.h"
 #include "rex/rexdb.h"
+#include "rex/rexdfa.h"
 #include "rex/rexfragment.h"
 #include "rex/rexnfasimulator.h"
 #include "rex/rexdfasimulator.h"
@@ -44,7 +45,7 @@ extern "C" {
 
 typedef struct rexgrep_s {
        rexdb_t *nfa;
-       rexdb_t *dfa;
+       rexdfa_t *dfa;
        rex_nfasimulator_t *si;
        rex_dfasimulator_t *dfasi;
        long startuid;
index 99800e1..2cd5280 100644 (file)
@@ -208,36 +208,28 @@ int main(int argc, const char *argv[])
        }
 
        if (pGrep->usedfa) {
-               pGrep->dfa = rex_db_createdfa(pGrep->nfa, pGrep->startuid);
+               rexdb_t *dfadb = rex_db_createdfa(pGrep->nfa, pGrep->startuid);
+               pGrep->dfa = rex_dfa_create_from_db(dfadb);
+               rex_db_destroy(dfadb);
        }
 
        for (i = 1; i < argc; i++) {
                if (strcmp(argv[i], "-D") == 0) {
                        int j;
-                       rexdb_t *db = pGrep->usedfa ? pGrep->dfa : pGrep->nfa;
-                       for (j = 0; j < r_array_length(db->states); j++) {
-                               rex_db_dumpstate(db, j);
-                       }
-                       goto end;
-               }
-       }
-
-
-       for (i = 1; i < argc; i++) {
-               if (strcmp(argv[i], "-F") == 0) {
-                       int j;
-                       rexdfa_t *dfa = NULL;
-                       if (pGrep->dfa == NULL)
-                               goto end;
-                       dfa = rex_dfa_create_from_db(pGrep->dfa);
-                       for (j = 0; j < dfa->nstates; j++) {
-                               rex_dfa_dumpstate(dfa, j);
+                       if (pGrep->dfa) {
+                               for (j = 0; j < pGrep->dfa->nstates; j++) {
+                                       rex_dfa_dumpstate(pGrep->dfa, j);
+                               }
+                       } else if (pGrep->nfa) {
+                               rexdb_t *db = pGrep->nfa;
+                               for (j = 0; j < r_array_length(db->states); j++) {
+                                       rex_db_dumpstate(db, j);
+                               }
                        }
                        goto end;
                }
        }
 
-
        /* scan files */
        for (i = 1; i < argc; i++) {
                if (argv[i][0] != '-') {