RPA Toolkit
Work on rexcc. Better support for rexdfa_t generation(without including NFA substates).
authorMartin Stoilov <martin@rpasearch.com>
Fri, 10 Feb 2012 05:34:41 +0000 (21:34 -0800)
committerMartin Stoilov <martin@rpasearch.com>
Fri, 10 Feb 2012 05:34:41 +0000 (21:34 -0800)
rex/rexcompiler.c
rex/rexdb.c
rex/rexdb.h
rexcc/rexcc.c
rexcc/rexcc.h
rexcc/unix/main.c
rexgrep/rexgrep.h
rexgrep/unix/main.c

index 206830f..1358bd0 100644 (file)
@@ -66,6 +66,7 @@ static int rex_compiler_isspace(int c)
 static int rex_compiler_getchar(rexcompiler_t *co)
 {
        ruint32 wc = REX_TOKEN_EOF;
+       int inc = 0;
 
 again:
        if (co->ptr + 1 < co->end && *co->ptr == '\\' && *(co->ptr + 1) == '\n') {
@@ -76,9 +77,7 @@ again:
                co->ptr += 3;
                goto again;
        }
-
-       int inc = r_utf8_mbtowc(&wc, (const unsigned char*)co->ptr, (const unsigned char*)co->end);
-
+       inc = r_utf8_mbtowc(&wc, (const unsigned char*)co->ptr, (const unsigned char*)co->end);
        if (inc <= 0)
                return REX_TOKEN_EOF;
        co->ptr += inc;
index 8a5536d..006cfc2 100644 (file)
@@ -337,7 +337,7 @@ const char *rex_db_version()
 }
 
 
-static void rex_db_filldfastate(rexdb_t *db, rexdfa_t *dfa, struct rexdfa_ctx *ctx, rexstate_t *state)
+static void rex_db_filldfastate(rexdb_t *db, rexdfa_t *dfa, struct rexdfa_ctx *ctx, rexstate_t *state, int withsubstates)
 {
        long i;
        rex_transition_t *t = NULL;
@@ -352,19 +352,9 @@ static void rex_db_filldfastate(rexdb_t *db, rexdfa_t *dfa, struct rexdfa_ctx *c
                dfa->trans[s->trans + i].state = t->dstuid;
        }
        ctx->ntrnas += s->ntrans;
-       s->substates = ctx->nsubstates;
-       s->nsubstates = rex_subset_length(state->subset);
-       for (i = 0; i < s->nsubstates; i++) {
-               unsigned long uid = rex_subset_index(state->subset, i);
-               rexsubstate_t *substate = rex_db_getsubstate(db, uid);
-               dfa->substates[s->substates + i].uid = uid;
-               dfa->substates[s->substates + i].type = substate->ss_type;
-               dfa->substates[s->substates + i].userdata = substate->ss_userdata;
-       }
-       ctx->nsubstates += s->nsubstates;
        s->accsubstates = ctx->naccsubstates;
        s->naccsubstates = 0L;
-       for (i = 0; i < s->nsubstates; i++) {
+       for (i = 0; i < rex_subset_length(state->subset); i++) {
                unsigned long uid = rex_subset_index(state->subset, i);
                rexsubstate_t *substate = rex_db_getsubstate(db, uid);
                if (substate->ss_type == REX_STATETYPE_ACCEPT) {
@@ -375,24 +365,40 @@ static void rex_db_filldfastate(rexdb_t *db, rexdfa_t *dfa, struct rexdfa_ctx *c
                }
        }
        ctx->naccsubstates += s->naccsubstates;
+       if (withsubstates) {
+               s->substates = ctx->nsubstates;
+               s->nsubstates = rex_subset_length(state->subset);
+               for (i = 0; i < rex_subset_length(state->subset); i++) {
+                       unsigned long uid = rex_subset_index(state->subset, i);
+                       rexsubstate_t *substate = rex_db_getsubstate(db, uid);
+                       dfa->substates[s->substates + i].uid = uid;
+                       dfa->substates[s->substates + i].type = substate->ss_type;
+                       dfa->substates[s->substates + i].userdata = substate->ss_userdata;
+               }
+               ctx->nsubstates += s->nsubstates;
+       } else {
+               s->substates = 0;
+               s->nsubstates = 0;
+       }
+
 }
 
 
-rexdfa_t *rex_db_todfa(rexdb_t *db)
+rexdfa_t *rex_db_todfa(rexdb_t *db, int withsubstates)
 {
        long i;
        rexdfa_t *dfa;
        struct rexdfa_ctx ctx;
        unsigned long nstates = rex_db_numstates(db);
        unsigned long ntrans = rex_db_numtransitions(db);
-       unsigned long nsubstates = rex_db_numsubstates(db);
        unsigned long naccsubstates = rex_db_numaccsubstates(db);
+       unsigned long nsubstates = withsubstates ? rex_db_numsubstates(db) : 0UL;
        dfa = rex_dfa_create(nstates, ntrans, naccsubstates, nsubstates);
        r_memset(&ctx, 0, sizeof(ctx));
 
        for (i = 0; i < r_array_length(db->states); i++) {
                rexstate_t *state = rex_db_getstate(db, i);
-               rex_db_filldfastate(db, dfa, &ctx, state);
+               rex_db_filldfastate(db, dfa, &ctx, state, withsubstates);
        }
        R_ASSERT(ctx.nstates == nstates);
        R_ASSERT(ctx.ntrnas == ntrans);
index 58c6882..b8bca7e 100644 (file)
@@ -69,7 +69,7 @@ long rex_db_numstates(rexdb_t *rexdb);
 long rex_db_numsubstates(rexdb_t *rexdb);
 long rex_db_numaccsubstates(rexdb_t *rexdb);
 const char *rex_db_version();
-rexdfa_t *rex_db_todfa(rexdb_t *db);
+rexdfa_t *rex_db_todfa(rexdb_t *db, int withsubstates);
 
 
 /*
index 4885f8a..5141680 100644 (file)
@@ -210,15 +210,16 @@ static int rex_cc_output_dfa(rexcc_t *pCC, FILE *out)
 {
        rexdfa_t *dfa = pCC->dfa;
 
-       rex_cc_fprintf(out, 0, "rexdfa_t ccdfa = {\n");
+       rex_cc_fprintf(out, 0, "static rexdfa_t ccdfa = {\n");
        rex_cc_fprintf(out, 1, "%lu,\n", dfa->nstates);
        rex_cc_fprintf(out, 1, "%s,\n", "states");
        rex_cc_fprintf(out, 1, "%lu,\n", dfa->ntrans);
        rex_cc_fprintf(out, 1, "%s,\n", "transitions");
+       rex_cc_fprintf(out, 1, "%lu,\n", dfa->naccsubstates);
+       rex_cc_fprintf(out, 1, "%s,\n", "accsubstates");
        rex_cc_fprintf(out, 1, "%lu,\n", dfa->nsubstates);
        rex_cc_fprintf(out, 1, "%s,\n", "substates");
-       rex_cc_fprintf(out, 1, "%lu,\n", dfa->naccsubstates);
-       rex_cc_fprintf(out, 1, "%s\n", "accsubstates");
+       rex_cc_fprintf(out, 1, "{0, },\n");
        rex_cc_fprintf(out, 0, "};\n");
 
        return 0;
@@ -226,21 +227,21 @@ static int rex_cc_output_dfa(rexcc_t *pCC, FILE *out)
 
 
 
-int rex_cc_output(rexcc_t *pCC, FILE *out)
+int rex_cc_output(rexcc_t *pCC, FILE *outc, FILE *outh)
 {
 
-       rex_cc_fprintf(out, 0, "#include \"rexdfa.h\"\n\n");
-
-       rex_cc_output_accsubstates(pCC, out);
-       rex_cc_fprintf(out, 0, "\n\n");
-       rex_cc_output_substates(pCC, out);
-       rex_cc_fprintf(out, 0, "\n\n");
-       rex_cc_output_transitions(pCC, out);
-       rex_cc_fprintf(out, 0, "\n\n");
-       rex_cc_output_states(pCC, out);
-       rex_cc_fprintf(out, 0, "\n\n");
-       rex_cc_output_dfa(pCC, out);
-
+       if (outc) {
+               rex_cc_fprintf(outc, 0, "#include \"rexdfa.h\"\n\n");
+               rex_cc_output_accsubstates(pCC, outc);
+               rex_cc_fprintf(outc, 0, "\n\n");
+               rex_cc_output_substates(pCC, outc);
+               rex_cc_fprintf(outc, 0, "\n\n");
+               rex_cc_output_transitions(pCC, outc);
+               rex_cc_fprintf(outc, 0, "\n\n");
+               rex_cc_output_states(pCC, outc);
+               rex_cc_fprintf(outc, 0, "\n\n");
+               rex_cc_output_dfa(pCC, outc);
+       }
 
        return 0;
 }
index 7990e45..1c71f92 100644 (file)
@@ -45,7 +45,7 @@ rexcc_t *rex_cc_create();
 void rex_cc_destroy(rexcc_t *pCC);
 int rex_cc_load_pattern(rexcc_t *pCC, rbuffer_t *buf, rexuserdata_t userdata);
 int rex_cc_load_string_pattern(rexcc_t *pCC, rbuffer_t *buf, rexuserdata_t userdata);
-int rex_cc_output(rexcc_t *pCC, FILE *out);
+int rex_cc_output(rexcc_t *pCC, FILE *outc,  FILE *outh);
 
 #ifdef __cplusplus
 }
index 36d27f7..74b2855 100644 (file)
@@ -29,6 +29,7 @@
 #include <stdlib.h>
 #include <wchar.h>
 #include <time.h>
+#include <errno.h>
 #include "rlib/rmem.h"
 #include "rlib/rarray.h"
 #include "rex/rexdfaconv.h"
@@ -89,12 +90,15 @@ int usage(int argc, const char *argv[])
 
                fprintf(stderr, "Usage: \n %s [OPTIONS] <filename>\n", argv[0]);
                fprintf(stderr, " OPTIONS:\n");
-               fprintf(stderr, "\t-e patterns              Regular Expression.\n");
-               fprintf(stderr, "\t-f patternfile           Read Regular Expressions from a file.\n");
+               fprintf(stderr, "\t-e <expression>          Regular Expression.\n");
+               fprintf(stderr, "\t-f <patternfile>         Read Regular Expressions from a file.\n");
+               fprintf(stderr, "\t-c <cfile>               Output .c file.\n");
+               fprintf(stderr, "\t-h <hfile>               Output .h file.\n");
                fprintf(stderr, "\t-D                       Dump states.\n");
-               fprintf(stderr, "\t-t                       Display time elapsed.\n");
+               fprintf(stderr, "\t-S                       Include substates.\n");
+               fprintf(stderr, "\t-t                       Display statistics.\n");
                fprintf(stderr, "\t-v                       Display version information.\n");
-               fprintf(stderr, "\t-h, --help               Display this help.\n");
+               fprintf(stderr, "\t--help                   Display this help.\n");
                
                return 0;
 }
@@ -150,8 +154,11 @@ int main(int argc, const char *argv[])
        int i, ret = 0;
        rexcc_t *pCC;
        rarray_t *buffers;
+       int withsubstates = 0;
        FILE *devnull = NULL;
        rexdb_t *tempdb = NULL;
+       FILE *cfile = NULL;
+       FILE *hfile = NULL;
 
        buffers = r_array_create(sizeof(rbuffer_t *));
        pCC = rex_cc_create();
@@ -167,7 +174,7 @@ int main(int argc, const char *argv[])
        }
 
        for (i = 1; i < argc; i++) {
-               if (strcmp(argv[i], "--help") == 0 || strcmp(argv[i], "-help") == 0 || strcmp(argv[i], "/?") == 0 || strcmp(argv[i], "-h") == 0) {
+               if (strcmp(argv[i], "--help") == 0 || strcmp(argv[i], "-help") == 0) {
                        usage(argc, argv);
                        goto end;
                }
@@ -181,6 +188,33 @@ int main(int argc, const char *argv[])
        }
 
        for (i = 1; i < argc; i++) {
+               if (strcmp(argv[i], "-c") == 0) {
+                       if (++i < argc) {
+                               cfile = fopen(argv[i], "wb");
+                               if (!cfile) {
+                                       fprintf(stderr, "Failed to create file: %s, %s\n", argv[i], strerror(errno));
+                                       goto error;
+                               }
+
+                       }
+               }
+       }
+
+       for (i = 1; i < argc; i++) {
+               if (strcmp(argv[i], "-h") == 0) {
+                       if (++i < argc) {
+                               hfile = fopen(argv[i], "wb");
+                               if (!hfile) {
+                                       fprintf(stderr, "Failed to create file: %s, %s\n", argv[i], strerror(errno));
+                                       goto error;
+                               }
+
+                       }
+               }
+       }
+
+
+       for (i = 1; i < argc; i++) {
                if (strcmp(argv[i], "-f") == 0) {
                        if (++i < argc) {
 #if 0
@@ -209,10 +243,16 @@ int main(int argc, const char *argv[])
                }
        }
 
+       for (i = 1; i < argc; i++) {
+               if (strcmp(argv[i], "-S") == 0) {
+                       withsubstates = 1;
+               }
+       }
+
        if (pCC->startuid < 0)
                goto error;
        tempdb = rex_db_createdfa(pCC->nfa, pCC->startuid);
-       pCC->dfa = rex_db_todfa(tempdb);
+       pCC->dfa = rex_db_todfa(tempdb, withsubstates);
        rex_db_destroy(tempdb);
 
        for (i = 1; i < argc; i++) {
@@ -232,7 +272,7 @@ int main(int argc, const char *argv[])
                }
        }
 
-       rex_cc_output(pCC, stdout);
+       rex_cc_output(pCC, cfile, hfile);
 
 end:
        rex_cc_destroy(pCC);
@@ -242,11 +282,19 @@ end:
 
        if (devnull)
                fclose(devnull);
+       if (cfile)
+               fclose(cfile);
+       if (hfile)
+               fclose(hfile);
        return ret;
 
 error:
        if (devnull)
                fclose(devnull);
+       if (cfile)
+               fclose(cfile);
+       if (hfile)
+               fclose(hfile);
        rex_cc_destroy(pCC);
        return 2;
 }
index bfc3cd3..4b39ae7 100644 (file)
@@ -57,6 +57,7 @@ typedef struct rexgrep_s {
        unsigned int showtime;
        unsigned int showfilename;
        unsigned int usedfa;
+       unsigned int withsubstates;
        unsigned int forceEncoding;
        int ret;
        void *filename;
index 247ca88..86739bf 100644 (file)
@@ -53,7 +53,7 @@ int usage(int argc, const char *argv[])
                fprintf(stderr, "\t-l                       Line mode.\n");
                fprintf(stderr, "\t-N                       Use NFA.\n");
                fprintf(stderr, "\t-D                       Dump states.\n");
-               fprintf(stderr, "\t-F                       Dump rexdfa_t states.\n");
+               fprintf(stderr, "\t-S                       Include DFA substates.\n");
                fprintf(stderr, "\t-q                       Quiet mode.\n");
                fprintf(stderr, "\t-t                       Display time elapsed.\n");
                fprintf(stderr, "\t-s string                Scan string.\n");
@@ -139,6 +139,12 @@ int main(int argc, const char *argv[])
        }
 
        for (i = 1; i < argc; i++) {
+               if (strcmp(argv[i], "-S") == 0) {
+                       pGrep->withsubstates = 1;
+               }
+       }
+
+       for (i = 1; i < argc; i++) {
                if (strcmp(argv[i], "-H") == 0 || strcmp(argv[i], "--with-filename") == 0) {
                        pGrep->showfilename = 1;
                }
@@ -242,7 +248,7 @@ int main(int argc, const char *argv[])
 
        if (!pGrep->dfa && pGrep->usedfa) {
                rexdb_t *dfadb = rex_db_createdfa(pGrep->nfa, pGrep->startuid);
-               pGrep->dfa = rex_db_todfa(dfadb);
+               pGrep->dfa = rex_db_todfa(dfadb, pGrep->withsubstates);
                rex_db_destroy(dfadb);
        }
 
@@ -265,7 +271,7 @@ int main(int argc, const char *argv[])
 
        if (pGrep->dfa && binop == REXGREP_BINOP_WRITE) {
                rexdfa_t dfa = *pGrep->dfa;
-               dfa.nsubstates = 0;
+//             dfa.nsubstates = 0;
                dfa.substates = NULL;
                dfa.states = NULL;
                dfa.trans = NULL;
@@ -279,9 +285,11 @@ int main(int argc, const char *argv[])
                dfa.states = pGrep->dfa->states;
                dfa.trans = pGrep->dfa->trans;
                dfa.accsubstates = pGrep->dfa->accsubstates;
+               dfa.substates = pGrep->dfa->substates;
                fwrite(dfa.states, sizeof(*dfa.states), dfa.nstates, pfile);
                fwrite(dfa.trans, sizeof(*dfa.trans), dfa.ntrans, pfile);
                fwrite(dfa.accsubstates, sizeof(*dfa.accsubstates), dfa.naccsubstates, pfile);
+               fwrite(dfa.substates, sizeof(*dfa.substates), dfa.nsubstates, pfile);
                fclose(pfile);
                goto end;
        }
@@ -329,7 +337,7 @@ end:
                unsigned long sizetrans = dfa->ntrans * sizeof(rexdft_t);
                unsigned long sizeaccsubs = dfa->naccsubstates * sizeof(rexdfss_t);
                unsigned long sizesubs = dfa->nsubstates * sizeof(rexdfss_t);
-               unsigned long sizetotal = sizestates + sizetrans + sizeaccsubs + sizesubs + sizeof(rexdfa_t);
+               unsigned long sizetotal = sizestates + sizetrans + sizeaccsubs + sizesubs;
                fprintf(stdout, "\n\n");
                fprintf(stdout, "\tDFA Memory: %ld KB, States: %ld KB (%.2f), Transitions: %ld KB (%.2f), Accecpting Substates: %ld KB(%.2f), Substates: %ld KB (%.2f)\n",
                                sizetotal/1024, sizestates/1024, (100.0*sizestates/sizetotal), sizetrans/1024, (100.0*sizetrans/sizetotal),