RPA Toolkit
added rexregex and test program
authorMartin Stoilov <martin@rpasearch.com>
Fri, 2 Mar 2012 05:41:35 +0000 (21:41 -0800)
committerMartin Stoilov <martin@rpasearch.com>
Fri, 2 Mar 2012 05:41:35 +0000 (21:41 -0800)
rex/rexregex.c [new file with mode: 0644]
rex/rexregex.h [new file with mode: 0644]
tests/testrex/build/unix/i386/Makefile [new file with mode: 0644]
tests/testrex/build/unix/testrex.mk [new file with mode: 0644]
tests/testrex/build/unix/x86_64/Makefile [new file with mode: 0644]
tests/testrex/rexregex.c [new file with mode: 0644]

diff --git a/rex/rexregex.c b/rex/rexregex.c
new file mode 100644 (file)
index 0000000..727a12c
--- /dev/null
@@ -0,0 +1,202 @@
+/*
+ *  Regular Pattern Analyzer Toolkit (RPA/Tk)
+ *  Copyright (c) 2009-2012 Martin Stoilov
+ *
+ *  This program is free software: you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation, either version 3 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ *  Martin Stoilov <martin@rpasearch.com>
+ */
+
+
+#include "rlib/rmem.h"
+#include "rlib/rstring.h"
+#include "rlib/rutf.h"
+#include "rex/rexdb.h"
+#include "rex/rexregex.h"
+#include "rex/rexdfa.h"
+
+#define REX_REGEX_HASHBYTES 3
+#define REX_REGEX_HASHBITS 5
+
+struct rexregex_s {
+       rexdfa_t *dfa;
+};
+
+
+rexregex_t *rex_regex_create(const char *str, unsigned int size)
+{
+       long start;
+       rexdb_t *nfadb = NULL, *dfadb = NULL;
+       rexregex_t *regex = NULL;
+
+       nfadb = rex_db_create(REXDB_TYPE_NFA);
+       start = rex_db_addexpression(nfadb, -1, str, size, 0);
+       if (start < 0)
+               goto error;
+       dfadb = rex_db_createdfa(nfadb, start);
+       if (!dfadb)
+               goto error;
+       regex = (rexregex_t *)r_zmalloc(sizeof(*regex));
+       regex->dfa = rex_db_todfa(dfadb, 0);
+       if (!regex->dfa)
+               goto error;
+       rex_dfa_hash(regex->dfa, REX_REGEX_HASHBYTES, REX_REGEX_HASHBITS);
+       rex_db_destroy(dfadb);
+       rex_db_destroy(nfadb);
+       return regex;
+
+error:
+       rex_db_destroy(dfadb);
+       rex_db_destroy(nfadb);
+       rex_regex_destroy(regex);
+       return NULL;
+}
+
+
+void rex_regex_destroy(rexregex_t *regex)
+{
+       if (regex) {
+               rex_dfa_destroy(regex->dfa);
+               r_free(regex);
+       }
+}
+
+
+rexregex_t *rex_regex_create_s(const char *str)
+{
+       return rex_regex_create(str, r_strlen(str));
+}
+
+
+#define REX_GREP_SHIFT(__shift__, __count__, __bytes__, __bitsperbyte__, __shiftstart__, __end__) \
+do { \
+       int inc, i; \
+       unsigned int mask = (1 << __bitsperbyte__) - 1; \
+       ruint32 wc = 0; \
+       for (i = 0; i < __count__; i++) { \
+               wc = 0; \
+               if (__shiftstart__ < __end__) { \
+                       wc = *(__shiftstart__); \
+                       inc = 1; \
+                       if (wc >= 0x80) { \
+                               inc = r_utf8_mbtowc(&wc, (const unsigned char*)(__shiftstart__), (const unsigned char*)__end__); \
+                       } \
+                       __shiftstart__ += inc; \
+               } \
+               __shift__ <<= __bitsperbyte__; \
+               __shift__ |= (wc & mask); \
+       } \
+       __shift__ = (__shift__ & REX_DFA_HASHMASK(__bytes__, __bitsperbyte__)); \
+} while (0)
+
+
+static int rex_regex_match_utf8(rexregex_t *regex, const char *start, const char *end)
+{
+       int inc;
+       ruint32 wc = 0;
+       int ret = 0;
+       long nstate = REX_DFA_STARTSTATE;
+       const char *input = start;
+       rexdfa_t *dfa = regex->dfa;
+       rexdfs_t *s;
+
+       while ((inc = r_utf8_mbtowc(&wc, (const unsigned char*)input, (const unsigned char*)end)) > 0) {
+               REX_DFA_NEXT(dfa, nstate, wc, &nstate);
+               if (nstate == 0)
+                       break;
+               input += inc;
+               s = REX_DFA_STATE(dfa, nstate);
+               if (s->type == REX_STATETYPE_ACCEPT)
+                       ret = (int)(input - start);
+       }
+       return ret;
+}
+
+
+int rex_regex_scan_utf8(rexregex_t *regex, const char *start, const char *end, const char **where)
+{
+       int ret = 0;
+       unsigned int shifter = 0;
+       const char *nextshift = start;
+       const char *input = start;
+       rexdfa_t *dfa = regex->dfa;
+
+       nextshift = start;
+       REX_GREP_SHIFT(shifter, REX_REGEX_HASHBYTES, REX_REGEX_HASHBYTES, REX_REGEX_HASHBITS, nextshift, end);
+
+       while (input < end) {
+               while ((ret = REX_BITARRAY_GET(dfa->bits, shifter)) == 0 && nextshift < end && ((unsigned char)*nextshift) < 0x80 && ((unsigned char)*input) < 0x80) {
+                       shifter <<= REX_REGEX_HASHBITS;
+                       shifter |= (((unsigned char)*nextshift) & ((1 << REX_REGEX_HASHBITS) - 1));
+                       shifter = (shifter & REX_DFA_HASHMASK(REX_REGEX_HASHBYTES, REX_REGEX_HASHBITS));
+                       nextshift += 1;
+                       input += 1;
+               }
+               if (ret)
+                       ret = rex_regex_match_utf8(regex, input, end);
+               if (ret == 0) {
+                       ruint32 wc = *input;
+                       if (wc >= 0x80) {
+                               ret = r_utf8_mbtowc(&wc, (const unsigned char*)input, (const unsigned char*)end);
+                               if (ret <= 0)
+                                       ret = 1;
+                               input += ret;
+                       } else {
+                               input += 1;
+                       }
+                       if (nextshift < end && ((unsigned char)*nextshift) < 0x80) {
+                               shifter <<= REX_REGEX_HASHBITS;
+                               shifter |= (((unsigned char)*nextshift) & ((1 << REX_REGEX_HASHBITS) - 1));
+                               shifter = (shifter & REX_DFA_HASHMASK(REX_REGEX_HASHBYTES, REX_REGEX_HASHBITS));
+                               nextshift += 1;
+                       } else {
+                               REX_GREP_SHIFT(shifter, 1, REX_REGEX_HASHBYTES, REX_REGEX_HASHBITS, nextshift, end);
+                       }
+               } else if (ret > 0) {
+                       if (where)
+                               *where = input;
+                       return ret;
+               } else {
+                       /*
+                        * Error
+                        */
+                       return -1;
+               }
+       }
+       return 0;
+}
+
+
+int rex_regex_match(rexregex_t *regex, unsigned int encoding, const char *start, const char *end)
+{
+       switch (encoding) {
+       case REX_ENCODING_BYTE:
+       case REX_ENCODING_UTF8:
+       default:
+               return rex_regex_match_utf8(regex, start, end);
+       }
+       return -1;
+}
+
+
+int rex_regex_scan(rexregex_t *regex, unsigned int encoding, const char *start, const char *end, const char **where)
+{
+       switch (encoding) {
+       case REX_ENCODING_BYTE:
+       case REX_ENCODING_UTF8:
+       default:
+               return rex_regex_scan_utf8(regex, start, end, where);
+       }
+       return -1;
+}
diff --git a/rex/rexregex.h b/rex/rexregex.h
new file mode 100644 (file)
index 0000000..b5c2d04
--- /dev/null
@@ -0,0 +1,53 @@
+/*
+ *  Regular Pattern Analyzer Toolkit (RPA/Tk)
+ *  Copyright (c) 2009-2012 Martin Stoilov
+ *
+ *  This program is free software: you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation, either version 3 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ *  Martin Stoilov <martin@rpasearch.com>
+ */
+
+/**
+ * @file rex/rexregex.h
+ * @brief Definition of RegEx interface
+ *
+ *
+ * <h2>Synopsis</h2>
+ * This file defines the data structures and API for the RegEx implementation.
+ */
+
+#ifndef _REXREGEX_H_
+#define _REXREGEX_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct rexregex_s rexregex_t;
+
+#define REX_ENCODING_UTF8 0
+#define REX_ENCODING_BYTE 1
+
+rexregex_t *rex_regex_create(const char *str, unsigned int size);
+rexregex_t *rex_regex_create_s(const char *str);
+int rex_regex_match(rexregex_t *regex, unsigned int encoding, const char *start, const char *end);
+int rex_regex_scan(rexregex_t *regex, unsigned int encoding, const char *start, const char *end, const char **where);
+void rex_regex_destroy(rexregex_t *regex);
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/tests/testrex/build/unix/i386/Makefile b/tests/testrex/build/unix/i386/Makefile
new file mode 100644 (file)
index 0000000..66e21bf
--- /dev/null
@@ -0,0 +1,29 @@
+ARCHDIR = $(shell basename $(shell pwd))
+OUTDIR = out
+OS = $(shell uname | tr "[:upper:]" "[:lower:]")
+ROOT_DIR = ../../../../..
+MACH = -m32
+
+include $(ROOT_DIR)/build/unix/config.mk
+
+ifeq ($(BLDCFG), release)
+CFLAGS = -O2
+else
+ifeq ($(BLDCFG), profile)
+CFLAGS = -O0 -pg
+else
+CFLAGS = -g -O0 -Wall 
+endif
+endif
+
+ifeq ($(CCBLD), yes)
+CFLAGS += -fprofile-arcs -ftest-coverage
+endif
+
+CFLAGS += $(MACH) $(INCLUDE)
+
+CFLAGS := $(CFLAGS)
+LDFLAGS = $(MACH)
+
+
+include ../testrpa2.mk
diff --git a/tests/testrex/build/unix/testrex.mk b/tests/testrex/build/unix/testrex.mk
new file mode 100644 (file)
index 0000000..7b52a4d
--- /dev/null
@@ -0,0 +1,39 @@
+RLIB_SRCDIR = $(ROOT_DIR)/rlib
+REX_SRCDIR = $(ROOT_DIR)/rex
+TESTS_SRCDIR = $(ROOT_DIR)/tests/testrex
+INCLUDE = -I$(ROOT_DIR) -I$(ROOT_DIR)/arch/unix/$(ARCHDIR)
+ifeq ($(OS), linux)
+LDFLAGS += --static
+endif
+
+LIBS = -L$(RLIB_SRCDIR)/build/unix/$(ARCHDIR)/out 
+LIBS += -L$(REX_SRCDIR)/build/unix/$(ARCHDIR)/out 
+LIBS += -lrex -lrlib -lm
+
+
+TESTS  += $(OUTDIR)/rexregex
+
+
+all : $(OUTDIR) $(TESTS)
+
+
+$(OUTDIR)/%: $(TESTS_SRCDIR)/%.c
+       + $(CC) $(CFLAGS) -o $(OUTDIR)/$* $(TESTS_SRCDIR)/$*.c  -lrpa $(LIBS) $(LDFLAGS) $(INCLUDE)
+
+
+$(OUTDIR)/%.o: $(TESTS_SRCDIR)/%.rpa
+       $(LD) -r -b binary -o $(OUTDIR)/$*.o $(TESTS_SRCDIR)/$*.rpa
+
+
+$(OUTDIR):
+       @mkdir $(OUTDIR)
+
+distclean: clean
+       -rm -f .depend
+       -rm -rf $(OUTDIR)
+
+clean:
+       -rm -f $(TESTS)
+       -rm -f *~
+       -rm -f $(ROOT_DIR)/*~
+
diff --git a/tests/testrex/build/unix/x86_64/Makefile b/tests/testrex/build/unix/x86_64/Makefile
new file mode 100644 (file)
index 0000000..bce2ff1
--- /dev/null
@@ -0,0 +1,29 @@
+ARCHDIR = $(shell basename $(shell pwd))
+OUTDIR = out
+OS = $(shell uname | tr "[:upper:]" "[:lower:]")
+ROOT_DIR = ../../../../..
+MACH = -m64
+
+include $(ROOT_DIR)/build/unix/config.mk
+
+ifeq ($(BLDCFG), release)
+CFLAGS = -O2
+else
+ifeq ($(BLDCFG), profile)
+CFLAGS = -O0 -pg
+else
+CFLAGS = -g -O0 -Wall 
+endif
+endif
+
+ifeq ($(CCBLD), yes)
+CFLAGS += -fprofile-arcs -ftest-coverage
+endif
+
+CFLAGS += $(MACH) $(INCLUDE)
+
+CFLAGS := $(CFLAGS)
+LDFLAGS = $(MACH)
+
+
+include ../testrex.mk
diff --git a/tests/testrex/rexregex.c b/tests/testrex/rexregex.c
new file mode 100644 (file)
index 0000000..2141bbb
--- /dev/null
@@ -0,0 +1,32 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "rex/rexregex.h"
+
+#define BUF_SIZE 0x10000
+
+int main(int argc, char *argv[])
+{
+       rexregex_t *r;
+       char *buf, *q;
+       const char *where;
+       int l = 0;
+       if (argc == 1) {
+               fprintf(stderr, "Usage: cat in.file | %s <regexp>\n", argv[0]);
+               return 0;
+       }
+
+       r = rex_regex_create_s(argv[1]);
+       buf = calloc(BUF_SIZE, 1);
+       while (fgets(buf, BUF_SIZE - 1, stdin)) {
+               ++l;
+               for (q = buf; *q; ++q);
+               if (q > buf)
+                       *(q-1) = 0;
+               if (rex_regex_scan(r, REX_ENCODING_UTF8, buf, q, &where) > 0)
+                       printf("%d:%s\n", l, buf);
+       }
+       free(buf);
+       rex_regex_destroy(r);
+       return 0;
+}