../../rex/doc/rex_main.txt \
../../rex/doc/rexdb.txt \
../../rex/doc/rexdfa.txt \
+ ../../rex/doc/rexregex.txt \
../../rex/doc/rexcc.txt \
../../rex/doc/rexgrep.txt \
- ../../rex/rexdb.h \
- ../../rex/rexdfa.h \
+ ../../rex/rexdb.h \
+ ../../rex/rexdfa.h \
+ ../../rex/rexregex.h \
INPUT_ENCODING = UTF-8
"$(OUTDIR)\rexcompiler.obj" \\r
"$(OUTDIR)\rexdb.obj" \\r
"$(OUTDIR)\rexdfa.obj" \\r
+ "$(OUTDIR)\rexregex.obj" \\r
"$(OUTDIR)\rexdfaconv.obj" \\r
"$(OUTDIR)\rexdfasimulator.obj" \\r
"$(OUTDIR)\rexfragment.obj" \\r
"$(OUTDIR)\rexdfa.obj" : $(SOURCE) $(OUTDIR)\r
$(CPP) $(CPP_FLAGS) $(SOURCE)\r
\r
+SOURCE="$(SRC_DIR)\rexregex.c"\r
+"$(OUTDIR)\rexregex.obj" : $(SOURCE) $(OUTDIR)\r
+ $(CPP) $(CPP_FLAGS) $(SOURCE)\r
+\r
SOURCE="$(SRC_DIR)\rexdfaconv.c"\r
"$(OUTDIR)\rexdfaconv.obj" : $(SOURCE) $(OUTDIR)\r
$(CPP) $(CPP_FLAGS) $(SOURCE)\r
SOURCE="$(SRC_DIR)\rextransition.c"\r
"$(OUTDIR)\rextransition.obj" : $(SOURCE) $(OUTDIR)\r
$(CPP) $(CPP_FLAGS) $(SOURCE)\r
-
\ No newline at end of file
+ \r
\section usage How to use.
To implement matching using REX, the user has to run the input through the DFA states until the automaton
-arrives at accepting state. For example, if the input is a string:
+arrives at accepting state. For example, if the input is a NULL terminated string:
@code
nstate = REX_DFA_STARTSTATE;
while (*str) {
}
@endcode
-REX doesn't provide API for matching or searching directly, it is up to the user to decide how to
-implement whatever functionality they need using the automaton.
+REX also provides some API for matching and searching. Example:
+
+@code
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "rex/rexregex.h"
+
+/*
+ * To build:
+ * gcc -o rexregex rexregex.c -I/usr/include/rpatk -lrex -lrlib
+ */
+
+#define BUF_SIZE 0x10000
+
+int main(int argc, char *argv[])
+{
+ rexregex_t *regex;
+ char *buf, *end;
+ const char *where;
+ int line = 0;
+
+ if (argc < 2) {
+ fprintf(stderr, "Usage:\n cat <file> | %s <regexp>\n", argv[0]);
+ return 0;
+ }
+ regex = rex_regex_create_s(argv[1]);
+ buf = calloc(BUF_SIZE, 1);
+ while (fgets(buf, BUF_SIZE - 1, stdin)) {
+ ++line;
+ for (end = buf; *end; ++end);
+ if (end > buf)
+ *(end - 1) = '\0';
+ if (rex_regex_scan(regex, REX_ENCODING_UTF8, buf, end, &where) > 0)
+ printf("%d: %s\n", line, buf);
+ }
+ free(buf);
+ rex_regex_destroy(regex);
+ return 0;
+}
+@endcode
The JavaScript tokenizer example @ref js-tokenizer.c is a simple demonstration how to use the REX library for lexical analysis of UTF8 encoded text.
--- /dev/null
+/** \page rexregex rexregex_t - Using RegEx for matching.
+
+- 1. Create @ref rexregex_t object using @ref rex_regex_create.
+- 2. Use @ref rexregex_t object to match or search with @ref rex_regex_match or @ref rex_regex_scan.
+- 3. Destroy the rexregex_t object with @ref rex_regex_destroy.
+
+*/
\ No newline at end of file
}
-#define REX_GREP_SHIFT(__shift__, __count__, __bytes__, __bitsperbyte__, __shiftstart__, __end__) \
+#define REX_GREP_SHIFT_UTF8(__shift__, __count__, __bytes__, __bitsperbyte__, __shiftstart__, __end__) \
do { \
int inc, i; \
unsigned int mask = (1 << __bitsperbyte__) - 1; \
} while (0)
+#define REX_GREP_SHIFT_BYTE(__shift__, __count__, __bytes__, __bitsperbyte__, __shiftstart__, __end__) \
+do { \
+ int inc, i; \
+ unsigned int mask = (1 << __bitsperbyte__) - 1; \
+ ruint32 wc = 0; \
+ for (i = 0; i < __count__; i++) { \
+ wc = 0; \
+ if (__shiftstart__ < __end__) { \
+ wc = *(__shiftstart__); \
+ inc = 1; \
+ __shiftstart__ += inc; \
+ } \
+ __shift__ <<= __bitsperbyte__; \
+ __shift__ |= (wc & mask); \
+ } \
+ __shift__ = (__shift__ & REX_DFA_HASHMASK(__bytes__, __bitsperbyte__)); \
+} while (0)
+
+
static int rex_regex_match_utf8(rexregex_t *regex, const char *start, const char *end)
{
int inc;
}
+static int rex_regex_match_byte(rexregex_t *regex, const char *start, const char *end)
+{
+ ruint32 wc = 0;
+ int ret = 0;
+ long nstate = REX_DFA_STARTSTATE;
+ const char *input = start;
+ rexdfa_t *dfa = regex->dfa;
+ rexdfs_t *s;
+
+ while (input < end) {
+ wc = *input;
+ REX_DFA_NEXT(dfa, nstate, wc, &nstate);
+ if (nstate == 0)
+ break;
+ input += 1;
+ s = REX_DFA_STATE(dfa, nstate);
+ if (s->type == REX_STATETYPE_ACCEPT)
+ ret = (int)(input - start);
+ }
+ return ret;
+}
+
+
+int rex_regex_scan_byte(rexregex_t *regex, const char *start, const char *end, const char **where)
+{
+ int ret = 0;
+ unsigned int shifter = 0;
+ const char *nextshift = start;
+ const char *input = start;
+ rexdfa_t *dfa = regex->dfa;
+
+ nextshift = start;
+ REX_GREP_SHIFT_BYTE(shifter, REX_REGEX_HASHBYTES, REX_REGEX_HASHBYTES, REX_REGEX_HASHBITS, nextshift, end);
+
+ while (input < end) {
+ while ((ret = REX_BITARRAY_GET(dfa->bits, shifter)) == 0 && nextshift < end) {
+ shifter <<= REX_REGEX_HASHBITS;
+ shifter |= (((unsigned char)*nextshift) & ((1 << REX_REGEX_HASHBITS) - 1));
+ shifter = (shifter & REX_DFA_HASHMASK(REX_REGEX_HASHBYTES, REX_REGEX_HASHBITS));
+ nextshift += 1;
+ input += 1;
+ }
+ if (ret)
+ ret = rex_regex_match_byte(regex, input, end);
+ if (ret == 0) {
+ input += 1;
+ if (nextshift < end) {
+ shifter <<= REX_REGEX_HASHBITS;
+ shifter |= (((unsigned char)*nextshift) & ((1 << REX_REGEX_HASHBITS) - 1));
+ shifter = (shifter & REX_DFA_HASHMASK(REX_REGEX_HASHBYTES, REX_REGEX_HASHBITS));
+ nextshift += 1;
+ } else {
+ REX_GREP_SHIFT_BYTE(shifter, 1, REX_REGEX_HASHBYTES, REX_REGEX_HASHBITS, nextshift, end);
+ }
+ } else if (ret > 0) {
+ if (where)
+ *where = input;
+ return ret;
+ } else {
+ /*
+ * Error
+ */
+ return -1;
+ }
+ }
+ return 0;
+}
+
+
int rex_regex_scan_utf8(rexregex_t *regex, const char *start, const char *end, const char **where)
{
int ret = 0;
rexdfa_t *dfa = regex->dfa;
nextshift = start;
- REX_GREP_SHIFT(shifter, REX_REGEX_HASHBYTES, REX_REGEX_HASHBYTES, REX_REGEX_HASHBITS, nextshift, end);
+ REX_GREP_SHIFT_UTF8(shifter, REX_REGEX_HASHBYTES, REX_REGEX_HASHBYTES, REX_REGEX_HASHBITS, nextshift, end);
while (input < end) {
while ((ret = REX_BITARRAY_GET(dfa->bits, shifter)) == 0 && nextshift < end && ((unsigned char)*nextshift) < 0x80 && ((unsigned char)*input) < 0x80) {
shifter = (shifter & REX_DFA_HASHMASK(REX_REGEX_HASHBYTES, REX_REGEX_HASHBITS));
nextshift += 1;
} else {
- REX_GREP_SHIFT(shifter, 1, REX_REGEX_HASHBYTES, REX_REGEX_HASHBITS, nextshift, end);
+ REX_GREP_SHIFT_UTF8(shifter, 1, REX_REGEX_HASHBYTES, REX_REGEX_HASHBITS, nextshift, end);
}
} else if (ret > 0) {
if (where)
{
switch (encoding) {
case REX_ENCODING_BYTE:
+ return rex_regex_match_byte(regex, start, end);
case REX_ENCODING_UTF8:
default:
return rex_regex_match_utf8(regex, start, end);
{
switch (encoding) {
case REX_ENCODING_BYTE:
+ return rex_regex_scan_byte(regex, start, end, where);
case REX_ENCODING_UTF8:
default:
return rex_regex_scan_utf8(regex, start, end, where);
extern "C" {
#endif
+/**
+ * Define RegEx object
+ */
typedef struct rexregex_s rexregex_t;
-#define REX_ENCODING_UTF8 0
-#define REX_ENCODING_BYTE 1
+#define REX_ENCODING_UTF8 0 /**< Input is encoded in UTF8. */
+#define REX_ENCODING_BYTE 1 /**< The encoding is one byte per character */
+/**
+ * Create regular expression object.
+ * @param str Regular Expression
+ * @param size The size of str in bytes.
+ * @return Regular Expression object.
+ */
rexregex_t *rex_regex_create(const char *str, unsigned int size);
+
+/**
+ * Create regular expression object.
+ * @param str Regular Expression. The string must be NULL terminated
+ * @return Regular Expression object.
+ */
rexregex_t *rex_regex_create_s(const char *str);
+
+/**
+ * Match regular expression.
+ * @param regex regular expression object created with @ref rex_regex_create or @ref rex_regex_create_s
+ * @param encoding Input encoding. Supported encodings:
+ * - @ref REX_ENCODING_UTF8
+ * - @ref REX_ENCODING_BYTE
+ * @param start Input start pointer
+ * @param end Input end pointer
+ * @return Returns the size in bytes of the matched substring or 0 if no match, -1 in case of error.
+ */
int rex_regex_match(rexregex_t *regex, unsigned int encoding, const char *start, const char *end);
+
+/**
+ * Search for regular expression match within the input specified with start and end.
+ * @param regex regular expression object created with @ref rex_regex_create or @ref rex_regex_create_s
+ * @param encoding Input encoding. Supported encodings:
+ * - @ref REX_ENCODING_UTF8
+ * - @ref REX_ENCODING_BYTE
+ * @param start Input start pointer
+ * @param end Input end pointer
+ * @param where It would be set to the beginning of the matched substring.
+ * @return Returns the size in bytes of the matched substring or 0 if no match, -1 in case of error.
+ * If 'where' is not NULL, it would be set to the beginning of the matched substring.
+ */
int rex_regex_scan(rexregex_t *regex, unsigned int encoding, const char *start, const char *end, const char **where);
+
+/**
+ * Destroy regex object created with @ref rex_regex_create or @ref rex_regex_create_s
+ * @param regex regular expression object to be destroyed.
+ */
void rex_regex_destroy(rexregex_t *regex);
int rex_cc_gettoken(rexcc_t *pCC)
{
struct tokeninfo_s *ti = NULL;
- struct tokeninfo_s *tk = tokens;
ruint32 wc = 0;
int inc, ret = 0;
long nstate = REX_DFA_STARTSTATE;
str = (rbuffer_t *)r_malloc(sizeof(rbuffer_t));
if (!str)
goto error;
- memset(str, 0, sizeof(*str));
+ r_memset(str, 0, sizeof(*str));
str->s = buffer;
str->size = st.st_size;
str->userdata = (void*)((unsigned long)fd);
- @subpage rex_main "Regular Expressions (REX library)"
- @subpage rexdb
- @subpage rexdfa
+ - @subpage rexregex
- @subpage rexcc
- @subpage rexgrep
#include <string.h>
#include "rex/rexregex.h"
+/*
+ * To build:
+ * gcc -o rexregex rexregex.c -I/usr/include/rpatk -lrex -lrlib
+ */
+
#define BUF_SIZE 0x10000
int main(int argc, char *argv[])
{
- rexregex_t *r;
- char *buf, *q;
+ rexregex_t *regex;
+ char *buf, *end;
const char *where;
- int l = 0;
- if (argc == 1) {
- fprintf(stderr, "Usage: cat in.file | %s <regexp>\n", argv[0]);
+ int line = 0;
+
+ if (argc < 2) {
+ fprintf(stderr, "Usage:\n cat <file> | %s <regexp>\n", argv[0]);
return 0;
}
-
- r = rex_regex_create_s(argv[1]);
+ regex = rex_regex_create_s(argv[1]);
buf = calloc(BUF_SIZE, 1);
while (fgets(buf, BUF_SIZE - 1, stdin)) {
- ++l;
- for (q = buf; *q; ++q);
- if (q > buf)
- *(q-1) = 0;
- if (rex_regex_scan(r, REX_ENCODING_UTF8, buf, q, &where) > 0)
- printf("%d:%s\n", l, buf);
+ ++line;
+ for (end = buf; *end; ++end);
+ if (end > buf)
+ *(end - 1) = '\0';
+ if (rex_regex_scan(regex, REX_ENCODING_BYTE, buf, end, &where) > 0)
+ printf("%d: %s\n", line, buf);
}
free(buf);
- rex_regex_destroy(r);
+ rex_regex_destroy(regex);
return 0;
}