2 * Regular Pattern Analyzer Toolkit (RPA/Tk)
3 * Copyright (c) 2009-2012 Martin Stoilov
5 * This program is free software: you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation, either version 3 of the License, or
8 * (at your option) any later version.
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
15 * You should have received a copy of the GNU General Public License
16 * along with this program. If not, see <http://www.gnu.org/licenses/>.
18 * Martin Stoilov <martin@rpasearch.com>
28 * Temporary here. Need to fix the encoding definitions.
30 #include "rpa/rpastat.h"
32 #include "rlib/rutf.h"
33 #include "rlib/rmem.h"
34 #include "rex/rextransition.h"
35 #include "rex/rexdfasimulator.h"
37 #include "rexgrepdep.h"
39 #define MAX_STACK 256000
43 rexgrep_t *rex_grep_create()
47 pGrep = (rexgrep_t *)r_malloc(sizeof(*pGrep));
50 r_memset(pGrep, 0, sizeof(*pGrep));
51 pGrep->nfa = rex_db_create(REXDB_TYPE_NFA);
52 pGrep->si = rex_nfasimulator_create();
53 pGrep->dfasi = rex_dfasimulator_create();
55 pGrep->startuid = 0UL;
60 void rex_grep_destroy(rexgrep_t *pGrep)
64 rex_db_destroy(pGrep->nfa);
65 rex_dfa_destroy(pGrep->dfa);
66 rex_nfasimulator_destroy(pGrep->si);
67 rex_dfasimulator_destroy(pGrep->dfasi);
72 int rex_grep_load_string_pattern(rexgrep_t *pGrep, rbuffer_t *buf)
74 return rex_grep_load_pattern(pGrep, buf);
78 int rex_grep_load_pattern(rexgrep_t *pGrep, rbuffer_t *buf)
80 pGrep->startuid = rex_db_addexpression(pGrep->nfa, pGrep->startuid, buf->s, buf->size, 0);
81 if (pGrep->startuid < 0) {
88 int rex_grep_match(rexgrep_t *pGrep, const char* input, const char *end)
97 long nstate = REX_DFA_STARTSTATE;
98 const char *start = input;
99 rexdfa_t *dfa = pGrep->dfa;
102 while ((inc = r_utf8_mbtowc(&wc, (const unsigned char*)input, (const unsigned char*)end)) > 0) {
103 REX_DFA_NEXT(dfa, nstate, wc, &nstate);
107 s = REX_DFA_STATE(dfa, nstate);
108 if (s->type == REX_STATETYPE_ACCEPT)
114 if (pGrep->startuid < 0) {
119 rex_nfasimulator_start(pGrep->si, db, pGrep->startuid);
120 while ((inc = r_utf8_mbtowc(&wc, (const unsigned char*)input, (const unsigned char*)end)) > 0) {
121 if (rex_nfasimulator_next(pGrep->si, db, wc, inc) == 0)
125 if (r_array_length(pGrep->si->accepts) > 0) {
126 rex_accept_t *acc = (rex_accept_t *)r_array_lastslot(pGrep->si->accepts);
127 return acc->inputsize;
133 int rex_grep_scan(rexgrep_t *pGrep, const char* start, const char* end)
137 while (start < end) {
138 ret = rex_grep_match(pGrep, start, end);
144 } else if (ret > 0) {
145 if (pGrep->showfilename) {
146 fprintf(stdout, "%s:", (const char*)pGrep->filename);
148 fwrite(start, 1, ret, stdout);
149 fprintf(stdout, "\n");
153 if ((ret = r_utf8_mbtowc(&wc, (const unsigned char*)start, (const unsigned char*)end)) <= 0)
162 static int rex_grep_scan_do(rexgrep_t *pGrep, const char* start, const char* end)
166 while (start < end) {
167 ret = rex_grep_match(pGrep, start, end);
173 } else if (ret > 0) {
177 if ((ret = r_utf8_mbtowc(&wc, (const unsigned char*)start, (const unsigned char*)end)) <= 0)
186 int rex_grep_scan_lines(rexgrep_t *pGrep, const char* start, const char* end)
191 for (eol = start; eol < end; eol++) {
192 if (*eol == '\n' || (eol + 1) == end) {
193 ret = rex_grep_scan_do(pGrep, start, eol + 1);
195 if (pGrep->showfilename) {
196 fprintf(stdout, "%s:", (const char*)pGrep->filename);
198 fwrite(start, 1, eol + 1 - start, stdout);
203 rex_grep_output_utf8_string(pGrep, "\n");
208 void rex_grep_scan_buffer(rexgrep_t *pGrep, rbuffer_t *buf)
210 switch (pGrep->greptype) {
211 case REX_GREPTYPE_SCANLINES:
212 rex_grep_scan_lines(pGrep, buf->s, buf->s + buf->size);
214 case REX_GREPTYPE_MATCH:
215 case REX_GREPTYPE_SCAN:
217 rex_grep_scan(pGrep, buf->s, buf->s + buf->size);
223 void rex_grep_output(rexgrep_t *pGrep, const char *s, unsigned long size, unsigned int encoding)
225 const unsigned char *input = (const unsigned char*)s;
226 const unsigned char *end = input + size;
230 if (encoding == RPA_ENCODING_UTF16LE || encoding == RPA_ENCODING_ICASE_UTF16LE) {
231 while ((ret = (int)r_utf16_mbtowc(&wc, input, end)) != 0) {
232 rex_grep_output_char(wc);
236 while ((ret = (int)r_utf8_mbtowc(&wc, input, end)) != 0) {
237 rex_grep_output_char(wc);
244 void rex_grep_output_utf8_string(rexgrep_t *pGrep, const char *s)
246 rex_grep_output(pGrep, s, r_strlen(s), RPA_ENCODING_UTF8);
250 void rex_grep_output_utf16_string(rexgrep_t *pGrep, const unsigned short *s)
252 unsigned long size = 0;
253 const unsigned short *pstr = s;
256 size += sizeof(unsigned short);
259 rex_grep_output(pGrep, (const char*)s, size, RPA_ENCODING_UTF16LE);