2 * Regular Pattern Analyzer (RPA)
3 * Copyright (c) 2009-2010 Martin Stoilov
5 * This program is free software: you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation, either version 3 of the License, or
8 * (at your option) any later version.
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
15 * You should have received a copy of the GNU General Public License
16 * along with this program. If not, see <http://www.gnu.org/licenses/>.
18 * Martin Stoilov <martin@rpasearch.com>
28 #include "rpagreputf.h"
29 #include "rpagrepdep.h"
31 #define MAX_STACK 256000
33 rpa_buffer_t * rpa_buffer_init(rpa_buffer_t *str, char *s, unsigned int size)
41 void rpa_buffer_free(rpa_buffer_t *str)
50 rpa_buffer_t * rpa_buffer_alloc(unsigned int size)
54 str = (rpa_buffer_t *)malloc(sizeof(rpa_buffer_t));
57 memset(str, 0, sizeof(*str));
58 if (!(str->s = (char *)malloc((size + 1) * sizeof(char)))) {
62 memset(str->s, 0, size + 1);
64 str->destroy = rpa_buffer_free;
69 int rpa_buffer_realloc(rpa_buffer_t *str, unsigned int size)
73 s = (char *)realloc(str->s, size);
82 void rpa_buffer_destroy(rpa_buffer_t *str)
84 if (str && str->destroy)
89 rpa_grep_t *rpa_grep_create()
93 pGrep = (rpa_grep_t *)malloc(sizeof(*pGrep));
96 memset(pGrep, 0, sizeof(*pGrep));
97 pGrep->hDbex = rpa_dbex_create();
101 void rpa_grep_close(rpa_grep_t *pGrep)
104 rpa_dbex_destroy(pGrep->hDbex);
109 void rpa_grep_optimizations(rpa_grep_t *pGrep, rulong allow)
111 rpa_dbex_cfgset(pGrep->hDbex, RPA_DBEXCFG_OPTIMIZATIONS, allow);
115 void rpa_grep_destroy(rpa_grep_t *pGrep)
119 rpa_grep_close(pGrep);
124 int rpa_grep_load_string_pattern(rpa_grep_t *pGrep, rpa_buffer_t *buf)
126 return rpa_grep_load_pattern(pGrep, buf);
130 int rpa_grep_load_pattern(rpa_grep_t *pGrep, rpa_buffer_t *buf)
133 int inputsize = buf->size;
134 const char *pattern = buf->s;
136 if (rpa_dbex_open(pGrep->hDbex) < 0) {
137 fprintf(stdout, "Failed to open rules database.\n");
141 while ((ret = rpa_dbex_load(pGrep->hDbex, pattern, inputsize)) > 0) {
146 rpa_errinfo_t errinfo;
147 rpa_dbex_lasterrorinfo(pGrep->hDbex, &errinfo);
148 if (errinfo.code == RPA_E_SYNTAX_ERROR) {
149 pattern += errinfo.offset;
150 for (line = 1; pattern >= buf->s; --pattern) {
151 if (*pattern == '\n')
154 fprintf(stdout, "Line: %d, ERROR: Syntax Error.\n", line);
156 fprintf(stdout, "ERROR: Pattern Loading failed.\n");
161 rpa_dbex_close(pGrep->hDbex);
162 pGrep->hPattern = rpa_dbex_last(pGrep->hDbex);
166 rpa_dbex_close(pGrep->hDbex);
171 void rpa_grep_list_patterns(rpa_grep_t *pGrep)
173 rpa_dbex_dumpproductions(pGrep->hDbex);
177 void rpa_grep_dump_pattern_records(rpa_grep_t *pGrep)
179 rpa_dbex_dumprecords(pGrep->hDbex);
183 void rpa_grep_debug_compile(rpa_grep_t *pGrep)
185 rpa_dbex_cfgset(pGrep->hDbex, RPA_DBEXCFG_DEBUG, 1);
186 rpa_dbex_compile(pGrep->hDbex);
187 rpa_dbex_cfgset(pGrep->hDbex, RPA_DBEXCFG_DEBUG, 0);
191 void rpa_grep_dump_pattern_info(rpa_grep_t *pGrep)
193 rpa_dbex_compile(pGrep->hDbex);
194 rpa_dbex_dumpinfo(pGrep->hDbex);
198 void rpa_grep_dump_alias_info(rpa_grep_t *pGrep)
200 rpa_dbex_compile(pGrep->hDbex);
201 rpa_dbex_dumpuids(pGrep->hDbex);
205 int rpa_grep_match(rpa_grep_t *pGrep, const char* buffer, unsigned long size)
209 const char *input = buffer, *start = buffer, *end = buffer + size;
211 hStat = rpa_stat_create(pGrep->hDbex, 0);
214 rpa_stat_cachedisable(hStat, pGrep->disablecache);
215 rpa_stat_setencoding(hStat, pGrep->encoding);
216 hStat->debug = pGrep->execdebug;
217 ret = rpa_stat_match(hStat, pGrep->hPattern, input, start, end);
219 rpa_grep_print_filename(pGrep);
220 rpa_grep_output(pGrep, input, ret, pGrep->encoding);
221 rpa_grep_output_utf8_string(pGrep, "\n");
223 pGrep->cachehit = hStat->cache->hit;
224 rpa_stat_destroy(hStat);
229 int rpa_grep_parse(rpa_grep_t *pGrep, const char* buffer, unsigned long size)
235 rarray_t *records = rpa_records_create();
237 const char *input = buffer, *start = buffer, *end = buffer + size;
239 hStat = rpa_stat_create(pGrep->hDbex, 0);
242 rpa_stat_cachedisable(hStat, pGrep->disablecache);
243 rpa_stat_setencoding(hStat, pGrep->encoding);
244 hStat->debug = pGrep->execdebug;
245 ret = rpa_stat_parse(hStat, pGrep->hPattern, input, start, end, records);
248 rpa_stat_lasterrorinfo(hStat, &err);
250 r_snprintf(location, sizeof(location), "Parse Error: Code: %ld", err.code);
251 rpa_grep_output_utf8_string(pGrep, location);
254 r_snprintf(location, sizeof(location), ", Rule UID: %ld", err.ruleid);
255 rpa_grep_output_utf8_string(pGrep, location);
258 r_snprintf(location, sizeof(location), ", Name: %s", err.name);
259 rpa_grep_output_utf8_string(pGrep, location);
262 r_snprintf(location, sizeof(location), " at Offset: %ld", err.offset);
263 rpa_grep_output_utf8_string(pGrep, location);
265 rpa_grep_output_utf8_string(pGrep, "\n");
268 if (pGrep->greptype == RPA_GREPTYPE_PARSE) {
269 for (i = 0; i < rpa_records_length(records); i++) {
270 prec = (rparecord_t *)rpa_records_slot(records, i);
271 if (prec->type & RPA_RECORD_END) {
272 rpa_grep_output_utf8_string(pGrep, prec->rule);
273 r_snprintf(location, sizeof(location), " (%ld, %ld)", (rlong)(prec->input - input), (rlong)prec->inputsiz);
274 rpa_grep_output_utf8_string(pGrep, location);
275 rpa_grep_output_utf8_string(pGrep, ": ");
276 rpa_grep_output(pGrep, prec->input, prec->inputsiz, pGrep->encoding);
277 rpa_grep_output_utf8_string(pGrep, "\n");
280 } else if (pGrep->greptype == RPA_GREPTYPE_PARSEAST) {
281 for (i = 0; i < rpa_records_length(records); i++) {
282 rpa_record_dump(records, i);
287 rpa_records_destroy(records);
288 pGrep->cachehit = hStat->cache->hit;
289 rpa_stat_destroy(hStat);
294 int rpa_grep_scan(rpa_grep_t *pGrep, const char* buffer, unsigned long size)
300 const char *input = buffer, *start = buffer, *end = buffer + size;
302 hStat = rpa_stat_create(pGrep->hDbex, 0);
305 rpa_stat_setencoding(hStat, pGrep->encoding);
306 rpa_stat_cachedisable(hStat, pGrep->disablecache);
307 hStat->debug = pGrep->execdebug;
308 pGrep->cachehit = hStat->cache->hit;
311 ret = rpa_stat_scan(hStat, pGrep->hPattern, input, start, end, &matched);
312 pGrep->cachehit += hStat->cache->hit;
317 rpa_grep_print_filename(pGrep);
319 rpa_grep_output(pGrep, matched, ret, pGrep->encoding);
320 rpa_grep_output_utf8_string(pGrep, "\n");
322 if (ret && matched + ret < end) {
323 input = matched + ret;
326 rpa_stat_destroy(hStat);
331 int rpa_grep_scan_lines(rpa_grep_t *pGrep, const char* buffer, unsigned long size)
337 unsigned long lines = 0;
338 const char *end = buffer + size, *lstart = buffer, *lend;
340 hStat = rpa_stat_create(pGrep->hDbex, 0);
343 rpa_stat_setencoding(hStat, pGrep->encoding);
344 hStat->debug = pGrep->execdebug;
347 if (pGrep->encoding == RPA_ENCODING_UTF16LE || pGrep->encoding == RPA_ENCODING_ICASE_UTF16LE) {
348 for (lend = lstart; lend < end; lend += sizeof(unsigned short)) {
349 if (*((unsigned short*)lend) == L'\n') {
351 lend += sizeof(unsigned short);
356 for (lend = lstart; lend < end; lend += sizeof(unsigned char)) {
357 if (*((unsigned char*)lend) == '\n') {
359 lend += sizeof(unsigned char);
366 ret = rpa_stat_scan(hStat, pGrep->hPattern, lstart, lstart, lend, &matched);
370 rpa_grep_print_filename(pGrep);
372 rpa_grep_output(pGrep, lstart, lend - lstart, pGrep->encoding);
378 rpa_stat_destroy(hStat);
383 void rpa_grep_scan_buffer(rpa_grep_t *pGrep, rpa_buffer_t *buf)
387 clock_t btime, scanclocks;
389 if (pGrep->forceEncoding == RPA_GREP_FORCE_BYTE) {
392 pGrep->encoding = pGrep->icase ? RPA_ENCODING_ICASE_BYTE : RPA_ENCODING_BYTE;
393 } else if (pGrep->forceEncoding == RPA_GREP_FORCE_UTF16) {
394 if (buf->size >= 2 && buf->s[0] == -1 && buf->s[1] == -2) {
396 size = buf->size - 2;
401 pGrep->encoding = pGrep->icase ? RPA_ENCODING_ICASE_UTF16LE : RPA_ENCODING_UTF16LE;
402 } else if (buf->size >= 2 && buf->s[0] == -1 && buf->s[1] == -2) {
404 size = buf->size - 2;
405 pGrep->encoding = pGrep->icase ? RPA_ENCODING_ICASE_UTF16LE : RPA_ENCODING_UTF16LE;
407 pGrep->encoding = pGrep->icase ? RPA_ENCODING_ICASE_UTF8 : RPA_ENCODING_UTF8;
414 switch (pGrep->greptype) {
415 case RPA_GREPTYPE_SCANLINES:
416 rpa_grep_scan_lines(pGrep, input, size);
418 case RPA_GREPTYPE_MATCH:
419 rpa_grep_match(pGrep, input, size);
421 case RPA_GREPTYPE_PARSEAST:
422 case RPA_GREPTYPE_PARSE:
423 rpa_grep_parse(pGrep, input, size);
425 case RPA_GREPTYPE_SCAN:
426 rpa_grep_scan(pGrep, input, size);
429 rpa_grep_scan(pGrep, input, size);
433 scanclocks = clock() - btime;
434 pGrep->scanmilisec += (unsigned long)(((unsigned long long)1000)*scanclocks/CLOCKS_PER_SEC);
438 rpa_buffer_t *rpa_buffer_loadfile(FILE *pFile)
440 unsigned int memchunk = 256;
441 int ret = 0, inputsize = 0;
444 buf = rpa_buffer_alloc(2 * memchunk);
449 if ((buf->size - inputsize) < memchunk) {
450 if (rpa_buffer_realloc(buf, buf->size + memchunk) < 0) {
451 fprintf(stderr, "Out of memory!\n");
455 ret = fread(&buf->s[inputsize], 1, memchunk - 1, pFile);
456 if ((ret <= 0) && ferror(pFile)) {
457 rpa_buffer_destroy(buf);
461 buf->s[inputsize] = '\0';
462 buf->size = inputsize;
463 } while (!feof(pFile));
469 int rpa_callback_output(rpastat_t * stat, const char *name, void *userdata, const char *input, unsigned int size, unsigned int reason)
476 int rpa_callback_matched_output(rpastat_t * stat, const char *name, void *userdata, const char *input, unsigned int size, unsigned int reason)
478 rpa_grep_t *pGrep = (rpa_grep_t *)userdata;
480 rpa_grep_output_utf8_string(pGrep, name);
481 rpa_grep_output_utf8_string(pGrep, ": ");
482 rpa_grep_output(pGrep, input, size, pGrep->encoding);
483 rpa_grep_output_utf8_string(pGrep, "\n");
489 void rpa_grep_setup_callback(rpa_grep_t *pGrep, rpa_buffer_t *pattern)
495 void rpa_grep_setup_matched_callback(rpa_grep_t *pGrep, rpa_buffer_t *pattern)
501 void rpa_grep_dump_pattern_tree(rpa_grep_t *pGrep, rpa_buffer_t *pattern)
503 rpa_dbex_dumptree(pGrep->hDbex, rpa_dbex_lookup_s(pGrep->hDbex, pattern->s));
507 void rpa_grep_output(rpa_grep_t *pGrep, const char *s, unsigned long size, unsigned int encoding)
509 const unsigned char *input = (const unsigned char*)s;
510 const unsigned char *end = input + size;
514 if (encoding == RPA_ENCODING_UTF16LE || encoding == RPA_ENCODING_ICASE_UTF16LE) {
515 while ((ret = (int)rpa_grep_utf16_mbtowc(&wc, input, end)) != 0) {
516 rpa_grep_output_char(wc);
520 while ((ret = (int)rpa_grep_utf8_mbtowc(&wc, input, end)) != 0) {
521 rpa_grep_output_char(wc);
528 void rpa_grep_output_utf8_string(rpa_grep_t *pGrep, const char *s)
530 rpa_grep_output(pGrep, s, strlen(s), RPA_ENCODING_UTF8);
534 void rpa_grep_output_utf16_string(rpa_grep_t *pGrep, const unsigned short *s)
536 unsigned long size = 0;
537 const unsigned short *pstr = s;
540 size += sizeof(unsigned short);
543 rpa_grep_output(pGrep, (const char*)s, size, RPA_ENCODING_UTF16LE);