2 * Regular Pattern Analyzer (RPA)
3 * Copyright (c) 2009-2010 Martin Stoilov
5 * This program is free software: you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation, either version 3 of the License, or
8 * (at your option) any later version.
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
15 * You should have received a copy of the GNU General Public License
16 * along with this program. If not, see <http://www.gnu.org/licenses/>.
18 * Martin Stoilov <martin@rpasearch.com>
28 #include "rpagreputf.h"
29 #include "rpagrepdep.h"
31 #define MAX_STACK 256000
33 rpa_buffer_t * rpa_buffer_init(rpa_buffer_t *str, char *s, unsigned int size)
41 void rpa_buffer_free(rpa_buffer_t *str)
50 rpa_buffer_t * rpa_buffer_alloc(unsigned int size)
54 str = (rpa_buffer_t *)malloc(sizeof(rpa_buffer_t));
57 memset(str, 0, sizeof(*str));
58 if (!(str->s = (char *)malloc((size + 1) * sizeof(char)))) {
62 memset(str->s, 0, size + 1);
64 str->destroy = rpa_buffer_free;
69 int rpa_buffer_realloc(rpa_buffer_t *str, unsigned int size)
73 s = (char *)realloc(str->s, size);
82 void rpa_buffer_destroy(rpa_buffer_t *str)
84 if (str && str->destroy)
89 rpa_grep_t *rpa_grep_create()
93 pGrep = (rpa_grep_t *)malloc(sizeof(*pGrep));
96 memset(pGrep, 0, sizeof(*pGrep));
97 pGrep->hDbex = rpa_dbex_create();
101 void rpa_grep_close(rpa_grep_t *pGrep)
104 rpa_dbex_destroy(pGrep->hDbex);
109 void rpa_grep_optimizations(rpa_grep_t *pGrep, rulong allow)
111 rpa_dbex_cfgset(pGrep->hDbex, RPA_DBEXCFG_OPTIMIZATIONS, allow);
115 void rpa_grep_destroy(rpa_grep_t *pGrep)
119 rpa_grep_close(pGrep);
124 int rpa_grep_load_string_pattern(rpa_grep_t *pGrep, rpa_buffer_t *buf)
126 return rpa_grep_load_pattern(pGrep, buf);
130 int rpa_grep_load_pattern(rpa_grep_t *pGrep, rpa_buffer_t *buf)
133 int inputsize = buf->size;
134 const char *pattern = buf->s;
136 if (rpa_dbex_open(pGrep->hDbex) < 0) {
137 fprintf(stdout, "Failed to open rules database.\n");
141 while ((ret = rpa_dbex_load(pGrep->hDbex, pattern, inputsize)) > 0) {
146 rpa_errinfo_t errinfo;
147 rpa_dbex_lasterrorinfo(pGrep->hDbex, &errinfo);
148 if (errinfo.code == RPA_E_SYNTAX_ERROR) {
149 pattern += errinfo.offset;
150 for (line = 1; pattern >= buf->s; --pattern) {
151 if (*pattern == '\n')
154 fprintf(stdout, "Line: %d, ERROR: Syntax Error.\n", line);
156 fprintf(stdout, "ERROR: Pattern Loading failed.\n");
161 rpa_dbex_close(pGrep->hDbex);
162 pGrep->hPattern = rpa_dbex_last(pGrep->hDbex);
166 rpa_dbex_close(pGrep->hDbex);
171 void rpa_grep_list_patterns(rpa_grep_t *pGrep)
173 rpa_dbex_dumpproductions(pGrep->hDbex);
177 void rpa_grep_dump_pattern_records(rpa_grep_t *pGrep)
179 rpa_dbex_dumprecords(pGrep->hDbex);
183 void rpa_grep_debug_compile(rpa_grep_t *pGrep)
185 rpa_dbex_cfgset(pGrep->hDbex, RPA_DBEXCFG_DEBUG, 1);
186 rpa_dbex_compile(pGrep->hDbex);
187 rpa_dbex_cfgset(pGrep->hDbex, RPA_DBEXCFG_DEBUG, 0);
191 void rpa_grep_dump_pattern_info(rpa_grep_t *pGrep)
193 rpa_dbex_compile(pGrep->hDbex);
194 rpa_dbex_dumpinfo(pGrep->hDbex);
198 void rpa_grep_dump_alias_info(rpa_grep_t *pGrep)
200 rpa_dbex_compile(pGrep->hDbex);
201 rpa_dbex_dumpuids(pGrep->hDbex);
205 int rpa_grep_match(rpa_grep_t *pGrep, const char* buffer, unsigned long size)
209 const char *input = buffer, *start = buffer, *end = buffer + size;
211 hStat = rpa_stat_create(pGrep->hDbex, 0);
214 rpa_stat_cachedisable(hStat, pGrep->disablecache);
215 hStat->debug = pGrep->execdebug;
216 ret = rpa_stat_match(hStat, pGrep->hPattern, pGrep->encoding, input, start, end);
218 rpa_grep_print_filename(pGrep);
219 rpa_grep_output(pGrep, input, ret, pGrep->encoding);
220 rpa_grep_output_utf8_string(pGrep, "\n");
222 pGrep->cachehit = hStat->cache->hit;
223 rpa_stat_destroy(hStat);
228 int rpa_grep_parse(rpa_grep_t *pGrep, const char* buffer, unsigned long size)
234 rarray_t *records = rpa_records_create();
236 const char *input = buffer, *start = buffer, *end = buffer + size;
238 hStat = rpa_stat_create(pGrep->hDbex, 0);
241 rpa_stat_cachedisable(hStat, pGrep->disablecache);
242 hStat->debug = pGrep->execdebug;
243 ret = rpa_stat_parse(hStat, pGrep->hPattern, pGrep->encoding, input, start, end, records);
246 rpa_stat_lasterrorinfo(hStat, &err);
248 r_snprintf(location, sizeof(location), "Parse Error: Code: %ld", err.code);
249 rpa_grep_output_utf8_string(pGrep, location);
252 r_snprintf(location, sizeof(location), ", Rule UID: %ld", err.ruleid);
253 rpa_grep_output_utf8_string(pGrep, location);
256 r_snprintf(location, sizeof(location), ", Name: %s", err.name);
257 rpa_grep_output_utf8_string(pGrep, location);
260 r_snprintf(location, sizeof(location), " at Offset: %ld", err.offset);
261 rpa_grep_output_utf8_string(pGrep, location);
263 rpa_grep_output_utf8_string(pGrep, "\n");
266 if (pGrep->greptype == RPA_GREPTYPE_PARSE) {
267 for (i = 0; i < rpa_records_length(records); i++) {
268 prec = (rparecord_t *)rpa_records_slot(records, i);
269 if (prec->type & RPA_RECORD_END) {
270 rpa_grep_output_utf8_string(pGrep, prec->rule);
271 r_snprintf(location, sizeof(location), " (%ld, %ld)", (rlong)(prec->input - input), (rlong)prec->inputsiz);
272 rpa_grep_output_utf8_string(pGrep, location);
273 rpa_grep_output_utf8_string(pGrep, ": ");
274 rpa_grep_output(pGrep, prec->input, prec->inputsiz, pGrep->encoding);
275 rpa_grep_output_utf8_string(pGrep, "\n");
278 } else if (pGrep->greptype == RPA_GREPTYPE_PARSEAST) {
279 for (i = 0; i < rpa_records_length(records); i++) {
280 rpa_record_dump(records, i);
285 rpa_records_destroy(records);
286 pGrep->cachehit = hStat->cache->hit;
287 rpa_stat_destroy(hStat);
292 int rpa_grep_scan(rpa_grep_t *pGrep, const char* buffer, unsigned long size)
298 const char *input = buffer, *start = buffer, *end = buffer + size;
300 hStat = rpa_stat_create(pGrep->hDbex, 0);
303 rpa_stat_cachedisable(hStat, pGrep->disablecache);
304 hStat->debug = pGrep->execdebug;
305 pGrep->cachehit = hStat->cache->hit;
308 ret = rpa_stat_scan(hStat, pGrep->hPattern, pGrep->encoding, input, start, end, &matched);
309 pGrep->cachehit += hStat->cache->hit;
314 rpa_grep_print_filename(pGrep);
316 rpa_grep_output(pGrep, matched, ret, pGrep->encoding);
317 rpa_grep_output_utf8_string(pGrep, "\n");
319 if (ret && matched + ret < end) {
320 input = matched + ret;
323 rpa_stat_destroy(hStat);
328 int rpa_grep_scan_lines(rpa_grep_t *pGrep, const char* buffer, unsigned long size)
334 unsigned long lines = 0;
335 const char *end = buffer + size, *lstart = buffer, *lend;
337 hStat = rpa_stat_create(pGrep->hDbex, 0);
340 hStat->debug = pGrep->execdebug;
343 if (pGrep->encoding == RPA_ENCODING_UTF16LE || pGrep->encoding == RPA_ENCODING_ICASE_UTF16LE) {
344 for (lend = lstart; lend < end; lend += sizeof(unsigned short)) {
345 if (*((unsigned short*)lend) == L'\n') {
347 lend += sizeof(unsigned short);
352 for (lend = lstart; lend < end; lend += sizeof(unsigned char)) {
353 if (*((unsigned char*)lend) == '\n') {
355 lend += sizeof(unsigned char);
362 ret = rpa_stat_scan(hStat, pGrep->hPattern, pGrep->encoding, lstart, lstart, lend, &matched);
366 rpa_grep_print_filename(pGrep);
368 rpa_grep_output(pGrep, lstart, lend - lstart, pGrep->encoding);
374 rpa_stat_destroy(hStat);
379 void rpa_grep_scan_buffer(rpa_grep_t *pGrep, rpa_buffer_t *buf)
383 clock_t btime, scanclocks;
385 if (pGrep->forceEncoding == RPA_GREP_FORCE_BYTE) {
388 pGrep->encoding = pGrep->icase ? RPA_ENCODING_ICASE_BYTE : RPA_ENCODING_BYTE;
389 } else if (pGrep->forceEncoding == RPA_GREP_FORCE_UTF16) {
390 if (buf->size >= 2 && buf->s[0] == -1 && buf->s[1] == -2) {
392 size = buf->size - 2;
397 pGrep->encoding = pGrep->icase ? RPA_ENCODING_ICASE_UTF16LE : RPA_ENCODING_UTF16LE;
398 } else if (buf->size >= 2 && buf->s[0] == -1 && buf->s[1] == -2) {
400 size = buf->size - 2;
401 pGrep->encoding = pGrep->icase ? RPA_ENCODING_ICASE_UTF16LE : RPA_ENCODING_UTF16LE;
403 pGrep->encoding = pGrep->icase ? RPA_ENCODING_ICASE_UTF8 : RPA_ENCODING_UTF8;
410 switch (pGrep->greptype) {
411 case RPA_GREPTYPE_SCANLINES:
412 rpa_grep_scan_lines(pGrep, input, size);
414 case RPA_GREPTYPE_MATCH:
415 rpa_grep_match(pGrep, input, size);
417 case RPA_GREPTYPE_PARSEAST:
418 case RPA_GREPTYPE_PARSE:
419 rpa_grep_parse(pGrep, input, size);
421 case RPA_GREPTYPE_SCAN:
422 rpa_grep_scan(pGrep, input, size);
425 rpa_grep_scan(pGrep, input, size);
429 scanclocks = clock() - btime;
430 pGrep->scanmilisec += (unsigned long)(((unsigned long long)1000)*scanclocks/CLOCKS_PER_SEC);
434 rpa_buffer_t *rpa_buffer_loadfile(FILE *pFile)
436 unsigned int memchunk = 256;
437 int ret = 0, inputsize = 0;
440 buf = rpa_buffer_alloc(2 * memchunk);
445 if ((buf->size - inputsize) < memchunk) {
446 if (rpa_buffer_realloc(buf, buf->size + memchunk) < 0) {
447 fprintf(stderr, "Out of memory!\n");
451 ret = fread(&buf->s[inputsize], 1, memchunk - 1, pFile);
452 if ((ret <= 0) && ferror(pFile)) {
453 rpa_buffer_destroy(buf);
457 buf->s[inputsize] = '\0';
458 buf->size = inputsize;
459 } while (!feof(pFile));
465 int rpa_callback_output(rpastat_t * stat, const char *name, void *userdata, const char *input, unsigned int size, unsigned int reason)
472 int rpa_callback_matched_output(rpastat_t * stat, const char *name, void *userdata, const char *input, unsigned int size, unsigned int reason)
474 rpa_grep_t *pGrep = (rpa_grep_t *)userdata;
476 rpa_grep_output_utf8_string(pGrep, name);
477 rpa_grep_output_utf8_string(pGrep, ": ");
478 rpa_grep_output(pGrep, input, size, pGrep->encoding);
479 rpa_grep_output_utf8_string(pGrep, "\n");
485 void rpa_grep_setup_callback(rpa_grep_t *pGrep, rpa_buffer_t *pattern)
491 void rpa_grep_setup_matched_callback(rpa_grep_t *pGrep, rpa_buffer_t *pattern)
497 void rpa_grep_dump_pattern_tree(rpa_grep_t *pGrep, rpa_buffer_t *pattern)
499 rpa_dbex_dumptree(pGrep->hDbex, rpa_dbex_lookup_s(pGrep->hDbex, pattern->s));
503 void rpa_grep_output(rpa_grep_t *pGrep, const char *s, unsigned long size, unsigned int encoding)
505 const unsigned char *input = (const unsigned char*)s;
506 const unsigned char *end = input + size;
510 if (encoding == RPA_ENCODING_UTF16LE || encoding == RPA_ENCODING_ICASE_UTF16LE) {
511 while ((ret = (int)rpa_grep_utf16_mbtowc(&wc, input, end)) != 0) {
512 rpa_grep_output_char(wc);
516 while ((ret = (int)rpa_grep_utf8_mbtowc(&wc, input, end)) != 0) {
517 rpa_grep_output_char(wc);
524 void rpa_grep_output_utf8_string(rpa_grep_t *pGrep, const char *s)
526 rpa_grep_output(pGrep, s, strlen(s), RPA_ENCODING_UTF8);
530 void rpa_grep_output_utf16_string(rpa_grep_t *pGrep, const unsigned short *s)
532 unsigned long size = 0;
533 const unsigned short *pstr = s;
536 size += sizeof(unsigned short);
539 rpa_grep_output(pGrep, (const char*)s, size, RPA_ENCODING_UTF16LE);