2 * Regular Pattern Analyzer (RPA)
3 * Copyright (c) 2009-2010 Martin Stoilov
5 * This program is free software: you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation, either version 3 of the License, or
8 * (at your option) any later version.
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
15 * You should have received a copy of the GNU General Public License
16 * along with this program. If not, see <http://www.gnu.org/licenses/>.
18 * Martin Stoilov <martin@rpasearch.com>
28 #include "rpagreputf.h"
29 #include "rpagrepdep.h"
31 #define MAX_STACK 256000
33 rpa_buffer_t * rpa_buffer_init(rpa_buffer_t *str, char *s, unsigned int size)
41 void rpa_buffer_free(rpa_buffer_t *str)
50 rpa_buffer_t * rpa_buffer_alloc(unsigned int size)
54 str = (rpa_buffer_t *)malloc(sizeof(rpa_buffer_t));
57 memset(str, 0, sizeof(*str));
58 if (!(str->s = (char *)malloc((size + 1) * sizeof(char)))) {
62 memset(str->s, 0, size + 1);
64 str->destroy = rpa_buffer_free;
69 int rpa_buffer_realloc(rpa_buffer_t *str, unsigned int size)
73 s = (char *)realloc(str->s, size);
82 void rpa_buffer_destroy(rpa_buffer_t *str)
84 if (str && str->destroy)
89 rpa_grep_t *rpa_grep_create()
93 pGrep = (rpa_grep_t *)malloc(sizeof(*pGrep));
96 memset(pGrep, 0, sizeof(*pGrep));
97 pGrep->hDbex = rpa_dbex_create();
101 void rpa_grep_close(rpa_grep_t *pGrep)
104 rpa_dbex_destroy(pGrep->hDbex);
109 void rpa_grep_optimizations(rpa_grep_t *pGrep, rulong allow)
111 rpa_dbex_cfgset(pGrep->hDbex, RPA_DBEXCFG_OPTIMIZATIONS, allow);
115 void rpa_grep_destroy(rpa_grep_t *pGrep)
119 rpa_grep_close(pGrep);
124 int rpa_grep_load_string_pattern(rpa_grep_t *pGrep, rpa_buffer_t *buf)
126 return rpa_grep_load_pattern(pGrep, buf);
130 int rpa_grep_load_pattern(rpa_grep_t *pGrep, rpa_buffer_t *buf)
133 int inputsize = buf->size;
134 const char *pattern = buf->s;
136 if (rpa_dbex_open(pGrep->hDbex) < 0) {
137 fprintf(stdout, "Failed to open rules database.\n");
141 while ((ret = rpa_dbex_load(pGrep->hDbex, pattern, inputsize)) > 0) {
146 rpa_errinfo_t errinfo;
147 rpa_dbex_lasterrorinfo(pGrep->hDbex, &errinfo);
148 if (errinfo.code == RPA_E_SYNTAX_ERROR) {
149 pattern += errinfo.offset;
150 for (line = 1; pattern >= buf->s; --pattern) {
151 if (*pattern == '\n')
154 fprintf(stdout, "Line: %d, ERROR: Syntax Error.\n", line);
156 fprintf(stdout, "ERROR: Pattern Loading failed.\n");
161 rpa_dbex_close(pGrep->hDbex);
162 pGrep->hPattern = rpa_dbex_last(pGrep->hDbex);
166 rpa_dbex_close(pGrep->hDbex);
171 void rpa_grep_list_patterns(rpa_grep_t *pGrep)
173 rpa_dbex_dumprules(pGrep->hDbex);
177 void rpa_grep_dump_pattern_records(rpa_grep_t *pGrep)
179 rpa_dbex_dumprecords(pGrep->hDbex);
183 void rpa_grep_dump_pattern_info(rpa_grep_t *pGrep)
185 rpa_dbex_compile(pGrep->hDbex);
186 rpa_dbex_dumpinfo(pGrep->hDbex);
190 void rpa_grep_dump_alias_info(rpa_grep_t *pGrep)
192 rpa_dbex_compile(pGrep->hDbex);
193 rpa_dbex_dumpalias(pGrep->hDbex);
197 int rpa_grep_match(rpa_grep_t *pGrep, const char* buffer, unsigned long size)
201 const char *input = buffer, *start = buffer, *end = buffer + size;
203 hStat = rpa_stat_create(pGrep->hDbex, 0);
206 rpa_stat_cachedisable(hStat, pGrep->disablecache);
207 rpa_stat_encodingset(hStat, pGrep->encoding);
208 hStat->debug = pGrep->execdebug;
209 ret = rpa_stat_match(hStat, pGrep->hPattern, input, start, end);
211 rpa_grep_print_filename(pGrep);
212 rpa_grep_output(pGrep, input, ret, pGrep->encoding);
213 rpa_grep_output_utf8_string(pGrep, "\n");
215 pGrep->cachehit = hStat->cache->hit;
216 pGrep->orphrecords = r_array_length(hStat->orphans);
217 pGrep->emitstacksize = r_array_length(hStat->emitstack);
219 rpa_stat_destroy(hStat);
224 int rpa_grep_parse(rpa_grep_t *pGrep, const char* buffer, unsigned long size)
229 rarray_t *records = NULL;
231 const char *input = buffer, *start = buffer, *end = buffer + size;
233 hStat = rpa_stat_create(pGrep->hDbex, 0);
236 rpa_stat_cachedisable(hStat, pGrep->disablecache);
237 rpa_stat_encodingset(hStat, pGrep->encoding);
238 hStat->debug = pGrep->execdebug;
239 rpa_stat_parse(hStat, pGrep->hPattern, input, start, end, &records);
241 if (pGrep->greptype == RPA_GREPTYPE_PARSE) {
242 for (i = 0; i < r_array_length(records); i++) {
243 prec = (rparecord_t *)r_array_slot(records, i);
244 if (prec->type & RPA_RECORD_END) {
245 rpa_grep_output_utf8_string(pGrep, prec->rule);
246 r_snprintf(location, sizeof(location), " (%ld, %ld)", (rlong)(prec->input - input), (rlong)prec->inputsiz);
247 rpa_grep_output_utf8_string(pGrep, location);
248 rpa_grep_output_utf8_string(pGrep, ": ");
249 rpa_grep_output(pGrep, prec->input, prec->inputsiz, pGrep->encoding);
250 rpa_grep_output_utf8_string(pGrep, "\n");
253 } else if (pGrep->greptype == RPA_GREPTYPE_PARSEAST) {
254 for (i = 0; i < r_array_length(records); i++) {
255 rpa_record_dump(records, i);
259 r_array_destroy(records);
261 pGrep->cachehit = hStat->cache->hit;
262 pGrep->orphrecords = r_array_length(hStat->orphans);
263 pGrep->emitstacksize = r_array_length(hStat->emitstack);
265 rpa_stat_destroy(hStat);
270 int rpa_grep_scan(rpa_grep_t *pGrep, const char* buffer, unsigned long size)
276 const char *input = buffer, *start = buffer, *end = buffer + size;
278 hStat = rpa_stat_create(pGrep->hDbex, 0);
281 rpa_stat_encodingset(hStat, pGrep->encoding);
282 rpa_stat_cachedisable(hStat, pGrep->disablecache);
283 hStat->debug = pGrep->execdebug;
284 pGrep->cachehit = hStat->cache->hit;
285 pGrep->orphrecords = r_array_length(hStat->orphans);
288 ret = rpa_stat_scan(hStat, pGrep->hPattern, input, start, end, &matched);
289 pGrep->cachehit += hStat->cache->hit;
290 pGrep->orphrecords += r_array_length(hStat->orphans);
295 rpa_grep_print_filename(pGrep);
297 rpa_grep_output(pGrep, matched, ret, pGrep->encoding);
298 rpa_grep_output_utf8_string(pGrep, "\n");
300 if (ret && matched + ret < end) {
301 input = matched + ret;
305 pGrep->emitstacksize = r_array_length(hStat->emitstack);
306 rpa_stat_destroy(hStat);
311 int rpa_grep_scan_lines(rpa_grep_t *pGrep, const char* buffer, unsigned long size)
317 unsigned long lines = 0;
318 const char *end = buffer + size, *lstart = buffer, *lend;
320 hStat = rpa_stat_create(pGrep->hDbex, 0);
323 rpa_stat_encodingset(hStat, pGrep->encoding);
324 hStat->debug = pGrep->execdebug;
327 if (pGrep->encoding == RPA_ENCODING_UTF16LE || pGrep->encoding == RPA_ENCODING_ICASE_UTF16LE) {
328 for (lend = lstart; lend < end; lend += sizeof(unsigned short)) {
329 if (*((unsigned short*)lend) == L'\n') {
331 lend += sizeof(unsigned short);
336 for (lend = lstart; lend < end; lend += sizeof(unsigned char)) {
337 if (*((unsigned char*)lend) == '\n') {
339 lend += sizeof(unsigned char);
346 ret = rpa_stat_scan(hStat, pGrep->hPattern, lstart, lstart, lend, &matched);
350 rpa_grep_print_filename(pGrep);
352 rpa_grep_output(pGrep, lstart, lend - lstart, pGrep->encoding);
358 rpa_stat_destroy(hStat);
363 void rpa_grep_scan_buffer(rpa_grep_t *pGrep, rpa_buffer_t *buf)
367 clock_t btime, scanclocks;
369 if (pGrep->forceEncoding == RPA_GREP_FORCE_BYTE) {
372 pGrep->encoding = pGrep->icase ? RPA_ENCODING_ICASE_BYTE : RPA_ENCODING_BYTE;
373 } else if (pGrep->forceEncoding == RPA_GREP_FORCE_UTF16) {
374 if (buf->size >= 2 && buf->s[0] == -1 && buf->s[1] == -2) {
376 size = buf->size - 2;
381 pGrep->encoding = pGrep->icase ? RPA_ENCODING_ICASE_UTF16LE : RPA_ENCODING_UTF16LE;
382 } else if (buf->size >= 2 && buf->s[0] == -1 && buf->s[1] == -2) {
384 size = buf->size - 2;
385 pGrep->encoding = pGrep->icase ? RPA_ENCODING_ICASE_UTF16LE : RPA_ENCODING_UTF16LE;
387 pGrep->encoding = pGrep->icase ? RPA_ENCODING_ICASE_UTF8 : RPA_ENCODING_UTF8;
394 switch (pGrep->greptype) {
395 case RPA_GREPTYPE_SCANLINES:
396 rpa_grep_scan_lines(pGrep, input, size);
398 case RPA_GREPTYPE_MATCH:
399 rpa_grep_match(pGrep, input, size);
401 case RPA_GREPTYPE_PARSEAST:
402 case RPA_GREPTYPE_PARSE:
403 rpa_grep_parse(pGrep, input, size);
405 case RPA_GREPTYPE_SCAN:
406 rpa_grep_scan(pGrep, input, size);
409 rpa_grep_scan(pGrep, input, size);
413 scanclocks = clock() - btime;
414 pGrep->scanmilisec += (unsigned long)(((unsigned long long)1000)*scanclocks/CLOCKS_PER_SEC);
418 rpa_buffer_t *rpa_buffer_loadfile(FILE *pFile)
420 unsigned int memchunk = 256;
421 int ret = 0, inputsize = 0;
424 buf = rpa_buffer_alloc(2 * memchunk);
429 if ((buf->size - inputsize) < memchunk) {
430 if (rpa_buffer_realloc(buf, buf->size + memchunk) < 0) {
431 fprintf(stderr, "Out of memory!\n");
435 ret = fread(&buf->s[inputsize], 1, memchunk - 1, pFile);
436 if ((ret <= 0) && ferror(pFile)) {
437 rpa_buffer_destroy(buf);
441 buf->s[inputsize] = '\0';
442 buf->size = inputsize;
443 } while (!feof(pFile));
449 int rpa_callback_output(rpastat_t * stat, const char *name, void *userdata, const char *input, unsigned int size, unsigned int reason)
456 int rpa_callback_matched_output(rpastat_t * stat, const char *name, void *userdata, const char *input, unsigned int size, unsigned int reason)
458 rpa_grep_t *pGrep = (rpa_grep_t *)userdata;
460 rpa_grep_output_utf8_string(pGrep, name);
461 rpa_grep_output_utf8_string(pGrep, ": ");
462 rpa_grep_output(pGrep, input, size, pGrep->encoding);
463 rpa_grep_output_utf8_string(pGrep, "\n");
469 void rpa_grep_setup_callback(rpa_grep_t *pGrep, rpa_buffer_t *pattern)
475 void rpa_grep_setup_matched_callback(rpa_grep_t *pGrep, rpa_buffer_t *pattern)
481 void rpa_grep_dump_pattern_tree(rpa_grep_t *pGrep, rpa_buffer_t *pattern)
483 rpa_dbex_dumptree_s(pGrep->hDbex, pattern->s, 0);
487 void rpa_grep_output(rpa_grep_t *pGrep, const char *s, unsigned long size, unsigned int encoding)
489 const unsigned char *input = (const unsigned char*)s;
490 const unsigned char *end = input + size;
494 if (encoding == RPA_ENCODING_UTF16LE || encoding == RPA_ENCODING_ICASE_UTF16LE) {
495 while ((ret = (int)rpa_grep_utf16_mbtowc(&wc, input, end)) != 0) {
496 rpa_grep_output_char(wc);
500 while ((ret = (int)rpa_grep_utf8_mbtowc(&wc, input, end)) != 0) {
501 rpa_grep_output_char(wc);
508 void rpa_grep_output_utf8_string(rpa_grep_t *pGrep, const char *s)
510 rpa_grep_output(pGrep, s, strlen(s), RPA_ENCODING_UTF8);
514 void rpa_grep_output_utf16_string(rpa_grep_t *pGrep, const unsigned short *s)
516 unsigned long size = 0;
517 const unsigned short *pstr = s;
520 size += sizeof(unsigned short);
523 rpa_grep_output(pGrep, (const char*)s, size, RPA_ENCODING_UTF16LE);