RPA Toolkit
Work on rexdfa_t matching. Replaced rexdb_t DFA with rexdfa_t in rexgrep.
[rpatk.git] / rex / rexdfa.c
1 /*
2  *  Regular Pattern Analyzer (RPA)
3  *  Copyright (c) 2009-2010 Martin Stoilov
4  *
5  *  This program is free software: you can redistribute it and/or modify
6  *  it under the terms of the GNU General Public License as published by
7  *  the Free Software Foundation, either version 3 of the License, or
8  *  (at your option) any later version.
9  *
10  *  This program is distributed in the hope that it will be useful,
11  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
12  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13  *  GNU General Public License for more details.
14  *
15  *  You should have received a copy of the GNU General Public License
16  *  along with this program.  If not, see <http://www.gnu.org/licenses/>.
17  *
18  *  Martin Stoilov <martin@rpasearch.com>
19  */
20
21 #include <stdlib.h>
22 #include <stdio.h>
23 #include <ctype.h>
24 #include "rlib/rmem.h"
25 #include "rex/rexdfa.h"
26
27
28 struct rexdfa_ctx {
29         unsigned long nstates;
30         unsigned long ntrnas;
31         unsigned long nsubstates;
32         unsigned long naccsubstates;
33 };
34
35
36 rexdfa_t *rex_dfa_create(unsigned long nstates, unsigned long ntrans, unsigned long nsubstates, unsigned long naccsubstates)
37 {
38         rexdfa_t *dfa = (rexdfa_t *)r_zmalloc(sizeof(*dfa));
39         dfa->states = r_zmalloc(sizeof(rexdfs_t) * nstates);
40         dfa->trans = r_zmalloc(sizeof(rexdft_t) * ntrans);
41         dfa->substates = r_zmalloc(sizeof(rexdfss_t) * nsubstates);
42         dfa->accsubstates = r_zmalloc(sizeof(rexdfss_t) * naccsubstates);
43         dfa->nstates = nstates;
44         dfa->ntrans = ntrans;
45         dfa->nsubstates = nsubstates;
46         dfa->naccsubstates = naccsubstates;
47         return dfa;
48 }
49
50
51 void rex_dfa_destroy(rexdfa_t *dfa)
52 {
53         if (dfa) {
54                 r_free(dfa->substates);
55                 r_free(dfa->accsubstates);
56                 r_free(dfa->states);
57                 r_free(dfa->trans);
58                 r_free(dfa);
59         }
60 }
61
62
63 static void rex_dfa_fillstate(rexdb_t *db, rexdfa_t *dfa, struct rexdfa_ctx *ctx, rexstate_t *state)
64 {
65         long i;
66         rex_transition_t *t = NULL;
67         rexdfs_t *s = &dfa->states[ctx->nstates++];
68         s->type = state->type;
69         s->trans = ctx->ntrnas;
70         s->ntrans = r_array_length(state->trans);
71         for (i = 0; i < s->ntrans; i++) {
72                 t = (rex_transition_t *)r_array_slot(state->trans, i);
73                 dfa->trans[s->trans + i].lowin = t->lowin;
74                 dfa->trans[s->trans + i].highin = t->highin;
75                 dfa->trans[s->trans + i].state = t->dstuid;
76         }
77         ctx->ntrnas += s->ntrans;
78         s->substates = ctx->nsubstates;
79         s->nsubstates = rex_subset_length(state->subset);
80         for (i = 0; i < s->nsubstates; i++) {
81                 unsigned long uid = rex_subset_index(state->subset, i);
82                 rexsubstate_t *substate = rex_db_getsubstate(db, uid);
83                 dfa->substates[s->substates + i].uid = uid;
84                 dfa->substates[s->substates + i].type = substate->ss_type;
85                 dfa->substates[s->substates + i].userdata = substate->ss_userdata;
86         }
87         ctx->nsubstates += s->nsubstates;
88         s->accsubstates = ctx->naccsubstates;
89         s->naccsubstates = 0L;
90         for (i = 0; i < s->nsubstates; i++) {
91                 unsigned long uid = rex_subset_index(state->subset, i);
92                 rexsubstate_t *substate = rex_db_getsubstate(db, uid);
93                 if (substate->ss_type == REX_STATETYPE_ACCEPT) {
94                         dfa->accsubstates[s->accsubstates + s->naccsubstates].uid = uid;
95                         dfa->accsubstates[s->accsubstates + s->naccsubstates].type = substate->ss_type;
96                         dfa->accsubstates[s->accsubstates + s->naccsubstates].userdata = substate->ss_userdata;
97                         s->naccsubstates++;
98                 }
99         }
100         ctx->naccsubstates += s->naccsubstates;
101 }
102
103
104 rexdfa_t *rex_dfa_create_from_db(rexdb_t *db)
105 {
106         long i;
107         rexdfa_t *dfa;
108         struct rexdfa_ctx ctx;
109         unsigned long nstates = rex_db_numstates(db);
110         unsigned long ntrans = rex_db_numtransitions(db);
111         unsigned long nsubstates = rex_db_numsubstates(db);
112         unsigned long naccsubstates = rex_db_numaccsubstates(db);
113         dfa = rex_dfa_create(nstates, ntrans, nsubstates, naccsubstates);
114         r_memset(&ctx, 0, sizeof(ctx));
115
116         for (i = 0; i < r_array_length(db->states); i++) {
117                 rexstate_t *state = rex_db_getstate(db, i);
118                 rex_dfa_fillstate(db, dfa, &ctx, state);
119         }
120         R_ASSERT(ctx.nstates == nstates);
121         R_ASSERT(ctx.ntrnas == ntrans);
122         R_ASSERT(ctx.nsubstates == nsubstates);
123         R_ASSERT(ctx.naccsubstates == naccsubstates);
124         return dfa;
125 }
126
127
128 rexdfs_t *rex_dfa_state(rexdfa_t *dfa, unsigned long nstate)
129 {
130         rexdfs_t *s;
131         if (nstate >= dfa->nstates)
132                 return NULL;
133         s = &dfa->states[nstate];
134         return s;
135 }
136
137
138 rexdft_t *rex_dfa_transition(rexdfa_t *dfa, unsigned long nstate, unsigned long ntrans)
139 {
140         rexdft_t *t;
141         rexdfs_t *s = rex_dfa_state(dfa, nstate);
142         if (!s)
143                 return NULL;
144         if (ntrans >= s->ntrans)
145                 return NULL;
146         t = &dfa->trans[s->trans + ntrans];
147         return t;
148 }
149
150
151 rexdfss_t *rex_dfa_substate(rexdfa_t *dfa, unsigned long nstate, unsigned long nsubstate)
152 {
153         rexdfss_t *ss;
154         rexdfs_t *s = rex_dfa_state(dfa, nstate);
155         if (!s)
156                 return NULL;
157         if (nsubstate >= s->nsubstates)
158                 return NULL;
159         ss = &dfa->substates[s->substates + nsubstate];
160         return ss;
161 }
162
163
164 rexdfss_t *rex_dfa_accsubstate(rexdfa_t *dfa, unsigned long nstate, unsigned long naccsubstate)
165 {
166         rexdfss_t *ss;
167         rexdfs_t *s = rex_dfa_state(dfa, nstate);
168         if (!s)
169                 return NULL;
170         if (naccsubstate >= s->naccsubstates)
171                 return NULL;
172         ss = &dfa->substates[s->accsubstates + naccsubstate];
173         return ss;
174 }
175
176
177 long rex_dfa_next(rexdfa_t *dfa, unsigned long nstate, rexchar_t input)
178 {
179         rexdft_t *t;
180         rexdfs_t *s = rex_dfa_state(dfa, nstate);
181         long min, max, mid;
182
183         if (!s || !s->ntrans)
184                 return 0L;
185         min = 0;
186         max = min + s->ntrans;
187         while (max > min) {
188                 mid = (min + max)/2;
189                 t = rex_dfa_transition(dfa, nstate, mid);
190                 if (input >= t->lowin) {
191                         min = mid + 1;
192                 } else {
193                         max = mid;
194                 }
195         }
196         if (min > 0)
197                 --min;
198         t = rex_dfa_transition(dfa, nstate, min);
199         if (input >= t->lowin && input <= t->highin)
200                 return t->state;
201         return 0;
202 }
203
204
205 void rex_dfa_dumpstate(rexdfa_t *dfa, unsigned long nstate)
206 {
207         long index;
208         char buf[240];
209         int bufsize = sizeof(buf) - 1;
210         int n = 0;
211         rexdfs_t *s = rex_dfa_state(dfa, nstate);
212         rexdft_t *t = NULL;
213
214         if (!s)
215                 return;
216         fprintf(stdout, "State %ld", nstate);
217         fprintf(stdout, " (");
218         for (index = 0; index < s->nsubstates; index++) {
219                 fprintf(stdout, "%ld", dfa->substates[s->substates + index].uid);
220                 if (dfa->substates[s->substates + index].type == REX_STATETYPE_ACCEPT)
221                         fprintf(stdout, "*");
222                 if (index + 1 < s->nsubstates)
223                         fprintf(stdout, ",");
224         }
225         fprintf(stdout, ")");
226
227
228         fprintf(stdout, ": ");
229         if (s->type == REX_STATETYPE_ACCEPT) {
230                 fprintf(stdout, " REX_STATETYPE_ACCEPT ");
231                 fprintf(stdout, " (");
232                 for (index = 0; index < s->naccsubstates; index++) {
233                         fprintf(stdout, "%ld*", dfa->accsubstates[s->accsubstates + index].uid);
234                         if (index + 1 < s->naccsubstates)
235                                 fprintf(stdout, ",");
236                 }
237                 fprintf(stdout, ")");
238         } else if (s->type == REX_STATETYPE_DEAD) {
239                 fprintf(stdout, " REX_STATETYPE_DEAD ");
240         } else if (s->type == REX_STATETYPE_START) {
241                 fprintf(stdout, " REX_STATETYPE_START ");
242         }
243         fprintf(stdout, "\n");
244         for (index = 0; index < s->ntrans; index++) {
245                 t = &dfa->trans[s->trans + index];
246                 n = 0;
247                 if (t->lowin != t->highin) {
248                         if (isprint(t->lowin) && !isspace(t->lowin) && isprint(t->highin) && !isspace(t->highin))
249                                 n += r_snprintf(buf + n, n < bufsize ? bufsize - n : 0, "        [%c - %c] ", t->lowin, t->highin);
250                         else
251                                 n += r_snprintf(buf + n, n < bufsize ? bufsize - n : 0, "        [0x%X - 0x%X] ", t->lowin, t->highin);
252                 } else {
253                         if (isprint(t->lowin) && !isspace(t->lowin))
254                                 n += r_snprintf(buf + n, n < bufsize ? bufsize - n : 0, "        '%c' ", t->lowin);
255                         else
256                                 n += r_snprintf(buf + n, n < bufsize ? bufsize - n : 0, "        0x%X ", t->lowin);
257                 }
258                 r_memset(buf + n, ' ', bufsize - n);
259                 n = 40;
260                 n += r_snprintf(buf + n, n < bufsize ? bufsize - n : 0, "-> %ld", t->state);
261                 fprintf(stdout, "%s\n", buf);
262         }
263         if (!s->ntrans)
264                 fprintf(stdout, "        (none)\n");
265         fprintf(stdout, "\n");
266
267 }
268
269