贴一个字符串匹配算法实现:
- 将简化版的正则表达式 转换成 nfa,
- nfa 子集法,转换成 dfa
- 利用dfa匹配输入串
因为这是嵌入式设备上提炼的一个简单需求,没有考虑周全,有以下缺点:
- 没有完全支持正则表达式,比如字符集就有限制:不能出现正则式元字符
- dfa的状态数有限制,最多128个状态
#define MAX_STATE 128
#define EMPTY_INPUT '/0'
/*
* pattern string:
* regular expression meta characters : '?', '+', '*', '[', ']', '^', '(', ')', '.'
* character set: a subset of ascii character set - { ?+*^[](). }
*/
// list head has no state inside
struct state_set {
int state;
struct state_set *next;
};
struct dfa_transform {
char input;
int state;
struct dfa_transform *next;
};
struct dfa_transform_entry {
int ntrans;
int state;
struct dfa_transform *transform;
};
struct dfa {
int nstates;
int start;
struct state_set *terminals;
struct dfa_transform_entry *trans_table;
// char *alpha_set;
char *matched_str;
};
struct dfa *dfa_create(const char *pattern);
void dfa_release(struct dfa *d);
void dfa_set_alpha(const char *alpha);
int dfa_match(const struct dfa *d, const char *string);
#endif
#include "dfa.h"
static char *alpha_set = "012345678";
static int is_alpha(const char *alpha_set, char c)
{
int i;
for (i = 0; i != strlen(alpha_set); i++)
if (alpha_set[i] == c)
return 1;
return 0;
}
static void free_set(struct state_set *set)
{
if (!set)
return;
if (set->next)
free_set(set->next);
free(set);
};
static void free_transform(struct dfa_transform *t)
{
if (!t)
return;
if (t->next)
free_transform(t->next);
free(t);
}
static void free_dfa(struct dfa *dfa)
{
int i;
if (!dfa)
return;
free_set(dfa->terminals);
for (i = 0; i != dfa->nstates; i++)
free_transform(dfa->trans_table[i].transform);
if (dfa->trans_table)
free(dfa->trans_table);
};
static void dfa_dump(const struct dfa *d)
{
struct state_set *set_iter = d->terminals->next;
struct dfa_transform *trans_iter;
int state = 0;
printf("/n/nnfa (maybe dfa):/n/n");
printf("/tnstates:/t/t%d/n", d->nstates);
printf("/tstart:/t/t/t%d/n", d->start);
printf("/tterminal states:/t{");
for (; set_iter; set_iter = set_iter->next)
printf("%d, ", set_iter->state);
printf("}/n");
printf("/talpha set:/t/t/"%s/"/n", alpha_set);
printf("/n/ttransfer table:/n/n/tstate/ttransforms(input -> state)/n");
for (; state != d->nstates; state++) {
printf("/t%d/t", state);
for (trans_iter = d->trans_table[state].transform; trans_iter; trans_iter = trans_iter->next)
printf("(%c -> %d), ", trans_iter->input, trans_iter->state);
printf("/n");
}
}
static int in_set(const struct state_set *set, int state)
{
struct state_set *iter = set->next;
for (; iter && iter->state != state; iter = iter->next);
if (iter)
return 1;
return 0;
}
static void set_add_state(struct state_set *set, int state)
{
struct state_set *new_elem;
if (set && in_set(set, state))
return;
new_elem = malloc(sizeof(struct state_set));
new_elem->state = state;
new_elem->next = set->next;
set->next = new_elem;
}
static int same_set(const struct state_set *s1, const struct state_set *s2)
{
const struct state_set *itl, *itr;
if (!s1->next && s2->next || !s2->next && s1->next)
return 0;
for (itl = s1->next; itl; itl = itl->next) {
for (itr = s2->next; itr; itr = itr->next)
if (itl->state == itr->state)
break;
if (!itr)
return 0;
}
for (itl = s2->next; itl; itl = itl->next) {
for (itr = s1->next; itr; itr = itr->next)
if (itl->state == itr->state)
break;
if (!itr)
return 0;
}
return 1;
}
static struct dfa *dfa_init(struct dfa *d)
{
d->start = d->nstates = 0;
d->terminals = malloc(sizeof(struct state_set));
d->terminals->next = 0;
d->trans_table = 0;
d->matched_str = 0;
}
static struct dfa *create_dfa_str(const char *pattern);
static struct dfa *create_dfa_n_str(const char * const*patterns, int npatterns);
static int transfer(const struct dfa *dfa, int state, char input)
{
if (state >= dfa->nstates)
return -1;
struct dfa_transform *trans_iter = (dfa->trans_table[state]).transform;
for (; trans_iter && trans_iter->input != input; trans_iter = trans_iter->next);
if (trans_iter)
return trans_iter->state;
return -1;
}
static int insert_transform(struct dfa *d, int state, char input, int next_state)
{
struct dfa_transform *new_trans;
struct dfa_transform_entry *entry = &(d->trans_table[state]);
new_trans = malloc(sizeof(struct dfa_transform));
new_trans->input = input;
new_trans->state = next_state;
new_trans->next = 0;
new_trans->next = entry->transform;
entry->transform = new_trans;
entry->ntrans++;
return 0;
}
static struct state_set *set_transfer(const struct dfa *dfa, const struct state_set *set, char input)
{
struct dfa_transform *trans_iter;
struct state_set *ret_set = 0;
const struct state_set *set_iter;
int state;
if (!set || (set && !set->next))
return 0;
for (set_iter = set->next; set_iter; set_iter = set_iter->next) {
state = set_iter->state;
trans_iter = (dfa->trans_table[state].transform);
for (; trans_iter; trans_iter = trans_iter->next) {
if (trans_iter->input == input) {
if (!ret_set) {
ret_set = malloc(sizeof(struct state_set));
ret_set->next = 0;
}
set_add_state(ret_set, trans_iter->state);
}
}
}
printf("set_transfer {");
for (set_iter = set->next; set_iter; set_iter = set_iter->next)
printf("%d, ", set_iter->state);
printf("}: (/'%c/' -> {", input);
if (ret_set)
for (set_iter = ret_set->next; set_iter; set_iter = set_iter->next)
printf("%d, ", set_iter->state);
printf("}/n");
return ret_set;
}
static struct state_set *e_closure(const struct dfa *nfa, const struct state_set *set)
{
struct dfa_transform *trans_iter;
struct state_set *e_set, *set_iter;
char *state_tag_map = malloc(nfa->nstates);
int *state_queue = malloc(nfa->nstates * 4);
int tail = 0, head = 0, qsize = nfa->nstates, s, i;
for (i = 0; i != nfa->nstates; i++)
state_tag_map[i] = 0;
e_set = malloc(sizeof(struct state_set));
e_set->next = 0;
for (set_iter = set->next; set_iter; set_iter = set_iter->next)
state_queue[head++] = set_iter->state;
while (tail != head) {
s = state_queue[tail++];
if (tail == qsize)
tail = 0;
if (state_tag_map[s] == 1)
continue;
state_tag_map[s] = 1;
set_add_state(e_set, s);
for (trans_iter = nfa->trans_table[s].transform; trans_iter; trans_iter = trans_iter->next) {
if (trans_iter->input == EMPTY_INPUT) {
for (i = tail; i % qsize!= head; i++)
if (trans_iter->state == state_queue[i % qsize])
break;
if (i % qsize != head || state_tag_map[trans_iter->state] == 1)
continue;
// add state to queue
if ( (head + 1) % qsize == tail)
printf("big warnning: queue overflow!/n");
state_queue[head] = trans_iter->state;
head = (head + 1) % qsize;
}
}
}
// dump
printf("empty closure of {");
for (set_iter = set->next; set_iter; set_iter = set_iter->next)
printf("%d, ", set_iter->state);
printf("} is {");
for (set_iter = e_set->next; set_iter; set_iter = set_iter->next)
printf("%d, ", set_iter->state);
printf("}/n");
return e_set;
}
static struct dfa *nfa_to_dfa(const struct dfa *nfa)
{
struct set_transfer {
struct state_set *set;
struct state_set *next_set;
struct set_transfer *next;
int dfa_state;
char input;
} *set_transfer_table[128], *set_trans_iter ;
struct state_set *state_set_queue[128];
int tail = 0, head = 0, nsets = 0, i, j;
struct state_set *s0, *set_iter1, *set_iter2, *new_set;
struct dfa *ret_dfa;
s0 = malloc(sizeof(struct state_set));
s0->next = 0;
set_add_state(s0, nfa->start);
state_set_queue[(head++) % 128] = e_closure(nfa, s0);
free_set(s0);
while (tail != head) {
set_iter1 = state_set_queue[(tail++) % 128];
// dump
printf("dfa state %d : state set: {", nsets);
for (set_iter2 = set_iter1->next; set_iter2; set_iter2 = set_iter2->next)
printf("%d, ", set_iter2->state);
printf("}/n");
set_transfer_table[nsets] = malloc(sizeof(struct set_transfer));
set_transfer_table[nsets]->set = set_iter1;
set_transfer_table[nsets]->next = 0;
set_transfer_table[nsets]->dfa_state = nsets;
nsets++;
set_trans_iter = set_transfer_table[nsets - 1];
for (i = 0; i != strlen(alpha_set); i++) {
new_set = set_transfer(nfa, set_iter1, alpha_set[i]);
if (new_set) {
set_iter2 = e_closure(nfa, new_set);
free_set(new_set);
for (j = 0; j != nsets; j++)
if (same_set(set_transfer_table[j]->set, set_iter2))
break;
if (j == nsets) {
for (j = tail; j % 128 != head; j++)
if (same_set(state_set_queue[j], set_iter2))
break;
if (j % 128 != head) {
free_set(set_iter2);
set_iter2 = state_set_queue[j];
printf("{");
for (new_set = set_iter2->next; new_set; new_set = new_set->next)
printf("%d, ", new_set->state);
printf("} in set_queue/n");
} else { // add to queue
if ((head + 1) % 128 == tail)
printf("big warnning: queue overflow!/n");
state_set_queue[head] = set_iter2;
head = (head + 1) % 128;
}
} else {
printf("{");
for (new_set = set_iter2->next; new_set; new_set = new_set->next)
printf("%d, ", new_set->state);
printf("} in set_transfer_table/n");
free_set(set_iter2);
set_iter2 = set_transfer_table[j]->set;
}
// insert set_transfer (set_transfer_table[nsets - 1].set, alpha_set[i], set_iter2)
printf("dfa transfer insert : {");
for (new_set = set_iter1->next; new_set; new_set= new_set->next)
printf("%d, ", new_set->state);
printf("} :/t/t/t(%c -> {", alpha_set[i]);
for (new_set = set_iter2->next; new_set; new_set= new_set->next)
printf("%d, ", new_set->state);
printf("}/n");
set_trans_iter->next = malloc(sizeof(struct set_transfer));
set_trans_iter->next->input = alpha_set[i];
set_trans_iter->next->next_set = set_iter2;
set_trans_iter->next ->next = 0;
set_trans_iter = set_trans_iter->next;
}
}
}
// construct dfa from set_transfer_table
ret_dfa = malloc(sizeof(struct dfa));
dfa_init(ret_dfa);
ret_dfa->nstates = nsets;
ret_dfa->trans_table = malloc(sizeof(struct dfa_transform_entry) * nsets);
memset(ret_dfa->trans_table, 0, sizeof(struct dfa_transform_entry) * nsets);
for (i = 0; i != nsets; i++) {
ret_dfa->trans_table[i].state = i;
if (in_set(set_transfer_table[i]->set, nfa->nstates - 1))
set_add_state(ret_dfa->terminals, set_transfer_table[i]->dfa_state);
for (set_trans_iter = set_transfer_table[i]->next;
set_trans_iter; set_trans_iter = set_trans_iter->next) {
for (j = 0; j != nsets; j++)
if (set_transfer_table[j]->set == set_trans_iter->next_set)
break;
insert_transform(ret_dfa, set_transfer_table[i]->dfa_state, set_trans_iter->input, j);
printf("set transfer to dfa: transfer(%d: /'%c/'->%d)/n"
, set_transfer_table[i]->dfa_state, set_trans_iter->input, j);
}
}
// memeory release,
// to be continue...
return ret_dfa;
}
static struct dfa *merge_dfa(const struct dfa *dfa_l, const struct dfa *dfa_r)
{
// return dfa_left + dfa_right
// to be continue...
}
/*
* pattern string:
* meta characters : { '?', '+', '*', '[', ']', '^', '(', ')', '.' }
* character set: { 0, 1, 2, 3, 4, 5, 6 } represents keys on panel
*
* string to nfa details:
* . : for all character in alpha-set, insert a transform
*
* a : back_state = front_state;
* front_state++,
*
* [ab...] : back_state = front_state;
* parall a|b|..., front_state++;
*
* (pattern) : (, push back_state,
* ), pop to get back_state
*
* '*' : new state sn;
* t(front_state, e) = { sn, back_state };
* t(back_state, e) = sn
* update back_state and front_state to sn
*
* '+' : new state sn;
* t(front_state, e) = { sn, back_state };
* update back_state to sn
*
* '?' : new state sn;
* t(back_state, e) = t(front_state, e) = sn;
* update back_state to sn
*
* '/0': add front_state to terminal SET
* if stack.empty, algorithm terminaled, otherwise error
* EOS : KEY_OK
*/
struct dfa *dfa_create(const char *pattern)
{
printf("/ncreate dfa for pattern: /"%s/"/n", pattern);
struct dfa *tmp_nfa, *ret_dfa;
char *parall_inputs;
int par_stack[5], top = 0;
int back_state = 0, front_state = 0, new_state;
int is_complement, i = 0, j;
tmp_nfa = malloc(sizeof(struct dfa));
dfa_init(tmp_nfa);
tmp_nfa->trans_table = malloc(sizeof(struct dfa_transform_entry) * MAX_STATE);
memset(tmp_nfa->trans_table, 0, sizeof(struct dfa_transform_entry) * MAX_STATE);
while (pattern[i]) {
if (is_alpha(alpha_set, pattern[i])) {
back_state = front_state;
front_state++;
insert_transform(tmp_nfa, back_state, pattern[i], front_state);
} else if (pattern[i] == '.') {
back_state = front_state;
front_state++;
for (j = 0; j != strlen(alpha_set); j++)
insert_transform(tmp_nfa, back_state, pattern[j], front_state);
} else if (pattern[i] == '[') {
back_state = front_state;
front_state++;
if (pattern[i+1] == '^') {
i++;
// ... unimplement
} else {
while (pattern[++i] != ']')
insert_transform(tmp_nfa, back_state, pattern[i], front_state);
}
} else if (pattern[i] == '(') {
back_state = front_state;
par_stack[top++] = back_state;
} else if (pattern[i] == ')') {
back_state = par_stack[--top];
} else if (pattern[i] == '*') {
insert_transform(tmp_nfa, front_state, EMPTY_INPUT, front_state + 1);
insert_transform(tmp_nfa, front_state, EMPTY_INPUT, back_state);
insert_transform(tmp_nfa, back_state, EMPTY_INPUT, front_state + 1);
front_state++;
} else if (pattern[i] == '+') {
insert_transform(tmp_nfa, front_state, EMPTY_INPUT, back_state);
insert_transform(tmp_nfa, front_state, EMPTY_INPUT, front_state + 1);
front_state++;
} else if (pattern[i] == '?') {
insert_transform(tmp_nfa, front_state, EMPTY_INPUT, front_state + 1);
insert_transform(tmp_nfa, back_state, EMPTY_INPUT, front_state + 1);
front_state++;
} else {
printf("bad pattern/n");
return 0;
}
tmp_nfa->nstates = front_state + 1;
i++;
}
dfa_dump(tmp_nfa);
ret_dfa = nfa_to_dfa(tmp_nfa);
free_dfa(tmp_nfa);
return ret_dfa;
}
void dfa_release(struct dfa *d)
{
free_dfa(d);
}
int dfa_match(const struct dfa *d, const char *str)
{
if (!d || d->nstates <= 0) // ?? == 0
return 0;
int i = 0, state = d->start;
for (; i != strlen(str); i++) {
state = transfer(d, state, str[i]);
if (state == -1)
return 0;
}
return in_set(d->terminals, state);
}
int main(int argc, char **argv)
{
// struct dfa *d = dfa_create("3(1*(0?1[013]+)*(2012)?(23)+)*310[31]2*3*");
char pattern[] = "1*23[243]+";
char str[200];
struct dfa *d = dfa_create(pattern);
dfa_dump(d);
while (1) {
printf(" >");
scanf("%199s", &str);
if (dfa_match(d, str))
printf("/"%s/" match /%s//n", str, pattern);
else
printf("/"%s/" unmatch /%s//n", str, pattern);
}
return 0;
}