703 lines
26 KiB
C
703 lines
26 KiB
C
#include "../Headers/slex_regex.h"
|
|
#include <stdio.h>
|
|
#include <stdlib.h>
|
|
#include <string.h>
|
|
|
|
// Token representation for Regex Parsing
|
|
typedef enum {
|
|
TOKEN_CHAR,
|
|
TOKEN_CHAR_SET,
|
|
TOKEN_CONCAT,
|
|
TOKEN_ALT,
|
|
TOKEN_STAR,
|
|
TOKEN_PLUS,
|
|
TOKEN_QUESTION,
|
|
TOKEN_LPAREN,
|
|
TOKEN_RPAREN,
|
|
} RegexTokenType;
|
|
|
|
typedef struct {
|
|
RegexTokenType type;
|
|
bool char_set[256];
|
|
} RegexToken;
|
|
|
|
// Global array to track all allocated NFA states for easy deallocation
|
|
static NFAState** g_nfa_states = NULL;
|
|
static int g_nfa_state_count = 0;
|
|
static int g_nfa_state_capacity = 0;
|
|
|
|
static NFAState* create_nfa_state() {
|
|
NFAState* s = (NFAState*)malloc(sizeof(NFAState));
|
|
s->id = g_nfa_state_count;
|
|
s->is_epsilon = false;
|
|
memset(s->char_set, 0, sizeof(s->char_set));
|
|
s->edge1 = NULL;
|
|
s->edge2 = NULL;
|
|
s->accept_rule_index = -1;
|
|
|
|
// Track state globally
|
|
if (g_nfa_state_count >= g_nfa_state_capacity) {
|
|
g_nfa_state_capacity = g_nfa_state_capacity == 0 ? 1024 : g_nfa_state_capacity * 2;
|
|
g_nfa_states = (NFAState**)realloc(g_nfa_states, g_nfa_state_capacity * sizeof(NFAState*));
|
|
}
|
|
g_nfa_states[g_nfa_state_count++] = s;
|
|
return s;
|
|
}
|
|
|
|
static void free_all_nfa_states() {
|
|
for (int i = 0; i < g_nfa_state_count; i++) {
|
|
free(g_nfa_states[i]);
|
|
}
|
|
free(g_nfa_states);
|
|
g_nfa_states = NULL;
|
|
g_nfa_state_count = 0;
|
|
g_nfa_state_capacity = 0;
|
|
}
|
|
|
|
// Tokenize a regex pattern
|
|
static RegexToken* tokenize_regex(const char* pattern, int* token_count_out) {
|
|
int capacity = 128;
|
|
int count = 0;
|
|
RegexToken* tokens = (RegexToken*)malloc(capacity * sizeof(RegexToken));
|
|
int len = (int)strlen(pattern);
|
|
int idx = 0;
|
|
|
|
while (idx < len) {
|
|
if (count >= capacity) {
|
|
capacity *= 2;
|
|
tokens = (RegexToken*)realloc(tokens, capacity * sizeof(RegexToken));
|
|
}
|
|
|
|
char c = pattern[idx];
|
|
|
|
if (c == '\\') {
|
|
idx++;
|
|
if (idx >= len) {
|
|
// Trailing backslash, treat as literal backslash
|
|
tokens[count].type = TOKEN_CHAR;
|
|
memset(tokens[count].char_set, 0, 256);
|
|
tokens[count].char_set[(unsigned char)'\\'] = true;
|
|
count++;
|
|
break;
|
|
}
|
|
char esc = pattern[idx++];
|
|
tokens[count].type = TOKEN_CHAR_SET;
|
|
memset(tokens[count].char_set, 0, 256);
|
|
|
|
if (esc == 'p' && idx < len && pattern[idx] == '{') {
|
|
idx++; // skip '{'
|
|
char prop[256];
|
|
int p_idx = 0;
|
|
while (idx < len && pattern[idx] != '}') {
|
|
prop[p_idx++] = pattern[idx++];
|
|
}
|
|
prop[p_idx] = '\0';
|
|
if (idx < len && pattern[idx] == '}') {
|
|
idx++; // skip '}'
|
|
}
|
|
|
|
if (strcmp(prop, "P") == 0) {
|
|
const char* punct = "!\"#%&'()*,-./:;?@[\\]_{}";
|
|
for (int k = 0; punct[k] != '\0'; k++) {
|
|
tokens[count].char_set[(unsigned char)punct[k]] = true;
|
|
}
|
|
} else if (strcmp(prop, "S") == 0) {
|
|
const char* sym = "$+<=>^`|~";
|
|
for (int k = 0; sym[k] != '\0'; k++) {
|
|
tokens[count].char_set[(unsigned char)sym[k]] = true;
|
|
}
|
|
} else if (strcmp(prop, "L") == 0) {
|
|
for (int d = 'a'; d <= 'z'; d++) tokens[count].char_set[d] = true;
|
|
for (int d = 'A'; d <= 'Z'; d++) tokens[count].char_set[d] = true;
|
|
} else if (strcmp(prop, "N") == 0) {
|
|
for (int d = '0'; d <= '9'; d++) tokens[count].char_set[d] = true;
|
|
}
|
|
} else if (esc == 'n') {
|
|
tokens[count].char_set[10] = true; // LF
|
|
} else if (esc == 't') {
|
|
tokens[count].char_set[9] = true; // TAB
|
|
} else if (esc == 'r') {
|
|
tokens[count].char_set[13] = true; // CR
|
|
} else if (esc == 's') {
|
|
tokens[count].char_set[32] = true; // Space
|
|
tokens[count].char_set[9] = true; // TAB
|
|
tokens[count].char_set[13] = true; // CR
|
|
tokens[count].char_set[10] = true; // LF
|
|
} else if (esc == 'd') {
|
|
for (int d = '0'; d <= '9'; d++) tokens[count].char_set[d] = true;
|
|
} else if (esc == 'w') {
|
|
for (int d = '0'; d <= '9'; d++) tokens[count].char_set[d] = true;
|
|
for (int d = 'a'; d <= 'z'; d++) tokens[count].char_set[d] = true;
|
|
for (int d = 'A'; d <= 'Z'; d++) tokens[count].char_set[d] = true;
|
|
tokens[count].char_set[(unsigned char)'_'] = true;
|
|
} else {
|
|
// Literal escaped character
|
|
tokens[count].type = TOKEN_CHAR;
|
|
tokens[count].char_set[(unsigned char)esc] = true;
|
|
}
|
|
count++;
|
|
} else if (c == '[') {
|
|
idx++;
|
|
bool negate = false;
|
|
if (idx < len && pattern[idx] == '^') {
|
|
negate = true;
|
|
idx++;
|
|
}
|
|
|
|
tokens[count].type = TOKEN_CHAR_SET;
|
|
memset(tokens[count].char_set, 0, 256);
|
|
|
|
while (idx < len && pattern[idx] != ']') {
|
|
char c1 = pattern[idx++];
|
|
if (c1 == '\\' && idx < len) {
|
|
char esc = pattern[idx++];
|
|
if (esc == 'p' && idx < len && pattern[idx] == '{') {
|
|
idx++; // skip '{'
|
|
char prop[256];
|
|
int p_idx = 0;
|
|
while (idx < len && pattern[idx] != '}') {
|
|
prop[p_idx++] = pattern[idx++];
|
|
}
|
|
prop[p_idx] = '\0';
|
|
if (idx < len && pattern[idx] == '}') {
|
|
idx++; // skip '}'
|
|
}
|
|
|
|
if (strcmp(prop, "P") == 0) {
|
|
const char* punct = "!\"#%&'()*,-./:;?@[\\]_{}";
|
|
for (int k = 0; punct[k] != '\0'; k++) {
|
|
tokens[count].char_set[(unsigned char)punct[k]] = true;
|
|
}
|
|
} else if (strcmp(prop, "S") == 0) {
|
|
const char* sym = "$+<=>^`|~";
|
|
for (int k = 0; sym[k] != '\0'; k++) {
|
|
tokens[count].char_set[(unsigned char)sym[k]] = true;
|
|
}
|
|
} else if (strcmp(prop, "L") == 0) {
|
|
for (int d = 'a'; d <= 'z'; d++) tokens[count].char_set[d] = true;
|
|
for (int d = 'A'; d <= 'Z'; d++) tokens[count].char_set[d] = true;
|
|
} else if (strcmp(prop, "N") == 0) {
|
|
for (int d = '0'; d <= '9'; d++) tokens[count].char_set[d] = true;
|
|
}
|
|
continue;
|
|
} else if (esc == 'n') c1 = '\n';
|
|
else if (esc == 't') c1 = '\t';
|
|
else if (esc == 'r') c1 = '\r';
|
|
else if (esc == 's') {
|
|
tokens[count].char_set[32] = true;
|
|
tokens[count].char_set[9] = true;
|
|
tokens[count].char_set[13] = true;
|
|
tokens[count].char_set[10] = true;
|
|
continue;
|
|
} else if (esc == 'd') {
|
|
for (int d = '0'; d <= '9'; d++) tokens[count].char_set[d] = true;
|
|
continue;
|
|
} else if (esc == 'w') {
|
|
for (int d = '0'; d <= '9'; d++) tokens[count].char_set[d] = true;
|
|
for (int d = 'a'; d <= 'z'; d++) tokens[count].char_set[d] = true;
|
|
for (int d = 'A'; d <= 'Z'; d++) tokens[count].char_set[d] = true;
|
|
tokens[count].char_set[(unsigned char)'_'] = true;
|
|
continue;
|
|
} else {
|
|
c1 = esc;
|
|
}
|
|
}
|
|
|
|
// Check range: c1-c2
|
|
if (idx + 1 < len && pattern[idx] == '-' && pattern[idx + 1] != ']') {
|
|
idx++; // skip '-'
|
|
char c2 = pattern[idx++];
|
|
if (c2 == '\\' && idx < len) {
|
|
char esc = pattern[idx++];
|
|
if (esc == 'n') c2 = '\n';
|
|
else if (esc == 't') c2 = '\t';
|
|
else if (esc == 'r') c2 = '\r';
|
|
else c2 = esc;
|
|
}
|
|
for (int r = (unsigned char)c1; r <= (unsigned char)c2; r++) {
|
|
tokens[count].char_set[r] = true;
|
|
}
|
|
} else {
|
|
tokens[count].char_set[(unsigned char)c1] = true;
|
|
}
|
|
}
|
|
|
|
if (idx < len && pattern[idx] == ']') {
|
|
idx++;
|
|
}
|
|
|
|
if (negate) {
|
|
for (int i = 0; i < 256; i++) {
|
|
tokens[count].char_set[i] = !tokens[count].char_set[i];
|
|
}
|
|
}
|
|
count++;
|
|
} else if (c == '.') {
|
|
tokens[count].type = TOKEN_CHAR_SET;
|
|
memset(tokens[count].char_set, 0, 256);
|
|
for (int i = 0; i < 256; i++) {
|
|
if (i != 10) { // any character except newline
|
|
tokens[count].char_set[i] = true;
|
|
}
|
|
}
|
|
count++;
|
|
idx++;
|
|
} else if (c == '*') {
|
|
tokens[count].type = TOKEN_STAR;
|
|
count++;
|
|
idx++;
|
|
} else if (c == '+') {
|
|
tokens[count].type = TOKEN_PLUS;
|
|
count++;
|
|
idx++;
|
|
} else if (c == '?') {
|
|
tokens[count].type = TOKEN_QUESTION;
|
|
count++;
|
|
idx++;
|
|
} else if (c == '|') {
|
|
tokens[count].type = TOKEN_ALT;
|
|
count++;
|
|
idx++;
|
|
} else if (c == '(') {
|
|
tokens[count].type = TOKEN_LPAREN;
|
|
count++;
|
|
idx++;
|
|
} else if (c == ')') {
|
|
tokens[count].type = TOKEN_RPAREN;
|
|
count++;
|
|
idx++;
|
|
} else {
|
|
tokens[count].type = TOKEN_CHAR;
|
|
memset(tokens[count].char_set, 0, 256);
|
|
tokens[count].char_set[(unsigned char)c] = true;
|
|
count++;
|
|
idx++;
|
|
}
|
|
}
|
|
|
|
*token_count_out = count;
|
|
return tokens;
|
|
}
|
|
|
|
// Insert explicit concatenation operators
|
|
static RegexToken* insert_concat(RegexToken* input, int input_count, int* output_count_out) {
|
|
int capacity = input_count * 2;
|
|
int count = 0;
|
|
RegexToken* output = (RegexToken*)malloc(capacity * sizeof(RegexToken));
|
|
|
|
for (int i = 0; i < input_count; i++) {
|
|
if (count >= capacity) {
|
|
capacity *= 2;
|
|
output = (RegexToken*)realloc(output, capacity * sizeof(RegexToken));
|
|
}
|
|
|
|
output[count++] = input[i];
|
|
|
|
if (i + 1 < input_count) {
|
|
RegexTokenType t1 = input[i].type;
|
|
RegexTokenType t2 = input[i + 1].type;
|
|
|
|
bool t1_can_concat = (t1 == TOKEN_CHAR || t1 == TOKEN_CHAR_SET || t1 == TOKEN_STAR || t1 == TOKEN_PLUS || t1 == TOKEN_QUESTION || t1 == TOKEN_RPAREN);
|
|
bool t2_can_concat = (t2 == TOKEN_CHAR || t2 == TOKEN_CHAR_SET || t2 == TOKEN_LPAREN);
|
|
|
|
if (t1_can_concat && t2_can_concat) {
|
|
if (count >= capacity) {
|
|
capacity *= 2;
|
|
output = (RegexToken*)realloc(output, capacity * sizeof(RegexToken));
|
|
}
|
|
output[count].type = TOKEN_CONCAT;
|
|
memset(output[count].char_set, 0, 256);
|
|
count++;
|
|
}
|
|
}
|
|
}
|
|
|
|
*output_count_out = count;
|
|
return output;
|
|
}
|
|
|
|
// Shunting-yard algorithm to convert infix tokens to postfix tokens
|
|
static RegexToken* infix_to_postfix(RegexToken* infix, int infix_count, int* postfix_count_out) {
|
|
int capacity = infix_count;
|
|
int postfix_count = 0;
|
|
RegexToken* postfix = (RegexToken*)malloc(capacity * sizeof(RegexToken));
|
|
|
|
RegexToken stack[512];
|
|
int stack_top = 0;
|
|
|
|
for (int i = 0; i < infix_count; i++) {
|
|
RegexToken t = infix[i];
|
|
|
|
if (t.type == TOKEN_CHAR || t.type == TOKEN_CHAR_SET) {
|
|
if (postfix_count >= capacity) {
|
|
capacity *= 2;
|
|
postfix = (RegexToken*)realloc(postfix, capacity * sizeof(RegexToken));
|
|
}
|
|
postfix[postfix_count++] = t;
|
|
} else if (t.type == TOKEN_LPAREN) {
|
|
stack[stack_top++] = t;
|
|
} else if (t.type == TOKEN_RPAREN) {
|
|
while (stack_top > 0 && stack[stack_top - 1].type != TOKEN_LPAREN) {
|
|
if (postfix_count >= capacity) {
|
|
capacity *= 2;
|
|
postfix = (RegexToken*)realloc(postfix, capacity * sizeof(RegexToken));
|
|
}
|
|
postfix[postfix_count++] = stack[--stack_top];
|
|
}
|
|
if (stack_top > 0) {
|
|
stack_top--; // pop LPAREN
|
|
}
|
|
} else if (t.type == TOKEN_STAR || t.type == TOKEN_PLUS || t.type == TOKEN_QUESTION) {
|
|
// Unary operators have highest precedence and are postfix, output immediately
|
|
if (postfix_count >= capacity) {
|
|
capacity *= 2;
|
|
postfix = (RegexToken*)realloc(postfix, capacity * sizeof(RegexToken));
|
|
}
|
|
postfix[postfix_count++] = t;
|
|
} else {
|
|
// Binary operators (CONCAT, ALT)
|
|
int p_curr = (t.type == TOKEN_ALT) ? 1 : 2;
|
|
while (stack_top > 0) {
|
|
RegexTokenType top_type = stack[stack_top - 1].type;
|
|
if (top_type == TOKEN_CONCAT || top_type == TOKEN_ALT) {
|
|
int p_top = (top_type == TOKEN_ALT) ? 1 : 2;
|
|
if (p_top >= p_curr) {
|
|
if (postfix_count >= capacity) {
|
|
capacity *= 2;
|
|
postfix = (RegexToken*)realloc(postfix, capacity * sizeof(RegexToken));
|
|
}
|
|
postfix[postfix_count++] = stack[--stack_top];
|
|
} else {
|
|
break;
|
|
}
|
|
} else {
|
|
break;
|
|
}
|
|
}
|
|
stack[stack_top++] = t;
|
|
}
|
|
}
|
|
|
|
while (stack_top > 0) {
|
|
if (postfix_count >= capacity) {
|
|
capacity *= 2;
|
|
postfix = (RegexToken*)realloc(postfix, capacity * sizeof(RegexToken));
|
|
}
|
|
postfix[postfix_count++] = stack[--stack_top];
|
|
}
|
|
|
|
*postfix_count_out = postfix_count;
|
|
return postfix;
|
|
}
|
|
|
|
// Build NFA from postfix tokens using Thompson's construction
|
|
static NFAFragment build_nfa(RegexToken* postfix, int postfix_count) {
|
|
NFAFragment stack[512];
|
|
int stack_top = 0;
|
|
|
|
for (int i = 0; i < postfix_count; i++) {
|
|
RegexToken t = postfix[i];
|
|
|
|
if (t.type == TOKEN_CHAR || t.type == TOKEN_CHAR_SET) {
|
|
NFAState* start = create_nfa_state();
|
|
NFAState* accept = create_nfa_state();
|
|
start->is_epsilon = false;
|
|
memcpy(start->char_set, t.char_set, 256);
|
|
start->edge1 = accept;
|
|
|
|
NFAFragment frag = {start, accept};
|
|
stack[stack_top++] = frag;
|
|
} else if (t.type == TOKEN_CONCAT) {
|
|
NFAFragment f2 = stack[--stack_top];
|
|
NFAFragment f1 = stack[--stack_top];
|
|
|
|
f1.accept->is_epsilon = true;
|
|
f1.accept->edge1 = f2.start;
|
|
|
|
NFAFragment frag = {f1.start, f2.accept};
|
|
stack[stack_top++] = frag;
|
|
} else if (t.type == TOKEN_ALT) {
|
|
NFAFragment f2 = stack[--stack_top];
|
|
NFAFragment f1 = stack[--stack_top];
|
|
|
|
NFAState* start = create_nfa_state();
|
|
NFAState* accept = create_nfa_state();
|
|
|
|
start->is_epsilon = true;
|
|
start->edge1 = f1.start;
|
|
start->edge2 = f2.start;
|
|
|
|
f1.accept->is_epsilon = true;
|
|
f1.accept->edge1 = accept;
|
|
|
|
f2.accept->is_epsilon = true;
|
|
f2.accept->edge1 = accept;
|
|
|
|
NFAFragment frag = {start, accept};
|
|
stack[stack_top++] = frag;
|
|
} else if (t.type == TOKEN_STAR) {
|
|
NFAFragment f1 = stack[--stack_top];
|
|
|
|
NFAState* start = create_nfa_state();
|
|
NFAState* accept = create_nfa_state();
|
|
|
|
start->is_epsilon = true;
|
|
start->edge1 = f1.start;
|
|
start->edge2 = accept;
|
|
|
|
f1.accept->is_epsilon = true;
|
|
f1.accept->edge1 = f1.start;
|
|
f1.accept->edge2 = accept;
|
|
|
|
NFAFragment frag = {start, accept};
|
|
stack[stack_top++] = frag;
|
|
} else if (t.type == TOKEN_PLUS) {
|
|
NFAFragment f1 = stack[--stack_top];
|
|
|
|
NFAState* start = create_nfa_state();
|
|
NFAState* accept = create_nfa_state();
|
|
|
|
start->is_epsilon = true;
|
|
start->edge1 = f1.start;
|
|
|
|
f1.accept->is_epsilon = true;
|
|
f1.accept->edge1 = f1.start;
|
|
f1.accept->edge2 = accept;
|
|
|
|
NFAFragment frag = {start, accept};
|
|
stack[stack_top++] = frag;
|
|
} else if (t.type == TOKEN_QUESTION) {
|
|
NFAFragment f1 = stack[--stack_top];
|
|
|
|
NFAState* start = create_nfa_state();
|
|
NFAState* accept = create_nfa_state();
|
|
|
|
start->is_epsilon = true;
|
|
start->edge1 = f1.start;
|
|
start->edge2 = accept;
|
|
|
|
f1.accept->is_epsilon = true;
|
|
f1.accept->edge1 = accept;
|
|
|
|
NFAFragment frag = {start, accept};
|
|
stack[stack_top++] = frag;
|
|
}
|
|
}
|
|
|
|
return stack[0];
|
|
}
|
|
|
|
// Computes epsilon closure of a set of NFA states
|
|
static void get_epsilon_closure(int* input_states, int input_count, NFAState** all_nfa_states, int total_nfa_states, int** output_states, int* output_count) {
|
|
bool* visited = (bool*)calloc(total_nfa_states, sizeof(bool));
|
|
int* queue = (int*)malloc(total_nfa_states * sizeof(int));
|
|
int head = 0, tail = 0;
|
|
|
|
for (int i = 0; i < input_count; i++) {
|
|
int id = input_states[i];
|
|
visited[id] = true;
|
|
queue[tail++] = id;
|
|
}
|
|
|
|
while (head < tail) {
|
|
int curr_id = queue[head++];
|
|
NFAState* s = all_nfa_states[curr_id];
|
|
if (s->is_epsilon) {
|
|
if (s->edge1 && !visited[s->edge1->id]) {
|
|
visited[s->edge1->id] = true;
|
|
queue[tail++] = s->edge1->id;
|
|
}
|
|
if (s->edge2 && !visited[s->edge2->id]) {
|
|
visited[s->edge2->id] = true;
|
|
queue[tail++] = s->edge2->id;
|
|
}
|
|
}
|
|
}
|
|
|
|
int count = 0;
|
|
for (int i = 0; i < total_nfa_states; i++) {
|
|
if (visited[i]) count++;
|
|
}
|
|
|
|
int* res = (int*)malloc(count * sizeof(int));
|
|
int idx = 0;
|
|
for (int i = 0; i < total_nfa_states; i++) {
|
|
if (visited[i]) {
|
|
res[idx++] = i;
|
|
}
|
|
}
|
|
|
|
*output_states = res;
|
|
*output_count = count;
|
|
free(visited);
|
|
free(queue);
|
|
}
|
|
|
|
// Compare two NFA state sets
|
|
static bool are_nfa_sets_equal(int* a, int a_count, int* b, int b_count) {
|
|
if (a_count != b_count) return false;
|
|
for (int i = 0; i < a_count; i++) {
|
|
if (a[i] != b[i]) return false;
|
|
}
|
|
return true;
|
|
}
|
|
|
|
// Compiles a set of regular expression patterns into a complete DFA using subset construction
|
|
DFAState* slex_compile_regexes(char** patterns, int pattern_count, int* dfa_state_count_out) {
|
|
free_all_nfa_states(); // Reset global state tracker
|
|
|
|
// 1. Build NFA for each pattern
|
|
NFAFragment* fragments = (NFAFragment*)malloc(pattern_count * sizeof(NFAFragment));
|
|
for (int i = 0; i < pattern_count; i++) {
|
|
int t_count = 0, concat_count = 0, post_count = 0;
|
|
RegexToken* tokens = tokenize_regex(patterns[i], &t_count);
|
|
RegexToken* tokens_concat = insert_concat(tokens, t_count, &concat_count);
|
|
RegexToken* tokens_postfix = infix_to_postfix(tokens_concat, concat_count, &post_count);
|
|
|
|
fragments[i] = build_nfa(tokens_postfix, post_count);
|
|
fragments[i].accept->accept_rule_index = i;
|
|
|
|
free(tokens);
|
|
free(tokens_concat);
|
|
free(tokens_postfix);
|
|
}
|
|
|
|
// 2. Create global start state with epsilon transitions to each pattern NFA's start state
|
|
NFAState* global_start = create_nfa_state();
|
|
global_start->is_epsilon = true;
|
|
|
|
NFAState* current_hub = global_start;
|
|
for (int i = 0; i < pattern_count; i++) {
|
|
if (i == pattern_count - 1) {
|
|
current_hub->edge1 = fragments[i].start;
|
|
} else {
|
|
NFAState* next_hub = create_nfa_state();
|
|
next_hub->is_epsilon = true;
|
|
current_hub->edge1 = fragments[i].start;
|
|
current_hub->edge2 = next_hub;
|
|
current_hub = next_hub;
|
|
}
|
|
}
|
|
free(fragments);
|
|
|
|
// 3. Subset construction
|
|
int total_nfa_states = g_nfa_state_count;
|
|
NFAState** all_nfa_states = g_nfa_states;
|
|
|
|
int dfa_capacity = 1024;
|
|
int dfa_count = 0;
|
|
DFAState* dfa_states = (DFAState*)malloc(dfa_capacity * sizeof(DFAState));
|
|
|
|
// Queue for subset construction
|
|
int* work_queue = (int*)malloc(dfa_capacity * sizeof(int));
|
|
int queue_head = 0, queue_tail = 0;
|
|
|
|
// Start state epsilon closure
|
|
int start_nfa_id = global_start->id;
|
|
int* start_closure = NULL;
|
|
int start_closure_count = 0;
|
|
get_epsilon_closure(&start_nfa_id, 1, all_nfa_states, total_nfa_states, &start_closure, &start_closure_count);
|
|
|
|
// Create start DFA state (0)
|
|
dfa_states[dfa_count].id = dfa_count;
|
|
dfa_states[dfa_count].nfa_states = start_closure;
|
|
dfa_states[dfa_count].nfa_state_count = start_closure_count;
|
|
memset(dfa_states[dfa_count].transitions, -1, sizeof(dfa_states[dfa_count].transitions));
|
|
dfa_states[dfa_count].accept_rule_index = -1;
|
|
|
|
work_queue[queue_tail++] = dfa_count;
|
|
dfa_count++;
|
|
|
|
// Process queue
|
|
while (queue_head < queue_tail) {
|
|
int curr_dfa_id = work_queue[queue_head++];
|
|
|
|
// For each possible ASCII character transition
|
|
for (int c = 0; c < 256; c++) {
|
|
// Find NFA states reachable on character 'c'
|
|
int* reachable = (int*)malloc(total_nfa_states * sizeof(int));
|
|
int reachable_count = 0;
|
|
|
|
DFAState* curr_dfa = &dfa_states[curr_dfa_id];
|
|
for (int i = 0; i < curr_dfa->nfa_state_count; i++) {
|
|
NFAState* nfa_s = all_nfa_states[curr_dfa->nfa_states[i]];
|
|
if (!nfa_s->is_epsilon && nfa_s->char_set[c]) {
|
|
if (nfa_s->edge1) {
|
|
reachable[reachable_count++] = nfa_s->edge1->id;
|
|
}
|
|
}
|
|
}
|
|
|
|
if (reachable_count > 0) {
|
|
// Compute epsilon closure of reachable NFA states
|
|
int* closure = NULL;
|
|
int closure_count = 0;
|
|
get_epsilon_closure(reachable, reachable_count, all_nfa_states, total_nfa_states, &closure, &closure_count);
|
|
free(reachable);
|
|
|
|
// Check if this DFA state already exists
|
|
int existing_id = -1;
|
|
for (int d = 0; d < dfa_count; d++) {
|
|
if (are_nfa_sets_equal(dfa_states[d].nfa_states, dfa_states[d].nfa_state_count, closure, closure_count)) {
|
|
existing_id = d;
|
|
break;
|
|
}
|
|
}
|
|
|
|
if (existing_id != -1) {
|
|
dfa_states[curr_dfa_id].transitions[c] = existing_id;
|
|
free(closure);
|
|
} else {
|
|
if (dfa_count >= dfa_capacity) {
|
|
dfa_capacity *= 2;
|
|
dfa_states = (DFAState*)realloc(dfa_states, dfa_capacity * sizeof(DFAState));
|
|
work_queue = (int*)realloc(work_queue, dfa_capacity * sizeof(int));
|
|
}
|
|
|
|
dfa_states[dfa_count].id = dfa_count;
|
|
dfa_states[dfa_count].nfa_states = closure;
|
|
dfa_states[dfa_count].nfa_state_count = closure_count;
|
|
memset(dfa_states[dfa_count].transitions, -1, sizeof(dfa_states[dfa_count].transitions));
|
|
dfa_states[dfa_count].accept_rule_index = -1;
|
|
|
|
dfa_states[curr_dfa_id].transitions[c] = dfa_count;
|
|
work_queue[queue_tail++] = dfa_count;
|
|
dfa_count++;
|
|
}
|
|
} else {
|
|
free(reachable);
|
|
dfa_states[curr_dfa_id].transitions[c] = -1;
|
|
}
|
|
}
|
|
}
|
|
|
|
// Determine accepting status of each DFA state based on NFA accept states
|
|
for (int d = 0; d < dfa_count; d++) {
|
|
int best_rule = -1;
|
|
for (int i = 0; i < dfa_states[d].nfa_state_count; i++) {
|
|
NFAState* nfa_s = all_nfa_states[dfa_states[d].nfa_states[i]];
|
|
if (nfa_s->accept_rule_index != -1) {
|
|
if (best_rule == -1 || nfa_s->accept_rule_index < best_rule) {
|
|
best_rule = nfa_s->accept_rule_index;
|
|
}
|
|
}
|
|
}
|
|
dfa_states[d].accept_rule_index = best_rule;
|
|
}
|
|
|
|
free(work_queue);
|
|
free_all_nfa_states(); // We no longer need the NFA states
|
|
|
|
*dfa_state_count_out = dfa_count;
|
|
return dfa_states;
|
|
}
|
|
|
|
void slex_free_dfa(DFAState* dfa_states, int dfa_state_count) {
|
|
if (dfa_states) {
|
|
for (int i = 0; i < dfa_state_count; i++) {
|
|
free(dfa_states[i].nfa_states);
|
|
}
|
|
free(dfa_states);
|
|
}
|
|
}
|