Finished the implemenation with Antigravity.
This commit is contained in:
@@ -0,0 +1,702 @@
|
||||
#include "../Headers/slex_regex.h"
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
|
||||
// Token representation for Regex Parsing
|
||||
typedef enum {
|
||||
TOKEN_CHAR,
|
||||
TOKEN_CHAR_SET,
|
||||
TOKEN_CONCAT,
|
||||
TOKEN_ALT,
|
||||
TOKEN_STAR,
|
||||
TOKEN_PLUS,
|
||||
TOKEN_QUESTION,
|
||||
TOKEN_LPAREN,
|
||||
TOKEN_RPAREN,
|
||||
} RegexTokenType;
|
||||
|
||||
typedef struct {
|
||||
RegexTokenType type;
|
||||
bool char_set[256];
|
||||
} RegexToken;
|
||||
|
||||
// Global array to track all allocated NFA states for easy deallocation
|
||||
static NFAState** g_nfa_states = NULL;
|
||||
static int g_nfa_state_count = 0;
|
||||
static int g_nfa_state_capacity = 0;
|
||||
|
||||
static NFAState* create_nfa_state() {
|
||||
NFAState* s = (NFAState*)malloc(sizeof(NFAState));
|
||||
s->id = g_nfa_state_count;
|
||||
s->is_epsilon = false;
|
||||
memset(s->char_set, 0, sizeof(s->char_set));
|
||||
s->edge1 = NULL;
|
||||
s->edge2 = NULL;
|
||||
s->accept_rule_index = -1;
|
||||
|
||||
// Track state globally
|
||||
if (g_nfa_state_count >= g_nfa_state_capacity) {
|
||||
g_nfa_state_capacity = g_nfa_state_capacity == 0 ? 1024 : g_nfa_state_capacity * 2;
|
||||
g_nfa_states = (NFAState**)realloc(g_nfa_states, g_nfa_state_capacity * sizeof(NFAState*));
|
||||
}
|
||||
g_nfa_states[g_nfa_state_count++] = s;
|
||||
return s;
|
||||
}
|
||||
|
||||
static void free_all_nfa_states() {
|
||||
for (int i = 0; i < g_nfa_state_count; i++) {
|
||||
free(g_nfa_states[i]);
|
||||
}
|
||||
free(g_nfa_states);
|
||||
g_nfa_states = NULL;
|
||||
g_nfa_state_count = 0;
|
||||
g_nfa_state_capacity = 0;
|
||||
}
|
||||
|
||||
// Tokenize a regex pattern
|
||||
static RegexToken* tokenize_regex(const char* pattern, int* token_count_out) {
|
||||
int capacity = 128;
|
||||
int count = 0;
|
||||
RegexToken* tokens = (RegexToken*)malloc(capacity * sizeof(RegexToken));
|
||||
int len = (int)strlen(pattern);
|
||||
int idx = 0;
|
||||
|
||||
while (idx < len) {
|
||||
if (count >= capacity) {
|
||||
capacity *= 2;
|
||||
tokens = (RegexToken*)realloc(tokens, capacity * sizeof(RegexToken));
|
||||
}
|
||||
|
||||
char c = pattern[idx];
|
||||
|
||||
if (c == '\\') {
|
||||
idx++;
|
||||
if (idx >= len) {
|
||||
// Trailing backslash, treat as literal backslash
|
||||
tokens[count].type = TOKEN_CHAR;
|
||||
memset(tokens[count].char_set, 0, 256);
|
||||
tokens[count].char_set[(unsigned char)'\\'] = true;
|
||||
count++;
|
||||
break;
|
||||
}
|
||||
char esc = pattern[idx++];
|
||||
tokens[count].type = TOKEN_CHAR_SET;
|
||||
memset(tokens[count].char_set, 0, 256);
|
||||
|
||||
if (esc == 'p' && idx < len && pattern[idx] == '{') {
|
||||
idx++; // skip '{'
|
||||
char prop[256];
|
||||
int p_idx = 0;
|
||||
while (idx < len && pattern[idx] != '}') {
|
||||
prop[p_idx++] = pattern[idx++];
|
||||
}
|
||||
prop[p_idx] = '\0';
|
||||
if (idx < len && pattern[idx] == '}') {
|
||||
idx++; // skip '}'
|
||||
}
|
||||
|
||||
if (strcmp(prop, "P") == 0) {
|
||||
const char* punct = "!\"#%&'()*,-./:;?@[\\]_{}";
|
||||
for (int k = 0; punct[k] != '\0'; k++) {
|
||||
tokens[count].char_set[(unsigned char)punct[k]] = true;
|
||||
}
|
||||
} else if (strcmp(prop, "S") == 0) {
|
||||
const char* sym = "$+<=>^`|~";
|
||||
for (int k = 0; sym[k] != '\0'; k++) {
|
||||
tokens[count].char_set[(unsigned char)sym[k]] = true;
|
||||
}
|
||||
} else if (strcmp(prop, "L") == 0) {
|
||||
for (int d = 'a'; d <= 'z'; d++) tokens[count].char_set[d] = true;
|
||||
for (int d = 'A'; d <= 'Z'; d++) tokens[count].char_set[d] = true;
|
||||
} else if (strcmp(prop, "N") == 0) {
|
||||
for (int d = '0'; d <= '9'; d++) tokens[count].char_set[d] = true;
|
||||
}
|
||||
} else if (esc == 'n') {
|
||||
tokens[count].char_set[10] = true; // LF
|
||||
} else if (esc == 't') {
|
||||
tokens[count].char_set[9] = true; // TAB
|
||||
} else if (esc == 'r') {
|
||||
tokens[count].char_set[13] = true; // CR
|
||||
} else if (esc == 's') {
|
||||
tokens[count].char_set[32] = true; // Space
|
||||
tokens[count].char_set[9] = true; // TAB
|
||||
tokens[count].char_set[13] = true; // CR
|
||||
tokens[count].char_set[10] = true; // LF
|
||||
} else if (esc == 'd') {
|
||||
for (int d = '0'; d <= '9'; d++) tokens[count].char_set[d] = true;
|
||||
} else if (esc == 'w') {
|
||||
for (int d = '0'; d <= '9'; d++) tokens[count].char_set[d] = true;
|
||||
for (int d = 'a'; d <= 'z'; d++) tokens[count].char_set[d] = true;
|
||||
for (int d = 'A'; d <= 'Z'; d++) tokens[count].char_set[d] = true;
|
||||
tokens[count].char_set[(unsigned char)'_'] = true;
|
||||
} else {
|
||||
// Literal escaped character
|
||||
tokens[count].type = TOKEN_CHAR;
|
||||
tokens[count].char_set[(unsigned char)esc] = true;
|
||||
}
|
||||
count++;
|
||||
} else if (c == '[') {
|
||||
idx++;
|
||||
bool negate = false;
|
||||
if (idx < len && pattern[idx] == '^') {
|
||||
negate = true;
|
||||
idx++;
|
||||
}
|
||||
|
||||
tokens[count].type = TOKEN_CHAR_SET;
|
||||
memset(tokens[count].char_set, 0, 256);
|
||||
|
||||
while (idx < len && pattern[idx] != ']') {
|
||||
char c1 = pattern[idx++];
|
||||
if (c1 == '\\' && idx < len) {
|
||||
char esc = pattern[idx++];
|
||||
if (esc == 'p' && idx < len && pattern[idx] == '{') {
|
||||
idx++; // skip '{'
|
||||
char prop[256];
|
||||
int p_idx = 0;
|
||||
while (idx < len && pattern[idx] != '}') {
|
||||
prop[p_idx++] = pattern[idx++];
|
||||
}
|
||||
prop[p_idx] = '\0';
|
||||
if (idx < len && pattern[idx] == '}') {
|
||||
idx++; // skip '}'
|
||||
}
|
||||
|
||||
if (strcmp(prop, "P") == 0) {
|
||||
const char* punct = "!\"#%&'()*,-./:;?@[\\]_{}";
|
||||
for (int k = 0; punct[k] != '\0'; k++) {
|
||||
tokens[count].char_set[(unsigned char)punct[k]] = true;
|
||||
}
|
||||
} else if (strcmp(prop, "S") == 0) {
|
||||
const char* sym = "$+<=>^`|~";
|
||||
for (int k = 0; sym[k] != '\0'; k++) {
|
||||
tokens[count].char_set[(unsigned char)sym[k]] = true;
|
||||
}
|
||||
} else if (strcmp(prop, "L") == 0) {
|
||||
for (int d = 'a'; d <= 'z'; d++) tokens[count].char_set[d] = true;
|
||||
for (int d = 'A'; d <= 'Z'; d++) tokens[count].char_set[d] = true;
|
||||
} else if (strcmp(prop, "N") == 0) {
|
||||
for (int d = '0'; d <= '9'; d++) tokens[count].char_set[d] = true;
|
||||
}
|
||||
continue;
|
||||
} else if (esc == 'n') c1 = '\n';
|
||||
else if (esc == 't') c1 = '\t';
|
||||
else if (esc == 'r') c1 = '\r';
|
||||
else if (esc == 's') {
|
||||
tokens[count].char_set[32] = true;
|
||||
tokens[count].char_set[9] = true;
|
||||
tokens[count].char_set[13] = true;
|
||||
tokens[count].char_set[10] = true;
|
||||
continue;
|
||||
} else if (esc == 'd') {
|
||||
for (int d = '0'; d <= '9'; d++) tokens[count].char_set[d] = true;
|
||||
continue;
|
||||
} else if (esc == 'w') {
|
||||
for (int d = '0'; d <= '9'; d++) tokens[count].char_set[d] = true;
|
||||
for (int d = 'a'; d <= 'z'; d++) tokens[count].char_set[d] = true;
|
||||
for (int d = 'A'; d <= 'Z'; d++) tokens[count].char_set[d] = true;
|
||||
tokens[count].char_set[(unsigned char)'_'] = true;
|
||||
continue;
|
||||
} else {
|
||||
c1 = esc;
|
||||
}
|
||||
}
|
||||
|
||||
// Check range: c1-c2
|
||||
if (idx + 1 < len && pattern[idx] == '-' && pattern[idx + 1] != ']') {
|
||||
idx++; // skip '-'
|
||||
char c2 = pattern[idx++];
|
||||
if (c2 == '\\' && idx < len) {
|
||||
char esc = pattern[idx++];
|
||||
if (esc == 'n') c2 = '\n';
|
||||
else if (esc == 't') c2 = '\t';
|
||||
else if (esc == 'r') c2 = '\r';
|
||||
else c2 = esc;
|
||||
}
|
||||
for (int r = (unsigned char)c1; r <= (unsigned char)c2; r++) {
|
||||
tokens[count].char_set[r] = true;
|
||||
}
|
||||
} else {
|
||||
tokens[count].char_set[(unsigned char)c1] = true;
|
||||
}
|
||||
}
|
||||
|
||||
if (idx < len && pattern[idx] == ']') {
|
||||
idx++;
|
||||
}
|
||||
|
||||
if (negate) {
|
||||
for (int i = 0; i < 256; i++) {
|
||||
tokens[count].char_set[i] = !tokens[count].char_set[i];
|
||||
}
|
||||
}
|
||||
count++;
|
||||
} else if (c == '.') {
|
||||
tokens[count].type = TOKEN_CHAR_SET;
|
||||
memset(tokens[count].char_set, 0, 256);
|
||||
for (int i = 0; i < 256; i++) {
|
||||
if (i != 10) { // any character except newline
|
||||
tokens[count].char_set[i] = true;
|
||||
}
|
||||
}
|
||||
count++;
|
||||
idx++;
|
||||
} else if (c == '*') {
|
||||
tokens[count].type = TOKEN_STAR;
|
||||
count++;
|
||||
idx++;
|
||||
} else if (c == '+') {
|
||||
tokens[count].type = TOKEN_PLUS;
|
||||
count++;
|
||||
idx++;
|
||||
} else if (c == '?') {
|
||||
tokens[count].type = TOKEN_QUESTION;
|
||||
count++;
|
||||
idx++;
|
||||
} else if (c == '|') {
|
||||
tokens[count].type = TOKEN_ALT;
|
||||
count++;
|
||||
idx++;
|
||||
} else if (c == '(') {
|
||||
tokens[count].type = TOKEN_LPAREN;
|
||||
count++;
|
||||
idx++;
|
||||
} else if (c == ')') {
|
||||
tokens[count].type = TOKEN_RPAREN;
|
||||
count++;
|
||||
idx++;
|
||||
} else {
|
||||
tokens[count].type = TOKEN_CHAR;
|
||||
memset(tokens[count].char_set, 0, 256);
|
||||
tokens[count].char_set[(unsigned char)c] = true;
|
||||
count++;
|
||||
idx++;
|
||||
}
|
||||
}
|
||||
|
||||
*token_count_out = count;
|
||||
return tokens;
|
||||
}
|
||||
|
||||
// Insert explicit concatenation operators
|
||||
static RegexToken* insert_concat(RegexToken* input, int input_count, int* output_count_out) {
|
||||
int capacity = input_count * 2;
|
||||
int count = 0;
|
||||
RegexToken* output = (RegexToken*)malloc(capacity * sizeof(RegexToken));
|
||||
|
||||
for (int i = 0; i < input_count; i++) {
|
||||
if (count >= capacity) {
|
||||
capacity *= 2;
|
||||
output = (RegexToken*)realloc(output, capacity * sizeof(RegexToken));
|
||||
}
|
||||
|
||||
output[count++] = input[i];
|
||||
|
||||
if (i + 1 < input_count) {
|
||||
RegexTokenType t1 = input[i].type;
|
||||
RegexTokenType t2 = input[i + 1].type;
|
||||
|
||||
bool t1_can_concat = (t1 == TOKEN_CHAR || t1 == TOKEN_CHAR_SET || t1 == TOKEN_STAR || t1 == TOKEN_PLUS || t1 == TOKEN_QUESTION || t1 == TOKEN_RPAREN);
|
||||
bool t2_can_concat = (t2 == TOKEN_CHAR || t2 == TOKEN_CHAR_SET || t2 == TOKEN_LPAREN);
|
||||
|
||||
if (t1_can_concat && t2_can_concat) {
|
||||
if (count >= capacity) {
|
||||
capacity *= 2;
|
||||
output = (RegexToken*)realloc(output, capacity * sizeof(RegexToken));
|
||||
}
|
||||
output[count].type = TOKEN_CONCAT;
|
||||
memset(output[count].char_set, 0, 256);
|
||||
count++;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
*output_count_out = count;
|
||||
return output;
|
||||
}
|
||||
|
||||
// Shunting-yard algorithm to convert infix tokens to postfix tokens
|
||||
static RegexToken* infix_to_postfix(RegexToken* infix, int infix_count, int* postfix_count_out) {
|
||||
int capacity = infix_count;
|
||||
int postfix_count = 0;
|
||||
RegexToken* postfix = (RegexToken*)malloc(capacity * sizeof(RegexToken));
|
||||
|
||||
RegexToken stack[512];
|
||||
int stack_top = 0;
|
||||
|
||||
for (int i = 0; i < infix_count; i++) {
|
||||
RegexToken t = infix[i];
|
||||
|
||||
if (t.type == TOKEN_CHAR || t.type == TOKEN_CHAR_SET) {
|
||||
if (postfix_count >= capacity) {
|
||||
capacity *= 2;
|
||||
postfix = (RegexToken*)realloc(postfix, capacity * sizeof(RegexToken));
|
||||
}
|
||||
postfix[postfix_count++] = t;
|
||||
} else if (t.type == TOKEN_LPAREN) {
|
||||
stack[stack_top++] = t;
|
||||
} else if (t.type == TOKEN_RPAREN) {
|
||||
while (stack_top > 0 && stack[stack_top - 1].type != TOKEN_LPAREN) {
|
||||
if (postfix_count >= capacity) {
|
||||
capacity *= 2;
|
||||
postfix = (RegexToken*)realloc(postfix, capacity * sizeof(RegexToken));
|
||||
}
|
||||
postfix[postfix_count++] = stack[--stack_top];
|
||||
}
|
||||
if (stack_top > 0) {
|
||||
stack_top--; // pop LPAREN
|
||||
}
|
||||
} else if (t.type == TOKEN_STAR || t.type == TOKEN_PLUS || t.type == TOKEN_QUESTION) {
|
||||
// Unary operators have highest precedence and are postfix, output immediately
|
||||
if (postfix_count >= capacity) {
|
||||
capacity *= 2;
|
||||
postfix = (RegexToken*)realloc(postfix, capacity * sizeof(RegexToken));
|
||||
}
|
||||
postfix[postfix_count++] = t;
|
||||
} else {
|
||||
// Binary operators (CONCAT, ALT)
|
||||
int p_curr = (t.type == TOKEN_ALT) ? 1 : 2;
|
||||
while (stack_top > 0) {
|
||||
RegexTokenType top_type = stack[stack_top - 1].type;
|
||||
if (top_type == TOKEN_CONCAT || top_type == TOKEN_ALT) {
|
||||
int p_top = (top_type == TOKEN_ALT) ? 1 : 2;
|
||||
if (p_top >= p_curr) {
|
||||
if (postfix_count >= capacity) {
|
||||
capacity *= 2;
|
||||
postfix = (RegexToken*)realloc(postfix, capacity * sizeof(RegexToken));
|
||||
}
|
||||
postfix[postfix_count++] = stack[--stack_top];
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
stack[stack_top++] = t;
|
||||
}
|
||||
}
|
||||
|
||||
while (stack_top > 0) {
|
||||
if (postfix_count >= capacity) {
|
||||
capacity *= 2;
|
||||
postfix = (RegexToken*)realloc(postfix, capacity * sizeof(RegexToken));
|
||||
}
|
||||
postfix[postfix_count++] = stack[--stack_top];
|
||||
}
|
||||
|
||||
*postfix_count_out = postfix_count;
|
||||
return postfix;
|
||||
}
|
||||
|
||||
// Build NFA from postfix tokens using Thompson's construction
|
||||
static NFAFragment build_nfa(RegexToken* postfix, int postfix_count) {
|
||||
NFAFragment stack[512];
|
||||
int stack_top = 0;
|
||||
|
||||
for (int i = 0; i < postfix_count; i++) {
|
||||
RegexToken t = postfix[i];
|
||||
|
||||
if (t.type == TOKEN_CHAR || t.type == TOKEN_CHAR_SET) {
|
||||
NFAState* start = create_nfa_state();
|
||||
NFAState* accept = create_nfa_state();
|
||||
start->is_epsilon = false;
|
||||
memcpy(start->char_set, t.char_set, 256);
|
||||
start->edge1 = accept;
|
||||
|
||||
NFAFragment frag = {start, accept};
|
||||
stack[stack_top++] = frag;
|
||||
} else if (t.type == TOKEN_CONCAT) {
|
||||
NFAFragment f2 = stack[--stack_top];
|
||||
NFAFragment f1 = stack[--stack_top];
|
||||
|
||||
f1.accept->is_epsilon = true;
|
||||
f1.accept->edge1 = f2.start;
|
||||
|
||||
NFAFragment frag = {f1.start, f2.accept};
|
||||
stack[stack_top++] = frag;
|
||||
} else if (t.type == TOKEN_ALT) {
|
||||
NFAFragment f2 = stack[--stack_top];
|
||||
NFAFragment f1 = stack[--stack_top];
|
||||
|
||||
NFAState* start = create_nfa_state();
|
||||
NFAState* accept = create_nfa_state();
|
||||
|
||||
start->is_epsilon = true;
|
||||
start->edge1 = f1.start;
|
||||
start->edge2 = f2.start;
|
||||
|
||||
f1.accept->is_epsilon = true;
|
||||
f1.accept->edge1 = accept;
|
||||
|
||||
f2.accept->is_epsilon = true;
|
||||
f2.accept->edge1 = accept;
|
||||
|
||||
NFAFragment frag = {start, accept};
|
||||
stack[stack_top++] = frag;
|
||||
} else if (t.type == TOKEN_STAR) {
|
||||
NFAFragment f1 = stack[--stack_top];
|
||||
|
||||
NFAState* start = create_nfa_state();
|
||||
NFAState* accept = create_nfa_state();
|
||||
|
||||
start->is_epsilon = true;
|
||||
start->edge1 = f1.start;
|
||||
start->edge2 = accept;
|
||||
|
||||
f1.accept->is_epsilon = true;
|
||||
f1.accept->edge1 = f1.start;
|
||||
f1.accept->edge2 = accept;
|
||||
|
||||
NFAFragment frag = {start, accept};
|
||||
stack[stack_top++] = frag;
|
||||
} else if (t.type == TOKEN_PLUS) {
|
||||
NFAFragment f1 = stack[--stack_top];
|
||||
|
||||
NFAState* start = create_nfa_state();
|
||||
NFAState* accept = create_nfa_state();
|
||||
|
||||
start->is_epsilon = true;
|
||||
start->edge1 = f1.start;
|
||||
|
||||
f1.accept->is_epsilon = true;
|
||||
f1.accept->edge1 = f1.start;
|
||||
f1.accept->edge2 = accept;
|
||||
|
||||
NFAFragment frag = {start, accept};
|
||||
stack[stack_top++] = frag;
|
||||
} else if (t.type == TOKEN_QUESTION) {
|
||||
NFAFragment f1 = stack[--stack_top];
|
||||
|
||||
NFAState* start = create_nfa_state();
|
||||
NFAState* accept = create_nfa_state();
|
||||
|
||||
start->is_epsilon = true;
|
||||
start->edge1 = f1.start;
|
||||
start->edge2 = accept;
|
||||
|
||||
f1.accept->is_epsilon = true;
|
||||
f1.accept->edge1 = accept;
|
||||
|
||||
NFAFragment frag = {start, accept};
|
||||
stack[stack_top++] = frag;
|
||||
}
|
||||
}
|
||||
|
||||
return stack[0];
|
||||
}
|
||||
|
||||
// Computes epsilon closure of a set of NFA states
|
||||
static void get_epsilon_closure(int* input_states, int input_count, NFAState** all_nfa_states, int total_nfa_states, int** output_states, int* output_count) {
|
||||
bool* visited = (bool*)calloc(total_nfa_states, sizeof(bool));
|
||||
int* queue = (int*)malloc(total_nfa_states * sizeof(int));
|
||||
int head = 0, tail = 0;
|
||||
|
||||
for (int i = 0; i < input_count; i++) {
|
||||
int id = input_states[i];
|
||||
visited[id] = true;
|
||||
queue[tail++] = id;
|
||||
}
|
||||
|
||||
while (head < tail) {
|
||||
int curr_id = queue[head++];
|
||||
NFAState* s = all_nfa_states[curr_id];
|
||||
if (s->is_epsilon) {
|
||||
if (s->edge1 && !visited[s->edge1->id]) {
|
||||
visited[s->edge1->id] = true;
|
||||
queue[tail++] = s->edge1->id;
|
||||
}
|
||||
if (s->edge2 && !visited[s->edge2->id]) {
|
||||
visited[s->edge2->id] = true;
|
||||
queue[tail++] = s->edge2->id;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
int count = 0;
|
||||
for (int i = 0; i < total_nfa_states; i++) {
|
||||
if (visited[i]) count++;
|
||||
}
|
||||
|
||||
int* res = (int*)malloc(count * sizeof(int));
|
||||
int idx = 0;
|
||||
for (int i = 0; i < total_nfa_states; i++) {
|
||||
if (visited[i]) {
|
||||
res[idx++] = i;
|
||||
}
|
||||
}
|
||||
|
||||
*output_states = res;
|
||||
*output_count = count;
|
||||
free(visited);
|
||||
free(queue);
|
||||
}
|
||||
|
||||
// Compare two NFA state sets
|
||||
static bool are_nfa_sets_equal(int* a, int a_count, int* b, int b_count) {
|
||||
if (a_count != b_count) return false;
|
||||
for (int i = 0; i < a_count; i++) {
|
||||
if (a[i] != b[i]) return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// Compiles a set of regular expression patterns into a complete DFA using subset construction
|
||||
DFAState* slex_compile_regexes(char** patterns, int pattern_count, int* dfa_state_count_out) {
|
||||
free_all_nfa_states(); // Reset global state tracker
|
||||
|
||||
// 1. Build NFA for each pattern
|
||||
NFAFragment* fragments = (NFAFragment*)malloc(pattern_count * sizeof(NFAFragment));
|
||||
for (int i = 0; i < pattern_count; i++) {
|
||||
int t_count = 0, concat_count = 0, post_count = 0;
|
||||
RegexToken* tokens = tokenize_regex(patterns[i], &t_count);
|
||||
RegexToken* tokens_concat = insert_concat(tokens, t_count, &concat_count);
|
||||
RegexToken* tokens_postfix = infix_to_postfix(tokens_concat, concat_count, &post_count);
|
||||
|
||||
fragments[i] = build_nfa(tokens_postfix, post_count);
|
||||
fragments[i].accept->accept_rule_index = i;
|
||||
|
||||
free(tokens);
|
||||
free(tokens_concat);
|
||||
free(tokens_postfix);
|
||||
}
|
||||
|
||||
// 2. Create global start state with epsilon transitions to each pattern NFA's start state
|
||||
NFAState* global_start = create_nfa_state();
|
||||
global_start->is_epsilon = true;
|
||||
|
||||
NFAState* current_hub = global_start;
|
||||
for (int i = 0; i < pattern_count; i++) {
|
||||
if (i == pattern_count - 1) {
|
||||
current_hub->edge1 = fragments[i].start;
|
||||
} else {
|
||||
NFAState* next_hub = create_nfa_state();
|
||||
next_hub->is_epsilon = true;
|
||||
current_hub->edge1 = fragments[i].start;
|
||||
current_hub->edge2 = next_hub;
|
||||
current_hub = next_hub;
|
||||
}
|
||||
}
|
||||
free(fragments);
|
||||
|
||||
// 3. Subset construction
|
||||
int total_nfa_states = g_nfa_state_count;
|
||||
NFAState** all_nfa_states = g_nfa_states;
|
||||
|
||||
int dfa_capacity = 1024;
|
||||
int dfa_count = 0;
|
||||
DFAState* dfa_states = (DFAState*)malloc(dfa_capacity * sizeof(DFAState));
|
||||
|
||||
// Queue for subset construction
|
||||
int* work_queue = (int*)malloc(dfa_capacity * sizeof(int));
|
||||
int queue_head = 0, queue_tail = 0;
|
||||
|
||||
// Start state epsilon closure
|
||||
int start_nfa_id = global_start->id;
|
||||
int* start_closure = NULL;
|
||||
int start_closure_count = 0;
|
||||
get_epsilon_closure(&start_nfa_id, 1, all_nfa_states, total_nfa_states, &start_closure, &start_closure_count);
|
||||
|
||||
// Create start DFA state (0)
|
||||
dfa_states[dfa_count].id = dfa_count;
|
||||
dfa_states[dfa_count].nfa_states = start_closure;
|
||||
dfa_states[dfa_count].nfa_state_count = start_closure_count;
|
||||
memset(dfa_states[dfa_count].transitions, -1, sizeof(dfa_states[dfa_count].transitions));
|
||||
dfa_states[dfa_count].accept_rule_index = -1;
|
||||
|
||||
work_queue[queue_tail++] = dfa_count;
|
||||
dfa_count++;
|
||||
|
||||
// Process queue
|
||||
while (queue_head < queue_tail) {
|
||||
int curr_dfa_id = work_queue[queue_head++];
|
||||
|
||||
// For each possible ASCII character transition
|
||||
for (int c = 0; c < 256; c++) {
|
||||
// Find NFA states reachable on character 'c'
|
||||
int* reachable = (int*)malloc(total_nfa_states * sizeof(int));
|
||||
int reachable_count = 0;
|
||||
|
||||
DFAState* curr_dfa = &dfa_states[curr_dfa_id];
|
||||
for (int i = 0; i < curr_dfa->nfa_state_count; i++) {
|
||||
NFAState* nfa_s = all_nfa_states[curr_dfa->nfa_states[i]];
|
||||
if (!nfa_s->is_epsilon && nfa_s->char_set[c]) {
|
||||
if (nfa_s->edge1) {
|
||||
reachable[reachable_count++] = nfa_s->edge1->id;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (reachable_count > 0) {
|
||||
// Compute epsilon closure of reachable NFA states
|
||||
int* closure = NULL;
|
||||
int closure_count = 0;
|
||||
get_epsilon_closure(reachable, reachable_count, all_nfa_states, total_nfa_states, &closure, &closure_count);
|
||||
free(reachable);
|
||||
|
||||
// Check if this DFA state already exists
|
||||
int existing_id = -1;
|
||||
for (int d = 0; d < dfa_count; d++) {
|
||||
if (are_nfa_sets_equal(dfa_states[d].nfa_states, dfa_states[d].nfa_state_count, closure, closure_count)) {
|
||||
existing_id = d;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (existing_id != -1) {
|
||||
dfa_states[curr_dfa_id].transitions[c] = existing_id;
|
||||
free(closure);
|
||||
} else {
|
||||
if (dfa_count >= dfa_capacity) {
|
||||
dfa_capacity *= 2;
|
||||
dfa_states = (DFAState*)realloc(dfa_states, dfa_capacity * sizeof(DFAState));
|
||||
work_queue = (int*)realloc(work_queue, dfa_capacity * sizeof(int));
|
||||
}
|
||||
|
||||
dfa_states[dfa_count].id = dfa_count;
|
||||
dfa_states[dfa_count].nfa_states = closure;
|
||||
dfa_states[dfa_count].nfa_state_count = closure_count;
|
||||
memset(dfa_states[dfa_count].transitions, -1, sizeof(dfa_states[dfa_count].transitions));
|
||||
dfa_states[dfa_count].accept_rule_index = -1;
|
||||
|
||||
dfa_states[curr_dfa_id].transitions[c] = dfa_count;
|
||||
work_queue[queue_tail++] = dfa_count;
|
||||
dfa_count++;
|
||||
}
|
||||
} else {
|
||||
free(reachable);
|
||||
dfa_states[curr_dfa_id].transitions[c] = -1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Determine accepting status of each DFA state based on NFA accept states
|
||||
for (int d = 0; d < dfa_count; d++) {
|
||||
int best_rule = -1;
|
||||
for (int i = 0; i < dfa_states[d].nfa_state_count; i++) {
|
||||
NFAState* nfa_s = all_nfa_states[dfa_states[d].nfa_states[i]];
|
||||
if (nfa_s->accept_rule_index != -1) {
|
||||
if (best_rule == -1 || nfa_s->accept_rule_index < best_rule) {
|
||||
best_rule = nfa_s->accept_rule_index;
|
||||
}
|
||||
}
|
||||
}
|
||||
dfa_states[d].accept_rule_index = best_rule;
|
||||
}
|
||||
|
||||
free(work_queue);
|
||||
free_all_nfa_states(); // We no longer need the NFA states
|
||||
|
||||
*dfa_state_count_out = dfa_count;
|
||||
return dfa_states;
|
||||
}
|
||||
|
||||
void slex_free_dfa(DFAState* dfa_states, int dfa_state_count) {
|
||||
if (dfa_states) {
|
||||
for (int i = 0; i < dfa_state_count; i++) {
|
||||
free(dfa_states[i].nfa_states);
|
||||
}
|
||||
free(dfa_states);
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user