Files
slex/Source/slex_regex.c
T
2026-05-26 04:45:32 +10:00

703 lines
26 KiB
C

#include "../Headers/slex_regex.h"
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
// Token representation for Regex Parsing
typedef enum {
TOKEN_CHAR,
TOKEN_CHAR_SET,
TOKEN_CONCAT,
TOKEN_ALT,
TOKEN_STAR,
TOKEN_PLUS,
TOKEN_QUESTION,
TOKEN_LPAREN,
TOKEN_RPAREN,
} RegexTokenType;
typedef struct {
RegexTokenType type;
bool char_set[256];
} RegexToken;
// Global array to track all allocated NFA states for easy deallocation
static NFAState** g_nfa_states = NULL;
static int g_nfa_state_count = 0;
static int g_nfa_state_capacity = 0;
static NFAState* create_nfa_state() {
NFAState* s = (NFAState*)malloc(sizeof(NFAState));
s->id = g_nfa_state_count;
s->is_epsilon = false;
memset(s->char_set, 0, sizeof(s->char_set));
s->edge1 = NULL;
s->edge2 = NULL;
s->accept_rule_index = -1;
// Track state globally
if (g_nfa_state_count >= g_nfa_state_capacity) {
g_nfa_state_capacity = g_nfa_state_capacity == 0 ? 1024 : g_nfa_state_capacity * 2;
g_nfa_states = (NFAState**)realloc(g_nfa_states, g_nfa_state_capacity * sizeof(NFAState*));
}
g_nfa_states[g_nfa_state_count++] = s;
return s;
}
static void free_all_nfa_states() {
for (int i = 0; i < g_nfa_state_count; i++) {
free(g_nfa_states[i]);
}
free(g_nfa_states);
g_nfa_states = NULL;
g_nfa_state_count = 0;
g_nfa_state_capacity = 0;
}
// Tokenize a regex pattern
static RegexToken* tokenize_regex(const char* pattern, int* token_count_out) {
int capacity = 128;
int count = 0;
RegexToken* tokens = (RegexToken*)malloc(capacity * sizeof(RegexToken));
int len = (int)strlen(pattern);
int idx = 0;
while (idx < len) {
if (count >= capacity) {
capacity *= 2;
tokens = (RegexToken*)realloc(tokens, capacity * sizeof(RegexToken));
}
char c = pattern[idx];
if (c == '\\') {
idx++;
if (idx >= len) {
// Trailing backslash, treat as literal backslash
tokens[count].type = TOKEN_CHAR;
memset(tokens[count].char_set, 0, 256);
tokens[count].char_set[(unsigned char)'\\'] = true;
count++;
break;
}
char esc = pattern[idx++];
tokens[count].type = TOKEN_CHAR_SET;
memset(tokens[count].char_set, 0, 256);
if (esc == 'p' && idx < len && pattern[idx] == '{') {
idx++; // skip '{'
char prop[256];
int p_idx = 0;
while (idx < len && pattern[idx] != '}') {
prop[p_idx++] = pattern[idx++];
}
prop[p_idx] = '\0';
if (idx < len && pattern[idx] == '}') {
idx++; // skip '}'
}
if (strcmp(prop, "P") == 0) {
const char* punct = "!\"#%&'()*,-./:;?@[\\]_{}";
for (int k = 0; punct[k] != '\0'; k++) {
tokens[count].char_set[(unsigned char)punct[k]] = true;
}
} else if (strcmp(prop, "S") == 0) {
const char* sym = "$+<=>^`|~";
for (int k = 0; sym[k] != '\0'; k++) {
tokens[count].char_set[(unsigned char)sym[k]] = true;
}
} else if (strcmp(prop, "L") == 0) {
for (int d = 'a'; d <= 'z'; d++) tokens[count].char_set[d] = true;
for (int d = 'A'; d <= 'Z'; d++) tokens[count].char_set[d] = true;
} else if (strcmp(prop, "N") == 0) {
for (int d = '0'; d <= '9'; d++) tokens[count].char_set[d] = true;
}
} else if (esc == 'n') {
tokens[count].char_set[10] = true; // LF
} else if (esc == 't') {
tokens[count].char_set[9] = true; // TAB
} else if (esc == 'r') {
tokens[count].char_set[13] = true; // CR
} else if (esc == 's') {
tokens[count].char_set[32] = true; // Space
tokens[count].char_set[9] = true; // TAB
tokens[count].char_set[13] = true; // CR
tokens[count].char_set[10] = true; // LF
} else if (esc == 'd') {
for (int d = '0'; d <= '9'; d++) tokens[count].char_set[d] = true;
} else if (esc == 'w') {
for (int d = '0'; d <= '9'; d++) tokens[count].char_set[d] = true;
for (int d = 'a'; d <= 'z'; d++) tokens[count].char_set[d] = true;
for (int d = 'A'; d <= 'Z'; d++) tokens[count].char_set[d] = true;
tokens[count].char_set[(unsigned char)'_'] = true;
} else {
// Literal escaped character
tokens[count].type = TOKEN_CHAR;
tokens[count].char_set[(unsigned char)esc] = true;
}
count++;
} else if (c == '[') {
idx++;
bool negate = false;
if (idx < len && pattern[idx] == '^') {
negate = true;
idx++;
}
tokens[count].type = TOKEN_CHAR_SET;
memset(tokens[count].char_set, 0, 256);
while (idx < len && pattern[idx] != ']') {
char c1 = pattern[idx++];
if (c1 == '\\' && idx < len) {
char esc = pattern[idx++];
if (esc == 'p' && idx < len && pattern[idx] == '{') {
idx++; // skip '{'
char prop[256];
int p_idx = 0;
while (idx < len && pattern[idx] != '}') {
prop[p_idx++] = pattern[idx++];
}
prop[p_idx] = '\0';
if (idx < len && pattern[idx] == '}') {
idx++; // skip '}'
}
if (strcmp(prop, "P") == 0) {
const char* punct = "!\"#%&'()*,-./:;?@[\\]_{}";
for (int k = 0; punct[k] != '\0'; k++) {
tokens[count].char_set[(unsigned char)punct[k]] = true;
}
} else if (strcmp(prop, "S") == 0) {
const char* sym = "$+<=>^`|~";
for (int k = 0; sym[k] != '\0'; k++) {
tokens[count].char_set[(unsigned char)sym[k]] = true;
}
} else if (strcmp(prop, "L") == 0) {
for (int d = 'a'; d <= 'z'; d++) tokens[count].char_set[d] = true;
for (int d = 'A'; d <= 'Z'; d++) tokens[count].char_set[d] = true;
} else if (strcmp(prop, "N") == 0) {
for (int d = '0'; d <= '9'; d++) tokens[count].char_set[d] = true;
}
continue;
} else if (esc == 'n') c1 = '\n';
else if (esc == 't') c1 = '\t';
else if (esc == 'r') c1 = '\r';
else if (esc == 's') {
tokens[count].char_set[32] = true;
tokens[count].char_set[9] = true;
tokens[count].char_set[13] = true;
tokens[count].char_set[10] = true;
continue;
} else if (esc == 'd') {
for (int d = '0'; d <= '9'; d++) tokens[count].char_set[d] = true;
continue;
} else if (esc == 'w') {
for (int d = '0'; d <= '9'; d++) tokens[count].char_set[d] = true;
for (int d = 'a'; d <= 'z'; d++) tokens[count].char_set[d] = true;
for (int d = 'A'; d <= 'Z'; d++) tokens[count].char_set[d] = true;
tokens[count].char_set[(unsigned char)'_'] = true;
continue;
} else {
c1 = esc;
}
}
// Check range: c1-c2
if (idx + 1 < len && pattern[idx] == '-' && pattern[idx + 1] != ']') {
idx++; // skip '-'
char c2 = pattern[idx++];
if (c2 == '\\' && idx < len) {
char esc = pattern[idx++];
if (esc == 'n') c2 = '\n';
else if (esc == 't') c2 = '\t';
else if (esc == 'r') c2 = '\r';
else c2 = esc;
}
for (int r = (unsigned char)c1; r <= (unsigned char)c2; r++) {
tokens[count].char_set[r] = true;
}
} else {
tokens[count].char_set[(unsigned char)c1] = true;
}
}
if (idx < len && pattern[idx] == ']') {
idx++;
}
if (negate) {
for (int i = 0; i < 256; i++) {
tokens[count].char_set[i] = !tokens[count].char_set[i];
}
}
count++;
} else if (c == '.') {
tokens[count].type = TOKEN_CHAR_SET;
memset(tokens[count].char_set, 0, 256);
for (int i = 0; i < 256; i++) {
if (i != 10) { // any character except newline
tokens[count].char_set[i] = true;
}
}
count++;
idx++;
} else if (c == '*') {
tokens[count].type = TOKEN_STAR;
count++;
idx++;
} else if (c == '+') {
tokens[count].type = TOKEN_PLUS;
count++;
idx++;
} else if (c == '?') {
tokens[count].type = TOKEN_QUESTION;
count++;
idx++;
} else if (c == '|') {
tokens[count].type = TOKEN_ALT;
count++;
idx++;
} else if (c == '(') {
tokens[count].type = TOKEN_LPAREN;
count++;
idx++;
} else if (c == ')') {
tokens[count].type = TOKEN_RPAREN;
count++;
idx++;
} else {
tokens[count].type = TOKEN_CHAR;
memset(tokens[count].char_set, 0, 256);
tokens[count].char_set[(unsigned char)c] = true;
count++;
idx++;
}
}
*token_count_out = count;
return tokens;
}
// Insert explicit concatenation operators
static RegexToken* insert_concat(RegexToken* input, int input_count, int* output_count_out) {
int capacity = input_count * 2;
int count = 0;
RegexToken* output = (RegexToken*)malloc(capacity * sizeof(RegexToken));
for (int i = 0; i < input_count; i++) {
if (count >= capacity) {
capacity *= 2;
output = (RegexToken*)realloc(output, capacity * sizeof(RegexToken));
}
output[count++] = input[i];
if (i + 1 < input_count) {
RegexTokenType t1 = input[i].type;
RegexTokenType t2 = input[i + 1].type;
bool t1_can_concat = (t1 == TOKEN_CHAR || t1 == TOKEN_CHAR_SET || t1 == TOKEN_STAR || t1 == TOKEN_PLUS || t1 == TOKEN_QUESTION || t1 == TOKEN_RPAREN);
bool t2_can_concat = (t2 == TOKEN_CHAR || t2 == TOKEN_CHAR_SET || t2 == TOKEN_LPAREN);
if (t1_can_concat && t2_can_concat) {
if (count >= capacity) {
capacity *= 2;
output = (RegexToken*)realloc(output, capacity * sizeof(RegexToken));
}
output[count].type = TOKEN_CONCAT;
memset(output[count].char_set, 0, 256);
count++;
}
}
}
*output_count_out = count;
return output;
}
// Shunting-yard algorithm to convert infix tokens to postfix tokens
static RegexToken* infix_to_postfix(RegexToken* infix, int infix_count, int* postfix_count_out) {
int capacity = infix_count;
int postfix_count = 0;
RegexToken* postfix = (RegexToken*)malloc(capacity * sizeof(RegexToken));
RegexToken stack[512];
int stack_top = 0;
for (int i = 0; i < infix_count; i++) {
RegexToken t = infix[i];
if (t.type == TOKEN_CHAR || t.type == TOKEN_CHAR_SET) {
if (postfix_count >= capacity) {
capacity *= 2;
postfix = (RegexToken*)realloc(postfix, capacity * sizeof(RegexToken));
}
postfix[postfix_count++] = t;
} else if (t.type == TOKEN_LPAREN) {
stack[stack_top++] = t;
} else if (t.type == TOKEN_RPAREN) {
while (stack_top > 0 && stack[stack_top - 1].type != TOKEN_LPAREN) {
if (postfix_count >= capacity) {
capacity *= 2;
postfix = (RegexToken*)realloc(postfix, capacity * sizeof(RegexToken));
}
postfix[postfix_count++] = stack[--stack_top];
}
if (stack_top > 0) {
stack_top--; // pop LPAREN
}
} else if (t.type == TOKEN_STAR || t.type == TOKEN_PLUS || t.type == TOKEN_QUESTION) {
// Unary operators have highest precedence and are postfix, output immediately
if (postfix_count >= capacity) {
capacity *= 2;
postfix = (RegexToken*)realloc(postfix, capacity * sizeof(RegexToken));
}
postfix[postfix_count++] = t;
} else {
// Binary operators (CONCAT, ALT)
int p_curr = (t.type == TOKEN_ALT) ? 1 : 2;
while (stack_top > 0) {
RegexTokenType top_type = stack[stack_top - 1].type;
if (top_type == TOKEN_CONCAT || top_type == TOKEN_ALT) {
int p_top = (top_type == TOKEN_ALT) ? 1 : 2;
if (p_top >= p_curr) {
if (postfix_count >= capacity) {
capacity *= 2;
postfix = (RegexToken*)realloc(postfix, capacity * sizeof(RegexToken));
}
postfix[postfix_count++] = stack[--stack_top];
} else {
break;
}
} else {
break;
}
}
stack[stack_top++] = t;
}
}
while (stack_top > 0) {
if (postfix_count >= capacity) {
capacity *= 2;
postfix = (RegexToken*)realloc(postfix, capacity * sizeof(RegexToken));
}
postfix[postfix_count++] = stack[--stack_top];
}
*postfix_count_out = postfix_count;
return postfix;
}
// Build NFA from postfix tokens using Thompson's construction
static NFAFragment build_nfa(RegexToken* postfix, int postfix_count) {
NFAFragment stack[512];
int stack_top = 0;
for (int i = 0; i < postfix_count; i++) {
RegexToken t = postfix[i];
if (t.type == TOKEN_CHAR || t.type == TOKEN_CHAR_SET) {
NFAState* start = create_nfa_state();
NFAState* accept = create_nfa_state();
start->is_epsilon = false;
memcpy(start->char_set, t.char_set, 256);
start->edge1 = accept;
NFAFragment frag = {start, accept};
stack[stack_top++] = frag;
} else if (t.type == TOKEN_CONCAT) {
NFAFragment f2 = stack[--stack_top];
NFAFragment f1 = stack[--stack_top];
f1.accept->is_epsilon = true;
f1.accept->edge1 = f2.start;
NFAFragment frag = {f1.start, f2.accept};
stack[stack_top++] = frag;
} else if (t.type == TOKEN_ALT) {
NFAFragment f2 = stack[--stack_top];
NFAFragment f1 = stack[--stack_top];
NFAState* start = create_nfa_state();
NFAState* accept = create_nfa_state();
start->is_epsilon = true;
start->edge1 = f1.start;
start->edge2 = f2.start;
f1.accept->is_epsilon = true;
f1.accept->edge1 = accept;
f2.accept->is_epsilon = true;
f2.accept->edge1 = accept;
NFAFragment frag = {start, accept};
stack[stack_top++] = frag;
} else if (t.type == TOKEN_STAR) {
NFAFragment f1 = stack[--stack_top];
NFAState* start = create_nfa_state();
NFAState* accept = create_nfa_state();
start->is_epsilon = true;
start->edge1 = f1.start;
start->edge2 = accept;
f1.accept->is_epsilon = true;
f1.accept->edge1 = f1.start;
f1.accept->edge2 = accept;
NFAFragment frag = {start, accept};
stack[stack_top++] = frag;
} else if (t.type == TOKEN_PLUS) {
NFAFragment f1 = stack[--stack_top];
NFAState* start = create_nfa_state();
NFAState* accept = create_nfa_state();
start->is_epsilon = true;
start->edge1 = f1.start;
f1.accept->is_epsilon = true;
f1.accept->edge1 = f1.start;
f1.accept->edge2 = accept;
NFAFragment frag = {start, accept};
stack[stack_top++] = frag;
} else if (t.type == TOKEN_QUESTION) {
NFAFragment f1 = stack[--stack_top];
NFAState* start = create_nfa_state();
NFAState* accept = create_nfa_state();
start->is_epsilon = true;
start->edge1 = f1.start;
start->edge2 = accept;
f1.accept->is_epsilon = true;
f1.accept->edge1 = accept;
NFAFragment frag = {start, accept};
stack[stack_top++] = frag;
}
}
return stack[0];
}
// Computes epsilon closure of a set of NFA states
static void get_epsilon_closure(int* input_states, int input_count, NFAState** all_nfa_states, int total_nfa_states, int** output_states, int* output_count) {
bool* visited = (bool*)calloc(total_nfa_states, sizeof(bool));
int* queue = (int*)malloc(total_nfa_states * sizeof(int));
int head = 0, tail = 0;
for (int i = 0; i < input_count; i++) {
int id = input_states[i];
visited[id] = true;
queue[tail++] = id;
}
while (head < tail) {
int curr_id = queue[head++];
NFAState* s = all_nfa_states[curr_id];
if (s->is_epsilon) {
if (s->edge1 && !visited[s->edge1->id]) {
visited[s->edge1->id] = true;
queue[tail++] = s->edge1->id;
}
if (s->edge2 && !visited[s->edge2->id]) {
visited[s->edge2->id] = true;
queue[tail++] = s->edge2->id;
}
}
}
int count = 0;
for (int i = 0; i < total_nfa_states; i++) {
if (visited[i]) count++;
}
int* res = (int*)malloc(count * sizeof(int));
int idx = 0;
for (int i = 0; i < total_nfa_states; i++) {
if (visited[i]) {
res[idx++] = i;
}
}
*output_states = res;
*output_count = count;
free(visited);
free(queue);
}
// Compare two NFA state sets
static bool are_nfa_sets_equal(int* a, int a_count, int* b, int b_count) {
if (a_count != b_count) return false;
for (int i = 0; i < a_count; i++) {
if (a[i] != b[i]) return false;
}
return true;
}
// Compiles a set of regular expression patterns into a complete DFA using subset construction
DFAState* slex_compile_regexes(char** patterns, int pattern_count, int* dfa_state_count_out) {
free_all_nfa_states(); // Reset global state tracker
// 1. Build NFA for each pattern
NFAFragment* fragments = (NFAFragment*)malloc(pattern_count * sizeof(NFAFragment));
for (int i = 0; i < pattern_count; i++) {
int t_count = 0, concat_count = 0, post_count = 0;
RegexToken* tokens = tokenize_regex(patterns[i], &t_count);
RegexToken* tokens_concat = insert_concat(tokens, t_count, &concat_count);
RegexToken* tokens_postfix = infix_to_postfix(tokens_concat, concat_count, &post_count);
fragments[i] = build_nfa(tokens_postfix, post_count);
fragments[i].accept->accept_rule_index = i;
free(tokens);
free(tokens_concat);
free(tokens_postfix);
}
// 2. Create global start state with epsilon transitions to each pattern NFA's start state
NFAState* global_start = create_nfa_state();
global_start->is_epsilon = true;
NFAState* current_hub = global_start;
for (int i = 0; i < pattern_count; i++) {
if (i == pattern_count - 1) {
current_hub->edge1 = fragments[i].start;
} else {
NFAState* next_hub = create_nfa_state();
next_hub->is_epsilon = true;
current_hub->edge1 = fragments[i].start;
current_hub->edge2 = next_hub;
current_hub = next_hub;
}
}
free(fragments);
// 3. Subset construction
int total_nfa_states = g_nfa_state_count;
NFAState** all_nfa_states = g_nfa_states;
int dfa_capacity = 1024;
int dfa_count = 0;
DFAState* dfa_states = (DFAState*)malloc(dfa_capacity * sizeof(DFAState));
// Queue for subset construction
int* work_queue = (int*)malloc(dfa_capacity * sizeof(int));
int queue_head = 0, queue_tail = 0;
// Start state epsilon closure
int start_nfa_id = global_start->id;
int* start_closure = NULL;
int start_closure_count = 0;
get_epsilon_closure(&start_nfa_id, 1, all_nfa_states, total_nfa_states, &start_closure, &start_closure_count);
// Create start DFA state (0)
dfa_states[dfa_count].id = dfa_count;
dfa_states[dfa_count].nfa_states = start_closure;
dfa_states[dfa_count].nfa_state_count = start_closure_count;
memset(dfa_states[dfa_count].transitions, -1, sizeof(dfa_states[dfa_count].transitions));
dfa_states[dfa_count].accept_rule_index = -1;
work_queue[queue_tail++] = dfa_count;
dfa_count++;
// Process queue
while (queue_head < queue_tail) {
int curr_dfa_id = work_queue[queue_head++];
// For each possible ASCII character transition
for (int c = 0; c < 256; c++) {
// Find NFA states reachable on character 'c'
int* reachable = (int*)malloc(total_nfa_states * sizeof(int));
int reachable_count = 0;
DFAState* curr_dfa = &dfa_states[curr_dfa_id];
for (int i = 0; i < curr_dfa->nfa_state_count; i++) {
NFAState* nfa_s = all_nfa_states[curr_dfa->nfa_states[i]];
if (!nfa_s->is_epsilon && nfa_s->char_set[c]) {
if (nfa_s->edge1) {
reachable[reachable_count++] = nfa_s->edge1->id;
}
}
}
if (reachable_count > 0) {
// Compute epsilon closure of reachable NFA states
int* closure = NULL;
int closure_count = 0;
get_epsilon_closure(reachable, reachable_count, all_nfa_states, total_nfa_states, &closure, &closure_count);
free(reachable);
// Check if this DFA state already exists
int existing_id = -1;
for (int d = 0; d < dfa_count; d++) {
if (are_nfa_sets_equal(dfa_states[d].nfa_states, dfa_states[d].nfa_state_count, closure, closure_count)) {
existing_id = d;
break;
}
}
if (existing_id != -1) {
dfa_states[curr_dfa_id].transitions[c] = existing_id;
free(closure);
} else {
if (dfa_count >= dfa_capacity) {
dfa_capacity *= 2;
dfa_states = (DFAState*)realloc(dfa_states, dfa_capacity * sizeof(DFAState));
work_queue = (int*)realloc(work_queue, dfa_capacity * sizeof(int));
}
dfa_states[dfa_count].id = dfa_count;
dfa_states[dfa_count].nfa_states = closure;
dfa_states[dfa_count].nfa_state_count = closure_count;
memset(dfa_states[dfa_count].transitions, -1, sizeof(dfa_states[dfa_count].transitions));
dfa_states[dfa_count].accept_rule_index = -1;
dfa_states[curr_dfa_id].transitions[c] = dfa_count;
work_queue[queue_tail++] = dfa_count;
dfa_count++;
}
} else {
free(reachable);
dfa_states[curr_dfa_id].transitions[c] = -1;
}
}
}
// Determine accepting status of each DFA state based on NFA accept states
for (int d = 0; d < dfa_count; d++) {
int best_rule = -1;
for (int i = 0; i < dfa_states[d].nfa_state_count; i++) {
NFAState* nfa_s = all_nfa_states[dfa_states[d].nfa_states[i]];
if (nfa_s->accept_rule_index != -1) {
if (best_rule == -1 || nfa_s->accept_rule_index < best_rule) {
best_rule = nfa_s->accept_rule_index;
}
}
}
dfa_states[d].accept_rule_index = best_rule;
}
free(work_queue);
free_all_nfa_states(); // We no longer need the NFA states
*dfa_state_count_out = dfa_count;
return dfa_states;
}
void slex_free_dfa(DFAState* dfa_states, int dfa_state_count) {
if (dfa_states) {
for (int i = 0; i < dfa_state_count; i++) {
free(dfa_states[i].nfa_states);
}
free(dfa_states);
}
}