Finished the implemenation with Antigravity.

2026-05-26 04:45:32 +10:00
parent 7d974680a6
commit 2950db1efb
10 changed files with 1760 additions and 5 deletions
@@ -0,0 +1,702 @@
+#include "../Headers/slex_regex.h"
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+// Token representation for Regex Parsing
+typedef enum {
+    TOKEN_CHAR,
+    TOKEN_CHAR_SET,
+    TOKEN_CONCAT,
+    TOKEN_ALT,
+    TOKEN_STAR,
+    TOKEN_PLUS,
+    TOKEN_QUESTION,
+    TOKEN_LPAREN,
+    TOKEN_RPAREN,
+} RegexTokenType;
+
+typedef struct {
+    RegexTokenType type;
+    bool char_set[256];
+} RegexToken;
+
+// Global array to track all allocated NFA states for easy deallocation
+static NFAState** g_nfa_states = NULL;
+static int g_nfa_state_count = 0;
+static int g_nfa_state_capacity = 0;
+
+static NFAState* create_nfa_state() {
+    NFAState* s = (NFAState*)malloc(sizeof(NFAState));
+    s->id = g_nfa_state_count;
+    s->is_epsilon = false;
+    memset(s->char_set, 0, sizeof(s->char_set));
+    s->edge1 = NULL;
+    s->edge2 = NULL;
+    s->accept_rule_index = -1;
+
+    // Track state globally
+    if (g_nfa_state_count >= g_nfa_state_capacity) {
+        g_nfa_state_capacity = g_nfa_state_capacity == 0 ? 1024 : g_nfa_state_capacity * 2;
+        g_nfa_states = (NFAState**)realloc(g_nfa_states, g_nfa_state_capacity * sizeof(NFAState*));
+    }
+    g_nfa_states[g_nfa_state_count++] = s;
+    return s;
+}
+
+static void free_all_nfa_states() {
+    for (int i = 0; i < g_nfa_state_count; i++) {
+        free(g_nfa_states[i]);
+    }
+    free(g_nfa_states);
+    g_nfa_states = NULL;
+    g_nfa_state_count = 0;
+    g_nfa_state_capacity = 0;
+}
+
+// Tokenize a regex pattern
+static RegexToken* tokenize_regex(const char* pattern, int* token_count_out) {
+    int capacity = 128;
+    int count = 0;
+    RegexToken* tokens = (RegexToken*)malloc(capacity * sizeof(RegexToken));
+    int len = (int)strlen(pattern);
+    int idx = 0;
+
+    while (idx < len) {
+        if (count >= capacity) {
+            capacity *= 2;
+            tokens = (RegexToken*)realloc(tokens, capacity * sizeof(RegexToken));
+        }
+
+        char c = pattern[idx];
+
+        if (c == '\\') {
+            idx++;
+            if (idx >= len) {
+                // Trailing backslash, treat as literal backslash
+                tokens[count].type = TOKEN_CHAR;
+                memset(tokens[count].char_set, 0, 256);
+                tokens[count].char_set[(unsigned char)'\\'] = true;
+                count++;
+                break;
+            }
+            char esc = pattern[idx++];
+            tokens[count].type = TOKEN_CHAR_SET;
+            memset(tokens[count].char_set, 0, 256);
+
+            if (esc == 'p' && idx < len && pattern[idx] == '{') {
+                idx++; // skip '{'
+                char prop[256];
+                int p_idx = 0;
+                while (idx < len && pattern[idx] != '}') {
+                    prop[p_idx++] = pattern[idx++];
+                }
+                prop[p_idx] = '\0';
+                if (idx < len && pattern[idx] == '}') {
+                    idx++; // skip '}'
+                }
+
+                if (strcmp(prop, "P") == 0) {
+                    const char* punct = "!\"#%&'()*,-./:;?@[\\]_{}";
+                    for (int k = 0; punct[k] != '\0'; k++) {
+                        tokens[count].char_set[(unsigned char)punct[k]] = true;
+                    }
+                } else if (strcmp(prop, "S") == 0) {
+                    const char* sym = "$+<=>^`|~";
+                    for (int k = 0; sym[k] != '\0'; k++) {
+                        tokens[count].char_set[(unsigned char)sym[k]] = true;
+                    }
+                } else if (strcmp(prop, "L") == 0) {
+                    for (int d = 'a'; d <= 'z'; d++) tokens[count].char_set[d] = true;
+                    for (int d = 'A'; d <= 'Z'; d++) tokens[count].char_set[d] = true;
+                } else if (strcmp(prop, "N") == 0) {
+                    for (int d = '0'; d <= '9'; d++) tokens[count].char_set[d] = true;
+                }
+            } else if (esc == 'n') {
+                tokens[count].char_set[10] = true; // LF
+            } else if (esc == 't') {
+                tokens[count].char_set[9] = true;  // TAB
+            } else if (esc == 'r') {
+                tokens[count].char_set[13] = true; // CR
+            } else if (esc == 's') {
+                tokens[count].char_set[32] = true; // Space
+                tokens[count].char_set[9] = true;  // TAB
+                tokens[count].char_set[13] = true; // CR
+                tokens[count].char_set[10] = true; // LF
+            } else if (esc == 'd') {
+                for (int d = '0'; d <= '9'; d++) tokens[count].char_set[d] = true;
+            } else if (esc == 'w') {
+                for (int d = '0'; d <= '9'; d++) tokens[count].char_set[d] = true;
+                for (int d = 'a'; d <= 'z'; d++) tokens[count].char_set[d] = true;
+                for (int d = 'A'; d <= 'Z'; d++) tokens[count].char_set[d] = true;
+                tokens[count].char_set[(unsigned char)'_'] = true;
+            } else {
+                // Literal escaped character
+                tokens[count].type = TOKEN_CHAR;
+                tokens[count].char_set[(unsigned char)esc] = true;
+            }
+            count++;
+        } else if (c == '[') {
+            idx++;
+            bool negate = false;
+            if (idx < len && pattern[idx] == '^') {
+                negate = true;
+                idx++;
+            }
+
+            tokens[count].type = TOKEN_CHAR_SET;
+            memset(tokens[count].char_set, 0, 256);
+
+            while (idx < len && pattern[idx] != ']') {
+                char c1 = pattern[idx++];
+                if (c1 == '\\' && idx < len) {
+                    char esc = pattern[idx++];
+                    if (esc == 'p' && idx < len && pattern[idx] == '{') {
+                        idx++; // skip '{'
+                        char prop[256];
+                        int p_idx = 0;
+                        while (idx < len && pattern[idx] != '}') {
+                            prop[p_idx++] = pattern[idx++];
+                        }
+                        prop[p_idx] = '\0';
+                        if (idx < len && pattern[idx] == '}') {
+                            idx++; // skip '}'
+                        }
+
+                        if (strcmp(prop, "P") == 0) {
+                            const char* punct = "!\"#%&'()*,-./:;?@[\\]_{}";
+                            for (int k = 0; punct[k] != '\0'; k++) {
+                                tokens[count].char_set[(unsigned char)punct[k]] = true;
+                            }
+                        } else if (strcmp(prop, "S") == 0) {
+                            const char* sym = "$+<=>^`|~";
+                            for (int k = 0; sym[k] != '\0'; k++) {
+                                tokens[count].char_set[(unsigned char)sym[k]] = true;
+                            }
+                        } else if (strcmp(prop, "L") == 0) {
+                            for (int d = 'a'; d <= 'z'; d++) tokens[count].char_set[d] = true;
+                            for (int d = 'A'; d <= 'Z'; d++) tokens[count].char_set[d] = true;
+                        } else if (strcmp(prop, "N") == 0) {
+                            for (int d = '0'; d <= '9'; d++) tokens[count].char_set[d] = true;
+                        }
+                        continue;
+                    } else if (esc == 'n') c1 = '\n';
+                    else if (esc == 't') c1 = '\t';
+                    else if (esc == 'r') c1 = '\r';
+                    else if (esc == 's') {
+                        tokens[count].char_set[32] = true;
+                        tokens[count].char_set[9] = true;
+                        tokens[count].char_set[13] = true;
+                        tokens[count].char_set[10] = true;
+                        continue;
+                    } else if (esc == 'd') {
+                        for (int d = '0'; d <= '9'; d++) tokens[count].char_set[d] = true;
+                        continue;
+                    } else if (esc == 'w') {
+                        for (int d = '0'; d <= '9'; d++) tokens[count].char_set[d] = true;
+                        for (int d = 'a'; d <= 'z'; d++) tokens[count].char_set[d] = true;
+                        for (int d = 'A'; d <= 'Z'; d++) tokens[count].char_set[d] = true;
+                        tokens[count].char_set[(unsigned char)'_'] = true;
+                        continue;
+                    } else {
+                        c1 = esc;
+                    }
+                }
+
+                // Check range: c1-c2
+                if (idx + 1 < len && pattern[idx] == '-' && pattern[idx + 1] != ']') {
+                    idx++; // skip '-'
+                    char c2 = pattern[idx++];
+                    if (c2 == '\\' && idx < len) {
+                        char esc = pattern[idx++];
+                        if (esc == 'n') c2 = '\n';
+                        else if (esc == 't') c2 = '\t';
+                        else if (esc == 'r') c2 = '\r';
+                        else c2 = esc;
+                    }
+                    for (int r = (unsigned char)c1; r <= (unsigned char)c2; r++) {
+                        tokens[count].char_set[r] = true;
+                    }
+                } else {
+                    tokens[count].char_set[(unsigned char)c1] = true;
+                }
+            }
+
+            if (idx < len && pattern[idx] == ']') {
+                idx++;
+            }
+
+            if (negate) {
+                for (int i = 0; i < 256; i++) {
+                    tokens[count].char_set[i] = !tokens[count].char_set[i];
+                }
+            }
+            count++;
+        } else if (c == '.') {
+            tokens[count].type = TOKEN_CHAR_SET;
+            memset(tokens[count].char_set, 0, 256);
+            for (int i = 0; i < 256; i++) {
+                if (i != 10) { // any character except newline
+                    tokens[count].char_set[i] = true;
+                }
+            }
+            count++;
+            idx++;
+        } else if (c == '*') {
+            tokens[count].type = TOKEN_STAR;
+            count++;
+            idx++;
+        } else if (c == '+') {
+            tokens[count].type = TOKEN_PLUS;
+            count++;
+            idx++;
+        } else if (c == '?') {
+            tokens[count].type = TOKEN_QUESTION;
+            count++;
+            idx++;
+        } else if (c == '|') {
+            tokens[count].type = TOKEN_ALT;
+            count++;
+            idx++;
+        } else if (c == '(') {
+            tokens[count].type = TOKEN_LPAREN;
+            count++;
+            idx++;
+        } else if (c == ')') {
+            tokens[count].type = TOKEN_RPAREN;
+            count++;
+            idx++;
+        } else {
+            tokens[count].type = TOKEN_CHAR;
+            memset(tokens[count].char_set, 0, 256);
+            tokens[count].char_set[(unsigned char)c] = true;
+            count++;
+            idx++;
+        }
+    }
+
+    *token_count_out = count;
+    return tokens;
+}
+
+// Insert explicit concatenation operators
+static RegexToken* insert_concat(RegexToken* input, int input_count, int* output_count_out) {
+    int capacity = input_count * 2;
+    int count = 0;
+    RegexToken* output = (RegexToken*)malloc(capacity * sizeof(RegexToken));
+
+    for (int i = 0; i < input_count; i++) {
+        if (count >= capacity) {
+            capacity *= 2;
+            output = (RegexToken*)realloc(output, capacity * sizeof(RegexToken));
+        }
+
+        output[count++] = input[i];
+
+        if (i + 1 < input_count) {
+            RegexTokenType t1 = input[i].type;
+            RegexTokenType t2 = input[i + 1].type;
+
+            bool t1_can_concat = (t1 == TOKEN_CHAR || t1 == TOKEN_CHAR_SET || t1 == TOKEN_STAR || t1 == TOKEN_PLUS || t1 == TOKEN_QUESTION || t1 == TOKEN_RPAREN);
+            bool t2_can_concat = (t2 == TOKEN_CHAR || t2 == TOKEN_CHAR_SET || t2 == TOKEN_LPAREN);
+
+            if (t1_can_concat && t2_can_concat) {
+                if (count >= capacity) {
+                    capacity *= 2;
+                    output = (RegexToken*)realloc(output, capacity * sizeof(RegexToken));
+                }
+                output[count].type = TOKEN_CONCAT;
+                memset(output[count].char_set, 0, 256);
+                count++;
+            }
+        }
+    }
+
+    *output_count_out = count;
+    return output;
+}
+
+// Shunting-yard algorithm to convert infix tokens to postfix tokens
+static RegexToken* infix_to_postfix(RegexToken* infix, int infix_count, int* postfix_count_out) {
+    int capacity = infix_count;
+    int postfix_count = 0;
+    RegexToken* postfix = (RegexToken*)malloc(capacity * sizeof(RegexToken));
+
+    RegexToken stack[512];
+    int stack_top = 0;
+
+    for (int i = 0; i < infix_count; i++) {
+        RegexToken t = infix[i];
+
+        if (t.type == TOKEN_CHAR || t.type == TOKEN_CHAR_SET) {
+            if (postfix_count >= capacity) {
+                capacity *= 2;
+                postfix = (RegexToken*)realloc(postfix, capacity * sizeof(RegexToken));
+            }
+            postfix[postfix_count++] = t;
+        } else if (t.type == TOKEN_LPAREN) {
+            stack[stack_top++] = t;
+        } else if (t.type == TOKEN_RPAREN) {
+            while (stack_top > 0 && stack[stack_top - 1].type != TOKEN_LPAREN) {
+                if (postfix_count >= capacity) {
+                    capacity *= 2;
+                    postfix = (RegexToken*)realloc(postfix, capacity * sizeof(RegexToken));
+                }
+                postfix[postfix_count++] = stack[--stack_top];
+            }
+            if (stack_top > 0) {
+                stack_top--; // pop LPAREN
+            }
+        } else if (t.type == TOKEN_STAR || t.type == TOKEN_PLUS || t.type == TOKEN_QUESTION) {
+            // Unary operators have highest precedence and are postfix, output immediately
+            if (postfix_count >= capacity) {
+                capacity *= 2;
+                postfix = (RegexToken*)realloc(postfix, capacity * sizeof(RegexToken));
+            }
+            postfix[postfix_count++] = t;
+        } else {
+            // Binary operators (CONCAT, ALT)
+            int p_curr = (t.type == TOKEN_ALT) ? 1 : 2;
+            while (stack_top > 0) {
+                RegexTokenType top_type = stack[stack_top - 1].type;
+                if (top_type == TOKEN_CONCAT || top_type == TOKEN_ALT) {
+                    int p_top = (top_type == TOKEN_ALT) ? 1 : 2;
+                    if (p_top >= p_curr) {
+                        if (postfix_count >= capacity) {
+                            capacity *= 2;
+                            postfix = (RegexToken*)realloc(postfix, capacity * sizeof(RegexToken));
+                        }
+                        postfix[postfix_count++] = stack[--stack_top];
+                    } else {
+                        break;
+                    }
+                } else {
+                    break;
+                }
+            }
+            stack[stack_top++] = t;
+        }
+    }
+
+    while (stack_top > 0) {
+        if (postfix_count >= capacity) {
+            capacity *= 2;
+            postfix = (RegexToken*)realloc(postfix, capacity * sizeof(RegexToken));
+        }
+        postfix[postfix_count++] = stack[--stack_top];
+    }
+
+    *postfix_count_out = postfix_count;
+    return postfix;
+}
+
+// Build NFA from postfix tokens using Thompson's construction
+static NFAFragment build_nfa(RegexToken* postfix, int postfix_count) {
+    NFAFragment stack[512];
+    int stack_top = 0;
+
+    for (int i = 0; i < postfix_count; i++) {
+        RegexToken t = postfix[i];
+
+        if (t.type == TOKEN_CHAR || t.type == TOKEN_CHAR_SET) {
+            NFAState* start = create_nfa_state();
+            NFAState* accept = create_nfa_state();
+            start->is_epsilon = false;
+            memcpy(start->char_set, t.char_set, 256);
+            start->edge1 = accept;
+
+            NFAFragment frag = {start, accept};
+            stack[stack_top++] = frag;
+        } else if (t.type == TOKEN_CONCAT) {
+            NFAFragment f2 = stack[--stack_top];
+            NFAFragment f1 = stack[--stack_top];
+
+            f1.accept->is_epsilon = true;
+            f1.accept->edge1 = f2.start;
+
+            NFAFragment frag = {f1.start, f2.accept};
+            stack[stack_top++] = frag;
+        } else if (t.type == TOKEN_ALT) {
+            NFAFragment f2 = stack[--stack_top];
+            NFAFragment f1 = stack[--stack_top];
+
+            NFAState* start = create_nfa_state();
+            NFAState* accept = create_nfa_state();
+
+            start->is_epsilon = true;
+            start->edge1 = f1.start;
+            start->edge2 = f2.start;
+
+            f1.accept->is_epsilon = true;
+            f1.accept->edge1 = accept;
+
+            f2.accept->is_epsilon = true;
+            f2.accept->edge1 = accept;
+
+            NFAFragment frag = {start, accept};
+            stack[stack_top++] = frag;
+        } else if (t.type == TOKEN_STAR) {
+            NFAFragment f1 = stack[--stack_top];
+
+            NFAState* start = create_nfa_state();
+            NFAState* accept = create_nfa_state();
+
+            start->is_epsilon = true;
+            start->edge1 = f1.start;
+            start->edge2 = accept;
+
+            f1.accept->is_epsilon = true;
+            f1.accept->edge1 = f1.start;
+            f1.accept->edge2 = accept;
+
+            NFAFragment frag = {start, accept};
+            stack[stack_top++] = frag;
+        } else if (t.type == TOKEN_PLUS) {
+            NFAFragment f1 = stack[--stack_top];
+
+            NFAState* start = create_nfa_state();
+            NFAState* accept = create_nfa_state();
+
+            start->is_epsilon = true;
+            start->edge1 = f1.start;
+
+            f1.accept->is_epsilon = true;
+            f1.accept->edge1 = f1.start;
+            f1.accept->edge2 = accept;
+
+            NFAFragment frag = {start, accept};
+            stack[stack_top++] = frag;
+        } else if (t.type == TOKEN_QUESTION) {
+            NFAFragment f1 = stack[--stack_top];
+
+            NFAState* start = create_nfa_state();
+            NFAState* accept = create_nfa_state();
+
+            start->is_epsilon = true;
+            start->edge1 = f1.start;
+            start->edge2 = accept;
+
+            f1.accept->is_epsilon = true;
+            f1.accept->edge1 = accept;
+
+            NFAFragment frag = {start, accept};
+            stack[stack_top++] = frag;
+        }
+    }
+
+    return stack[0];
+}
+
+// Computes epsilon closure of a set of NFA states
+static void get_epsilon_closure(int* input_states, int input_count, NFAState** all_nfa_states, int total_nfa_states, int** output_states, int* output_count) {
+    bool* visited = (bool*)calloc(total_nfa_states, sizeof(bool));
+    int* queue = (int*)malloc(total_nfa_states * sizeof(int));
+    int head = 0, tail = 0;
+
+    for (int i = 0; i < input_count; i++) {
+        int id = input_states[i];
+        visited[id] = true;
+        queue[tail++] = id;
+    }
+
+    while (head < tail) {
+        int curr_id = queue[head++];
+        NFAState* s = all_nfa_states[curr_id];
+        if (s->is_epsilon) {
+            if (s->edge1 && !visited[s->edge1->id]) {
+                visited[s->edge1->id] = true;
+                queue[tail++] = s->edge1->id;
+            }
+            if (s->edge2 && !visited[s->edge2->id]) {
+                visited[s->edge2->id] = true;
+                queue[tail++] = s->edge2->id;
+            }
+        }
+    }
+
+    int count = 0;
+    for (int i = 0; i < total_nfa_states; i++) {
+        if (visited[i]) count++;
+    }
+
+    int* res = (int*)malloc(count * sizeof(int));
+    int idx = 0;
+    for (int i = 0; i < total_nfa_states; i++) {
+        if (visited[i]) {
+            res[idx++] = i;
+        }
+    }
+
+    *output_states = res;
+    *output_count = count;
+    free(visited);
+    free(queue);
+}
+
+// Compare two NFA state sets
+static bool are_nfa_sets_equal(int* a, int a_count, int* b, int b_count) {
+    if (a_count != b_count) return false;
+    for (int i = 0; i < a_count; i++) {
+        if (a[i] != b[i]) return false;
+    }
+    return true;
+}
+
+// Compiles a set of regular expression patterns into a complete DFA using subset construction
+DFAState* slex_compile_regexes(char** patterns, int pattern_count, int* dfa_state_count_out) {
+    free_all_nfa_states(); // Reset global state tracker
+
+    // 1. Build NFA for each pattern
+    NFAFragment* fragments = (NFAFragment*)malloc(pattern_count * sizeof(NFAFragment));
+    for (int i = 0; i < pattern_count; i++) {
+        int t_count = 0, concat_count = 0, post_count = 0;
+        RegexToken* tokens = tokenize_regex(patterns[i], &t_count);
+        RegexToken* tokens_concat = insert_concat(tokens, t_count, &concat_count);
+        RegexToken* tokens_postfix = infix_to_postfix(tokens_concat, concat_count, &post_count);
+
+        fragments[i] = build_nfa(tokens_postfix, post_count);
+        fragments[i].accept->accept_rule_index = i;
+
+        free(tokens);
+        free(tokens_concat);
+        free(tokens_postfix);
+    }
+
+    // 2. Create global start state with epsilon transitions to each pattern NFA's start state
+    NFAState* global_start = create_nfa_state();
+    global_start->is_epsilon = true;
+
+    NFAState* current_hub = global_start;
+    for (int i = 0; i < pattern_count; i++) {
+        if (i == pattern_count - 1) {
+            current_hub->edge1 = fragments[i].start;
+        } else {
+            NFAState* next_hub = create_nfa_state();
+            next_hub->is_epsilon = true;
+            current_hub->edge1 = fragments[i].start;
+            current_hub->edge2 = next_hub;
+            current_hub = next_hub;
+        }
+    }
+    free(fragments);
+
+    // 3. Subset construction
+    int total_nfa_states = g_nfa_state_count;
+    NFAState** all_nfa_states = g_nfa_states;
+
+    int dfa_capacity = 1024;
+    int dfa_count = 0;
+    DFAState* dfa_states = (DFAState*)malloc(dfa_capacity * sizeof(DFAState));
+
+    // Queue for subset construction
+    int* work_queue = (int*)malloc(dfa_capacity * sizeof(int));
+    int queue_head = 0, queue_tail = 0;
+
+    // Start state epsilon closure
+    int start_nfa_id = global_start->id;
+    int* start_closure = NULL;
+    int start_closure_count = 0;
+    get_epsilon_closure(&start_nfa_id, 1, all_nfa_states, total_nfa_states, &start_closure, &start_closure_count);
+
+    // Create start DFA state (0)
+    dfa_states[dfa_count].id = dfa_count;
+    dfa_states[dfa_count].nfa_states = start_closure;
+    dfa_states[dfa_count].nfa_state_count = start_closure_count;
+    memset(dfa_states[dfa_count].transitions, -1, sizeof(dfa_states[dfa_count].transitions));
+    dfa_states[dfa_count].accept_rule_index = -1;
+
+    work_queue[queue_tail++] = dfa_count;
+    dfa_count++;
+
+    // Process queue
+    while (queue_head < queue_tail) {
+        int curr_dfa_id = work_queue[queue_head++];
+
+        // For each possible ASCII character transition
+        for (int c = 0; c < 256; c++) {
+            // Find NFA states reachable on character 'c'
+            int* reachable = (int*)malloc(total_nfa_states * sizeof(int));
+            int reachable_count = 0;
+
+            DFAState* curr_dfa = &dfa_states[curr_dfa_id];
+            for (int i = 0; i < curr_dfa->nfa_state_count; i++) {
+                NFAState* nfa_s = all_nfa_states[curr_dfa->nfa_states[i]];
+                if (!nfa_s->is_epsilon && nfa_s->char_set[c]) {
+                    if (nfa_s->edge1) {
+                        reachable[reachable_count++] = nfa_s->edge1->id;
+                    }
+                }
+            }
+
+            if (reachable_count > 0) {
+                // Compute epsilon closure of reachable NFA states
+                int* closure = NULL;
+                int closure_count = 0;
+                get_epsilon_closure(reachable, reachable_count, all_nfa_states, total_nfa_states, &closure, &closure_count);
+                free(reachable);
+
+                // Check if this DFA state already exists
+                int existing_id = -1;
+                for (int d = 0; d < dfa_count; d++) {
+                    if (are_nfa_sets_equal(dfa_states[d].nfa_states, dfa_states[d].nfa_state_count, closure, closure_count)) {
+                        existing_id = d;
+                        break;
+                    }
+                }
+
+                if (existing_id != -1) {
+                    dfa_states[curr_dfa_id].transitions[c] = existing_id;
+                    free(closure);
+                } else {
+                    if (dfa_count >= dfa_capacity) {
+                        dfa_capacity *= 2;
+                        dfa_states = (DFAState*)realloc(dfa_states, dfa_capacity * sizeof(DFAState));
+                        work_queue = (int*)realloc(work_queue, dfa_capacity * sizeof(int));
+                    }
+
+                    dfa_states[dfa_count].id = dfa_count;
+                    dfa_states[dfa_count].nfa_states = closure;
+                    dfa_states[dfa_count].nfa_state_count = closure_count;
+                    memset(dfa_states[dfa_count].transitions, -1, sizeof(dfa_states[dfa_count].transitions));
+                    dfa_states[dfa_count].accept_rule_index = -1;
+
+                    dfa_states[curr_dfa_id].transitions[c] = dfa_count;
+                    work_queue[queue_tail++] = dfa_count;
+                    dfa_count++;
+                }
+            } else {
+                free(reachable);
+                dfa_states[curr_dfa_id].transitions[c] = -1;
+            }
+        }
+    }
+
+    // Determine accepting status of each DFA state based on NFA accept states
+    for (int d = 0; d < dfa_count; d++) {
+        int best_rule = -1;
+        for (int i = 0; i < dfa_states[d].nfa_state_count; i++) {
+            NFAState* nfa_s = all_nfa_states[dfa_states[d].nfa_states[i]];
+            if (nfa_s->accept_rule_index != -1) {
+                if (best_rule == -1 || nfa_s->accept_rule_index < best_rule) {
+                    best_rule = nfa_s->accept_rule_index;
+                }
+            }
+        }
+        dfa_states[d].accept_rule_index = best_rule;
+    }
+
+    free(work_queue);
+    free_all_nfa_states(); // We no longer need the NFA states
+
+    *dfa_state_count_out = dfa_count;
+    return dfa_states;
+}
+
+void slex_free_dfa(DFAState* dfa_states, int dfa_state_count) {
+    if (dfa_states) {
+        for (int i = 0; i < dfa_state_count; i++) {
+            free(dfa_states[i].nfa_states);
+        }
+        free(dfa_states);
+    }
+}