slex/Source/slex_regex.c

#include "../Headers/slex_regex.h"
#include <stdio.h>
#include <stdlib.h>
#include <string.h>

// Token representation for Regex Parsing
typedef enum {
    TOKEN_CHAR,
    TOKEN_CHAR_SET,
    TOKEN_CONCAT,
    TOKEN_ALT,
    TOKEN_STAR,
    TOKEN_PLUS,
    TOKEN_QUESTION,
    TOKEN_LPAREN,
    TOKEN_RPAREN,
} RegexTokenType;

typedef struct {
    RegexTokenType type;
    bool char_set[256];
} RegexToken;

// Global array to track all allocated NFA states for easy deallocation
static NFAState** g_nfa_states = NULL;
static int g_nfa_state_count = 0;
static int g_nfa_state_capacity = 0;

static NFAState* create_nfa_state() {
    NFAState* s = (NFAState*)malloc(sizeof(NFAState));
    s->id = g_nfa_state_count;
    s->is_epsilon = false;
    memset(s->char_set, 0, sizeof(s->char_set));
    s->edge1 = NULL;
    s->edge2 = NULL;
    s->accept_rule_index = -1;

    // Track state globally
    if (g_nfa_state_count >= g_nfa_state_capacity) {
        g_nfa_state_capacity = g_nfa_state_capacity == 0 ? 1024 : g_nfa_state_capacity * 2;
        g_nfa_states = (NFAState**)realloc(g_nfa_states, g_nfa_state_capacity * sizeof(NFAState*));
    }
    g_nfa_states[g_nfa_state_count++] = s;
    return s;
}

static void free_all_nfa_states() {
    for (int i = 0; i < g_nfa_state_count; i++) {
        free(g_nfa_states[i]);
    }
    free(g_nfa_states);
    g_nfa_states = NULL;
    g_nfa_state_count = 0;
    g_nfa_state_capacity = 0;
}

// Tokenize a regex pattern
static RegexToken* tokenize_regex(const char* pattern, int* token_count_out) {
    int capacity = 128;
    int count = 0;
    RegexToken* tokens = (RegexToken*)malloc(capacity * sizeof(RegexToken));
    int len = (int)strlen(pattern);
    int idx = 0;

    while (idx < len) {
        if (count >= capacity) {
            capacity *= 2;
            tokens = (RegexToken*)realloc(tokens, capacity * sizeof(RegexToken));
        }

        char c = pattern[idx];

        if (c == '\\') {
            idx++;
            if (idx >= len) {
                // Trailing backslash, treat as literal backslash
                tokens[count].type = TOKEN_CHAR;
                memset(tokens[count].char_set, 0, 256);
                tokens[count].char_set[(unsigned char)'\\'] = true;
                count++;
                break;
            }
            char esc = pattern[idx++];
            tokens[count].type = TOKEN_CHAR_SET;
            memset(tokens[count].char_set, 0, 256);

            if (esc == 'p' && idx < len && pattern[idx] == '{') {
                idx++; // skip '{'
                char prop[256];
                int p_idx = 0;
                while (idx < len && pattern[idx] != '}') {
                    prop[p_idx++] = pattern[idx++];
                }
                prop[p_idx] = '\0';
                if (idx < len && pattern[idx] == '}') {
                    idx++; // skip '}'
                }

                if (strcmp(prop, "P") == 0) {
                    const char* punct = "!\"#%&'()*,-./:;?@[\\]_{}";
                    for (int k = 0; punct[k] != '\0'; k++) {
                        tokens[count].char_set[(unsigned char)punct[k]] = true;
                    }
                } else if (strcmp(prop, "S") == 0) {
                    const char* sym = "$+<=>^`|~";
                    for (int k = 0; sym[k] != '\0'; k++) {
                        tokens[count].char_set[(unsigned char)sym[k]] = true;
                    }
                } else if (strcmp(prop, "L") == 0) {
                    for (int d = 'a'; d <= 'z'; d++) tokens[count].char_set[d] = true;
                    for (int d = 'A'; d <= 'Z'; d++) tokens[count].char_set[d] = true;
                } else if (strcmp(prop, "N") == 0) {
                    for (int d = '0'; d <= '9'; d++) tokens[count].char_set[d] = true;
                }
            } else if (esc == 'n') {
                tokens[count].char_set[10] = true; // LF
            } else if (esc == 't') {
                tokens[count].char_set[9] = true;  // TAB
            } else if (esc == 'r') {
                tokens[count].char_set[13] = true; // CR
            } else if (esc == 's') {
                tokens[count].char_set[32] = true; // Space
                tokens[count].char_set[9] = true;  // TAB
                tokens[count].char_set[13] = true; // CR
                tokens[count].char_set[10] = true; // LF
            } else if (esc == 'd') {
                for (int d = '0'; d <= '9'; d++) tokens[count].char_set[d] = true;
            } else if (esc == 'w') {
                for (int d = '0'; d <= '9'; d++) tokens[count].char_set[d] = true;
                for (int d = 'a'; d <= 'z'; d++) tokens[count].char_set[d] = true;
                for (int d = 'A'; d <= 'Z'; d++) tokens[count].char_set[d] = true;
                tokens[count].char_set[(unsigned char)'_'] = true;
            } else {
                // Literal escaped character
                tokens[count].type = TOKEN_CHAR;
                tokens[count].char_set[(unsigned char)esc] = true;
            }
            count++;
        } else if (c == '[') {
            idx++;
            bool negate = false;
            if (idx < len && pattern[idx] == '^') {
                negate = true;
                idx++;
            }

            tokens[count].type = TOKEN_CHAR_SET;
            memset(tokens[count].char_set, 0, 256);

            while (idx < len && pattern[idx] != ']') {
                char c1 = pattern[idx++];
                if (c1 == '\\' && idx < len) {
                    char esc = pattern[idx++];
                    if (esc == 'p' && idx < len && pattern[idx] == '{') {
                        idx++; // skip '{'
                        char prop[256];
                        int p_idx = 0;
                        while (idx < len && pattern[idx] != '}') {
                            prop[p_idx++] = pattern[idx++];
                        }
                        prop[p_idx] = '\0';
                        if (idx < len && pattern[idx] == '}') {
                            idx++; // skip '}'
                        }

                        if (strcmp(prop, "P") == 0) {
                            const char* punct = "!\"#%&'()*,-./:;?@[\\]_{}";
                            for (int k = 0; punct[k] != '\0'; k++) {
                                tokens[count].char_set[(unsigned char)punct[k]] = true;
                            }
                        } else if (strcmp(prop, "S") == 0) {
                            const char* sym = "$+<=>^`|~";
                            for (int k = 0; sym[k] != '\0'; k++) {
                                tokens[count].char_set[(unsigned char)sym[k]] = true;
                            }
                        } else if (strcmp(prop, "L") == 0) {
                            for (int d = 'a'; d <= 'z'; d++) tokens[count].char_set[d] = true;
                            for (int d = 'A'; d <= 'Z'; d++) tokens[count].char_set[d] = true;
                        } else if (strcmp(prop, "N") == 0) {
                            for (int d = '0'; d <= '9'; d++) tokens[count].char_set[d] = true;
                        }
                        continue;
                    } else if (esc == 'n') c1 = '\n';
                    else if (esc == 't') c1 = '\t';
                    else if (esc == 'r') c1 = '\r';
                    else if (esc == 's') {
                        tokens[count].char_set[32] = true;
                        tokens[count].char_set[9] = true;
                        tokens[count].char_set[13] = true;
                        tokens[count].char_set[10] = true;
                        continue;
                    } else if (esc == 'd') {
                        for (int d = '0'; d <= '9'; d++) tokens[count].char_set[d] = true;
                        continue;
                    } else if (esc == 'w') {
                        for (int d = '0'; d <= '9'; d++) tokens[count].char_set[d] = true;
                        for (int d = 'a'; d <= 'z'; d++) tokens[count].char_set[d] = true;
                        for (int d = 'A'; d <= 'Z'; d++) tokens[count].char_set[d] = true;
                        tokens[count].char_set[(unsigned char)'_'] = true;
                        continue;
                    } else {
                        c1 = esc;
                    }
                }

                // Check range: c1-c2
                if (idx + 1 < len && pattern[idx] == '-' && pattern[idx + 1] != ']') {
                    idx++; // skip '-'
                    char c2 = pattern[idx++];
                    if (c2 == '\\' && idx < len) {
                        char esc = pattern[idx++];
                        if (esc == 'n') c2 = '\n';
                        else if (esc == 't') c2 = '\t';
                        else if (esc == 'r') c2 = '\r';
                        else c2 = esc;
                    }
                    for (int r = (unsigned char)c1; r <= (unsigned char)c2; r++) {
                        tokens[count].char_set[r] = true;
                    }
                } else {
                    tokens[count].char_set[(unsigned char)c1] = true;
                }
            }

            if (idx < len && pattern[idx] == ']') {
                idx++;
            }

            if (negate) {
                for (int i = 0; i < 256; i++) {
                    tokens[count].char_set[i] = !tokens[count].char_set[i];
                }
            }
            count++;
        } else if (c == '.') {
            tokens[count].type = TOKEN_CHAR_SET;
            memset(tokens[count].char_set, 0, 256);
            for (int i = 0; i < 256; i++) {
                if (i != 10) { // any character except newline
                    tokens[count].char_set[i] = true;
                }
            }
            count++;
            idx++;
        } else if (c == '*') {
            tokens[count].type = TOKEN_STAR;
            count++;
            idx++;
        } else if (c == '+') {
            tokens[count].type = TOKEN_PLUS;
            count++;
            idx++;
        } else if (c == '?') {
            tokens[count].type = TOKEN_QUESTION;
            count++;
            idx++;
        } else if (c == '|') {
            tokens[count].type = TOKEN_ALT;
            count++;
            idx++;
        } else if (c == '(') {
            tokens[count].type = TOKEN_LPAREN;
            count++;
            idx++;
        } else if (c == ')') {
            tokens[count].type = TOKEN_RPAREN;
            count++;
            idx++;
        } else {
            tokens[count].type = TOKEN_CHAR;
            memset(tokens[count].char_set, 0, 256);
            tokens[count].char_set[(unsigned char)c] = true;
            count++;
            idx++;
        }
    }

    *token_count_out = count;
    return tokens;
}

// Insert explicit concatenation operators
static RegexToken* insert_concat(RegexToken* input, int input_count, int* output_count_out) {
    int capacity = input_count * 2;
    int count = 0;
    RegexToken* output = (RegexToken*)malloc(capacity * sizeof(RegexToken));

    for (int i = 0; i < input_count; i++) {
        if (count >= capacity) {
            capacity *= 2;
            output = (RegexToken*)realloc(output, capacity * sizeof(RegexToken));
        }

        output[count++] = input[i];

        if (i + 1 < input_count) {
            RegexTokenType t1 = input[i].type;
            RegexTokenType t2 = input[i + 1].type;

            bool t1_can_concat = (t1 == TOKEN_CHAR || t1 == TOKEN_CHAR_SET || t1 == TOKEN_STAR || t1 == TOKEN_PLUS || t1 == TOKEN_QUESTION || t1 == TOKEN_RPAREN);
            bool t2_can_concat = (t2 == TOKEN_CHAR || t2 == TOKEN_CHAR_SET || t2 == TOKEN_LPAREN);

            if (t1_can_concat && t2_can_concat) {
                if (count >= capacity) {
                    capacity *= 2;
                    output = (RegexToken*)realloc(output, capacity * sizeof(RegexToken));
                }
                output[count].type = TOKEN_CONCAT;
                memset(output[count].char_set, 0, 256);
                count++;
            }
        }
    }

    *output_count_out = count;
    return output;
}

// Shunting-yard algorithm to convert infix tokens to postfix tokens
static RegexToken* infix_to_postfix(RegexToken* infix, int infix_count, int* postfix_count_out) {
    int capacity = infix_count;
    int postfix_count = 0;
    RegexToken* postfix = (RegexToken*)malloc(capacity * sizeof(RegexToken));

    RegexToken stack[512];
    int stack_top = 0;

    for (int i = 0; i < infix_count; i++) {
        RegexToken t = infix[i];

        if (t.type == TOKEN_CHAR || t.type == TOKEN_CHAR_SET) {
            if (postfix_count >= capacity) {
                capacity *= 2;
                postfix = (RegexToken*)realloc(postfix, capacity * sizeof(RegexToken));
            }
            postfix[postfix_count++] = t;
        } else if (t.type == TOKEN_LPAREN) {
            stack[stack_top++] = t;
        } else if (t.type == TOKEN_RPAREN) {
            while (stack_top > 0 && stack[stack_top - 1].type != TOKEN_LPAREN) {
                if (postfix_count >= capacity) {
                    capacity *= 2;
                    postfix = (RegexToken*)realloc(postfix, capacity * sizeof(RegexToken));
                }
                postfix[postfix_count++] = stack[--stack_top];
            }
            if (stack_top > 0) {
                stack_top--; // pop LPAREN
            }
        } else if (t.type == TOKEN_STAR || t.type == TOKEN_PLUS || t.type == TOKEN_QUESTION) {
            // Unary operators have highest precedence and are postfix, output immediately
            if (postfix_count >= capacity) {
                capacity *= 2;
                postfix = (RegexToken*)realloc(postfix, capacity * sizeof(RegexToken));
            }
            postfix[postfix_count++] = t;
        } else {
            // Binary operators (CONCAT, ALT)
            int p_curr = (t.type == TOKEN_ALT) ? 1 : 2;
            while (stack_top > 0) {
                RegexTokenType top_type = stack[stack_top - 1].type;
                if (top_type == TOKEN_CONCAT || top_type == TOKEN_ALT) {
                    int p_top = (top_type == TOKEN_ALT) ? 1 : 2;
                    if (p_top >= p_curr) {
                        if (postfix_count >= capacity) {
                            capacity *= 2;
                            postfix = (RegexToken*)realloc(postfix, capacity * sizeof(RegexToken));
                        }
                        postfix[postfix_count++] = stack[--stack_top];
                    } else {
                        break;
                    }
                } else {
                    break;
                }
            }
            stack[stack_top++] = t;
        }
    }

    while (stack_top > 0) {
        if (postfix_count >= capacity) {
            capacity *= 2;
            postfix = (RegexToken*)realloc(postfix, capacity * sizeof(RegexToken));
        }
        postfix[postfix_count++] = stack[--stack_top];
    }

    *postfix_count_out = postfix_count;
    return postfix;
}

// Build NFA from postfix tokens using Thompson's construction
static NFAFragment build_nfa(RegexToken* postfix, int postfix_count) {
    NFAFragment stack[512];
    int stack_top = 0;

    for (int i = 0; i < postfix_count; i++) {
        RegexToken t = postfix[i];

        if (t.type == TOKEN_CHAR || t.type == TOKEN_CHAR_SET) {
            NFAState* start = create_nfa_state();
            NFAState* accept = create_nfa_state();
            start->is_epsilon = false;
            memcpy(start->char_set, t.char_set, 256);
            start->edge1 = accept;

            NFAFragment frag = {start, accept};
            stack[stack_top++] = frag;
        } else if (t.type == TOKEN_CONCAT) {
            NFAFragment f2 = stack[--stack_top];
            NFAFragment f1 = stack[--stack_top];

            f1.accept->is_epsilon = true;
            f1.accept->edge1 = f2.start;

            NFAFragment frag = {f1.start, f2.accept};
            stack[stack_top++] = frag;
        } else if (t.type == TOKEN_ALT) {
            NFAFragment f2 = stack[--stack_top];
            NFAFragment f1 = stack[--stack_top];

            NFAState* start = create_nfa_state();
            NFAState* accept = create_nfa_state();

            start->is_epsilon = true;
            start->edge1 = f1.start;
            start->edge2 = f2.start;

            f1.accept->is_epsilon = true;
            f1.accept->edge1 = accept;

            f2.accept->is_epsilon = true;
            f2.accept->edge1 = accept;

            NFAFragment frag = {start, accept};
            stack[stack_top++] = frag;
        } else if (t.type == TOKEN_STAR) {
            NFAFragment f1 = stack[--stack_top];

            NFAState* start = create_nfa_state();
            NFAState* accept = create_nfa_state();

            start->is_epsilon = true;
            start->edge1 = f1.start;
            start->edge2 = accept;

            f1.accept->is_epsilon = true;
            f1.accept->edge1 = f1.start;
            f1.accept->edge2 = accept;

            NFAFragment frag = {start, accept};
            stack[stack_top++] = frag;
        } else if (t.type == TOKEN_PLUS) {
            NFAFragment f1 = stack[--stack_top];

            NFAState* start = create_nfa_state();
            NFAState* accept = create_nfa_state();

            start->is_epsilon = true;
            start->edge1 = f1.start;

            f1.accept->is_epsilon = true;
            f1.accept->edge1 = f1.start;
            f1.accept->edge2 = accept;

            NFAFragment frag = {start, accept};
            stack[stack_top++] = frag;
        } else if (t.type == TOKEN_QUESTION) {
            NFAFragment f1 = stack[--stack_top];

            NFAState* start = create_nfa_state();
            NFAState* accept = create_nfa_state();

            start->is_epsilon = true;
            start->edge1 = f1.start;
            start->edge2 = accept;

            f1.accept->is_epsilon = true;
            f1.accept->edge1 = accept;

            NFAFragment frag = {start, accept};
            stack[stack_top++] = frag;
        }
    }

    return stack[0];
}

// Computes epsilon closure of a set of NFA states
static void get_epsilon_closure(int* input_states, int input_count, NFAState** all_nfa_states, int total_nfa_states, int** output_states, int* output_count) {
    bool* visited = (bool*)calloc(total_nfa_states, sizeof(bool));
    int* queue = (int*)malloc(total_nfa_states * sizeof(int));
    int head = 0, tail = 0;

    for (int i = 0; i < input_count; i++) {
        int id = input_states[i];
        visited[id] = true;
        queue[tail++] = id;
    }

    while (head < tail) {
        int curr_id = queue[head++];
        NFAState* s = all_nfa_states[curr_id];
        if (s->is_epsilon) {
            if (s->edge1 && !visited[s->edge1->id]) {
                visited[s->edge1->id] = true;
                queue[tail++] = s->edge1->id;
            }
            if (s->edge2 && !visited[s->edge2->id]) {
                visited[s->edge2->id] = true;
                queue[tail++] = s->edge2->id;
            }
        }
    }

    int count = 0;
    for (int i = 0; i < total_nfa_states; i++) {
        if (visited[i]) count++;
    }

    int* res = (int*)malloc(count * sizeof(int));
    int idx = 0;
    for (int i = 0; i < total_nfa_states; i++) {
        if (visited[i]) {
            res[idx++] = i;
        }
    }

    *output_states = res;
    *output_count = count;
    free(visited);
    free(queue);
}

// Compare two NFA state sets
static bool are_nfa_sets_equal(int* a, int a_count, int* b, int b_count) {
    if (a_count != b_count) return false;
    for (int i = 0; i < a_count; i++) {
        if (a[i] != b[i]) return false;
    }
    return true;
}

// Compiles a set of regular expression patterns into a complete DFA using subset construction
DFAState* slex_compile_regexes(char** patterns, int pattern_count, int* dfa_state_count_out) {
    free_all_nfa_states(); // Reset global state tracker

    // 1. Build NFA for each pattern
    NFAFragment* fragments = (NFAFragment*)malloc(pattern_count * sizeof(NFAFragment));
    for (int i = 0; i < pattern_count; i++) {
        int t_count = 0, concat_count = 0, post_count = 0;
        RegexToken* tokens = tokenize_regex(patterns[i], &t_count);
        RegexToken* tokens_concat = insert_concat(tokens, t_count, &concat_count);
        RegexToken* tokens_postfix = infix_to_postfix(tokens_concat, concat_count, &post_count);

        fragments[i] = build_nfa(tokens_postfix, post_count);
        fragments[i].accept->accept_rule_index = i;

        free(tokens);
        free(tokens_concat);
        free(tokens_postfix);
    }

    // 2. Create global start state with epsilon transitions to each pattern NFA's start state
    NFAState* global_start = create_nfa_state();
    global_start->is_epsilon = true;

    NFAState* current_hub = global_start;
    for (int i = 0; i < pattern_count; i++) {
        if (i == pattern_count - 1) {
            current_hub->edge1 = fragments[i].start;
        } else {
            NFAState* next_hub = create_nfa_state();
            next_hub->is_epsilon = true;
            current_hub->edge1 = fragments[i].start;
            current_hub->edge2 = next_hub;
            current_hub = next_hub;
        }
    }
    free(fragments);

    // 3. Subset construction
    int total_nfa_states = g_nfa_state_count;
    NFAState** all_nfa_states = g_nfa_states;

    int dfa_capacity = 1024;
    int dfa_count = 0;
    DFAState* dfa_states = (DFAState*)malloc(dfa_capacity * sizeof(DFAState));

    // Queue for subset construction
    int* work_queue = (int*)malloc(dfa_capacity * sizeof(int));
    int queue_head = 0, queue_tail = 0;

    // Start state epsilon closure
    int start_nfa_id = global_start->id;
    int* start_closure = NULL;
    int start_closure_count = 0;
    get_epsilon_closure(&start_nfa_id, 1, all_nfa_states, total_nfa_states, &start_closure, &start_closure_count);

    // Create start DFA state (0)
    dfa_states[dfa_count].id = dfa_count;
    dfa_states[dfa_count].nfa_states = start_closure;
    dfa_states[dfa_count].nfa_state_count = start_closure_count;
    memset(dfa_states[dfa_count].transitions, -1, sizeof(dfa_states[dfa_count].transitions));
    dfa_states[dfa_count].accept_rule_index = -1;

    work_queue[queue_tail++] = dfa_count;
    dfa_count++;

    // Process queue
    while (queue_head < queue_tail) {
        int curr_dfa_id = work_queue[queue_head++];

        // For each possible ASCII character transition
        for (int c = 0; c < 256; c++) {
            // Find NFA states reachable on character 'c'
            int* reachable = (int*)malloc(total_nfa_states * sizeof(int));
            int reachable_count = 0;

            DFAState* curr_dfa = &dfa_states[curr_dfa_id];
            for (int i = 0; i < curr_dfa->nfa_state_count; i++) {
                NFAState* nfa_s = all_nfa_states[curr_dfa->nfa_states[i]];
                if (!nfa_s->is_epsilon && nfa_s->char_set[c]) {
                    if (nfa_s->edge1) {
                        reachable[reachable_count++] = nfa_s->edge1->id;
                    }
                }
            }

            if (reachable_count > 0) {
                // Compute epsilon closure of reachable NFA states
                int* closure = NULL;
                int closure_count = 0;
                get_epsilon_closure(reachable, reachable_count, all_nfa_states, total_nfa_states, &closure, &closure_count);
                free(reachable);

                // Check if this DFA state already exists
                int existing_id = -1;
                for (int d = 0; d < dfa_count; d++) {
                    if (are_nfa_sets_equal(dfa_states[d].nfa_states, dfa_states[d].nfa_state_count, closure, closure_count)) {
                        existing_id = d;
                        break;
                    }
                }

                if (existing_id != -1) {
                    dfa_states[curr_dfa_id].transitions[c] = existing_id;
                    free(closure);
                } else {
                    if (dfa_count >= dfa_capacity) {
                        dfa_capacity *= 2;
                        dfa_states = (DFAState*)realloc(dfa_states, dfa_capacity * sizeof(DFAState));
                        work_queue = (int*)realloc(work_queue, dfa_capacity * sizeof(int));
                    }

                    dfa_states[dfa_count].id = dfa_count;
                    dfa_states[dfa_count].nfa_states = closure;
                    dfa_states[dfa_count].nfa_state_count = closure_count;
                    memset(dfa_states[dfa_count].transitions, -1, sizeof(dfa_states[dfa_count].transitions));
                    dfa_states[dfa_count].accept_rule_index = -1;

                    dfa_states[curr_dfa_id].transitions[c] = dfa_count;
                    work_queue[queue_tail++] = dfa_count;
                    dfa_count++;
                }
            } else {
                free(reachable);
                dfa_states[curr_dfa_id].transitions[c] = -1;
            }
        }
    }

    // Determine accepting status of each DFA state based on NFA accept states
    for (int d = 0; d < dfa_count; d++) {
        int best_rule = -1;
        for (int i = 0; i < dfa_states[d].nfa_state_count; i++) {
            NFAState* nfa_s = all_nfa_states[dfa_states[d].nfa_states[i]];
            if (nfa_s->accept_rule_index != -1) {
                if (best_rule == -1 || nfa_s->accept_rule_index < best_rule) {
                    best_rule = nfa_s->accept_rule_index;
                }
            }
        }
        dfa_states[d].accept_rule_index = best_rule;
    }

    free(work_queue);
    free_all_nfa_states(); // We no longer need the NFA states

    *dfa_state_count_out = dfa_count;
    return dfa_states;
}

void slex_free_dfa(DFAState* dfa_states, int dfa_state_count) {
    if (dfa_states) {
        for (int i = 0; i < dfa_state_count; i++) {
            free(dfa_states[i].nfa_states);
        }
        free(dfa_states);
    }
}