#include "../Headers/slex_regex.h" #include #include #include // Token representation for Regex Parsing typedef enum { TOKEN_CHAR, TOKEN_CHAR_SET, TOKEN_CONCAT, TOKEN_ALT, TOKEN_STAR, TOKEN_PLUS, TOKEN_QUESTION, TOKEN_LPAREN, TOKEN_RPAREN, } RegexTokenType; typedef struct { RegexTokenType type; bool char_set[256]; } RegexToken; // Global array to track all allocated NFA states for easy deallocation static NFAState** g_nfa_states = NULL; static int g_nfa_state_count = 0; static int g_nfa_state_capacity = 0; static NFAState* create_nfa_state() { NFAState* s = (NFAState*)malloc(sizeof(NFAState)); s->id = g_nfa_state_count; s->is_epsilon = false; memset(s->char_set, 0, sizeof(s->char_set)); s->edge1 = NULL; s->edge2 = NULL; s->accept_rule_index = -1; // Track state globally if (g_nfa_state_count >= g_nfa_state_capacity) { g_nfa_state_capacity = g_nfa_state_capacity == 0 ? 1024 : g_nfa_state_capacity * 2; g_nfa_states = (NFAState**)realloc(g_nfa_states, g_nfa_state_capacity * sizeof(NFAState*)); } g_nfa_states[g_nfa_state_count++] = s; return s; } static void free_all_nfa_states() { for (int i = 0; i < g_nfa_state_count; i++) { free(g_nfa_states[i]); } free(g_nfa_states); g_nfa_states = NULL; g_nfa_state_count = 0; g_nfa_state_capacity = 0; } // Tokenize a regex pattern static RegexToken* tokenize_regex(const char* pattern, int* token_count_out) { int capacity = 128; int count = 0; RegexToken* tokens = (RegexToken*)malloc(capacity * sizeof(RegexToken)); int len = (int)strlen(pattern); int idx = 0; while (idx < len) { if (count >= capacity) { capacity *= 2; tokens = (RegexToken*)realloc(tokens, capacity * sizeof(RegexToken)); } char c = pattern[idx]; if (c == '\\') { idx++; if (idx >= len) { // Trailing backslash, treat as literal backslash tokens[count].type = TOKEN_CHAR; memset(tokens[count].char_set, 0, 256); tokens[count].char_set[(unsigned char)'\\'] = true; count++; break; } char esc = pattern[idx++]; tokens[count].type = TOKEN_CHAR_SET; memset(tokens[count].char_set, 0, 256); if (esc == 'p' && idx < len && pattern[idx] == '{') { idx++; // skip '{' char prop[256]; int p_idx = 0; while (idx < len && pattern[idx] != '}') { prop[p_idx++] = pattern[idx++]; } prop[p_idx] = '\0'; if (idx < len && pattern[idx] == '}') { idx++; // skip '}' } if (strcmp(prop, "P") == 0) { const char* punct = "!\"#%&'()*,-./:;?@[\\]_{}"; for (int k = 0; punct[k] != '\0'; k++) { tokens[count].char_set[(unsigned char)punct[k]] = true; } } else if (strcmp(prop, "S") == 0) { const char* sym = "$+<=>^`|~"; for (int k = 0; sym[k] != '\0'; k++) { tokens[count].char_set[(unsigned char)sym[k]] = true; } } else if (strcmp(prop, "L") == 0) { for (int d = 'a'; d <= 'z'; d++) tokens[count].char_set[d] = true; for (int d = 'A'; d <= 'Z'; d++) tokens[count].char_set[d] = true; } else if (strcmp(prop, "N") == 0) { for (int d = '0'; d <= '9'; d++) tokens[count].char_set[d] = true; } } else if (esc == 'n') { tokens[count].char_set[10] = true; // LF } else if (esc == 't') { tokens[count].char_set[9] = true; // TAB } else if (esc == 'r') { tokens[count].char_set[13] = true; // CR } else if (esc == 's') { tokens[count].char_set[32] = true; // Space tokens[count].char_set[9] = true; // TAB tokens[count].char_set[13] = true; // CR tokens[count].char_set[10] = true; // LF } else if (esc == 'd') { for (int d = '0'; d <= '9'; d++) tokens[count].char_set[d] = true; } else if (esc == 'w') { for (int d = '0'; d <= '9'; d++) tokens[count].char_set[d] = true; for (int d = 'a'; d <= 'z'; d++) tokens[count].char_set[d] = true; for (int d = 'A'; d <= 'Z'; d++) tokens[count].char_set[d] = true; tokens[count].char_set[(unsigned char)'_'] = true; } else { // Literal escaped character tokens[count].type = TOKEN_CHAR; tokens[count].char_set[(unsigned char)esc] = true; } count++; } else if (c == '[') { idx++; bool negate = false; if (idx < len && pattern[idx] == '^') { negate = true; idx++; } tokens[count].type = TOKEN_CHAR_SET; memset(tokens[count].char_set, 0, 256); while (idx < len && pattern[idx] != ']') { char c1 = pattern[idx++]; if (c1 == '\\' && idx < len) { char esc = pattern[idx++]; if (esc == 'p' && idx < len && pattern[idx] == '{') { idx++; // skip '{' char prop[256]; int p_idx = 0; while (idx < len && pattern[idx] != '}') { prop[p_idx++] = pattern[idx++]; } prop[p_idx] = '\0'; if (idx < len && pattern[idx] == '}') { idx++; // skip '}' } if (strcmp(prop, "P") == 0) { const char* punct = "!\"#%&'()*,-./:;?@[\\]_{}"; for (int k = 0; punct[k] != '\0'; k++) { tokens[count].char_set[(unsigned char)punct[k]] = true; } } else if (strcmp(prop, "S") == 0) { const char* sym = "$+<=>^`|~"; for (int k = 0; sym[k] != '\0'; k++) { tokens[count].char_set[(unsigned char)sym[k]] = true; } } else if (strcmp(prop, "L") == 0) { for (int d = 'a'; d <= 'z'; d++) tokens[count].char_set[d] = true; for (int d = 'A'; d <= 'Z'; d++) tokens[count].char_set[d] = true; } else if (strcmp(prop, "N") == 0) { for (int d = '0'; d <= '9'; d++) tokens[count].char_set[d] = true; } continue; } else if (esc == 'n') c1 = '\n'; else if (esc == 't') c1 = '\t'; else if (esc == 'r') c1 = '\r'; else if (esc == 's') { tokens[count].char_set[32] = true; tokens[count].char_set[9] = true; tokens[count].char_set[13] = true; tokens[count].char_set[10] = true; continue; } else if (esc == 'd') { for (int d = '0'; d <= '9'; d++) tokens[count].char_set[d] = true; continue; } else if (esc == 'w') { for (int d = '0'; d <= '9'; d++) tokens[count].char_set[d] = true; for (int d = 'a'; d <= 'z'; d++) tokens[count].char_set[d] = true; for (int d = 'A'; d <= 'Z'; d++) tokens[count].char_set[d] = true; tokens[count].char_set[(unsigned char)'_'] = true; continue; } else { c1 = esc; } } // Check range: c1-c2 if (idx + 1 < len && pattern[idx] == '-' && pattern[idx + 1] != ']') { idx++; // skip '-' char c2 = pattern[idx++]; if (c2 == '\\' && idx < len) { char esc = pattern[idx++]; if (esc == 'n') c2 = '\n'; else if (esc == 't') c2 = '\t'; else if (esc == 'r') c2 = '\r'; else c2 = esc; } for (int r = (unsigned char)c1; r <= (unsigned char)c2; r++) { tokens[count].char_set[r] = true; } } else { tokens[count].char_set[(unsigned char)c1] = true; } } if (idx < len && pattern[idx] == ']') { idx++; } if (negate) { for (int i = 0; i < 256; i++) { tokens[count].char_set[i] = !tokens[count].char_set[i]; } } count++; } else if (c == '.') { tokens[count].type = TOKEN_CHAR_SET; memset(tokens[count].char_set, 0, 256); for (int i = 0; i < 256; i++) { if (i != 10) { // any character except newline tokens[count].char_set[i] = true; } } count++; idx++; } else if (c == '*') { tokens[count].type = TOKEN_STAR; count++; idx++; } else if (c == '+') { tokens[count].type = TOKEN_PLUS; count++; idx++; } else if (c == '?') { tokens[count].type = TOKEN_QUESTION; count++; idx++; } else if (c == '|') { tokens[count].type = TOKEN_ALT; count++; idx++; } else if (c == '(') { tokens[count].type = TOKEN_LPAREN; count++; idx++; } else if (c == ')') { tokens[count].type = TOKEN_RPAREN; count++; idx++; } else { tokens[count].type = TOKEN_CHAR; memset(tokens[count].char_set, 0, 256); tokens[count].char_set[(unsigned char)c] = true; count++; idx++; } } *token_count_out = count; return tokens; } // Insert explicit concatenation operators static RegexToken* insert_concat(RegexToken* input, int input_count, int* output_count_out) { int capacity = input_count * 2; int count = 0; RegexToken* output = (RegexToken*)malloc(capacity * sizeof(RegexToken)); for (int i = 0; i < input_count; i++) { if (count >= capacity) { capacity *= 2; output = (RegexToken*)realloc(output, capacity * sizeof(RegexToken)); } output[count++] = input[i]; if (i + 1 < input_count) { RegexTokenType t1 = input[i].type; RegexTokenType t2 = input[i + 1].type; bool t1_can_concat = (t1 == TOKEN_CHAR || t1 == TOKEN_CHAR_SET || t1 == TOKEN_STAR || t1 == TOKEN_PLUS || t1 == TOKEN_QUESTION || t1 == TOKEN_RPAREN); bool t2_can_concat = (t2 == TOKEN_CHAR || t2 == TOKEN_CHAR_SET || t2 == TOKEN_LPAREN); if (t1_can_concat && t2_can_concat) { if (count >= capacity) { capacity *= 2; output = (RegexToken*)realloc(output, capacity * sizeof(RegexToken)); } output[count].type = TOKEN_CONCAT; memset(output[count].char_set, 0, 256); count++; } } } *output_count_out = count; return output; } // Shunting-yard algorithm to convert infix tokens to postfix tokens static RegexToken* infix_to_postfix(RegexToken* infix, int infix_count, int* postfix_count_out) { int capacity = infix_count; int postfix_count = 0; RegexToken* postfix = (RegexToken*)malloc(capacity * sizeof(RegexToken)); RegexToken stack[512]; int stack_top = 0; for (int i = 0; i < infix_count; i++) { RegexToken t = infix[i]; if (t.type == TOKEN_CHAR || t.type == TOKEN_CHAR_SET) { if (postfix_count >= capacity) { capacity *= 2; postfix = (RegexToken*)realloc(postfix, capacity * sizeof(RegexToken)); } postfix[postfix_count++] = t; } else if (t.type == TOKEN_LPAREN) { stack[stack_top++] = t; } else if (t.type == TOKEN_RPAREN) { while (stack_top > 0 && stack[stack_top - 1].type != TOKEN_LPAREN) { if (postfix_count >= capacity) { capacity *= 2; postfix = (RegexToken*)realloc(postfix, capacity * sizeof(RegexToken)); } postfix[postfix_count++] = stack[--stack_top]; } if (stack_top > 0) { stack_top--; // pop LPAREN } } else if (t.type == TOKEN_STAR || t.type == TOKEN_PLUS || t.type == TOKEN_QUESTION) { // Unary operators have highest precedence and are postfix, output immediately if (postfix_count >= capacity) { capacity *= 2; postfix = (RegexToken*)realloc(postfix, capacity * sizeof(RegexToken)); } postfix[postfix_count++] = t; } else { // Binary operators (CONCAT, ALT) int p_curr = (t.type == TOKEN_ALT) ? 1 : 2; while (stack_top > 0) { RegexTokenType top_type = stack[stack_top - 1].type; if (top_type == TOKEN_CONCAT || top_type == TOKEN_ALT) { int p_top = (top_type == TOKEN_ALT) ? 1 : 2; if (p_top >= p_curr) { if (postfix_count >= capacity) { capacity *= 2; postfix = (RegexToken*)realloc(postfix, capacity * sizeof(RegexToken)); } postfix[postfix_count++] = stack[--stack_top]; } else { break; } } else { break; } } stack[stack_top++] = t; } } while (stack_top > 0) { if (postfix_count >= capacity) { capacity *= 2; postfix = (RegexToken*)realloc(postfix, capacity * sizeof(RegexToken)); } postfix[postfix_count++] = stack[--stack_top]; } *postfix_count_out = postfix_count; return postfix; } // Build NFA from postfix tokens using Thompson's construction static NFAFragment build_nfa(RegexToken* postfix, int postfix_count) { NFAFragment stack[512]; int stack_top = 0; for (int i = 0; i < postfix_count; i++) { RegexToken t = postfix[i]; if (t.type == TOKEN_CHAR || t.type == TOKEN_CHAR_SET) { NFAState* start = create_nfa_state(); NFAState* accept = create_nfa_state(); start->is_epsilon = false; memcpy(start->char_set, t.char_set, 256); start->edge1 = accept; NFAFragment frag = {start, accept}; stack[stack_top++] = frag; } else if (t.type == TOKEN_CONCAT) { NFAFragment f2 = stack[--stack_top]; NFAFragment f1 = stack[--stack_top]; f1.accept->is_epsilon = true; f1.accept->edge1 = f2.start; NFAFragment frag = {f1.start, f2.accept}; stack[stack_top++] = frag; } else if (t.type == TOKEN_ALT) { NFAFragment f2 = stack[--stack_top]; NFAFragment f1 = stack[--stack_top]; NFAState* start = create_nfa_state(); NFAState* accept = create_nfa_state(); start->is_epsilon = true; start->edge1 = f1.start; start->edge2 = f2.start; f1.accept->is_epsilon = true; f1.accept->edge1 = accept; f2.accept->is_epsilon = true; f2.accept->edge1 = accept; NFAFragment frag = {start, accept}; stack[stack_top++] = frag; } else if (t.type == TOKEN_STAR) { NFAFragment f1 = stack[--stack_top]; NFAState* start = create_nfa_state(); NFAState* accept = create_nfa_state(); start->is_epsilon = true; start->edge1 = f1.start; start->edge2 = accept; f1.accept->is_epsilon = true; f1.accept->edge1 = f1.start; f1.accept->edge2 = accept; NFAFragment frag = {start, accept}; stack[stack_top++] = frag; } else if (t.type == TOKEN_PLUS) { NFAFragment f1 = stack[--stack_top]; NFAState* start = create_nfa_state(); NFAState* accept = create_nfa_state(); start->is_epsilon = true; start->edge1 = f1.start; f1.accept->is_epsilon = true; f1.accept->edge1 = f1.start; f1.accept->edge2 = accept; NFAFragment frag = {start, accept}; stack[stack_top++] = frag; } else if (t.type == TOKEN_QUESTION) { NFAFragment f1 = stack[--stack_top]; NFAState* start = create_nfa_state(); NFAState* accept = create_nfa_state(); start->is_epsilon = true; start->edge1 = f1.start; start->edge2 = accept; f1.accept->is_epsilon = true; f1.accept->edge1 = accept; NFAFragment frag = {start, accept}; stack[stack_top++] = frag; } } return stack[0]; } // Computes epsilon closure of a set of NFA states static void get_epsilon_closure(int* input_states, int input_count, NFAState** all_nfa_states, int total_nfa_states, int** output_states, int* output_count) { bool* visited = (bool*)calloc(total_nfa_states, sizeof(bool)); int* queue = (int*)malloc(total_nfa_states * sizeof(int)); int head = 0, tail = 0; for (int i = 0; i < input_count; i++) { int id = input_states[i]; visited[id] = true; queue[tail++] = id; } while (head < tail) { int curr_id = queue[head++]; NFAState* s = all_nfa_states[curr_id]; if (s->is_epsilon) { if (s->edge1 && !visited[s->edge1->id]) { visited[s->edge1->id] = true; queue[tail++] = s->edge1->id; } if (s->edge2 && !visited[s->edge2->id]) { visited[s->edge2->id] = true; queue[tail++] = s->edge2->id; } } } int count = 0; for (int i = 0; i < total_nfa_states; i++) { if (visited[i]) count++; } int* res = (int*)malloc(count * sizeof(int)); int idx = 0; for (int i = 0; i < total_nfa_states; i++) { if (visited[i]) { res[idx++] = i; } } *output_states = res; *output_count = count; free(visited); free(queue); } // Compare two NFA state sets static bool are_nfa_sets_equal(int* a, int a_count, int* b, int b_count) { if (a_count != b_count) return false; for (int i = 0; i < a_count; i++) { if (a[i] != b[i]) return false; } return true; } // Compiles a set of regular expression patterns into a complete DFA using subset construction DFAState* slex_compile_regexes(char** patterns, int pattern_count, int* dfa_state_count_out) { free_all_nfa_states(); // Reset global state tracker // 1. Build NFA for each pattern NFAFragment* fragments = (NFAFragment*)malloc(pattern_count * sizeof(NFAFragment)); for (int i = 0; i < pattern_count; i++) { int t_count = 0, concat_count = 0, post_count = 0; RegexToken* tokens = tokenize_regex(patterns[i], &t_count); RegexToken* tokens_concat = insert_concat(tokens, t_count, &concat_count); RegexToken* tokens_postfix = infix_to_postfix(tokens_concat, concat_count, &post_count); fragments[i] = build_nfa(tokens_postfix, post_count); fragments[i].accept->accept_rule_index = i; free(tokens); free(tokens_concat); free(tokens_postfix); } // 2. Create global start state with epsilon transitions to each pattern NFA's start state NFAState* global_start = create_nfa_state(); global_start->is_epsilon = true; NFAState* current_hub = global_start; for (int i = 0; i < pattern_count; i++) { if (i == pattern_count - 1) { current_hub->edge1 = fragments[i].start; } else { NFAState* next_hub = create_nfa_state(); next_hub->is_epsilon = true; current_hub->edge1 = fragments[i].start; current_hub->edge2 = next_hub; current_hub = next_hub; } } free(fragments); // 3. Subset construction int total_nfa_states = g_nfa_state_count; NFAState** all_nfa_states = g_nfa_states; int dfa_capacity = 1024; int dfa_count = 0; DFAState* dfa_states = (DFAState*)malloc(dfa_capacity * sizeof(DFAState)); // Queue for subset construction int* work_queue = (int*)malloc(dfa_capacity * sizeof(int)); int queue_head = 0, queue_tail = 0; // Start state epsilon closure int start_nfa_id = global_start->id; int* start_closure = NULL; int start_closure_count = 0; get_epsilon_closure(&start_nfa_id, 1, all_nfa_states, total_nfa_states, &start_closure, &start_closure_count); // Create start DFA state (0) dfa_states[dfa_count].id = dfa_count; dfa_states[dfa_count].nfa_states = start_closure; dfa_states[dfa_count].nfa_state_count = start_closure_count; memset(dfa_states[dfa_count].transitions, -1, sizeof(dfa_states[dfa_count].transitions)); dfa_states[dfa_count].accept_rule_index = -1; work_queue[queue_tail++] = dfa_count; dfa_count++; // Process queue while (queue_head < queue_tail) { int curr_dfa_id = work_queue[queue_head++]; // For each possible ASCII character transition for (int c = 0; c < 256; c++) { // Find NFA states reachable on character 'c' int* reachable = (int*)malloc(total_nfa_states * sizeof(int)); int reachable_count = 0; DFAState* curr_dfa = &dfa_states[curr_dfa_id]; for (int i = 0; i < curr_dfa->nfa_state_count; i++) { NFAState* nfa_s = all_nfa_states[curr_dfa->nfa_states[i]]; if (!nfa_s->is_epsilon && nfa_s->char_set[c]) { if (nfa_s->edge1) { reachable[reachable_count++] = nfa_s->edge1->id; } } } if (reachable_count > 0) { // Compute epsilon closure of reachable NFA states int* closure = NULL; int closure_count = 0; get_epsilon_closure(reachable, reachable_count, all_nfa_states, total_nfa_states, &closure, &closure_count); free(reachable); // Check if this DFA state already exists int existing_id = -1; for (int d = 0; d < dfa_count; d++) { if (are_nfa_sets_equal(dfa_states[d].nfa_states, dfa_states[d].nfa_state_count, closure, closure_count)) { existing_id = d; break; } } if (existing_id != -1) { dfa_states[curr_dfa_id].transitions[c] = existing_id; free(closure); } else { if (dfa_count >= dfa_capacity) { dfa_capacity *= 2; dfa_states = (DFAState*)realloc(dfa_states, dfa_capacity * sizeof(DFAState)); work_queue = (int*)realloc(work_queue, dfa_capacity * sizeof(int)); } dfa_states[dfa_count].id = dfa_count; dfa_states[dfa_count].nfa_states = closure; dfa_states[dfa_count].nfa_state_count = closure_count; memset(dfa_states[dfa_count].transitions, -1, sizeof(dfa_states[dfa_count].transitions)); dfa_states[dfa_count].accept_rule_index = -1; dfa_states[curr_dfa_id].transitions[c] = dfa_count; work_queue[queue_tail++] = dfa_count; dfa_count++; } } else { free(reachable); dfa_states[curr_dfa_id].transitions[c] = -1; } } } // Determine accepting status of each DFA state based on NFA accept states for (int d = 0; d < dfa_count; d++) { int best_rule = -1; for (int i = 0; i < dfa_states[d].nfa_state_count; i++) { NFAState* nfa_s = all_nfa_states[dfa_states[d].nfa_states[i]]; if (nfa_s->accept_rule_index != -1) { if (best_rule == -1 || nfa_s->accept_rule_index < best_rule) { best_rule = nfa_s->accept_rule_index; } } } dfa_states[d].accept_rule_index = best_rule; } free(work_queue); free_all_nfa_states(); // We no longer need the NFA states *dfa_state_count_out = dfa_count; return dfa_states; } void slex_free_dfa(DFAState* dfa_states, int dfa_state_count) { if (dfa_states) { for (int i = 0; i < dfa_state_count; i++) { free(dfa_states[i].nfa_states); } free(dfa_states); } }