scc/Source/scc_parser.c

#include "../Headers/scc_core.h"
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h>

// Helper: Trim leading and trailing whitespace
static char* trim_whitespace(char* str) {
    while (isspace((unsigned char)*str)) {
        str++;
    }
    if (*str == '\0') {
        return str;
    }
    char* end = str + strlen(str) - 1;
    while (end > str && isspace((unsigned char)*end)) {
        end--;
    }
    *(end + 1) = '\0';
    return str;
}

// Helper: Get next line from buffer
static bool get_next_line(char** cursor, char* line_buf, int max_len) {
    char* c = *cursor;
    if (*c == '\0') return false;
    int idx = 0;
    while (*c != '\0' && *c != '\n' && *c != '\r' && idx < max_len - 1) {
        line_buf[idx++] = *c++;
    }
    line_buf[idx] = '\0';

    // Skip newline characters
    if (*c == '\r') c++;
    if (*c == '\n') c++;

    *cursor = c;
    return true;
}

// Helper: Split line into tokens, respecting quotes
static void split_tokens(const char* str, char*** out_tokens, uint64_t* out_count) {
    char** tokens = NULL;
    uint64_t count = 0;
    const char* p = str;
    while (*p != '\0') {
        while (isspace((unsigned char)*p)) p++;
        if (*p == '\0') break;

        const char* start = p;
        if (*p == '"') {
            p++;
            while (*p != '\0' && *p != '"') p++;
            if (*p == '"') p++;
        } else if (*p == '\'') {
            p++;
            while (*p != '\0' && *p != '\'') p++;
            if (*p == '\'') p++;
        } else {
            while (*p != '\0' && !isspace((unsigned char)*p)) p++;
        }

        size_t len = p - start;
        char* token = (char*)malloc(len + 1);
        memcpy(token, start, len);
        token[len] = '\0';

        count++;
        tokens = (char**)realloc(tokens, count * sizeof(char*));
        tokens[count - 1] = token;
    }
    *out_tokens = tokens;
    *out_count = count;
}

bool scc_read_rule_from_cstr(char *content, scc_rules *output_rule) {
    if (!content || !output_rule) return false;

    // Initialize output structure
    output_rule->rules = NULL;
    output_rule->rule_count = 0;
    output_rule->syntax_ids = NULL;
    output_rule->syntax_id_count = 0;

    typedef enum {
        STATE_NONE,
        STATE_SYNTAX_IDS,
        STATE_RULES
    } ParserState;

    ParserState state = STATE_NONE;
    scc_rule* current_rule = NULL;

    char* cursor = content;
    char line[4096];

    while (get_next_line(&cursor, line, sizeof(line))) {
        char* trimmed = trim_whitespace(line);

        // Skip comments and empty lines
        if (trimmed[0] == '\0' || trimmed[0] == '#' || (trimmed[0] == '/' && trimmed[1] == '/')) {
            continue;
        }

        // Section switches
        if (strcmp(trimmed, "syntax_ids:") == 0) {
            state = STATE_SYNTAX_IDS;
            continue;
        } else if (strcmp(trimmed, "rules:") == 0) {
            state = STATE_RULES;
            continue;
        }

        if (state == STATE_SYNTAX_IDS) {
            // Trimmed line is a syntax ID
            output_rule->syntax_id_count++;
            output_rule->syntax_ids = (char**)realloc(output_rule->syntax_ids, output_rule->syntax_id_count * sizeof(char*));
            output_rule->syntax_ids[output_rule->syntax_id_count - 1] = strdup(trimmed);
        } else if (state == STATE_RULES) {
            if (strcmp(trimmed, ";") == 0) {
                current_rule = NULL;
                continue;
            }

            if (trimmed[0] == ':' || trimmed[0] == '|') {
                if (current_rule) {
                    char* match_part = trimmed + 1;
                    char** match_tokens = NULL;
                    uint64_t match_token_count = 0;
                    split_tokens(match_part, &match_tokens, &match_token_count);

                    scc_matching* matching = (scc_matching*)malloc(sizeof(scc_matching));
                    matching->match_ids = match_tokens;
                    matching->match_id_count = match_token_count;
                    matching->target_syntax_id = NULL;
                    matching->using_match_id = NULL;
                    matching->using_match_id_count = 0;

                    current_rule->matching_count++;
                    current_rule->matchings = (scc_matching**)realloc(current_rule->matchings, current_rule->matching_count * sizeof(scc_matching*));
                    current_rule->matchings[current_rule->matching_count - 1] = matching;
                }
            } else if (strncmp(trimmed, "=>", 2) == 0) {
                if (current_rule && current_rule->matching_count > 0) {
                    scc_matching* matching = current_rule->matchings[current_rule->matching_count - 1];
                    char* op_part = trimmed + 2;
                    char** op_tokens = NULL;
                    uint64_t op_token_count = 0;
                    split_tokens(op_part, &op_tokens, &op_token_count);

                    if (op_token_count > 0) {
                        if (strcmp(op_tokens[0], "new_node") == 0) {
                            if (op_token_count > 1) {
                                matching->target_syntax_id = strdup(op_tokens[1]);
                                if (op_token_count > 2) {
                                    matching->using_match_id_count = op_token_count - 2;
                                    matching->using_match_id = (char**)malloc(matching->using_match_id_count * sizeof(char*));
                                    for (uint64_t i = 0; i < matching->using_match_id_count; i++) {
                                        matching->using_match_id[i] = strdup(op_tokens[2 + i]);
                                    }
                                }
                            }
                        } else if (strcmp(op_tokens[0], "append_as_child") == 0) {
                            matching->target_syntax_id = strdup("append_as_child");
                            if (op_token_count > 1) {
                                matching->using_match_id_count = op_token_count - 1;
                                matching->using_match_id = (char**)malloc(matching->using_match_id_count * sizeof(char*));
                                for (uint64_t i = 0; i < matching->using_match_id_count; i++) {
                                    matching->using_match_id[i] = strdup(op_tokens[1 + i]);
                                }
                            }
                        } else if (strcmp(op_tokens[0], "skip") == 0) {
                            matching->target_syntax_id = strdup("skip");
                        }
                    }

                    // Free op_tokens
                    for (uint64_t i = 0; i < op_token_count; i++) {
                        free(op_tokens[i]);
                    }
                    free(op_tokens);
                }
            } else {
                // Defines a new rule (node_type_name)
                output_rule->rule_count++;
                output_rule->rules = (scc_rule*)realloc(output_rule->rules, output_rule->rule_count * sizeof(scc_rule));

                current_rule = &output_rule->rules[output_rule->rule_count - 1];
                current_rule->node_type_name = strdup(trimmed);
                current_rule->matchings = NULL;
                current_rule->matching_count = 0;
            }
        }
    }

    return true;
}

bool scc_read_rule_from_file(FILE *f, scc_rules *output_rule) {
    if (!f || !output_rule) return false;

    // Determine file size
    fseek(f, 0, SEEK_END);
    long size = ftell(f);
    fseek(f, 0, SEEK_SET);

    char* content = (char*)malloc(size + 1);
    if (!content) return false;

    size_t read_bytes = fread(content, 1, size, f);
    content[read_bytes] = '\0';

    bool success = scc_read_rule_from_cstr(content, output_rule);
    free(content);
    return success;
}