Load configuration for in language syntax replacements

Add testbench
Reintroducing meta tokens
2025-08-26 22:43:59 +02:00 · 2025-08-25 12:09:54 +02:00 · 2025-08-25 07:12:22 +02:00 · 2025-08-24 20:54:20 +02:00 · 2025-08-12 19:04:09 +02:00 · 2025-08-11 12:49:44 +02:00
12 changed files with 1051 additions and 9 deletions
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -4,3 +4,6 @@ version = "0.1.0"
 edition = "2024"
 [dependencies]
 gtoml = "0.1.2"
 toml = "0.9.3"
 regex = "1.11.1"
--- a/example.mlc
+++ b/example.mlc
@@ -0,0 +1,3 @@
 variable:=-3; c := (a+b- 3) * 23 + variable; d := c - a;Natural : Number (n) := {n >= 0};faculty : Natural (n) -> Natural := if n = 0 then 1 else faculty (n-1) * n end;
 String Natural (n) := {Character * n};hello_word -> String := "Hello World!";
 first_letter -> Character := 'a';
--- a/language.toml
+++ b/language.toml
@@ -0,0 +1,79 @@
 # Meta rules are separate rules with priority over all other rules.
 # They can be compared to preprocessor directives, but are more powerful.
 # Pattern matching in preprocessor style, is running at highest priority before anything else.
 [meta.replacements]
 comments = ["^--.*", ""]
 # Interpolation with a shell, replaces the meta pattern by the interpolation result.
 # Passing arguments is supported through groups and #<parameter number> in the shell command.
 [meta.interpolation]
 with = ["^#with ([\\w./]+)", "cat $1"]
 date = ["#date_now", "date"]
 user = ["#user", "user"]
 test = ["#test", "cat ./mathlib.mlc"]
 # Describes tokens to be replaced by identifiers and then later swapped back in after the tokenizer.
 # All special tokens are treated as constants
 [meta.token]
 string_constant = "\".*?\""
 char_constant = "'.'"
 # Every key below is used as type in an enumerate to sort the tokens
 #  -> Replacement in order
 #  -> Every amount of other symbols is saved as some kind of value
 #  -> Those are using the default type "identifier"
 [token]
 separator = [" ", ",", "\n"]
 operands = [":=", "->", "<=", ">=", "<", ">", "!", "+", "-", "/", "*", "(", ")", "[", "]", "{", "}", "=", "?", ":"]
 terminator = [";"]
 [semantics]
 keywords = ["if", "then", "else", "end"]
 [constants]
 number = "(?:0b[01]+|0x[0-9a-fA-F]+|0[0-7]+|[1-9][0-9]*)"
 character = "'.'"
 logic = "(true|false)"
 [types]
 Number = "number"
 Character = "character"
 Type = ""
 Array = "{character * number}"
 Logic = "logic"
 # List of rules
 # Rules can be found in traces
 # use better names than rule_1, rule_2, ...
 # The compiler will run through all rules trying to match exactly one.
 # Uses the following generic types:
 # - OPERAND
 # - IDENTIFIER
 # - KEYWORD
 # - TERMINATOR
 # - OTHER (Use this type for ambiguous parts. Same as lazy .+ in regular expressions)
 # Definition of custom types are possible, by creation of a rule with the same name.
 # IMPORTANT: Rules are always top priority and can overwrite other types.
 # Named placeholders: The character # is reserved for named placeholders. They are only valid inside a rule.
 [syntax]
 definition = "IDENTIFIER#1 -> IDENTIFIER#2 := OTHER#3 TERMINATOR"
 definition_with_parameter = "IDENTIFIER#1 : parameter#2 -> IDENTIFIER#3 := OTHER#4 TERMINATOR"
 recursion = "#basename OTHER := OTHER #basename OTHER TERMINATOR"
 replace_predef = [ "IDENTIFIER#1 -> OTHER := OTHER#2 TERMINATOR OTHER IDENTIFIER#1", "#1 -> OTHER := #2 TERMINATOR OTHER (#2)" ]
 replace_postdef = [ "IDENTIFIER#1 OTHER TERMINATOR IDENTIFIER#1 -> OTHER := OTHER#2 TERMINATOR", "#2 OTHER TERMINATOR #1 -> OTHER := #2 TERMINATOR" ]
 unfold_parameter = [ ": OTHER IDENTIFIER#1 ( IDENTIFIER#2 OTHER#3 ) OTHER ->", ": OTHER #1 #2 #1 ( #3 ) OTHER ->" ]
 unfold_parameter_remove_brackets = [ ": OTHER IDENTIFIER ( ) OTHER ->", ": OTHER OTHER ->" ]
 parameter = ": OTHER ->"
 # The following sections are used to build different output formats
 # [interpreter] refers to the builtin interpreter using a minimal subset of C syntax
 # The name of each section is only used to specify the actual output.
 [clang]
 definition = "#2 #1 () {return (#3);}"
 Logic = "int"
 Number = "long int"
 Character = "char"
 Type = "struct"
 [interpreter]
--- a/mathlib.mlc
+++ b/mathlib.mlc
@@ -0,0 +1,4 @@
 Sigma -> Array := {0, 1, 2};
 N -> Array := {3};
 P -> Array := {3 -> 012};
 S -> Number := 3;
--- a/src/identification.rs
+++ b/src/identification.rs
@@ -0,0 +1,75 @@
 use crate::Token;
 use regex::Regex;
 use toml::{Table, Value};
 // Identifier
 // Each Identifier is analyzed to be worked with.
 pub struct Identifier {
    identities: Vec<Identity>,
    pub tokens: Vec<Token>,
    type_configuration: Table,
    constant_configuration: Table,
 }
 // Identity
 // The Identity of a identifier token.
 struct Identity {
    class: IdentityClass,
    name: String,
    sub_type: String,
 }
 pub enum IdentityClass {
    TYPE,
    CONSTANT,
    DEFINITION,
 }
 impl Identifier {
    pub fn new(token: Vec<Token>) -> Identifier {
        let identities: Vec<Identity> = vec![];
        let new_config_type: Table = Table::new();
        let new_config_constant: Table = Table::new();
        Identifier {
            identities: identities,
            tokens: token,
            type_configuration: new_config_type,
            constant_configuration: new_config_constant,
        }
    }
    pub fn load_criteria_from_configuration(&mut self, complete_configuration: Table) {
        let type_configuration_wrapped: &Value = complete_configuration
            .get("types")
            .expect("Missing section types in configuration.");
        let constant_configuration_wrapped: &Value = complete_configuration
            .get("constants")
            .expect("Missing section constants in configuration.");
        let type_configuration: Table = Table::try_from(type_configuration_wrapped)
            .expect("Can't read type configuration from Value.");
        let constant_configuration: Table = Table::try_from(constant_configuration_wrapped)
            .expect("Can't read constant configuration from Value.");
        self.type_configuration = type_configuration;
        self.constant_configuration = constant_configuration;
    }
    pub fn identify_identifiers(&mut self) {
        let tokens: &Vec<Token> = &self.tokens;
        let constant_patterns: Table = self.constant_configuration.clone();
        let type_names: Table = self.type_configuration.clone();
        let mut identity_found = false;
        for token in tokens.iter() {
            if token.token_type == crate::TokenType::IDENTIFIER {
                for raw_pattern in constant_patterns.iter() {
                    let pattern: &str = raw_pattern.1.as_str().unwrap();
                    let expression: Regex = Regex::new(pattern).unwrap();
                    // Check for constant
                    if expression.is_match(token.token.as_str()) {
                        println!("Matching! Found {:?} {:?}.", raw_pattern.0, token.token);
                    }
                }
            }
        }
    }
 }
--- a/src/main.rs
+++ b/src/main.rs
@@ -1,16 +1,52 @@
-mod collector;
+mod preprocessor;
 mod syntax;
 mod testcases;
 mod tokenizer;
-use collector::Collector;
+use tokenizer::*;
 fn main() {
-    let mut _collector: Collector = Collector {
+    // CL-Wrapper
-        definitions: vec![(String::from(""), String::from(""))],
+    let args: Vec<String> = std::env::args().collect();
        arguments: vec![(String::from(""), String::from(""))],
    };
-    let test_string: String = String::from("(1 + 2) * 3");
+    // Adjust to following principle:
    //  micro [-t <target>] [-l <language.toml>] [<list of source files>]
    //  -t default: first found
    //  -l default: language.toml
    //
    //  Either loads all source files or takes stdin input by piping code into the program
    let mut raw_source_code: String = String::from("");
    for i in 1..args.len() {
        raw_source_code = raw_source_code
            + std::fs::read_to_string(args[i].clone())
                .expect("Source file not found!")
                .as_str();
    }
-    let echo_string: String = _collector.eval(test_string);
+    // Load language toml
    let mut meta_rules: crate::preprocessor::MetaRules =
        crate::preprocessor::MetaRules::new("./language.toml");
    let mut tokenizer_configuration: Tokenizer = Tokenizer::new();
    tokenizer_configuration.read_configuration_from_file("./language.toml");
-    println!("Result: {}", echo_string);
+    // Run preprocessor
    let preprocessed_source_code: String = meta_rules.process(raw_source_code);
    // Tokenizing
    tokenizer_configuration.eat(preprocessed_source_code.as_str());
    tokenizer_configuration.identify_tokens();
    //   Reintroducing meta_tokens
    for meta_token in meta_rules.special_tokens.iter() {
        // Go through all tokens
        for i in 0..tokenizer_configuration.tokens.len() {
            if meta_token.0 == tokenizer_configuration.tokens[i].token {
                tokenizer_configuration.tokens[i] = meta_token.1.clone();
                break;
            }
        }
    }
    // Syntax resolving
    // Apply translation
 }
--- a/src/preprocessor.rs
+++ b/src/preprocessor.rs
@@ -0,0 +1,188 @@
 use crate::tokenizer::Token;
 use regex::{Captures, Match, Regex};
 use toml::{Table, Value};
 // MetaRules
 // Struct containing all meta rules.
 pub struct MetaRules {
    replacement_rules: Vec<(String, (String, String))>,
    interpolation_rules: Vec<(String, (String, String))>,
    token_rules: Vec<(String, String)>,
    pub special_tokens: Vec<(String, Token)>,
 }
 // Implementation of MetaRules
 // Trait implementation
 impl MetaRules {
    // @name new
    // @return MetaRules
    // @brief Create a new rule struct by reading from a configuration file.
    // @param configuration_filename: &str
    pub fn new(configuration_filename: &str) -> MetaRules {
        let configuration_content: String = std::fs::read_to_string(configuration_filename)
            .expect("[ERROR] Could not open configuration file!");
        let mut replacements: Vec<(String, (String, String))> = vec![];
        let mut interpolation: Vec<(String, (String, String))> = vec![];
        let mut meta_token_rules: Vec<(String, String)> = vec![];
        let meta_tokens: Vec<(String, Token)> = vec![];
        let configuration = gtoml::parse(configuration_content.as_str())
            .expect("[ERROR] TOML invalid in preprocessor!");
        let configuration_unpacked: Table = Table::try_from(configuration).unwrap();
        let meta_configuration: Table = match configuration_unpacked.get("meta") {
            Some(config) => config.as_table().unwrap().clone(),
            None => Table::new(),
        };
        if !meta_configuration.is_empty() {
            if meta_configuration.contains_key("replacements") {
                println!("[INFO] Found replacement rules.");
                let replacement_rules: Table = meta_configuration
                    .get("replacements")
                    .unwrap()
                    .as_table()
                    .unwrap()
                    .clone();
                for key in replacement_rules.keys() {
                    let value: Vec<Value> = replacement_rules
                        .get(key)
                        .unwrap()
                        .as_array()
                        .unwrap()
                        .clone();
                    let name: String = key.clone();
                    let pattern: String = value[0].as_str().unwrap().to_owned();
                    let replacement: String = value[1].as_str().unwrap().to_owned();
                    replacements.push((name, (pattern, replacement)));
                }
            }
            if meta_configuration.contains_key("interpolation") {
                println!("[INFO] Found interpolation rules.");
                let interpolation_rules: Table = meta_configuration
                    .get("interpolation")
                    .unwrap()
                    .as_table()
                    .unwrap()
                    .clone();
                for key in interpolation_rules.keys() {
                    let value: Vec<Value> = interpolation_rules
                        .get(key)
                        .unwrap()
                        .as_array()
                        .unwrap()
                        .clone();
                    let name: String = key.clone();
                    let pattern: String = value[0].as_str().unwrap().to_owned();
                    let cmd: &str = value[1].as_str().unwrap();
                    interpolation.push((name, (pattern, String::from(cmd))));
                }
            }
            if meta_configuration.contains_key("token") {
                println!("[INFO] Found token rules.");
                let token_rules: Table = meta_configuration
                    .get("token")
                    .unwrap()
                    .as_table()
                    .unwrap()
                    .clone();
                for rule in token_rules.keys() {
                    let pattern: String =
                        token_rules.get(rule).unwrap().as_str().unwrap().to_owned();
                    meta_token_rules.push((rule.clone(), pattern));
                }
            }
        } else {
            println!("[WARNING] No meta configuration, skipping preprocessor.");
        }
        MetaRules {
            replacement_rules: replacements,
            interpolation_rules: interpolation,
            token_rules: meta_token_rules,
            special_tokens: meta_tokens,
        }
    }
    // @name process
    // @return String
    // @brief Run preprocessor on raw code.
    // @param rule_set: MetaRules, raw_code: String
    pub fn process(&mut self, raw_code: String) -> String {
        let mut processed_code: String = raw_code.clone();
        // replacement rules
        for rule in self.replacement_rules.iter() {
            println!("[INFO] Applying rule {}", rule.0);
            let base_pattern: Regex = Regex::new((rule.1 .0).as_str()).unwrap();
            processed_code = base_pattern
                .replace_all(processed_code.as_str(), rule.1 .1.as_str())
                .to_string();
        }
        // interpolation rules
        for rule in self.interpolation_rules.iter() {
            println!("[INFO] Applying rule {}", rule.0);
            let base_pattern: Regex = Regex::new((rule.1 .0).as_str()).unwrap();
            let processed_code_replacement = processed_code.clone();
            let captures: Option<Captures> =
                base_pattern.captures(processed_code_replacement.as_str());
            let directive: String;
            match captures {
                Some(n) => directive = n.get(0).map_or("", |m| m.as_str()).to_string(),
                None => continue,
            };
            let command: &str = &base_pattern.replace(directive.as_str(), rule.1 .1.as_str());
            let subprocess = std::process::Command::new("/bin/bash")
                .arg("-c")
                .arg(String::from("echo \"$(") + command + ")\"")
                .output()
                .expect((String::from("") + "Failed to run command " + command + "!").as_str());
            processed_code = base_pattern
                .replace(
                    processed_code.as_str(),
                    String::from_utf8(subprocess.stdout).unwrap(),
                )
                .to_string();
        }
        for token_style in self.token_rules.iter() {
            println!("[INFO] Searching meta tokens of style {}", token_style.0);
            // Search all occurrences
            let token_pattern: Regex =
                Regex::new(token_style.1.as_str()).expect("Could not assign pattern.");
            let match_list: Match;
            match_list = match token_pattern.find(processed_code.as_str()) {
                Some(n) => n,
                None => continue,
            };
            // Create id for each occurrence
            let meta_id: String = String::from("meta_token_")
                + match_list.start().to_string().as_str()
                + "__"
                + match_list.end().to_string().as_str();
            // Replace token by id
            let meta_value: String = match_list.as_str().to_string();
            let value_regex: Regex =
                Regex::new(meta_value.as_str()).expect("Could not create pattern.");
            processed_code = value_regex
                .replace(processed_code.as_str(), meta_id.as_str())
                .to_string();
            // Safe id and token
            self.special_tokens.push((
                meta_id,
                Token {
                    token: meta_value,
                    token_type: crate::TokenType::IDENTIFIER,
                },
            ));
        }
        return processed_code;
    }
 }
--- a/src/structure.rs
+++ b/src/structure.rs
@@ -0,0 +1,31 @@
 // HeadStructure
 // Top level of structure.
 pub struct HeadStructure {
    token: TokenConfiguration,
    syntax: SyntaxConfiguration,
    semantics: SemanticsConfiguration,
    types: TypesConfiguration,
    hdl: HdlConfiguration,
    compiled: CompiledConfiguration,
    interpreter: InterpreterConfiguration,
 }
 pub struct TokenConfiguration {
    separator: Vec<String>,
    operands: Vec<String>,
    terminator: Vec<String>,
 }
 pub struct SyntaxConfiguration {
    keywords: Vec<String>,
 }
 pub struct SemanticsConfiguration {}
 pub struct TypesConfiguration {}
 pub struct HdlConfiguration {}
 pub struct CompiledConfiguration {}
 pub struct InterpreterConfiguration {}
--- a/src/syntax.rs
+++ b/src/syntax.rs
@@ -0,0 +1,76 @@
 use toml::{Table, Value};
 // SyntaxRule
 // Implementation of a syntax rule that can be applied.
 #[derive(Debug)]
 pub struct SyntaxRule {
    pub name: String,
    pub left: String,
    pub right: String,
 }
 // Implementation of SyntaxRule
 // Load and Resolve from outside
 impl SyntaxRule {
    // @name new
    // @return SyntaxRule
    // @brief Create a new syntax rule / load rule set.
    // @param name_: String, left_: String, right_: String
    fn new(name_: String, left_: String, right_: String) -> SyntaxRule {
        SyntaxRule {
            name: String::new(),
            left: String::new(),
            right: String::new(),
        }
    }
    // @name load
    // @return Vec<SyntaxRule>
    // @brief Load configuration and retrieve transformation rules.
    // @param configuration_filename: &str
    pub fn load(configuration_filename: &str) -> Vec<SyntaxRule> {
        let mut rules: Vec<SyntaxRule> = vec![];
        let configuration_content: String = std::fs::read_to_string(configuration_filename)
            .expect("[ERROR] Could not open configuration file!");
        let configuration = gtoml::parse(configuration_content.as_str())
            .expect("[ERROR] TOML invalid in preprocessor!");
        let configuration_unpacked: Table = Table::try_from(configuration).unwrap();
        let syntax_definitions: Table = match configuration_unpacked.get("syntax") {
            Some(config) => config.as_table().unwrap().clone(),
            None => Table::new(),
        };
        for key in syntax_definitions.keys() {
            let rule: Value = syntax_definitions.get(key).unwrap().clone();
            if rule.is_array() {
                let rule_array = rule.as_array().unwrap();
                let left: String = rule_array[0].to_string();
                let right: String = rule_array[1].to_string();
                rules.push(SyntaxRule {
                    name: key.to_string(),
                    left: left,
                    right: right,
                });
            }
        }
        rules
    }
    // @name resolve
    // @return String
    // @brief Applies all rules until none of them can be applied again.
    // @param rules: Vec<SyntaxRule>, unsolved: String
    pub fn resolve(rules: Vec<SyntaxRule>, unsolved: String) -> String {
        String::new()
    }
    // @name transform
    // @return String
    // @brief Applies a rule.
    // @param &mut self, unformed: String
    fn transform(&mut self, unformed: String) -> String {
        String::new()
    }
 }
--- a/src/testcases.rs
+++ b/src/testcases.rs
@@ -0,0 +1,138 @@
 #[cfg(test)]
 mod tests {
    // preprocessor
    #[test]
    fn test_replacements() {
        let mut ruleset: crate::preprocessor::MetaRules =
            crate::preprocessor::MetaRules::new("./testspecs.toml");
        let sut: String = ruleset.process(String::from("-- Comment to remove"));
        let verify: String = String::from("");
        let case_comment_at_end: String =
            ruleset.process(String::from("This -- comment is not removed."));
        let case_comment_at_end_verify: String = String::from("This -- comment is not removed.");
        assert_eq!(sut, verify);
        assert_eq!(case_comment_at_end, case_comment_at_end_verify);
    }
    #[test]
    fn test_interpolation() {
        let mut ruleset: crate::preprocessor::MetaRules =
            crate::preprocessor::MetaRules::new("./testspecs.toml");
        let run_with_interpolation_test: String = ruleset.process(String::from("#test"));
        let interpolation_verification: String = std::fs::read_to_string("./mathlib.mlc").unwrap();
        assert_eq!(run_with_interpolation_test, interpolation_verification);
    }
    #[test]
    fn test_meta_token() {
        let mut ruleset: crate::preprocessor::MetaRules =
            crate::preprocessor::MetaRules::new("./testspecs.toml");
        let meta_token_test_string: String = ruleset.process(String::from("\"sample\""));
        let meta_token_sample_string: String = String::from("\"sample\"");
        let meta_token_verify: Vec<crate::tokenizer::Token> = vec![crate::tokenizer::Token {
            token: meta_token_sample_string,
            token_type: crate::tokenizer::TokenType::IDENTIFIER,
        }];
        assert_eq!(meta_token_verify.len(), ruleset.special_tokens.len());
        assert_eq!(
            meta_token_verify[0].token,
            ruleset.special_tokens[0].1.token
        );
        assert_eq!(meta_token_test_string, "meta_token_0__8");
    }
    // Tokenizer
    #[test]
    fn test_eat() {
        let mut sample: crate::tokenizer::Tokenizer = crate::tokenizer::Tokenizer::new();
        sample.read_configuration_from_file("./testspecs.toml");
        sample.eat("faculty : Natural n := if n = 0 then 1 else n * faculty (n - 1);");
        assert_eq!(
            sample.token_list,
            vec![
                "faculty", ":", "Natural", "n", ":=", "if", "n", "=", "0", "then", "1", "else",
                "n", "*", "faculty", "(", "n", "-", "1", ")", ";"
            ]
        )
    }
    #[test]
    fn test_identify_tokens() {
        let mut token_sample: crate::tokenizer::Tokenizer = crate::tokenizer::Tokenizer::new();
        token_sample.read_configuration_from_file("./testspecs.toml");
        token_sample.eat("id : -> 125;");
        token_sample.identify_tokens();
        let mut token_verify: crate::tokenizer::Tokenizer = crate::tokenizer::Tokenizer::new();
        token_verify.read_configuration_from_file("./testspecs.toml");
        token_verify.eat("id : -> 125;");
        token_verify.tokens = vec![
            crate::tokenizer::Token {
                token: String::from("id"),
                token_type: crate::tokenizer::TokenType::IDENTIFIER,
            },
            crate::tokenizer::Token {
                token: String::from(":"),
                token_type: crate::tokenizer::TokenType::OPERAND,
            },
            crate::tokenizer::Token {
                token: String::from("->"),
                token_type: crate::tokenizer::TokenType::OPERAND,
            },
            crate::tokenizer::Token {
                token: String::from("125"),
                token_type: crate::tokenizer::TokenType::IDENTIFIER,
            },
            crate::tokenizer::Token {
                token: String::from(";"),
                token_type: crate::tokenizer::TokenType::TERMINATOR,
            },
        ];
        assert_eq!(token_sample.configuration, token_verify.configuration);
        assert_eq!(token_sample.tokens.len(), token_verify.tokens.len());
        assert_eq!(token_sample.token_list.len(), token_verify.token_list.len());
    }
    // @name test_syntax_load
    // @return
    // @brief
    // @param
    #[test]
    fn test_syntax_load() {
        let test: Vec<crate::syntax::SyntaxRule> =
            crate::syntax::SyntaxRule::load("./testspecs.toml");
        let verify: Vec<crate::syntax::SyntaxRule> = vec![
            crate::syntax::SyntaxRule {
                name: String::from("replace_predef"),
                left: String::from(
                    "IDENTIFIER#1 -> OTHER := OTHER#2 TERMINATOR OTHER IDENTIFIER#1",
                ),
                right: String::from("#1 -> OTHER := #2 TERMINATOR OTHER (#2)"),
            },
            crate::syntax::SyntaxRule {
                name: String::from("replace_postdef"),
                left: String::from(
                    "IDENTIFIER#1 OTHER TERMINATOR IDENTIFIER#1 -> OTHER := OTHER#2 TERMINATOR",
                ),
                right: String::from("#2 OTHER TERMINATOR #1 -> OTHER := #2 TERMINATOR"),
            },
            crate::syntax::SyntaxRule {
                name: String::from("unfold_parameter"),
                left: String::from(": OTHER IDENTIFIER#1 ( IDENTIFIER#2 OTHER#3 ) OTHER ->"),
                right: String::from(": OTHER #1 #2 #1 ( #3 ) OTHER ->"),
            },
            crate::syntax::SyntaxRule {
                name: String::from("unfold_parameter_remove_brackets"),
                left: String::from(": OTHER IDENTIFIER ( ) OTHER ->"),
                right: String::from(": OTHER OTHER ->"),
            },
        ];
        assert_eq!(test.len(), verify.len());
    }
 }
--- a/src/tokenizer.rs
+++ b/src/tokenizer.rs
@@ -0,0 +1,330 @@
 use std::fs;
 use toml::{Table, Value};
 #[derive(PartialEq, Debug)]
 pub enum TokenType {
    OPERAND,
    TERMINATOR,
    IDENTIFIER,
    KEYWORD,
 }
 // Tokenizer
 // Tokenizer and underlying functions to turn code into tokens
 pub struct Tokenizer {
    pub token_list: Vec<String>,
    // BUG:
    pub tokens: Vec<Token>,
    // Grammar options from toml file
    pub configuration: Table,
 }
 // Token
 // This is a token with a token type.
 #[derive(Debug)]
 pub struct Token {
    pub token: String,
    pub token_type: TokenType,
 }
 impl Clone for Token {
    fn clone(&self) -> Token {
        let token_type: TokenType = match self.token_type {
            TokenType::OPERAND => TokenType::OPERAND,
            TokenType::KEYWORD => TokenType::KEYWORD,
            TokenType::TERMINATOR => TokenType::TERMINATOR,
            TokenType::IDENTIFIER => TokenType::IDENTIFIER,
        };
        Token {
            token: self.token.clone(),
            token_type: token_type,
        }
    }
 }
 // Implementation of Tokenizer
 // Functions associated with the tokenizer struct and module.
 impl Tokenizer {
    // @name read_configuration_from_file
    // @return
    // @brief Try to read configuration from an external file
    // @param &mut self, configuration_filename: &str
    pub fn read_configuration_from_file(&mut self, configuration_filename: &str) {
        let configuration_string: String = fs::read_to_string(configuration_filename).expect(
            (String::from("Could not open configuration file at: ") + configuration_filename)
                .as_str(),
        );
        let configuration = gtoml::parse(configuration_string.as_str()).expect("TOML invalid!");
        self.configuration = Table::try_from(configuration).unwrap();
        // Check for token section in config, panic if not present
        if !self.configuration.contains_key("token") {
            panic!("Token section is not present!");
        }
        // Check for semantics section in config, panic if not present
        if !self.configuration.contains_key("semantics") {
            panic!("Section semantics is not present!");
        }
    }
    // @name new
    // @return Tokenizer
    // @brief Create a new Tokenizer
    // @param
    pub fn new() -> Tokenizer {
        let empty_tokens: Vec<Token> = vec![];
        let empty_value: toml::map::Map<String, Value> = toml::map::Map::new();
        let empty_token_list: Vec<String> = vec![];
        Tokenizer {
            tokens: empty_tokens,
            token_list: empty_token_list,
            configuration: empty_value,
        }
    }
    // @name eat
    // @return
    // @brief Consumes a string and safes the tokens
    // @param line: &str
    pub fn eat(&mut self, line: &str) {
        // Get token vectors from configuration
        let token_table_value: &Value = self.configuration.get("token").unwrap();
        let token_table: Table = Table::try_from(token_table_value).unwrap();
        let mut tokens: Vec<String> = vec![line.to_string()];
        let mut new_tokens: Vec<String> = vec![];
        let mut token_buffer: String = String::from("");
        // Iterate over tokens in token table and split tokens.
        if token_table.contains_key("separator") {
            let separator: Vec<Value> = token_table
                .get_key_value("separator")
                .unwrap()
                .1
                .as_array()
                .unwrap()
                .clone();
            if separator.len() > 0 {
                for token in tokens.iter() {
                    let mut token_feed = token.clone();
                    while !token_feed.is_empty() {
                        let mut no_match: bool = true;
                        for sep in separator.iter() {
                            if token_feed.starts_with(sep.as_str().unwrap()) {
                                // Reset and add token
                                no_match = false;
                                if token_buffer.len() > 0 {
                                    new_tokens.push(token_buffer.clone());
                                    token_buffer = String::from("");
                                }
                                let new_feed: String =
                                    token_feed.split_off(sep.as_str().unwrap().len());
                                token_feed = new_feed;
                            }
                        }
                        if no_match {
                            let new_feed: String = token_feed.split_off(1);
                            token_buffer = token_buffer
                                + String::from(token_feed.chars().next().unwrap()).as_str();
                            token_feed = new_feed;
                        }
                    }
                    // empty token
                    new_tokens.push(token_buffer.clone());
                    token_buffer = String::from("");
                }
                // empty token
                new_tokens.push(token_buffer.clone());
                token_buffer = String::from("");
            }
        }
        tokens = new_tokens.clone();
        new_tokens = vec![];
        if token_table.contains_key("operands") {
            let operands: Vec<Value> = token_table
                .get_key_value("operands")
                .unwrap()
                .1
                .as_array()
                .unwrap()
                .clone();
            if operands.len() > 0 {
                for token in tokens.iter() {
                    let mut token_feed = token.clone();
                    while !token_feed.is_empty() {
                        let mut no_match: bool = true;
                        for op in operands.iter() {
                            if token_feed.starts_with(op.as_str().unwrap()) {
                                // Reset and add token
                                no_match = false;
                                if token_buffer.len() > 0 {
                                    new_tokens.push(token_buffer.clone());
                                }
                                token_buffer = String::from("");
                                new_tokens.push(op.as_str().unwrap().to_string());
                                let new_feed: String =
                                    token_feed.split_off(op.as_str().unwrap().len());
                                token_feed = new_feed;
                            }
                        }
                        if no_match {
                            let new_feed: String = token_feed.split_off(1);
                            token_buffer = token_buffer
                                + String::from(token_feed.chars().next().unwrap()).as_str();
                            token_feed = new_feed;
                        }
                    }
                    // empty token
                    new_tokens.push(token_buffer.clone());
                    token_buffer = String::from("");
                }
                // empty token
                new_tokens.push(token_buffer.clone());
                token_buffer = String::from("");
            }
        }
        tokens = new_tokens.clone();
        new_tokens = vec![];
        if token_table.contains_key("terminator") {
            let terminator: Vec<Value> = token_table
                .get_key_value("terminator")
                .unwrap()
                .1
                .as_array()
                .unwrap()
                .clone();
            if terminator.len() > 0 {
                for token in tokens.iter() {
                    let mut token_feed = token.clone();
                    while !token_feed.is_empty() {
                        let mut no_match: bool = true;
                        for term in terminator.iter() {
                            if token_feed.starts_with(term.as_str().unwrap()) {
                                // Reset and add token
                                no_match = false;
                                if token_buffer.len() > 0 {
                                    new_tokens.push(token_buffer.clone());
                                }
                                token_buffer = String::from("");
                                new_tokens.push(term.as_str().unwrap().to_string());
                                let new_feed: String =
                                    token_feed.split_off(term.as_str().unwrap().len());
                                token_feed = new_feed;
                            }
                        }
                        if no_match {
                            let new_feed: String = token_feed.split_off(1);
                            token_buffer = token_buffer
                                + String::from(token_feed.chars().next().unwrap()).as_str();
                            token_feed = new_feed;
                        }
                    }
                    // empty token as token ended
                    new_tokens.push(token_buffer.clone());
                    token_buffer = String::from("");
                }
                // empty token
                new_tokens.push(token_buffer.clone());
            }
        }
        self.token_list.append(&mut new_tokens);
        // Clean up token list
        let mut cleaned_token_list: Vec<String> = vec![];
        for token in self.token_list.iter() {
            if token.as_str() != "" {
                cleaned_token_list.push(token.to_string());
            }
        }
        self.token_list = cleaned_token_list;
    }
    // @name identify_tokens
    // @return
    // @brief Go through all tokens and try to find them.
    // @param &mut self
    pub fn identify_tokens(&mut self) {
        // Go through token list
        let mut token_identities: Vec<Token> = vec![];
        let mut found_token: bool;
        let token_section: Table =
            Table::try_from(self.configuration.get("token").unwrap()).unwrap();
        let semantics_section: Table =
            Table::try_from(self.configuration.get("semantics").unwrap()).unwrap();
        for token in self.token_list.iter() {
            found_token = false;
            if token.as_str() == "" {
                continue;
            }
            // Check if token is an operand
            if token_section.contains_key("operands") {
                let operands: Vec<Value> = token_section
                    .get_key_value("operands")
                    .unwrap()
                    .1
                    .as_array()
                    .unwrap()
                    .clone();
                for operand in operands.iter() {
                    if operand.as_str().unwrap() == token.as_str() {
                        token_identities.push(Token {
                            token: token.clone(),
                            token_type: TokenType::OPERAND,
                        });
                        found_token = true;
                    }
                }
            }
            if token_section.contains_key("terminator") && !found_token {
                let terminator: Vec<Value> = token_section
                    .get_key_value("terminator")
                    .unwrap()
                    .1
                    .as_array()
                    .unwrap()
                    .clone();
                for term in terminator.iter() {
                    if term.as_str().unwrap() == token.as_str() {
                        token_identities.push(Token {
                            token: token.clone(),
                            token_type: TokenType::TERMINATOR,
                        });
                        found_token = true;
                    }
                }
            }
            if semantics_section.contains_key("keywords") && !found_token {
                let keywords: Vec<Value> = semantics_section
                    .get_key_value("keywords")
                    .unwrap()
                    .1
                    .as_array()
                    .unwrap()
                    .clone();
                for keyword in keywords.iter() {
                    if keyword.as_str().unwrap() == token.as_str() {
                        token_identities.push(Token {
                            token: token.clone(),
                            token_type: TokenType::KEYWORD,
                        });
                        found_token = true;
                    }
                }
            }
            if !found_token {
                token_identities.push(Token {
                    token: token.clone(),
                    token_type: TokenType::IDENTIFIER,
                });
            }
        }
        self.tokens = token_identities;
    }
 }
--- a/testspecs.toml
+++ b/testspecs.toml
@@ -0,0 +1,79 @@
 # Meta rules are separate rules with priority over all other rules.
 # They can be compared to preprocessor directives, but are more powerful.
 # Pattern matching in preprocessor style, is running at highest priority before anything else.
 [meta.replacements]
 comments = ["^--.*", ""]
 # Interpolation with a shell, replaces the meta pattern by the interpolation result.
 # Passing arguments is supported through groups and #<parameter number> in the shell command.
 [meta.interpolation]
 with = ["^#with ([\\w./]+)", "cat $1"]
 date = ["#date_now", "date"]
 user = ["#user", "user"]
 test = ["#test", "cat ./mathlib.mlc"]
 # Describes tokens to be replaced by identifiers and then later swapped back in after the tokenizer.
 # All special tokens are treated as constants
 [meta.token]
 string_constant = "\".*?\""
 char_constant = "'.'"
 # Every key below is used as type in an enumerate to sort the tokens
 #  -> Replacement in order
 #  -> Every amount of other symbols is saved as some kind of value
 #  -> Those are using the default type "identifier"
 [token]
 separator = [" ", ",", "\n"]
 operands = [":=", "->", "<=", ">=", "<", ">", "!", "+", "-", "/", "*", "(", ")", "[", "]", "{", "}", "=", "?", ":"]
 terminator = [";"]
 [semantics]
 keywords = ["if", "then", "else", "end"]
 [constants]
 number = "(?:0b[01]+|0x[0-9a-fA-F]+|0[0-7]+|[1-9][0-9]*)"
 character = "'.'"
 logic = "(true|false)"
 [types]
 Number = "number"
 Character = "character"
 Type = ""
 Array = "{character * number}"
 Logic = "logic"
 # List of rules
 # Rules can be found in traces
 # use better names than rule_1, rule_2, ...
 # The compiler will run through all rules trying to match exactly one.
 # Uses the following generic types:
 # - OPERAND
 # - IDENTIFIER
 # - KEYWORD
 # - TERMINATOR
 # - OTHER (Use this type for ambiguous parts. Same as lazy .+ in regular expressions)
 # Definition of custom types are possible, by creation of a rule with the same name.
 # IMPORTANT: Rules are always top priority and can overwrite other types.
 # Named placeholders: The character # is reserved for named placeholders. They are only valid inside a rule.
 [syntax]
 definition = "IDENTIFIER#1 -> IDENTIFIER#2 := OTHER#3 TERMINATOR"
 definition_with_parameter = "IDENTIFIER#1 : parameter#2 -> IDENTIFIER#3 := OTHER#4 TERMINATOR"
 recursion = "#basename OTHER := OTHER #basename OTHER TERMINATOR"
 replace_predef = [ "IDENTIFIER#1 -> OTHER := OTHER#2 TERMINATOR OTHER IDENTIFIER#1", "#1 -> OTHER := #2 TERMINATOR OTHER (#2)" ]
 replace_postdef = [ "IDENTIFIER#1 OTHER TERMINATOR IDENTIFIER#1 -> OTHER := OTHER#2 TERMINATOR", "#2 OTHER TERMINATOR #1 -> OTHER := #2 TERMINATOR" ]
 unfold_parameter = [ ": OTHER IDENTIFIER#1 ( IDENTIFIER#2 OTHER#3 ) OTHER ->", ": OTHER #1 #2 #1 ( #3 ) OTHER ->" ]
 unfold_parameter_remove_brackets = [ ": OTHER IDENTIFIER ( ) OTHER ->", ": OTHER OTHER ->" ]
 parameter = ": OTHER ->"
 # The following sections are used to build different output formats
 # [interpreter] refers to the builtin interpreter using a minimal subset of C syntax
 # The name of each section is only used to specify the actual output.
 [clang]
 definition = "#2 #1 () {return (#3);}"
 Logic = "int"
 Number = "long int"
 Character = "char"
 Type = "struct"
 [interpreter]
Author	SHA1	Message	Date
yannickreiss	2a846a5f53	Load configuration for in language syntax replacements	2025-08-26 22:43:59 +02:00
yannickreiss	f67c79c65b	Add testbench	2025-08-25 12:09:54 +02:00
yannickreiss	42fa5affb5	Reintroducing meta tokens	2025-08-25 07:12:22 +02:00
yannickreiss	015de5dc0a	implement meta token replacement	2025-08-24 20:54:20 +02:00
yannickreiss	ddba3423df	Transofrmation	2025-08-12 19:04:09 +02:00
yannickreiss	9e4141fc96	Implement sample with working preprocessor (Stage 1 + 2)	2025-08-11 12:49:44 +02:00
yannickreiss	0b6073b5bb	Add types and semantics to micro language description	2025-08-07 09:33:47 +02:00