diff --git a/Cargo.toml b/Cargo.toml index 8aa2b9b..596f90a 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -4,3 +4,6 @@ version = "0.1.0" edition = "2024" [dependencies] +gtoml = "0.1.2" +toml = "0.9.3" +regex = "1.11.1" diff --git a/example.mlc b/example.mlc new file mode 100644 index 0000000..1e15215 --- /dev/null +++ b/example.mlc @@ -0,0 +1,5 @@ +#with language.toml +variable:=-3; c := (a+b- 3) * 23 + variable; d := c - a;Natural : Number (n) := {n >= 0};faculty : Natural (n) -> Natural := if n = 0 then 1 else faculty (n-1) * n end; +String Natural (n) := {Character * n};hello_word -> String := "Hello World!"; +first_letter -> Character := 'a'; +wrong -> Logic := false;date -> String := "#date_now"; diff --git a/language.toml b/language.toml index 1727e64..137ffca 100644 --- a/language.toml +++ b/language.toml @@ -1,31 +1,77 @@ +# Meta rules are separate rules with priority over all other rules. +# They can be compared to preprocessor directives, but are more powerful. + +# Pattern matching in preprocessor style, is running at highest priority before anything else. +[meta.replacements] +comments = ["^--.*", ""] + +# Interpolation with a shell, replaces the meta pattern by the interpolation result. +# Passing arguments is supported through groups and # in the shell command. +[meta.interpolation] +with = ["^#with ([\\w./]+)", "cat $1"] +date = ["#date_now", "date"] + +# Describes tokens to be replaced by identifiers and then later swapped back in after the tokenizer. +# All special tokens are treated as constants +[meta.token] +string_constant = "\".*?\"" +char_constant = "'.'" + # Every key below is used as type in an enumerate to sort the tokens # -> Replacement in order # -> Every amount of other symbols is saved as some kind of value # -> Those are using the default type "identifier" [token] -separator = [" "] -operands = [":=", "->", "!", "+", "-", "/", "*", "(", ")", "[", "]", "{", "}", "=", "?", ":", "'", "\""] +separator = [" ", ",", "\n"] +operands = [":=", "->", "<=", ">=", "<", ">", "!", "+", "-", "/", "*", "(", ")", "[", "]", "{", "}", "=", "?", ":"] terminator = [";"] [semantics] -keywords = ["if", "then", "else", "with"] +keywords = ["if", "then", "else", "end", "with"] -# constant descriptions +[constants] number = "(?:0b[01]+|0x[0-9a-fA-F]+|0[0-7]+|[1-9][0-9]*)" -string = "\"[\w\W]*?\"" -character = "'[\w\W]'" +character = "'.'" logic = "(true|false)" [types] -Number = "" -Character = "" +Number = "number" +Character = "character" Type = "" -Logic = "" +Array = "{character * number}" +Logic = "logic" +# List of rules +# Rules can be found in traces +# use better names than rule_1, rule_2, ... +# The compiler will run through all rules trying to match exactly one. +# Uses the following generic types: +# - OPERAND +# - IDENTIFIER +# - KEYWORD +# - TERMINATOR +# - OTHER (Use this type for ambiguous parts. Same as lazy .+ in regular expressions) +# Definition of custom types are possible, by creation of a rule with the same name. +# IMPORTANT: Rules are always top priority and can overwrite other types. +# Named placeholders: The character # is reserved for named placeholders. They are only valid inside a rule. [syntax] +definition = "IDENTIFIER#1 -> IDENTIFIER#2 := OTHER#3 TERMINATOR" +definition_with_parameter = "IDENTIFIER#1 : parameter#2 -> IDENTIFIER#3 := OTHER#4 TERMINATOR" +recursion = "#basename OTHER := OTHER #basename OTHER TERMINATOR" +replace_predef = [ "IDENTIFIER#1 -> OTHER := OTHER#2 TERMINATOR OTHER IDENTIFIER#1", "#1 -> OTHER := #2 TERMINATOR OTHER (#2)" ] +replace_postdef = [ "IDENTIFIER#1 OTHER TERMINATOR IDENTIFIER#1 -> OTHER := OTHER#2 TERMINATOR", "#2 OTHER TERMINATOR #1 -> OTHER := #2 TERMINATOR" ] +unfold_parameter = [ ": OTHER IDENTIFIER#1 ( IDENTIFIER#2 OTHER#3 ) OTHER ->", ": OTHER #1 #2 #1 ( #3 ) OTHER ->" ] +unfold_parameter_remove_brackets = [ ": OTHER IDENTIFIER ( ) OTHER ->", ": OTHER OTHER ->" ] +parameter = ": OTHER ->" -[hdl] - -[compiled] +# The following sections are used to build different output formats +# [interpreter] refers to the builtin interpreter using a minimal subset of C syntax +# The name of each section is only used to specify the actual output. +[clang] +definition = "#2 #1 () {return (#3);}" +Logic = "int" +Number = "long int" +Character = "char" +Type = "struct" [interpreter] diff --git a/src/identification.rs b/src/identification.rs new file mode 100644 index 0000000..e02f9dc --- /dev/null +++ b/src/identification.rs @@ -0,0 +1,75 @@ +use crate::Token; +use regex::Regex; +use toml::{Table, Value}; + +// Identifier +// Each Identifier is analyzed to be worked with. +pub struct Identifier { + identities: Vec, + pub tokens: Vec, + type_configuration: Table, + constant_configuration: Table, +} + +// Identity +// The Identity of a identifier token. +struct Identity { + class: IdentityClass, + name: String, + sub_type: String, +} + +pub enum IdentityClass { + TYPE, + CONSTANT, + DEFINITION, +} + +impl Identifier { + pub fn new(token: Vec) -> Identifier { + let identities: Vec = vec![]; + let new_config_type: Table = Table::new(); + let new_config_constant: Table = Table::new(); + Identifier { + identities: identities, + tokens: token, + type_configuration: new_config_type, + constant_configuration: new_config_constant, + } + } + + pub fn load_criteria_from_configuration(&mut self, complete_configuration: Table) { + let type_configuration_wrapped: &Value = complete_configuration + .get("types") + .expect("Missing section types in configuration."); + let constant_configuration_wrapped: &Value = complete_configuration + .get("constants") + .expect("Missing section constants in configuration."); + let type_configuration: Table = Table::try_from(type_configuration_wrapped) + .expect("Can't read type configuration from Value."); + let constant_configuration: Table = Table::try_from(constant_configuration_wrapped) + .expect("Can't read constant configuration from Value."); + self.type_configuration = type_configuration; + self.constant_configuration = constant_configuration; + } + + pub fn identify_identifiers(&mut self) { + let tokens: &Vec = &self.tokens; + let constant_patterns: Table = self.constant_configuration.clone(); + let type_names: Table = self.type_configuration.clone(); + let mut identity_found = false; + + for token in tokens.iter() { + if token.token_type == crate::TokenType::IDENTIFIER { + for raw_pattern in constant_patterns.iter() { + let pattern: &str = raw_pattern.1.as_str().unwrap(); + let expression: Regex = Regex::new(pattern).unwrap(); + // Check for constant + if expression.is_match(token.token.as_str()) { + println!("Matching! Found {:?} {:?}.", raw_pattern.0, token.token); + } + } + } + } + } +} diff --git a/src/main.rs b/src/main.rs index 685a566..a9fa082 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,16 +1,26 @@ -mod collector; +mod identification; +mod preprocessor; +mod testcases; +mod tokenizer; -use collector::Collector; +use tokenizer::*; fn main() { - let mut _collector: Collector = Collector { - definitions: vec![(String::from(""), String::from(""))], - arguments: vec![(String::from(""), String::from(""))], - }; + let sample_code: String = std::fs::read_to_string("example.mlc").unwrap(); + let mut example_tokenizer: Tokenizer = Tokenizer::new(); + let mut meta_rules: crate::preprocessor::MetaRules = + crate::preprocessor::MetaRules::new("./language.toml"); + let processed_sample_code: String = meta_rules.process(sample_code.to_owned()); + example_tokenizer.read_configuration_from_file("./language.toml"); + example_tokenizer.eat(processed_sample_code.as_str()); + example_tokenizer.identify_tokens(); - let test_string: String = String::from("(1 + 2) * 3"); + let mut example_identifier: identification::Identifier = + identification::Identifier::new(example_tokenizer.tokens); + example_identifier.load_criteria_from_configuration(example_tokenizer.configuration); + example_identifier.identify_identifiers(); - let echo_string: String = _collector.eval(test_string); - - println!("Result: {}", echo_string); + for token in example_identifier.tokens.iter() { + print!("{}", token.token); + } } diff --git a/src/preprocessor.rs b/src/preprocessor.rs new file mode 100644 index 0000000..168a7f9 --- /dev/null +++ b/src/preprocessor.rs @@ -0,0 +1,148 @@ +use crate::tokenizer::Token; +use regex::{Captures, Regex}; +use toml::{Table, Value}; + +// MetaRules +// Struct containing all meta rules. +pub struct MetaRules { + replacement_rules: Vec<(String, (String, String))>, + interpolation_rules: Vec<(String, (String, String))>, + token_rules: Vec<(String, String)>, + special_tokens: Vec, +} + +// Implementation of MetaRules +// Trait implementation +impl MetaRules { + // @name new + // @return MetaRules + // @brief Create a new rule struct by reading from a configuration file. + // @param configuration_filename: &str + pub fn new(configuration_filename: &str) -> MetaRules { + let configuration_content: String = std::fs::read_to_string(configuration_filename) + .expect("[ERROR] Could not open configuration file!"); + let mut replacements: Vec<(String, (String, String))> = vec![]; + let mut interpolation: Vec<(String, (String, String))> = vec![]; + let mut meta_token_rules: Vec<(String, String)> = vec![]; + let meta_tokens: Vec = vec![]; + let configuration = gtoml::parse(configuration_content.as_str()) + .expect("[ERROR] TOML invalid in preprocessor!"); + let configuration_unpacked: Table = Table::try_from(configuration).unwrap(); + let meta_configuration: Table = match configuration_unpacked.get("meta") { + Some(config) => config.as_table().unwrap().clone(), + None => Table::new(), + }; + + if !meta_configuration.is_empty() { + if meta_configuration.contains_key("replacements") { + println!("[INFO] Found replacement rules."); + let replacement_rules: Table = meta_configuration + .get("replacements") + .unwrap() + .as_table() + .unwrap() + .clone(); + for key in replacement_rules.keys() { + let value: Vec = replacement_rules + .get(key) + .unwrap() + .as_array() + .unwrap() + .clone(); + let name: String = key.clone(); + let pattern: String = value[0].as_str().unwrap().to_owned(); + let replacement: String = value[1].as_str().unwrap().to_owned(); + replacements.push((name, (pattern, replacement))); + } + } + + if meta_configuration.contains_key("interpolation") { + println!("[INFO] Found interpolation rules."); + let interpolation_rules: Table = meta_configuration + .get("interpolation") + .unwrap() + .as_table() + .unwrap() + .clone(); + for key in interpolation_rules.keys() { + let value: Vec = interpolation_rules + .get(key) + .unwrap() + .as_array() + .unwrap() + .clone(); + let name: String = key.clone(); + let pattern: String = value[0].as_str().unwrap().to_owned(); + let cmd: &str = value[1].as_str().unwrap(); + interpolation.push((name, (pattern, String::from(cmd)))); + } + } + + if meta_configuration.contains_key("token") { + println!("[INFO] Found token rules."); + let token_rules: Table = meta_configuration + .get("token") + .unwrap() + .as_table() + .unwrap() + .clone(); + for rule in token_rules.keys() { + let pattern: String = + token_rules.get(rule).unwrap().as_str().unwrap().to_owned(); + meta_token_rules.push((rule.clone(), pattern)); + } + } + } else { + println!("[WARNING] No meta configuration, skipping preprocessor."); + } + + MetaRules { + replacement_rules: replacements, + interpolation_rules: interpolation, + token_rules: meta_token_rules, + special_tokens: meta_tokens, + } + } + + // @name process + // @return String + // @brief Run preprocessor on raw code. + // @param rule_set: MetaRules, raw_code: String + pub fn process(&mut self, raw_code: String) -> String { + let mut processed_code: String = raw_code.clone(); + + // replacement rules + for rule in self.replacement_rules.iter() { + println!("[INFO] Applying rule {}", rule.0); + let base_pattern: Regex = Regex::new((rule.1 .0).as_str()).unwrap(); + processed_code = base_pattern + .replace_all(processed_code.as_str(), rule.1 .1.as_str()) + .to_string(); + } + + // interpolation rules + for rule in self.interpolation_rules.iter() { + println!("[INFO] Applying rule {}", rule.0); + let base_pattern: Regex = Regex::new((rule.1 .0).as_str()).unwrap(); + let processed_code_replacement = processed_code.clone(); + let parameter = &base_pattern + .captures(processed_code_replacement.as_str()) + .unwrap()[0]; + let command: &str = &base_pattern.replace(parameter, rule.1 .1.as_str()); + println!("{:?}", &command); + let subprocess = std::process::Command::new("/bin/bash") + .arg("-c") + .arg(String::from("echo \"$(") + command + ")\"") + .output() + .expect((String::from("") + "Failed to run command " + command + "!").as_str()); + processed_code = base_pattern + .replace( + processed_code.as_str(), + String::from_utf8(subprocess.stdout).unwrap(), + ) + .to_string(); + } + + return processed_code; + } +} diff --git a/src/structure.rs b/src/structure.rs new file mode 100644 index 0000000..b28dfb4 --- /dev/null +++ b/src/structure.rs @@ -0,0 +1,31 @@ +// HeadStructure +// Top level of structure. +pub struct HeadStructure { + token: TokenConfiguration, + syntax: SyntaxConfiguration, + semantics: SemanticsConfiguration, + types: TypesConfiguration, + hdl: HdlConfiguration, + compiled: CompiledConfiguration, + interpreter: InterpreterConfiguration, +} + +pub struct TokenConfiguration { + separator: Vec, + operands: Vec, + terminator: Vec, +} + +pub struct SyntaxConfiguration { + keywords: Vec, +} + +pub struct SemanticsConfiguration {} + +pub struct TypesConfiguration {} + +pub struct HdlConfiguration {} + +pub struct CompiledConfiguration {} + +pub struct InterpreterConfiguration {} diff --git a/src/testcases.rs b/src/testcases.rs new file mode 100644 index 0000000..092d231 --- /dev/null +++ b/src/testcases.rs @@ -0,0 +1,9 @@ +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn dummy_test() { + assert_eq!(2, 2); + } +} diff --git a/src/tokenizer.rs b/src/tokenizer.rs new file mode 100644 index 0000000..598e8be --- /dev/null +++ b/src/tokenizer.rs @@ -0,0 +1,304 @@ +use std::fs; +use toml::{Table, Value}; + +#[derive(PartialEq)] +pub enum TokenType { + OPERAND, + TERMINATOR, + IDENTIFIER, + KEYWORD, +} + +// Tokenizer +// Tokenizer and underlying functions to turn code into tokens +pub struct Tokenizer { + pub token_list: Vec, + // BUG: + pub tokens: Vec, + // Grammar options from toml file + pub configuration: Table, +} + +// Token +// This is a token with a token type. +pub struct Token { + pub token: String, + pub token_type: TokenType, +} +// Implementation of Tokenizer +// Functions associated with the tokenizer struct and module. +impl Tokenizer { + // @name read_configuration_from_file + // @return + // @brief Try to read configuration from an external file + // @param &mut self, configuration_filename: &str + pub fn read_configuration_from_file(&mut self, configuration_filename: &str) { + let configuration_string: String = fs::read_to_string(configuration_filename).expect( + (String::from("Could not open configuration file at: ") + configuration_filename) + .as_str(), + ); + + let configuration = gtoml::parse(configuration_string.as_str()).expect("TOML invalid!"); + self.configuration = Table::try_from(configuration).unwrap(); + + // Check for token section in config, panic if not present + if !self.configuration.contains_key("token") { + panic!("Token section is not present!"); + } + + // Check for semantics section in config, panic if not present + if !self.configuration.contains_key("semantics") { + panic!("Section semantics is not present!"); + } + } + + // @name new + // @return Tokenizer + // @brief Create a new Tokenizer + // @param + pub fn new() -> Tokenizer { + let empty_tokens: Vec = vec![]; + let empty_value: toml::map::Map = toml::map::Map::new(); + let empty_token_list: Vec = vec![]; + Tokenizer { + tokens: empty_tokens, + token_list: empty_token_list, + configuration: empty_value, + } + } + + // @name eat + // @return + // @brief Consumes a string and safes the tokens + // @param line: &str + pub fn eat(&mut self, line: &str) { + // Get token vectors from configuration + let token_table_value: &Value = self.configuration.get("token").unwrap(); + let token_table: Table = Table::try_from(token_table_value).unwrap(); + let mut tokens: Vec = vec![line.to_string()]; + let mut new_tokens: Vec = vec![]; + let mut token_buffer: String = String::from(""); + + // Iterate over tokens in token table and split tokens. + if token_table.contains_key("separator") { + let separator: Vec = token_table + .get_key_value("separator") + .unwrap() + .1 + .as_array() + .unwrap() + .clone(); + if separator.len() > 0 { + for token in tokens.iter() { + let mut token_feed = token.clone(); + while !token_feed.is_empty() { + let mut no_match: bool = true; + for sep in separator.iter() { + if token_feed.starts_with(sep.as_str().unwrap()) { + // Reset and add token + no_match = false; + if token_buffer.len() > 0 { + new_tokens.push(token_buffer.clone()); + token_buffer = String::from(""); + } + let new_feed: String = + token_feed.split_off(sep.as_str().unwrap().len()); + token_feed = new_feed; + } + } + if no_match { + let new_feed: String = token_feed.split_off(1); + token_buffer = token_buffer + + String::from(token_feed.chars().next().unwrap()).as_str(); + token_feed = new_feed; + } + } + // empty token + new_tokens.push(token_buffer.clone()); + token_buffer = String::from(""); + } + // empty token + new_tokens.push(token_buffer.clone()); + token_buffer = String::from(""); + } + } + tokens = new_tokens.clone(); + new_tokens = vec![]; + if token_table.contains_key("operands") { + let operands: Vec = token_table + .get_key_value("operands") + .unwrap() + .1 + .as_array() + .unwrap() + .clone(); + if operands.len() > 0 { + for token in tokens.iter() { + let mut token_feed = token.clone(); + while !token_feed.is_empty() { + let mut no_match: bool = true; + for op in operands.iter() { + if token_feed.starts_with(op.as_str().unwrap()) { + // Reset and add token + no_match = false; + if token_buffer.len() > 0 { + new_tokens.push(token_buffer.clone()); + } + token_buffer = String::from(""); + new_tokens.push(op.as_str().unwrap().to_string()); + let new_feed: String = + token_feed.split_off(op.as_str().unwrap().len()); + token_feed = new_feed; + } + } + if no_match { + let new_feed: String = token_feed.split_off(1); + token_buffer = token_buffer + + String::from(token_feed.chars().next().unwrap()).as_str(); + token_feed = new_feed; + } + } + // empty token + new_tokens.push(token_buffer.clone()); + token_buffer = String::from(""); + } + // empty token + new_tokens.push(token_buffer.clone()); + token_buffer = String::from(""); + } + } + tokens = new_tokens.clone(); + new_tokens = vec![]; + if token_table.contains_key("terminator") { + let terminator: Vec = token_table + .get_key_value("terminator") + .unwrap() + .1 + .as_array() + .unwrap() + .clone(); + if terminator.len() > 0 { + for token in tokens.iter() { + let mut token_feed = token.clone(); + while !token_feed.is_empty() { + let mut no_match: bool = true; + for term in terminator.iter() { + if token_feed.starts_with(term.as_str().unwrap()) { + // Reset and add token + no_match = false; + if token_buffer.len() > 0 { + new_tokens.push(token_buffer.clone()); + } + token_buffer = String::from(""); + new_tokens.push(term.as_str().unwrap().to_string()); + let new_feed: String = + token_feed.split_off(term.as_str().unwrap().len()); + token_feed = new_feed; + } + } + if no_match { + let new_feed: String = token_feed.split_off(1); + token_buffer = token_buffer + + String::from(token_feed.chars().next().unwrap()).as_str(); + token_feed = new_feed; + } + } + // empty token as token ended + new_tokens.push(token_buffer.clone()); + token_buffer = String::from(""); + } + // empty token + new_tokens.push(token_buffer.clone()); + } + } + self.token_list.append(&mut new_tokens); + } + + // @name identify_tokens + // @return + // @brief Go through all tokens and try to find them. + // @param &mut self + pub fn identify_tokens(&mut self) { + // Go through token list + let mut token_identities: Vec = vec![]; + let mut found_token: bool; + let token_section: Table = + Table::try_from(self.configuration.get("token").unwrap()).unwrap(); + let semantics_section: Table = + Table::try_from(self.configuration.get("semantics").unwrap()).unwrap(); + + for token in self.token_list.iter() { + found_token = false; + + if token.as_str() == "" { + continue; + } + + // Check if token is an operand + if token_section.contains_key("operands") { + let operands: Vec = token_section + .get_key_value("operands") + .unwrap() + .1 + .as_array() + .unwrap() + .clone(); + for operand in operands.iter() { + if operand.as_str().unwrap() == token.as_str() { + token_identities.push(Token { + token: token.clone(), + token_type: TokenType::OPERAND, + }); + found_token = true; + } + } + } + + if token_section.contains_key("terminator") && !found_token { + let terminator: Vec = token_section + .get_key_value("terminator") + .unwrap() + .1 + .as_array() + .unwrap() + .clone(); + for term in terminator.iter() { + if term.as_str().unwrap() == token.as_str() { + token_identities.push(Token { + token: token.clone(), + token_type: TokenType::TERMINATOR, + }); + found_token = true; + } + } + } + + if semantics_section.contains_key("keywords") && !found_token { + let keywords: Vec = semantics_section + .get_key_value("keywords") + .unwrap() + .1 + .as_array() + .unwrap() + .clone(); + for keyword in keywords.iter() { + if keyword.as_str().unwrap() == token.as_str() { + token_identities.push(Token { + token: token.clone(), + token_type: TokenType::KEYWORD, + }); + found_token = true; + } + } + } + + if !found_token { + token_identities.push(Token { + token: token.clone(), + token_type: TokenType::IDENTIFIER, + }); + } + } + self.tokens = token_identities; + } +}