Skip to content

Commit 961aed8

Browse files
added readme
1 parent 5c30a7e commit 961aed8

File tree

5 files changed

+262
-5
lines changed

5 files changed

+262
-5
lines changed

EsotericDevZone.RuleBasedParser.csproj

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
<DefineConstants>DEBUG;TRACE</DefineConstants>
2222
<ErrorReport>prompt</ErrorReport>
2323
<WarningLevel>4</WarningLevel>
24+
<DocumentationFile>bin\Debug\EsotericDevZone.RuleBasedParser.xml</DocumentationFile>
2425
</PropertyGroup>
2526
<PropertyGroup Condition=" '$(Configuration)|$(Platform)' == 'Release|AnyCPU' ">
2627
<DebugType>pdbonly</DebugType>
@@ -29,6 +30,7 @@
2930
<DefineConstants>TRACE</DefineConstants>
3031
<ErrorReport>prompt</ErrorReport>
3132
<WarningLevel>4</WarningLevel>
33+
<DocumentationFile>bin\Release\EsotericDevZone.RuleBasedParser.xml</DocumentationFile>
3234
</PropertyGroup>
3335
<PropertyGroup>
3436
<ApplicationIcon>

ParseRulePatterns/LiteralPatternItem.cs

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,5 @@
11
using System;
22
using System.Collections.Generic;
3-
using System.Linq;
4-
using System.Text;
5-
using System.Threading.Tasks;
63

74
namespace EsotericDevZone.RuleBasedParser.ParseRulePatterns
85
{

Parser.cs

Lines changed: 23 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,9 @@
77

88
namespace EsotericDevZone.RuleBasedParser
99
{
10+
/// <summary>
11+
/// Rule based parser tool
12+
/// </summary>
1013
public class Parser
1114
{
1215
public string RootRuleKey { get; set; }
@@ -27,14 +30,23 @@ public Parser(TokensSplitOptions tokensSplitOptions, CommentStyle commentStyle)
2730
CommentStyle = commentStyle;
2831
}
2932

33+
/// <summary>
34+
/// Builder function to call when no tokens are provided (default: throws NoTokensProvidedException)
35+
/// </summary>
36+
public Func<ParseResult> EmptyBodyResult { get; set; } = () => throw new NoTokensProvidedException("No tokens provided");
37+
38+
/// <summary>
39+
/// Parses an input sequence and returns the evaluated value according to the parsing rules
40+
/// </summary>
41+
/// <exception cref="ParseException">A parse exception is thrown when a syntax error is encountered</exception>
3042
public object Parse(string input)
3143
{
3244
ParseCache.Clear();
3345
var tokens = input.SplitToTokens(TokensSplitOptions, CommentStyle);
3446

3547
if(tokens.Count==0)
3648
{
37-
throw new NoTokensProvidedException("No tokens provided");
49+
return EmptyBodyResult();
3850
}
3951

4052
var result = LookFor(RootRuleKey, tokens, 0);
@@ -54,6 +66,16 @@ public object Parse(string input)
5466
return result.Result.Value;
5567
}
5668

69+
public T Parse<T>(string input)
70+
{
71+
var result = Parse(input);
72+
73+
if (!(result is T))
74+
throw new InvalidCastException();
75+
76+
return (T)result;
77+
}
78+
5779
private ParseRecord LookFor(ParseRule rule, List<Token> tokens, int pos)
5880
{
5981
var pattern = rule.ParsePattern;

Presets/Parsers/ArithmeticsParser.cs

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ public ArithmeticsParser(Func<string, AtomResult> numberBuilder) : base()
1616
this.TokensSplitOptions = new TokensSplitOptions(
1717
Lists.Empty<string>(),
1818
Lists.Of(@"\+", @"\-", @"\*", @"\/", @"\(", @"\)")
19-
);
19+
);
2020
base.CommentStyle = CommentStyles.NoCommentsStyle;
2121
NumberBuilder = numberBuilder;
2222
Initialize();
@@ -49,6 +49,8 @@ private void Initialize()
4949
ParseRules.RegisterRule("@T", "NUMBER", ParseResultBuilders.Self);
5050
ParseRules.RegisterRule("@T", "SYMBOL", ParseResultBuilders.Self);
5151
ParseRules.RegisterRule("@T", "( @E )", ParseResultBuilders.Self);
52+
ParseRules.RegisterRule("@T", "+ @T", (int x) => x);
53+
ParseRules.RegisterRule("@T", "- @T", (int x) => -x);
5254

5355
RootRuleKey = "@E";
5456
}

README.md

Lines changed: 234 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1 +1,235 @@
11
# EsotericDevZone.RuleBasedParser
2+
3+
This library provides a simple token-oriented text parser featuring a set of user defined rules.
4+
5+
## Parse rules
6+
7+
A parse rule is a structure described by a rule key (the rule's name which starts with an `@` character), a pattern and an object build method.
8+
9+
The rule key represents the type of the object which results after parsing the input. For example:
10+
```
11+
Rule key = "@ADDITION"
12+
Pattern = "@TERM + @TERM" # so expressions of type 1+2, 15+67 etc are considered "@ADDITION"s
13+
Builder = (term1, term2) -> term1+term2
14+
```
15+
16+
The pattern tells the parser how to identify the input tokens. A pattern is a series of individual matchers separated by a single space `" "`.
17+
18+
The types of matchers are:
19+
20+
- **Atoms**
21+
22+
An atom corresponds to a single token from the input interpreted in some specific way. Atom types are user defined. They are recommended to be named with capital letters for better distinction.
23+
24+
For example, `NUMBER` atom matcher identifier any numeric values such as `7`, `13`, `12.3`, while `STRING` may match values like `"this string"`.
25+
- **Rule keys**
26+
27+
A rule key matcher tells the parser that it should look for an entity which corresponds to that rule key.
28+
29+
Fore example, in the case of the pattern `TERM = @ADDITION`, the parser expects values like `1+4` at the right of the equality sign.
30+
31+
- **Repeatable Tail (`??`)**
32+
33+
Anything following the repeatable tail matcher (`??`) is optional and may match as many times as it can.
34+
35+
For example, the pattern `@TERM ?? + @TERM` mathes sequences of summed number of any length, like `1+7`, `5`, `2+3+20` etc.
36+
37+
*The repeatable tail must appear exactly once in a rule pattern!*
38+
39+
- **Literals**
40+
41+
A literal match defines a verbatim token which must appear in the input.
42+
43+
The `+` sign in the pattern `@TERM + @TERM` is an example of literal matcher.
44+
Similarly, the `function` keyword in pattern `function NAME ( )`, as well as the parenthesis `(`, `)` are also literals.
45+
46+
- **Wildcards**
47+
48+
A wildcard is a list of possible literals separated by the `|` delimiter. The parser considers it the input matches a wildcard if the parsed token is
49+
one of the items in the literals list.
50+
51+
For example, to match both addition and subtractions, one can do `@TERM ?? +|- @TERM`. The parser matches expressions like `1+2`, `5-3`, `1+2-5`.
52+
53+
**No spaces between the literals and delimiter!** Pattern `+ |-` is completely different from `+|-` and is a potential error source. The pattern `+ |-` describes a "rule" that matches the `+` sign followed by a wildcard consisting of a ghost token (the "one" before the delimiter `|`) or the `-` sign.
54+
55+
## Tokens
56+
57+
A token is an indivisible substring of the input. It can be imagined as a single word in a paragraph. By default, tokens are isolated from each other by any kind of whitespace.
58+
In some context, that would be inconventient as the input `text that "contains strings"` is separated in 4 tokens (`text`, `that`, `"contains`, `strings"`) while the user might expect 3. Moreover, inputs like `1+2+3` would generate a single token while the human mind might usually identify 5 of them. There is a solution to overcome this problem.
59+
60+
There are `TokensSplitOptions` to tell in which situations token must not be splitted (therefore preserving spaces between them), or must be splitted, even if there is no explicit whitespace between them.
61+
62+
The `TokensSplitOptions` contains two sets of rules:
63+
64+
- **Split Breaking Rules**
65+
66+
A split breaking rule consist of two characters meaning that anything that is found between them represents a single token. For example, the split breaking rule (`""`) identifies '"this token"', `"and this \" escaped one \""`, `"and even 'this'"`, but `'not`, ..., `this_one'`. Similarly, the rule (`{}`) isolated `{anything between brackets}`. When the two characters are identical, one can be skipped, such that the rule (`""`) is equivalent to (`"`).
67+
68+
- **Atoms**
69+
70+
An atom (_in tokens terms_) is a regex that describes a single atom and isolates it from the adjacent context. For example, the atom rule (`\+`) isolates every _plus_ sign from the input `1+2+3`, leading to 5 tokens '1', `+`, `2`, `+`, `3`.
71+
72+
**The atom rules are Regular Expressions, please be aware of the regex escaping rules when needed (A rule that isolates the `=` sign token must be written as `\=`)!**
73+
74+
Some more complex rules can be defined, like `\d\.\d`, which matches any of these numbers (`1.2`, `3.5` etc.) *even when the atom rule `\.` has been defined*.
75+
76+
## Comments
77+
78+
In case portions of the input are wanted to be ignored, the user can define a `CommentStyle`, which is a way to identify and remove inline and block comments from the original input.
79+
80+
- **Inline comment rules** are described by a single string marking the start of the comment, like the slashes (`//`) in C/C++. An inline comment starts from the specified marker and keeps going until the end of the line.
81+
- **Block comment rules** are described by two string markers, one for the begining (e.g. `/*`) and one for the end (e.g. `*/`) of the comment.
82+
83+
CommentStyles are optional.
84+
85+
## Creating a Parser (Demo)
86+
87+
Let's see how to create a parser that evaluates integer aritmetic expressions.
88+
89+
First, we need to define our tokens. A token is a continuous sequence of digits, or an operator (let's say the four basic operations `+ - * /`, or a parenthesis `()`).
90+
We don't have split breaking rules, as no strings are involved.
91+
92+
Therefore, we define our token split options:
93+
```C#
94+
using EsotericDevZone.Core.Collections;
95+
//...
96+
var tokensSplitOptions = new TokensSplitOptions(
97+
Lists.Empty<string>(), // no split breaking rules
98+
Lists.Of(@"\+", @"\-", @"\*", @"\/", @"\(", @"\)") // atom rules
99+
);
100+
```
101+
102+
Let's create our parser:
103+
104+
```C#
105+
106+
var parser = new Parser();
107+
parser.TokensSplitOptions = tokensSplitOptions;
108+
parser.CommentStyle = CommentStyles.NoCommentsStyle; // No comment :))
109+
```
110+
111+
We need to tell the parser how to evaluate numbers. For that, we need to define an parse atom:
112+
113+
```C#
114+
parser.RegisterAtom("NUMBER", AtomBuilders.Integer);
115+
```
116+
117+
Or, if you're feeling like you want to do it yourself:
118+
119+
```C#
120+
parser.RegisterAtom("NUMBER", (token)
121+
=> int.TryParse(input, out int value)
122+
? AtomResult.Atom(value)
123+
: AtomResult.Error("Input is not an integer")
124+
);
125+
```
126+
127+
Now let's see how to create the parse rules. We need to think for a while and consider the following situations:
128+
129+
- `12` - single number
130+
- `6+3` - simple addition
131+
- `1+2+3+4` - multiple addition
132+
- `3+4-2` - combined addition/subtraction
133+
- `5*6` - multiplication
134+
- `1+5*6-3+8/3` - combined operations
135+
- `6+(4+5)*3` - parenthesis
136+
137+
From these examples, we can extract a couple of helpful ideas:
138+
139+
1. A single number is a valid expression
140+
1. A series of additions/subtractions is a valid expression
141+
1. A series of multiplications/divisions is a valid expression
142+
1. In a combined operations input, multiplication and division comes before addition/subtraction
143+
1. Anything between parenthesis must be solved first
144+
145+
These hints point to a following rules system:
146+
147+
```
148+
key: @EXPR pattern: @ADD
149+
key: @ADD pattern: @MUL ?? +|- @MUL
150+
key: @MUL pattern: @TERM ?? *|/ @TERM
151+
key: @TERM pattern: NUMBER
152+
key: @TERM pattern: ( @EXPR )
153+
```
154+
155+
Let's see how we translate them into code:
156+
157+
```C#
158+
// Make the @EXPR evaluate as the result of its contained pattern rule @ADD :
159+
parser.ParseRules.RegisterRule("@EXPR", "@ADD", ParseResultBuilders.Self);
160+
161+
// Write the @ADD rule and how to evaluate it
162+
parser.ParseRules.RegisterRule("@ADD", "@MUL ?? +|- @MUL", ParseResultBuilders.LeftAssociate((a, b, sign) =>
163+
{
164+
// LeftAssociate(oper) takes a list of N parsed (@MUL) entities and N-1 tokens/operators
165+
// and sequencially calls result = oper(result, value(i), token(i-1))
166+
// consuming each of the tokens from left to right
167+
if (sign.Value == "+")
168+
return new ParseResult(sign, (int)a.Value + (int)b.Value);
169+
else // if (sign.Value == "-")
170+
return new ParseResult(sign, (int)a.Value - (int)b.Value);
171+
}));
172+
173+
// Do the same for @MUL rule
174+
parser.ParseRules.RegisterRule("@MUL", "@TERM ?? *|/ @TERM", ParseResultBuilders.LeftAssociate((a, b, sign) =>
175+
{
176+
if (sign.Value == "*")
177+
return new ParseResult(sign, (int)a.Value * (int)b.Value);
178+
else // if (sign.Value == "/")
179+
return new ParseResult(sign, (int)a.Value / (int)b.Value);
180+
}));
181+
182+
// Define @TERM rulekey. Multiple rule definitions with the same key are allowed.
183+
// If one rule fails to parse, the others are tried in the order they have been declared
184+
parser.ParseRules.RegisterRule("@TERM", "NUMBER", ParseResultBuilders.Self);
185+
// Self builder works even when there are also literals around the target pattern :
186+
parser.ParseRules.RegisterRule("@TERM", "( @EXPR )", ParseResultBuilders.Self);
187+
```
188+
189+
Finally, we have to tell the parser what the input is as a whole (what it should look for when it parses the input).
190+
We do that by specifying something that's called the `RootRuleKey` - the "global" parse rule pattern:
191+
192+
```C#
193+
parser.RootRuleKey = "@EXPR"; // we are generally looking for "expressions"
194+
```
195+
196+
Now we are ready to go:
197+
198+
```C#
199+
int result1 = parser.Parse("(1+2)*4"); // output: 12
200+
int result2 = parser.Parse("3 - 2-1"); // output: 0
201+
```
202+
203+
Or, make it interactive:
204+
205+
```C#
206+
while (true)
207+
{
208+
try
209+
{
210+
Console.Write(">> ");
211+
string input = Console.ReadLine();
212+
Console.WriteLine(parser.Parse(input));
213+
}
214+
catch(Exception e)
215+
{
216+
Console.WriteLine(e.Message);
217+
}
218+
}
219+
220+
/* Output
221+
---------------------------------------------
222+
>> 1+2
223+
3
224+
>> 165*887
225+
146355
226+
>> 5+6*(3-5*(7-3))+4
227+
-93
228+
>> 1+
229+
1:2 Parse error: Insuficient tokens
230+
>> 45 6
231+
1:4 Parse error: Insuficient tokens
232+
>> 10
233+
10
234+
---------------------------------------------*/
235+
```

0 commit comments

Comments
 (0)