Skip to content

Commit 57f4551

Browse files
karlkallmanswmalswmal
authored
Feature/regexfunctions (#2373)
* WIP * WIP * WIP * WIP * WIP * Fix REGEXEXTRACT/REGEXREPLACE scalar bugs: invalid return_mode validation, no-match now returns #N/A, empty pattern inserts replacement. Split regex tests into one file per function, verified against Excel desktop. * Refactored unit tests --------- Co-authored-by: swmal <897655+swmal@users.noreply.github.com> Co-authored-by: swmal <{ID}+username}@users.noreply.github.com>
1 parent c04fb81 commit 57f4551

9 files changed

Lines changed: 2010 additions & 0 deletions

File tree

src/EPPlus/FormulaParsing/Excel/Functions/BuiltInFunctions.cs

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,9 @@ public BuiltInFunctions()
6868
Functions["usdollar"] = new UsDollar();
6969
Functions["encodeurl"] = new EncodeUrl();
7070
Functions["code"] = new CodeFunction();
71+
Functions["regextest"] = new RegexTest();
72+
Functions["regexextract"] = new RegexExtract();
73+
Functions["regexreplace"] = new RegexReplace();
7174
Functions["textsplit"] = new TextSplit();
7275
Functions["textbefore"] = new TextBefore(DelimiterFunction.TextBefore);
7376
Functions["textafter"] = new TextAfter(DelimiterFunction.TextAfter);
Lines changed: 197 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,197 @@
1+
using OfficeOpenXml.FormulaParsing.Excel.Functions.MathFunctions;
2+
using OfficeOpenXml.FormulaParsing.Excel.Functions.Metadata;
3+
using OfficeOpenXml.FormulaParsing.Excel.Functions.RefAndLookup;
4+
using OfficeOpenXml.FormulaParsing.FormulaExpressions;
5+
using OfficeOpenXml.FormulaParsing.Ranges;
6+
using OfficeOpenXml.RichData.IndexRelations;
7+
using System;
8+
using System.Collections.Generic;
9+
using System.Linq;
10+
using System.Runtime.CompilerServices;
11+
using System.Text;
12+
using System.Text.RegularExpressions;
13+
14+
namespace OfficeOpenXml.FormulaParsing.Excel.Functions.Text
15+
{
16+
[FunctionMetadata(
17+
Category = ExcelFunctionCategory.Text,
18+
EPPlusVersion = "8.6",
19+
Description = "Extracts text matching a regular expression pattern from input string values.",
20+
SupportsArrays = true)]
21+
internal class RegexExtract : RegexFunctionBase
22+
{
23+
public override int ArgumentMinLength => 2;
24+
25+
public override string NamespacePrefix => "_xlfn.";
26+
27+
public override CompileResult Execute(IList<FunctionArgument> arguments, ParsingContext context)
28+
{
29+
bool textIsRange = arguments[0].IsExcelRange;
30+
bool patternIsRange = arguments[1].IsExcelRange;
31+
int returnMode = arguments.Count > 2 ? ArgToInt(arguments, 2, 0) : 0;
32+
int caseSensitivity = arguments.Count > 3 ? ArgToInt(arguments, 3, 0) : 0;
33+
34+
if (!textIsRange && !patternIsRange)
35+
{
36+
var text = arguments[0].Value?.ToString();
37+
var pattern = arguments[1].Value?.ToString();
38+
39+
if (text == null || pattern == null)
40+
return CreateResult(ExcelErrorValue.Create(eErrorType.NA), DataType.ExcelError);
41+
if (caseSensitivity > 1 || caseSensitivity < 0 || returnMode < 0 || returnMode > 2)
42+
return CreateResult(ExcelErrorValue.Create(eErrorType.Value), DataType.ExcelError);
43+
44+
if (returnMode == 1)
45+
{
46+
var matches = GetMatches(text, pattern, caseSensitivity);
47+
if (matches.Length == 0)
48+
return CreateResult(ExcelErrorValue.Create(eErrorType.NA), DataType.ExcelError);
49+
50+
var arr = new InMemoryRange((short)1, (short)matches.Length);
51+
for (int i = 0; i < matches.Length; i++)
52+
arr.SetValue(0, i, matches[i]);
53+
54+
return CreateDynamicArrayResult(arr, DataType.ExcelRange);
55+
}
56+
else if (returnMode == 2)
57+
{
58+
// Read the number of capturing groups from the pattern (GetGroupNumbers
59+
// includes group 0). A failed match reports Groups.Count == 1, so this must
60+
// not be read from the match. No groups -> #VALUE!; groups but no match -> #N/A.
61+
var regex = new Regex(pattern, (RegexOptions)caseSensitivity);
62+
if (regex.GetGroupNumbers().Length <= 1)
63+
return CreateResult(ExcelErrorValue.Create(eErrorType.Value), DataType.ExcelError);
64+
65+
var match = regex.Match(text);
66+
if (!match.Success)
67+
return CreateResult(ExcelErrorValue.Create(eErrorType.NA), DataType.ExcelError);
68+
69+
var groups = match.Groups
70+
.Cast<Group>()
71+
.Skip(1)
72+
.Select(g => g.Value)
73+
.ToArray();
74+
75+
var arr = new InMemoryRange((short)1, (short)groups.Length);
76+
for (int i = 0; i < groups.Length; i++)
77+
arr.SetValue(0, i, groups[i]);
78+
79+
return CreateDynamicArrayResult(arr, DataType.ExcelRange);
80+
}
81+
var firstMatch = Regex.Match(text, pattern, (RegexOptions)caseSensitivity);
82+
if (!firstMatch.Success)
83+
return CreateResult(ExcelErrorValue.Create(eErrorType.NA), DataType.ExcelError);
84+
return CreateResult(firstMatch.Value, DataType.String);
85+
}
86+
87+
var texts = textIsRange ? arguments[0].ValueAsRangeInfo : null;
88+
var patterns = patternIsRange ? arguments[1].ValueAsRangeInfo : null;
89+
90+
int textRows = texts != null ? texts.Size.NumberOfRows : 1;
91+
int textCols = texts != null ? texts.Size.NumberOfCols : 1;
92+
int patternRows = patterns != null ? patterns.Size.NumberOfRows : 1;
93+
int patternCols = patterns != null ? patterns.Size.NumberOfCols : 1;
94+
95+
var nRows = ExpandedSize(textRows, patternRows);
96+
var nCols = ExpandedSize(textCols, patternCols);
97+
98+
var result = new InMemoryRange(nRows, nCols);
99+
100+
for (int row = 0; row < nRows; row++)
101+
{
102+
for (int col = 0; col < nCols; col++)
103+
{
104+
var textValue = GetValue(texts, arguments[0], textRows, textCols, row, col);
105+
var patternValue = GetValue(patterns, arguments[1], patternRows, patternCols, row, col);
106+
107+
if (textValue == null || patternValue == null)
108+
{
109+
result.SetValue(row, col, ExcelErrorValue.Create(eErrorType.NA));
110+
}
111+
// Use the same validation as the scalar branch. The previous Math.Abs check
112+
// let negative arguments through, which fell into mode 0 or reached
113+
// (RegexOptions)(-1). Excel returns #VALUE! per cell for these.
114+
else if (caseSensitivity > 1 || caseSensitivity < 0 || returnMode < 0 || returnMode > 2)
115+
{
116+
result.SetValue(row, col, ExcelErrorValue.Create(eErrorType.Value));
117+
}
118+
else
119+
{
120+
// Compute per cell and catch invalid-pattern exceptions here so that a
121+
// single bad cell becomes #VALUE! in place, while the other cells are
122+
// still calculated (verified against Excel).
123+
try
124+
{
125+
var options = (RegexOptions)caseSensitivity;
126+
if (returnMode == 2)
127+
{
128+
// A failed match reports Groups.Count == 1, so the number of
129+
// capturing groups must be read from the pattern itself (via
130+
// GetGroupNumbers, which includes group 0) rather than from the
131+
// match. No groups -> #VALUE!; groups but no match -> #N/A.
132+
var regex = new Regex(patternValue, options);
133+
if (regex.GetGroupNumbers().Length <= 1)
134+
{
135+
result.SetValue(row, col, ExcelErrorValue.Create(eErrorType.Value));
136+
}
137+
else
138+
{
139+
var match = regex.Match(textValue);
140+
if (!match.Success)
141+
{
142+
result.SetValue(row, col, ExcelErrorValue.Create(eErrorType.NA));
143+
}
144+
else
145+
{
146+
// In range mode only the first group is returned per cell.
147+
result.SetValue(row, col, match.Groups[1].Value);
148+
}
149+
}
150+
}
151+
else if (returnMode == 1)
152+
{
153+
var matches = GetMatches(textValue, patternValue, caseSensitivity);
154+
if (matches.Length == 0)
155+
{
156+
result.SetValue(row, col, ExcelErrorValue.Create(eErrorType.NA));
157+
}
158+
else
159+
{
160+
// In range mode only the first match is returned per cell.
161+
result.SetValue(row, col, matches[0]);
162+
}
163+
}
164+
else
165+
{
166+
var match = Regex.Match(textValue, patternValue, options);
167+
if (!match.Success)
168+
{
169+
result.SetValue(row, col, ExcelErrorValue.Create(eErrorType.NA));
170+
}
171+
else
172+
{
173+
result.SetValue(row, col, match.Value);
174+
}
175+
}
176+
}
177+
catch (ArgumentException)
178+
{
179+
// Invalid regex pattern in this cell -> #VALUE! for this cell only.
180+
result.SetValue(row, col, ExcelErrorValue.Create(eErrorType.Value));
181+
}
182+
}
183+
}
184+
}
185+
186+
return CreateDynamicArrayResult(result, DataType.ExcelRange);
187+
}
188+
189+
private string[] GetMatches(string text, string pattern, int caseSensitive)
190+
{
191+
return Regex.Matches(text, pattern, (RegexOptions)caseSensitive)
192+
.Cast<System.Text.RegularExpressions.Match>()
193+
.Select(m => m.Value)
194+
.ToArray();
195+
}
196+
}
197+
}
Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
using System;
2+
using System.Collections.Generic;
3+
using System.Linq;
4+
using System.Text;
5+
6+
namespace OfficeOpenXml.FormulaParsing.Excel.Functions.Text
7+
{
8+
internal abstract class RegexFunctionBase : ExcelFunction
9+
{
10+
protected static string GetValue(
11+
IRangeInfo range,
12+
FunctionArgument scalar,
13+
int nRows, int nCols,
14+
int row, int col)
15+
{
16+
if (range == null)
17+
return scalar.Value?.ToString();
18+
19+
int r = nRows == 1 ? 0 : row;
20+
int c = nCols == 1 ? 0 : col;
21+
22+
if (r >= nRows || c >= nCols)
23+
return null;
24+
25+
return range.GetOffset(r, c)?.ToString();
26+
}
27+
28+
protected static short ExpandedSize(int a, int b)
29+
{
30+
if (a == 1) return (short)b;
31+
if (b == 1) return (short)a;
32+
return (short)Math.Max(a, b);
33+
}
34+
}
35+
}

0 commit comments

Comments
 (0)