Skip to content

Commit 68e8771

Browse files
committed
* Added extension methods that use OpenAI GPT4 for repairing HTML text.
1 parent ec20bf6 commit 68e8771

5 files changed

Lines changed: 171 additions & 2 deletions

File tree

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,49 @@
1+
using System;
2+
3+
using OnixData.Standard.Legacy;
4+
using OnixData.Standard.Services;
5+
using OnixData.Standard.Version3.Text;
6+
7+
namespace OnixData.Standard.Extensions
8+
{
9+
public static class OnixRepairTextExtensions
10+
{
11+
/// <summary>
12+
///
13+
/// Repairs any malformed HTML in the Text field of an OnixLegacyOtherText object
14+
///
15+
/// </summary>
16+
public static TimeSpan RepairHtmlText(this OnixLegacyOtherText legacyOtherText, OnixChatServiceSettings chatServiceSettings)
17+
{
18+
TimeSpan timeSpan = new TimeSpan(0);
19+
20+
if (!String.IsNullOrEmpty(legacyOtherText.Text))
21+
{
22+
OnixTextRepairChatService repairService = new OnixTextRepairChatService(chatServiceSettings);
23+
24+
legacyOtherText.Text = repairService.RetrieveRepairedCommHtml(legacyOtherText.Text, ref timeSpan);
25+
}
26+
27+
return timeSpan;
28+
}
29+
30+
/// <summary>
31+
///
32+
/// Repairs any malformed HTML in the Text field of an OnixTextContent object
33+
///
34+
/// </summary>
35+
public static TimeSpan RepairHtmlText(this OnixTextContent onixTextContent, OnixChatServiceSettings chatServiceSettings)
36+
{
37+
TimeSpan timeSpan = new TimeSpan(0);
38+
39+
if (!String.IsNullOrEmpty(onixTextContent.Text))
40+
{
41+
OnixTextRepairChatService repairService = new OnixTextRepairChatService(chatServiceSettings);
42+
43+
onixTextContent.Text = repairService.RetrieveRepairedCommHtml(onixTextContent.Text, ref timeSpan);
44+
}
45+
46+
return timeSpan;
47+
}
48+
}
49+
}

OnixData.Standard/OnixData.Standard.csproj

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
<Title>ONIX-Data</Title>
88
<Authors>jaerith</Authors>
99
<Description>This C# library serves to provide .NET data structures (and an accompanying set of helpful parsers) for the ONIX XML format, which is the international standard for representing the electronic data regarding books (along with other media). This format has been established by the international book trade body known as EDITEUR. Within this solution, you will find two collections of classes for serialization/deserialization: one that represents the legacy format (i.e., 2.1 and earlier) and another that represents the current format (i.e., 3.0). In addition, two helpful parser classes have been included in order to assist with the population of those collections.</Description>
10-
<Copyright>Copyright © 2021</Copyright>
10+
<Copyright>Copyright © 2025</Copyright>
1111
<PackageProjectUrl>https://github.com/jaerith/ONIX-Data</PackageProjectUrl>
1212
<PackageLicenseUrl>https://github.com/jaerith/ONIX-Data/blob/master/LICENSE</PackageLicenseUrl>
1313
<RepositoryUrl>https://github.com/jaerith/ONIX-Data</RepositoryUrl>
@@ -17,4 +17,8 @@
1717
<LangVersion>default</LangVersion>
1818
</PropertyGroup>
1919

20+
<ItemGroup>
21+
<PackageReference Include="OpenAI" Version="2.5.0" />
22+
</ItemGroup>
23+
2024
</Project>
Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
namespace OnixData.Standard.Services
2+
{
3+
public class OnixChatServiceSettings
4+
{
5+
public string ApiKey { get; set; }
6+
public string Model { get; set; }
7+
public string Prompt { get; set; }
8+
public string Suffix { get; set; }
9+
public decimal Temperature { get; set; }
10+
public int MaxTokens { get; set; }
11+
public decimal TopP { get; set; }
12+
public string Stop { get; set; }
13+
14+
public int TimeoutInSeconds { get; set; }
15+
16+
public OnixChatServiceSettings()
17+
{
18+
Model = "gpt-4o-mini";
19+
Suffix = null;
20+
Temperature = 0.3M;
21+
MaxTokens = 2000;
22+
TopP = 1.0m;
23+
Stop = "[END]";
24+
TimeoutInSeconds = 180;
25+
}
26+
}
27+
}
Lines changed: 89 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,89 @@
1+
using System;
2+
3+
using OpenAI.Chat;
4+
5+
namespace OnixData.Standard.Services
6+
{
7+
public class OnixTextRepairChatService
8+
{
9+
public const int CONST_LARGE_REQUEST_SIZE_THRESHOLD = 25000;
10+
11+
public const string RETURN_ERROR_MSG_PREFIX = @"ERROR! ";
12+
13+
public const string CONST_REQUEST_TAG_FIX_MESSAGE_FORMAT =
14+
@"Repair the following XML:\n{0}";
15+
16+
public const string CONST_REQUEST_HTML_FIX_MESSAGE_FORMAT =
17+
@"Repair the following HTML so that it is valid and all tags are paired, without any reordering or omitting text and without converting the text into a HTML document that starts with <!DOCTYPE html> and without adding more text than necessary:\n{0}";
18+
19+
public const string CONST_GPT4_RESPONSE_PADDED_START = "\n";
20+
public const string CONST_GPT4_RESPONSE_HTML_START = "```html";
21+
public const string CONST_GPT4_RESPONSE_HTML_END = "```";
22+
23+
private OnixChatServiceSettings _settings;
24+
25+
private ChatClient _defaultOpenAIChatClient;
26+
private ChatClient _largeRequestOpenAIChatClient;
27+
28+
public OnixTextRepairChatService(OnixChatServiceSettings settings)
29+
{
30+
_settings = settings;
31+
32+
var keyCreds = new System.ClientModel.ApiKeyCredential(settings.ApiKey);
33+
34+
var defaultOptions =
35+
new OpenAI.OpenAIClientOptions() { NetworkTimeout = new TimeSpan(0, 0, settings.TimeoutInSeconds) };
36+
37+
_defaultOpenAIChatClient =
38+
new ChatClient(settings.Model, keyCreds, defaultOptions);
39+
40+
var largeRequestOptions =
41+
new OpenAI.OpenAIClientOptions() { NetworkTimeout = new TimeSpan(0, 10, 0) };
42+
43+
_largeRequestOpenAIChatClient =
44+
new ChatClient(settings.Model, keyCreds, largeRequestOptions);
45+
}
46+
47+
public string RetrieveRepairedCommHtml(string commTextTag, ref TimeSpan requestLength)
48+
{
49+
string repairedCommTextTag = commTextTag;
50+
51+
string promptFormat = CONST_REQUEST_HTML_FIX_MESSAGE_FORMAT;
52+
if (!String.IsNullOrEmpty(_settings.Prompt))
53+
promptFormat = _settings.Prompt;
54+
55+
string promptRequest = String.Format(promptFormat, commTextTag);
56+
57+
DateTime requestStartTime = DateTime.Now;
58+
59+
ChatCompletion completion =
60+
commTextTag.Length < CONST_LARGE_REQUEST_SIZE_THRESHOLD ?
61+
_defaultOpenAIChatClient.CompleteChat(promptRequest) :
62+
_largeRequestOpenAIChatClient.CompleteChat(promptRequest);
63+
64+
if ((completion != null) && (completion.Content.Count > 0))
65+
{
66+
repairedCommTextTag = completion.Content[0].Text;
67+
if (!String.IsNullOrEmpty(repairedCommTextTag) && repairedCommTextTag.Contains(CONST_GPT4_RESPONSE_HTML_START))
68+
{
69+
int htmlStart =
70+
repairedCommTextTag.IndexOf(CONST_GPT4_RESPONSE_HTML_START) + CONST_GPT4_RESPONSE_HTML_START.Length;
71+
72+
int htmlEnd = repairedCommTextTag.IndexOf(CONST_GPT4_RESPONSE_HTML_END, htmlStart);
73+
if (htmlEnd > htmlStart)
74+
{
75+
repairedCommTextTag =
76+
repairedCommTextTag.Substring(htmlStart, htmlEnd - htmlStart).Trim();
77+
}
78+
}
79+
}
80+
81+
requestLength = DateTime.Now.Subtract(requestStartTime);
82+
83+
if (repairedCommTextTag.StartsWith(CONST_GPT4_RESPONSE_PADDED_START))
84+
repairedCommTextTag = repairedCommTextTag.Remove(0, CONST_GPT4_RESPONSE_PADDED_START.Length);
85+
86+
return repairedCommTextTag;
87+
}
88+
}
89+
}

OnixData/Extensions/OnixRepairTextExtensions.cs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ public static TimeSpan RepairHtmlText(this OnixLegacyOtherText legacyOtherText,
2929

3030
/// <summary>
3131
///
32-
/// Repairs any malformed HTML in the Text field of an OnixLegacyOtherText object
32+
/// Repairs any malformed HTML in the Text field of an OnixTextContent object
3333
///
3434
/// </summary>
3535
public static TimeSpan RepairHtmlText(this OnixTextContent onixTextContent, OnixChatServiceSettings chatServiceSettings)

0 commit comments

Comments
 (0)