Skip to content

Commit 4a45b5d

Browse files
committed
Improved markdown conversion and add ability to extract notes
1 parent 4dbd92a commit 4a45b5d

2 files changed

Lines changed: 28 additions & 51 deletions

File tree

src/Services/InfoProviderSystem/DTOJsonSchemaConverter.php

Lines changed: 10 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -48,14 +48,15 @@ public function getJSONSchema(): array
4848
'type' => 'object',
4949
'properties' => [
5050
'name' => ['type' => 'string', 'description' => 'Product name'],
51-
'description' => ['type' => 'string', 'description' => 'Product description'],
51+
'description' => ['type' => 'string', 'description' => 'A short description of the product, maybe containing the most important things. Onnly One line.'],
5252
'manufacturer' => ['type' => ['string', 'null'], 'description' => 'Manufacturer name'],
5353
'mpn' => ['type' => ['string', 'null'], 'description' => 'Manufacturer Part Number'],
54-
'category' => ['type' => ['string', 'null'], 'description' => 'Product category'],
54+
'category' => ['type' => ['string', 'null'], 'description' => 'Product category, e.g. "Passive components -> Resistors"'],
5555
'manufacturing_status' => ['type' => ['string', 'null'], 'enum' => ['active', 'obsolete', 'nrfnd', 'discontinued', null], 'description' => 'Manufacturing status'],
56-
'footprint' => ['type' => ['string', 'null'], 'description' => 'Package/footprint type'],
56+
'footprint' => ['type' => ['string', 'null'], 'description' => 'Package/footprint type, like "SOT-23", "DIP-8", "QFN-32" etc.'],
5757
'mass' => ['type' => ['number', 'null'], 'description' => 'Mass of the product in grams'],
58-
'gtin' => ['type' => ['string', 'null'], 'description' => 'Global Trade Item Number (GTIN) / EAN / UPC code'],
58+
'gtin' => ['type' => ['string', 'null'], 'description' => 'Global Trade Item Number (GTIN) / EAN / UPC code for barcodes'],
59+
'notes' => ['type' => ['string', 'null'], 'description' => 'Optional long description of the part with more details than description. Can be markdown formatted.'],
5960
'parameters' => [
6061
'type' => 'array',
6162
'items' => [
@@ -98,6 +99,7 @@ public function getJSONSchema(): array
9899
'distributor_name' => ['type' => 'string', 'description' => 'Name of the distributor or vendor. Typically the shop name'],
99100
'order_number' => ['type' => ['string', 'null'], 'description' => 'The order number or SKU used by the distributor. Optional, but can help to find the product on the distributor website.'],
100101
'product_url' => ['type' => 'string'],
102+
'prices_include_vat' => ['type' => ['boolean', 'null'], 'description' => 'Whether the prices include VAT or not. Null if unknown.'],
101103
'prices' => [
102104
'type' => 'array',
103105
'items' => [
@@ -194,8 +196,8 @@ public function jsonToDTO(array $data, string $providerKey, string $providerId,
194196
$prices[] = new PriceDTO(
195197
minimum_discount_amount: (int) ($p['minimum_quantity'] ?? 1),
196198
price: (string) ($p['price'] ?? 0),
197-
currency_iso_code: $p['currency'] ?? 'USD',
198-
price_related_quantity: (int) ($p['minimum_quantity'] ?? 1),
199+
currency_iso_code: $p['currency'] ?? null,
200+
price_related_quantity: 1,
199201
);
200202
}
201203
}
@@ -205,6 +207,7 @@ public function jsonToDTO(array $data, string $providerKey, string $providerId,
205207
order_number: $v['order_number'] ?? 'Unknown',
206208
prices: $prices,
207209
product_url: $v['product_url'] ?? $productUrl,
210+
prices_include_vat: $v['prices_include_vat'] ?? null,
208211
);
209212
}
210213
}
@@ -228,7 +231,7 @@ public function jsonToDTO(array $data, string $providerKey, string $providerId,
228231
provider_url: $productUrl,
229232
footprint: $data['footprint'] ?? null,
230233
gtin: $data['gtin'] ?? null,
231-
notes: null,
234+
notes: $data['notes'],
232235
datasheets: $datasheets,
233236
images: $images,
234237
parameters: $parameters,

src/Services/InfoProviderSystem/Providers/AIInfoExtractor.php

Lines changed: 18 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,8 @@
3737
use Symfony\Component\DomCrawler\Crawler;
3838
use Symfony\Contracts\HttpClient\HttpClientInterface;
3939

40+
use function Symfony\Component\String\u;
41+
4042

4143
final class AIInfoExtractor implements InfoProviderInterface
4244
{
@@ -105,7 +107,10 @@ public function getDetails(string $id): PartDetailDTO
105107
// Truncate to max content length
106108
$truncatedHtml = $this->truncateHTML($cleanedHtml, $this->settings->maxContentLength);*/
107109

110+
//Convert html to markdown, to provide a cleaner input to the LLM.
108111
$markdown = $this->htmlToMarkdown($html);
112+
//Truncate markdown to max content length, if needed
113+
$markdown = u($markdown)->truncate($this->settings->maxContentLength, '... [truncated]')->toString();
109114

110115
//Extract structured data using traditional methods, to provide additional context to the LLM. This can help improve accuracy, especially for technical specifications that might be in tables or specific formats.
111116
$structuredData = $this->extractStructuredData($html, $url);
@@ -137,10 +142,21 @@ private function htmlToMarkdown(string $html): string
137142
{
138143
//Extract only the main content of the page to avoid overwhelming the LLM with irrelevant information.
139144
$crawler = new Crawler($html);
140-
$mainContent = $crawler->filter('main, article, #content')->first();
145+
$mainContent = $crawler->filter('main, article, #content');
141146

142147
// If we found a specific content area, get its HTML; otherwise, use the whole body.
143-
$htmlToConvert = $mainContent->count() ? $mainContent->html() : $html;
148+
//Concat the html of all matched nodes, to provide more context to the LLM, especially for pages that use multiple sections for product info.
149+
if ($mainContent->count() > 0) {
150+
$htmlToConvert = '';
151+
foreach ($mainContent as $node) {
152+
$htmlToConvert .= $node->ownerDocument->saveHTML($node);
153+
$htmlToConvert .= "\n\n"; // Add some spacing between sections
154+
}
155+
} else {
156+
//Use the whole body content, as it might contain relevant information, especially for simpler pages that don't have a clear main/content section.
157+
$htmlToConvert = $html;
158+
}
159+
144160

145161
//Concert to markdown
146162
$converter = new HtmlConverter([
@@ -163,48 +179,6 @@ public function getCapabilities(): array
163179
];
164180
}
165181

166-
private function cleanHTML(string $html): string
167-
{
168-
// Remove script tags
169-
$html = preg_replace('/<script\b[^>]*>(.*?)<\/script>/is', '', $html);
170-
171-
// Remove style tags
172-
$html = preg_replace('/<style\b[^>]*>(.*?)<\/style>/is', '', $html);
173-
174-
// Remove nav tags
175-
$html = preg_replace('/<nav\b[^>]*>(.*?)<\/nav>/is', '', $html);
176-
177-
// Remove footer tags
178-
$html = preg_replace('/<footer\b[^>]*>(.*?)<\/footer>/is', '', $html);
179-
180-
// Remove header tags
181-
$html = preg_replace('/<header\b[^>]*>(.*?)<\/header>/is', '', $html);
182-
183-
// Remove HTML comments
184-
$html = preg_replace('/<!--(.*?)-->/is', '', $html);
185-
186-
return $html;
187-
}
188-
189-
private function truncateHTML(string $html, int $maxLength): string
190-
{
191-
if (strlen($html) <= $maxLength) {
192-
return $html;
193-
}
194-
195-
// Truncate and find the last > or space to avoid cutting tags
196-
$truncated = substr($html, 0, $maxLength);
197-
198-
// Find the last occurrence of > or space
199-
$lastPos = max(strrpos($truncated, '>'), strrpos($truncated, ' '));
200-
201-
if ($lastPos !== false && $lastPos > $maxLength * 0.9) {
202-
$truncated = substr($truncated, 0, $lastPos + 1);
203-
}
204-
205-
return $truncated;
206-
}
207-
208182
private function callLLM(string $htmlContent, string $url, ?string $structuredData = null): array
209183
{
210184
$input = new MessageBag(

0 commit comments

Comments
 (0)