Skip to content

Commit 94ad1b4

Browse files
committed
Include extraction runtime modules required by webhook bot
1 parent 1ec24f7 commit 94ad1b4

4 files changed

Lines changed: 960 additions & 2 deletions

File tree

telegram/src/intent-routing.js

Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,56 @@
1+
const { classifyMessageInput, inferLikelyWebUrlFromText } = require('./message-utils');
2+
3+
const NON_TEXT_SOURCE_TYPES = new Set([
4+
'html_url',
5+
'pdf_url',
6+
'pdf_file',
7+
'image_url',
8+
'image_file',
9+
'audio_url',
10+
'audio_file'
11+
]);
12+
13+
function decideInputIntent({ incomingKind, text, sourceType, shortPromptMaxChars }) {
14+
const normalizedText = String(text || '').trim();
15+
const normalizedSource = String(sourceType || '').trim().toLowerCase();
16+
const isNonTextSource = NON_TEXT_SOURCE_TYPES.has(normalizedSource);
17+
const parsed = classifyMessageInput(normalizedText);
18+
const isShortText = !isNonTextSource
19+
&& String(incomingKind || '').trim().toLowerCase() === 'text'
20+
&& normalizedText.length > 0
21+
&& normalizedText.length <= Math.max(1, Number(shortPromptMaxChars || 120));
22+
23+
let inferredUrl = '';
24+
if (isShortText && parsed.kind !== 'url') {
25+
inferredUrl = String(inferLikelyWebUrlFromText(normalizedText) || '').trim();
26+
}
27+
28+
let route = 'text';
29+
let reason = 'default_text';
30+
if (isNonTextSource) {
31+
route = 'source_extraction';
32+
reason = `non_text_source:${normalizedSource}`;
33+
} else if (parsed.kind === 'url' || inferredUrl) {
34+
route = 'url';
35+
reason = parsed.kind === 'url' ? 'input_contains_url' : 'short_text_url_inferred';
36+
} else if (isShortText) {
37+
route = 'invent';
38+
reason = 'short_text_needs_story_expansion';
39+
}
40+
41+
return {
42+
route,
43+
reason,
44+
parsedKind: parsed.kind,
45+
parsedValue: String(parsed.value || ''),
46+
isShortText,
47+
isNonTextSource,
48+
inferredUrl
49+
};
50+
}
51+
52+
module.exports = {
53+
NON_TEXT_SOURCE_TYPES,
54+
decideInputIntent
55+
};
56+

telegram/src/message-utils.js

Lines changed: 88 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -56,7 +56,33 @@ function isLongStoryText(value) {
5656
function isLikelyMediaUrl(value) {
5757
const url = String(value || '').trim().toLowerCase();
5858
if (!url) return false;
59-
return /\.(png|jpe?g|gif|webp|bmp|svg|mp4|mov|avi|mkv|webm|m4v|wmv|flv)(\?.*)?$/.test(url);
59+
return /\.(png|jpe?g|gif|webp|bmp|svg|mp4|mov|avi|mkv|webm|m4v|wmv|flv|mp3|wav|ogg|m4a|aac|flac|opus)(\?.*)?$/.test(url);
60+
}
61+
62+
function isLikelyImageUrl(value) {
63+
const url = String(value || '').trim().toLowerCase();
64+
if (!url) return false;
65+
return /\.(png|jpe?g|gif|webp|bmp|svg)(\?.*)?$/.test(url);
66+
}
67+
68+
function isLikelyAudioUrl(value) {
69+
const url = String(value || '').trim().toLowerCase();
70+
if (!url) return false;
71+
return /\.(mp3|wav|ogg|m4a|aac|flac|opus)(\?.*)?$/.test(url);
72+
}
73+
74+
function isLikelyPdfUrl(value) {
75+
const candidate = normalizeUrlCandidate(value);
76+
if (!looksLikeUrl(candidate)) return false;
77+
try {
78+
const parsed = new URL(candidate);
79+
const joined = `${parsed.pathname || ''}${parsed.search || ''}`.toLowerCase();
80+
if (joined.includes('.pdf')) return true;
81+
const mimeHint = String(parsed.searchParams.get('format') || '').toLowerCase();
82+
return mimeHint === 'pdf' || mimeHint === 'application/pdf';
83+
} catch (_) {
84+
return /\.pdf(\?.*)?$/i.test(candidate);
85+
}
6086
}
6187

6288
function isLikelyWebPageUrl(value) {
@@ -106,6 +132,60 @@ function extractTextFallbackFromUrlMessage(value) {
106132
return candidate;
107133
}
108134

135+
function extractFirstPdfUrlLikeToken(value) {
136+
const text = String(value || '').trim();
137+
if (!text) return '';
138+
const explicit = extractFirstUrl(text);
139+
if (explicit && isLikelyPdfUrl(explicit)) return explicit;
140+
const tokens = text.split(/\s+/).map((t) => normalizeUrlCandidate(t)).filter(Boolean);
141+
for (const token of tokens) {
142+
const inferred = inferLikelyWebUrlFromText(token);
143+
if (inferred && isLikelyPdfUrl(inferred)) return inferred;
144+
}
145+
return '';
146+
}
147+
148+
function extractFirstImageUrlLikeToken(value) {
149+
const text = String(value || '').trim();
150+
if (!text) return '';
151+
const explicit = extractFirstUrl(text);
152+
if (explicit && isLikelyImageUrl(explicit)) return explicit;
153+
const tokens = text.split(/\s+/).map((t) => normalizeUrlCandidate(t)).filter(Boolean);
154+
for (const token of tokens) {
155+
const inferred = inferLikelyWebUrlFromText(token);
156+
if (inferred && isLikelyImageUrl(inferred)) return inferred;
157+
}
158+
return '';
159+
}
160+
161+
function extractFirstAudioUrlLikeToken(value) {
162+
const text = String(value || '').trim();
163+
if (!text) return '';
164+
const explicit = extractFirstUrl(text);
165+
if (explicit && isLikelyAudioUrl(explicit)) return explicit;
166+
const tokens = text.split(/\s+/).map((t) => normalizeUrlCandidate(t)).filter(Boolean);
167+
for (const token of tokens) {
168+
const inferred = inferLikelyWebUrlFromText(token);
169+
if (inferred && isLikelyAudioUrl(inferred)) return inferred;
170+
if (!inferred) {
171+
let candidate = token;
172+
if (!/^https?:\/\//i.test(candidate)) {
173+
if (/^www\./i.test(candidate) || /^[a-z0-9.-]+\.[a-z]{2,}(?:[/:?#].*)?$/i.test(candidate)) {
174+
candidate = `https://${candidate}`;
175+
} else {
176+
continue;
177+
}
178+
}
179+
try {
180+
const parsed = new URL(candidate);
181+
const normalized = parsed.toString();
182+
if (isLikelyAudioUrl(normalized)) return normalized;
183+
} catch (_) {}
184+
}
185+
}
186+
return '';
187+
}
188+
109189
function extractLinksFromEntities(baseText, entities) {
110190
const text = String(baseText || '');
111191
const list = Array.isArray(entities) ? entities : [];
@@ -189,9 +269,15 @@ module.exports = {
189269
extractFirstUrl,
190270
classifyMessageInput,
191271
isLikelyWebPageUrl,
272+
isLikelyPdfUrl,
273+
isLikelyImageUrl,
274+
isLikelyAudioUrl,
192275
extractTextFallbackFromUrlMessage,
193276
inferLikelyWebUrlFromText,
194277
extractMessageInputText,
195-
extractFirstUrlLikeToken
278+
extractFirstUrlLikeToken,
279+
extractFirstPdfUrlLikeToken,
280+
extractFirstImageUrlLikeToken,
281+
extractFirstAudioUrlLikeToken
196282
};
197283
const { LONG_STORY_TEXT_MIN_CHARS } = require('./data/thresholds');

0 commit comments

Comments
 (0)