Skip to content

Commit 1b9c4c5

Browse files
committed
Refactor website fetching logic into a dedicated utility and add automatic refetching when cached website files are missing
1 parent 9e1d3d9 commit 1b9c4c5

3 files changed

Lines changed: 164 additions & 131 deletions

File tree

apps/editor/src/views/panel/backend/message-handlers/handle-paste-url.ts

Lines changed: 8 additions & 131 deletions
Original file line numberDiff line numberDiff line change
@@ -1,151 +1,28 @@
11
import { PasteUrlMessage } from '../../types/messages'
22
import { PanelProvider } from '../panel-provider'
3-
import axios from 'axios'
4-
import { JSDOM } from 'jsdom'
5-
import { Readability, isProbablyReaderable } from '@mozilla/readability'
6-
import createDOMPurify from 'dompurify'
7-
import TurndownServiceJoplin from '@joplin/turndown'
8-
import * as turndownPluginGfm from '@joplin/turndown-plugin-gfm'
9-
import type TurndownService from 'turndown'
103
import * as fs from 'fs'
11-
import * as os from 'os'
12-
import * as path from 'path'
13-
import * as crypto from 'crypto'
14-
15-
const remove_markdown_images = (text: string) => {
16-
const without_images = text.replace(/!\[([^\]]*)\]\(([^)]*)\)/g, '')
17-
return without_images.replace(/\n{3,}/g, '\n\n')
18-
}
19-
20-
const create_turndown_service = () => {
21-
const turndown_service: TurndownService = new TurndownServiceJoplin({
22-
codeBlockStyle: 'fenced'
23-
})
24-
turndown_service.use(turndownPluginGfm.gfm)
25-
turndown_service.addRule('fencedCodeBlock', {
26-
filter: (node: any, options: any) => {
27-
return (
28-
options.codeBlockStyle == 'fenced' &&
29-
node.nodeName == 'PRE' &&
30-
node.querySelector('code')
31-
)
32-
},
33-
replacement: (_: any, node: any, options: any) => {
34-
const element = node as HTMLElement
35-
const language = (element
36-
.querySelector('code')
37-
?.className.match(/language-(\S+)/) || [null, ''])[1]
38-
39-
return (
40-
'\n\n' +
41-
options.fence +
42-
language +
43-
'\n' +
44-
element.textContent +
45-
'\n' +
46-
options.fence +
47-
'\n\n'
48-
)
49-
}
50-
})
51-
// Convert math blocks to markdown
52-
turndown_service.addRule('multiplemath', {
53-
filter(node) {
54-
return (
55-
node.nodeName == 'SPAN' &&
56-
(node as HTMLElement).classList.contains('katex-display')
57-
) // Check if it's a display math block that centers equation
58-
},
59-
replacement(_, node) {
60-
// "<annotation>" element holds expression string, right for markdown
61-
const annotation = (node as HTMLElement).querySelector(
62-
'annotation'
63-
)?.textContent
64-
if (!annotation) return ''
65-
return `$$\n${annotation}\n$$`
66-
}
67-
})
68-
turndown_service.addRule('multiplemath', {
69-
filter(node) {
70-
return (
71-
node.nodeName == 'SPAN' &&
72-
(node as HTMLElement).classList.contains('katex')
73-
)
74-
},
75-
replacement(_, node) {
76-
const is_block =
77-
node.parentNode?.nodeName == 'P' &&
78-
node.parentNode.childNodes.length == 1
79-
const annotation = (node as HTMLElement).querySelector(
80-
'annotation'
81-
)?.textContent
82-
if (!annotation) return ''
83-
return is_block ? `$$ ${annotation} $$` : `$${annotation}$`
84-
}
85-
})
86-
turndown_service.addRule('stripElements', {
87-
filter: ['figure', 'picture', 'sup'],
88-
replacement: () => ''
89-
})
90-
return turndown_service
91-
}
4+
import {
5+
fetch_and_save_website,
6+
get_website_file_path
7+
} from '../utils/website-fetcher'
928

939
export const handle_paste_url = async (
9410
panel_provider: PanelProvider,
9511
message: PasteUrlMessage
9612
) => {
9713
try {
9814
const url = message.url
99-
const hash = crypto.createHash('md5').update(url).digest('hex')
100-
const filename = `cwc-website-${hash}.txt`
101-
const file_path = path.join(os.tmpdir(), filename)
15+
const file_path = get_website_file_path(url)
10216

10317
if (fs.existsSync(file_path)) {
10418
panel_provider.add_text_at_cursor_position(`#Website(${url})`)
10519
return
10620
}
10721

108-
const response = await axios.get(url, {
109-
headers: {
110-
'User-Agent':
111-
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/143.0.0.0 Safari/537.36'
112-
},
113-
timeout: 5000
114-
})
22+
const content = await fetch_and_save_website(url)
11523

116-
if (response.status == 200 && typeof response.data == 'string') {
117-
const html = response.data
118-
const window = new JSDOM('').window
119-
const DOMPurify = createDOMPurify(window as any)
120-
const clean_html = DOMPurify.sanitize(html)
121-
const dom = new JSDOM(clean_html, { url })
122-
const doc = dom.window.document
123-
124-
if (isProbablyReaderable(doc)) {
125-
const reader = new Readability(doc, { keepClasses: true })
126-
const article = reader.parse()
127-
128-
if (article && article.content) {
129-
const turndown_service = create_turndown_service()
130-
const article_dom = new JSDOM(article.content)
131-
let content = turndown_service.turndown(
132-
article_dom.window.document.body
133-
)
134-
content = remove_markdown_images(content)
135-
136-
if (content && content.trim().length > 0) {
137-
if (article.title) {
138-
content = `# ${article.title}\n\n${content}`
139-
}
140-
141-
await fs.promises.writeFile(file_path, content, 'utf-8')
142-
panel_provider.add_text_at_cursor_position(`#Website(${url})`)
143-
return
144-
}
145-
}
146-
}
147-
148-
panel_provider.add_text_at_cursor_position(url)
24+
if (content) {
25+
panel_provider.add_text_at_cursor_position(`#Website(${url})`)
14926
} else {
15027
panel_provider.add_text_at_cursor_position(url)
15128
}

apps/editor/src/views/panel/backend/utils/replace-website-symbol.ts

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@ import * as os from 'os'
22
import * as path from 'path'
33
import * as fs from 'fs'
44
import * as crypto from 'crypto'
5+
import { fetch_and_save_website } from './website-fetcher'
56

67
export const replace_website_symbol = async (params: {
78
instruction: string
@@ -19,13 +20,22 @@ export const replace_website_symbol = async (params: {
1920
const hash = crypto.createHash('md5').update(url).digest('hex')
2021
const filename = `cwc-website-${hash}.txt`
2122
const file_path = path.join(os.tmpdir(), filename)
23+
2224
try {
2325
const content = await fs.promises.readFile(file_path, 'utf-8')
2426
return {
2527
content,
2628
success: true
2729
}
2830
} catch (error) {
31+
// If file is missing (e.g. reboot), try to refetch
32+
const content = await fetch_and_save_website(url)
33+
if (content) {
34+
return {
35+
content,
36+
success: true
37+
}
38+
}
2939
return {
3040
success: false
3141
}
Lines changed: 146 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,146 @@
1+
import axios from 'axios'
2+
import { JSDOM } from 'jsdom'
3+
import { Readability, isProbablyReaderable } from '@mozilla/readability'
4+
import createDOMPurify from 'dompurify'
5+
import TurndownServiceJoplin from '@joplin/turndown'
6+
import * as turndownPluginGfm from '@joplin/turndown-plugin-gfm'
7+
import type TurndownService from 'turndown'
8+
import * as fs from 'fs'
9+
import * as os from 'os'
10+
import * as path from 'path'
11+
import * as crypto from 'crypto'
12+
13+
const remove_markdown_images = (text: string) => {
14+
const without_images = text.replace(/!\[([^\]]*)\]\(([^)]*)\)/g, '')
15+
return without_images.replace(/\n{3,}/g, '\n\n')
16+
}
17+
18+
const create_turndown_service = () => {
19+
const turndown_service: TurndownService = new TurndownServiceJoplin({
20+
codeBlockStyle: 'fenced'
21+
})
22+
turndown_service.use(turndownPluginGfm.gfm)
23+
turndown_service.addRule('fencedCodeBlock', {
24+
filter: (node: any, options: any) => {
25+
return (
26+
options.codeBlockStyle == 'fenced' &&
27+
node.nodeName == 'PRE' &&
28+
node.querySelector('code')
29+
)
30+
},
31+
replacement: (_: any, node: any, options: any) => {
32+
const element = node as HTMLElement
33+
const language = (element
34+
.querySelector('code')
35+
?.className.match(/language-(\S+)/) || [null, ''])[1]
36+
37+
return (
38+
'\n\n' +
39+
options.fence +
40+
language +
41+
'\n' +
42+
element.textContent +
43+
'\n' +
44+
options.fence +
45+
'\n\n'
46+
)
47+
}
48+
})
49+
// Convert math blocks to markdown
50+
turndown_service.addRule('multiplemath', {
51+
filter(node) {
52+
return (
53+
node.nodeName == 'SPAN' &&
54+
(node as HTMLElement).classList.contains('katex-display')
55+
) // Check if it's a display math block that centers equation
56+
},
57+
replacement(_, node) {
58+
// "<annotation>" element holds expression string, right for markdown
59+
const annotation = (node as HTMLElement).querySelector(
60+
'annotation'
61+
)?.textContent
62+
if (!annotation) return ''
63+
return `$$\n${annotation}\n$$`
64+
}
65+
})
66+
turndown_service.addRule('multiplemath', {
67+
filter(node) {
68+
return (
69+
node.nodeName == 'SPAN' &&
70+
(node as HTMLElement).classList.contains('katex')
71+
)
72+
},
73+
replacement(_, node) {
74+
const is_block =
75+
node.parentNode?.nodeName == 'P' &&
76+
node.parentNode.childNodes.length == 1
77+
const annotation = (node as HTMLElement).querySelector(
78+
'annotation'
79+
)?.textContent
80+
if (!annotation) return ''
81+
return is_block ? `$$ ${annotation} $$` : `$${annotation}$`
82+
}
83+
})
84+
turndown_service.addRule('stripElements', {
85+
filter: ['figure', 'picture', 'sup'],
86+
replacement: () => ''
87+
})
88+
return turndown_service
89+
}
90+
91+
export const get_website_file_path = (url: string) => {
92+
const hash = crypto.createHash('md5').update(url).digest('hex')
93+
const filename = `cwc-website-${hash}.txt`
94+
return path.join(os.tmpdir(), filename)
95+
}
96+
97+
export const fetch_and_save_website = async (
98+
url: string
99+
): Promise<string | null> => {
100+
try {
101+
const file_path = get_website_file_path(url)
102+
103+
const response = await axios.get(url, {
104+
headers: {
105+
'User-Agent':
106+
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/143.0.0.0 Safari/537.36'
107+
},
108+
timeout: 5000
109+
})
110+
111+
if (response.status == 200 && typeof response.data == 'string') {
112+
const html = response.data
113+
const window = new JSDOM('').window
114+
const DOMPurify = createDOMPurify(window as any)
115+
const clean_html = DOMPurify.sanitize(html)
116+
const dom = new JSDOM(clean_html, { url })
117+
const doc = dom.window.document
118+
119+
if (isProbablyReaderable(doc)) {
120+
const reader = new Readability(doc, { keepClasses: true })
121+
const article = reader.parse()
122+
123+
if (article && article.content) {
124+
const turndown_service = create_turndown_service()
125+
const article_dom = new JSDOM(article.content)
126+
let content = turndown_service.turndown(
127+
article_dom.window.document.body
128+
)
129+
content = remove_markdown_images(content)
130+
131+
if (content && content.trim().length > 0) {
132+
if (article.title) {
133+
content = `# ${article.title}\n\n${content}`
134+
}
135+
136+
await fs.promises.writeFile(file_path, content, 'utf-8')
137+
return content
138+
}
139+
}
140+
}
141+
}
142+
return null
143+
} catch (error) {
144+
return null
145+
}
146+
}

0 commit comments

Comments
 (0)