|
1 | 1 | import { PasteUrlMessage } from '../../types/messages' |
2 | 2 | import { PanelProvider } from '../panel-provider' |
3 | | -import axios from 'axios' |
4 | | -import { JSDOM } from 'jsdom' |
5 | | -import { Readability, isProbablyReaderable } from '@mozilla/readability' |
6 | | -import createDOMPurify from 'dompurify' |
7 | | -import TurndownServiceJoplin from '@joplin/turndown' |
8 | | -import * as turndownPluginGfm from '@joplin/turndown-plugin-gfm' |
9 | | -import type TurndownService from 'turndown' |
10 | 3 | import * as fs from 'fs' |
11 | | -import * as os from 'os' |
12 | | -import * as path from 'path' |
13 | | -import * as crypto from 'crypto' |
14 | | - |
15 | | -const remove_markdown_images = (text: string) => { |
16 | | - const without_images = text.replace(/!\[([^\]]*)\]\(([^)]*)\)/g, '') |
17 | | - return without_images.replace(/\n{3,}/g, '\n\n') |
18 | | -} |
19 | | - |
20 | | -const create_turndown_service = () => { |
21 | | - const turndown_service: TurndownService = new TurndownServiceJoplin({ |
22 | | - codeBlockStyle: 'fenced' |
23 | | - }) |
24 | | - turndown_service.use(turndownPluginGfm.gfm) |
25 | | - turndown_service.addRule('fencedCodeBlock', { |
26 | | - filter: (node: any, options: any) => { |
27 | | - return ( |
28 | | - options.codeBlockStyle == 'fenced' && |
29 | | - node.nodeName == 'PRE' && |
30 | | - node.querySelector('code') |
31 | | - ) |
32 | | - }, |
33 | | - replacement: (_: any, node: any, options: any) => { |
34 | | - const element = node as HTMLElement |
35 | | - const language = (element |
36 | | - .querySelector('code') |
37 | | - ?.className.match(/language-(\S+)/) || [null, ''])[1] |
38 | | - |
39 | | - return ( |
40 | | - '\n\n' + |
41 | | - options.fence + |
42 | | - language + |
43 | | - '\n' + |
44 | | - element.textContent + |
45 | | - '\n' + |
46 | | - options.fence + |
47 | | - '\n\n' |
48 | | - ) |
49 | | - } |
50 | | - }) |
51 | | - // Convert math blocks to markdown |
52 | | - turndown_service.addRule('multiplemath', { |
53 | | - filter(node) { |
54 | | - return ( |
55 | | - node.nodeName == 'SPAN' && |
56 | | - (node as HTMLElement).classList.contains('katex-display') |
57 | | - ) // Check if it's a display math block that centers equation |
58 | | - }, |
59 | | - replacement(_, node) { |
60 | | - // "<annotation>" element holds expression string, right for markdown |
61 | | - const annotation = (node as HTMLElement).querySelector( |
62 | | - 'annotation' |
63 | | - )?.textContent |
64 | | - if (!annotation) return '' |
65 | | - return `$$\n${annotation}\n$$` |
66 | | - } |
67 | | - }) |
68 | | - turndown_service.addRule('multiplemath', { |
69 | | - filter(node) { |
70 | | - return ( |
71 | | - node.nodeName == 'SPAN' && |
72 | | - (node as HTMLElement).classList.contains('katex') |
73 | | - ) |
74 | | - }, |
75 | | - replacement(_, node) { |
76 | | - const is_block = |
77 | | - node.parentNode?.nodeName == 'P' && |
78 | | - node.parentNode.childNodes.length == 1 |
79 | | - const annotation = (node as HTMLElement).querySelector( |
80 | | - 'annotation' |
81 | | - )?.textContent |
82 | | - if (!annotation) return '' |
83 | | - return is_block ? `$$ ${annotation} $$` : `$${annotation}$` |
84 | | - } |
85 | | - }) |
86 | | - turndown_service.addRule('stripElements', { |
87 | | - filter: ['figure', 'picture', 'sup'], |
88 | | - replacement: () => '' |
89 | | - }) |
90 | | - return turndown_service |
91 | | -} |
| 4 | +import { |
| 5 | + fetch_and_save_website, |
| 6 | + get_website_file_path |
| 7 | +} from '../utils/website-fetcher' |
92 | 8 |
|
93 | 9 | export const handle_paste_url = async ( |
94 | 10 | panel_provider: PanelProvider, |
95 | 11 | message: PasteUrlMessage |
96 | 12 | ) => { |
97 | 13 | try { |
98 | 14 | const url = message.url |
99 | | - const hash = crypto.createHash('md5').update(url).digest('hex') |
100 | | - const filename = `cwc-website-${hash}.txt` |
101 | | - const file_path = path.join(os.tmpdir(), filename) |
| 15 | + const file_path = get_website_file_path(url) |
102 | 16 |
|
103 | 17 | if (fs.existsSync(file_path)) { |
104 | 18 | panel_provider.add_text_at_cursor_position(`#Website(${url})`) |
105 | 19 | return |
106 | 20 | } |
107 | 21 |
|
108 | | - const response = await axios.get(url, { |
109 | | - headers: { |
110 | | - 'User-Agent': |
111 | | - 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/143.0.0.0 Safari/537.36' |
112 | | - }, |
113 | | - timeout: 5000 |
114 | | - }) |
| 22 | + const content = await fetch_and_save_website(url) |
115 | 23 |
|
116 | | - if (response.status == 200 && typeof response.data == 'string') { |
117 | | - const html = response.data |
118 | | - const window = new JSDOM('').window |
119 | | - const DOMPurify = createDOMPurify(window as any) |
120 | | - const clean_html = DOMPurify.sanitize(html) |
121 | | - const dom = new JSDOM(clean_html, { url }) |
122 | | - const doc = dom.window.document |
123 | | - |
124 | | - if (isProbablyReaderable(doc)) { |
125 | | - const reader = new Readability(doc, { keepClasses: true }) |
126 | | - const article = reader.parse() |
127 | | - |
128 | | - if (article && article.content) { |
129 | | - const turndown_service = create_turndown_service() |
130 | | - const article_dom = new JSDOM(article.content) |
131 | | - let content = turndown_service.turndown( |
132 | | - article_dom.window.document.body |
133 | | - ) |
134 | | - content = remove_markdown_images(content) |
135 | | - |
136 | | - if (content && content.trim().length > 0) { |
137 | | - if (article.title) { |
138 | | - content = `# ${article.title}\n\n${content}` |
139 | | - } |
140 | | - |
141 | | - await fs.promises.writeFile(file_path, content, 'utf-8') |
142 | | - panel_provider.add_text_at_cursor_position(`#Website(${url})`) |
143 | | - return |
144 | | - } |
145 | | - } |
146 | | - } |
147 | | - |
148 | | - panel_provider.add_text_at_cursor_position(url) |
| 24 | + if (content) { |
| 25 | + panel_provider.add_text_at_cursor_position(`#Website(${url})`) |
149 | 26 | } else { |
150 | 27 | panel_provider.add_text_at_cursor_position(url) |
151 | 28 | } |
|
0 commit comments