|
| 1 | +const { Buffer } = require('node:buffer') |
| 2 | +const { setTimeout } = require('node:timers/promises') |
| 3 | + |
| 4 | +const TEXT_TITAN = 356425 |
| 5 | +const PRINT_M1 = 39995 |
| 6 | + |
| 7 | +const md5 = (data, encoding = 'hex') => { |
| 8 | + let hash = createHash('md5') |
| 9 | + hash.update(data) |
| 10 | + return hash.digest(encoding) |
| 11 | +} |
| 12 | + |
1 | 13 | export default class ArtimiPlugin { |
2 | 14 | constructor (options, context) { |
3 | 15 | this.options = Object.assign({}, ArtimiPlugin.defaults, options) |
4 | 16 | this.context = context |
| 17 | + this.controller = new AbortController() |
5 | 18 | } |
6 | 19 |
|
7 | 20 | async transcribe (transcription, { buffer, ...raw }) { |
| 21 | + let { logger } = this.context |
| 22 | + let { host, token, htr, model } = this.options |
| 23 | + |
| 24 | + let images = [await this.upload(buffer)] |
| 25 | + let config = { |
| 26 | + model: model || (htr ? TEXT_TITAN : PRINT_M1) |
| 27 | + } |
| 28 | + |
| 29 | + logger.info('Submit transcription request...') |
| 30 | + let res = await fetch(`${host}/transcription`, { |
| 31 | + method: 'POST', |
| 32 | + headers: { |
| 33 | + 'Authorization': `Bearer ${token}`, |
| 34 | + 'Content-Type': 'application/json' |
| 35 | + }, |
| 36 | + body: JSON.stringify({ config, images }) |
| 37 | + }) |
| 38 | + |
| 39 | + if (!res.ok) { |
| 40 | + throw new Error(await res.text()) |
| 41 | + } |
| 42 | + |
| 43 | + let job = await res.json() |
| 44 | + transcription.config.jobId = job.id |
| 45 | + |
| 46 | + logger.info(`Fetch transcription #${job.id}...`) |
| 47 | + await this.poll(job) |
| 48 | + |
| 49 | + transcription.text = job.output.text |
| 50 | + transcription.data = job.output.alto |
| 51 | + } |
| 52 | + |
| 53 | + async poll (job) { |
| 54 | + let { logger } = this.context |
| 55 | + let { host, interval, maxRetries } = this.options |
| 56 | + |
| 57 | + let numRetries = 0 |
| 58 | + |
| 59 | + while (true) { |
| 60 | + let next |
| 61 | + |
| 62 | + try { |
| 63 | + let res = await fetch(`${host}/transcription/${job.id}`, { |
| 64 | + headers: { |
| 65 | + Authorization: `Bearer ${token}` |
| 66 | + } |
| 67 | + }) |
| 68 | + |
| 69 | + if (!res.ok) { |
| 70 | + throw new Error(await res.text()) |
| 71 | + } |
| 72 | + |
| 73 | + next = await res.json() |
| 74 | + } catch (err) { |
| 75 | + logger.error({ stack: err.stack }, `Request failed: ${err.message}`) |
| 76 | + if (++numRetries > maxRetries) { |
| 77 | + throw err |
| 78 | + } |
| 79 | + } |
| 80 | + |
| 81 | + switch (next.state) { |
| 82 | + case 'completed': |
| 83 | + return next |
| 84 | + case 'created': |
| 85 | + case 'active': |
| 86 | + case 'paused': |
| 87 | + break |
| 88 | + default: |
| 89 | + throw new Error(`transcription request state "${next.state}"`) |
| 90 | + } |
| 91 | + |
| 92 | + await new Promise(resolve => { |
| 93 | + setTimeout(interval, null, { |
| 94 | + signal: this.controller.signal |
| 95 | + }).then(resolve, resolve) |
| 96 | + }) |
| 97 | + } |
| 98 | + } |
| 99 | + |
| 100 | + async upload (buffer) { |
8 | 101 | let { logger, sharp } = this.context |
| 102 | + let { host, token } = this.options |
9 | 103 |
|
10 | 104 | // TODO use PNG if image uses alpha |
| 105 | + let type = 'jpeg' |
11 | 106 | let image = await sharp.toBuffer('jpeg', buffer, { raw }) |
12 | 107 |
|
13 | | - logger.info('Upload image to cache...') |
14 | | - logger.info('Submit transcription request...') |
15 | | - logger.info('Fetch transcription...') |
| 108 | + let checksum = md5(image) |
| 109 | + let contentMd5 = Buffer.from(checksum, 'hex').toString('base64') |
| 110 | + let contentType = `image/${type}` |
16 | 111 |
|
17 | | - // logger.info(`transkribus process id set to ${proc.id}`) |
18 | | - // transcription.config.proc = proc.id |
19 | | - // await session.poll(proc) |
| 112 | + let res = await fetch(`${host}/uploads/${checksum}.${type}`, { |
| 113 | + method: 'PUT', |
| 114 | + redirect: 'manual', |
| 115 | + headers: { |
| 116 | + 'Authorization': `Bearer ${token}`, |
| 117 | + 'X-Content-Length': image.length, |
| 118 | + 'Content-MD5': contentMd5, |
| 119 | + 'Content-Type': contentType |
| 120 | + } |
| 121 | + }) |
20 | 122 |
|
21 | | - // transcription.text = proc.content.text |
22 | | - // transcription.data = await session.alto(proc) |
| 123 | + switch (res.status) { |
| 124 | + case 204: |
| 125 | + logger.info('Image already cached...') |
| 126 | + return `${checksum}.${type}` |
| 127 | + case 307: |
| 128 | + logger.info('Upload image to cache...') |
| 129 | + res = await fetch(res.headers.get('location'), { |
| 130 | + method: 'PUT', |
| 131 | + body: image, |
| 132 | + headers: { |
| 133 | + 'Content-MD5': contentMd5, |
| 134 | + 'Content-Type': contentType |
| 135 | + } |
| 136 | + }) |
| 137 | + if (!res.ok) { |
| 138 | + throw new Error(await res.text()) |
| 139 | + } |
| 140 | + return `${checksum}.${type}` |
| 141 | + default: |
| 142 | + throw new Error(await res.text()) |
| 143 | + } |
23 | 144 | } |
24 | 145 |
|
25 | 146 | async unload () { |
| 147 | + this.conroller.abort() |
26 | 148 | } |
27 | 149 | } |
28 | 150 |
|
29 | 151 | ArtimiPlugin.defaults = { |
30 | 152 | htr: true, |
31 | 153 | model: null, |
32 | | - token: null |
| 154 | + token: null, |
| 155 | + interval: 10_000, |
| 156 | + maxRetries: 3, |
| 157 | + host: 'http://localhost:3000' |
33 | 158 | } |
0 commit comments