Skip to content
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion apps/sim/executor/utils/file-tool-processor.ts
Original file line number Diff line number Diff line change
Expand Up @@ -138,7 +138,7 @@ export class FileToolProcessor {
}

if (!buffer && data.url) {
buffer = await downloadFileFromUrl(data.url)
buffer = await downloadFileFromUrl(data.url, { userId: context.userId })
}

if (buffer) {
Expand Down
2 changes: 1 addition & 1 deletion apps/sim/lib/execution/files.ts
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ export async function processExecutionFile(

if (file.type === 'url' && file.data) {
const { downloadFileFromUrl } = await import('@/lib/uploads/utils/file-utils.server')
const buffer = await downloadFileFromUrl(file.data)
const buffer = await downloadFileFromUrl(file.data, { userId })

if (buffer.length > MAX_FILE_SIZE) {
const fileSizeMB = (buffer.length / (1024 * 1024)).toFixed(2)
Expand Down
45 changes: 28 additions & 17 deletions apps/sim/lib/knowledge/documents/document-processor.ts
Original file line number Diff line number Diff line change
Expand Up @@ -295,7 +295,7 @@ async function parseDocument(
if (isPDF && (hasAzureMistralOCR || hasMistralOCR)) {
if (hasAzureMistralOCR) {
logger.info(`Using Azure Mistral OCR: ${filename}`)
return parseWithAzureMistralOCR(fileUrl, filename, mimeType)
return parseWithAzureMistralOCR(fileUrl, filename, mimeType, userId)
}

if (hasMistralOCR) {
Expand All @@ -305,7 +305,7 @@ async function parseDocument(
}

logger.info(`Using file parser: ${filename}`)
return parseWithFileParser(fileUrl, filename, mimeType)
return parseWithFileParser(fileUrl, filename, mimeType, userId)
}

async function handleFileForOCR(
Expand All @@ -321,7 +321,7 @@ async function handleFileForOCR(
if (mimeType === 'application/pdf') {
logger.info(`handleFileForOCR: Downloading external PDF to check page count`)
try {
const buffer = await downloadFileWithTimeout(fileUrl)
const buffer = await downloadFileWithTimeout(fileUrl, userId)
logger.info(`handleFileForOCR: Downloaded external PDF: ${buffer.length} bytes`)
return { httpsUrl: fileUrl, buffer }
} catch (error) {
Expand All @@ -340,7 +340,7 @@ async function handleFileForOCR(

logger.info(`Uploading "${filename}" to cloud storage for OCR`)

const buffer = await downloadFileWithTimeout(fileUrl)
const buffer = await downloadFileWithTimeout(fileUrl, userId)

logger.info(`Downloaded ${filename}: ${buffer.length} bytes`)

Expand Down Expand Up @@ -380,11 +380,11 @@ async function handleFileForOCR(
}
}

async function downloadFileWithTimeout(fileUrl: string): Promise<Buffer> {
return downloadFileFromUrl(fileUrl, TIMEOUTS.FILE_DOWNLOAD)
async function downloadFileWithTimeout(fileUrl: string, userId?: string): Promise<Buffer> {
return downloadFileFromUrl(fileUrl, { timeoutMs: TIMEOUTS.FILE_DOWNLOAD, userId })
}

async function downloadFileForBase64(fileUrl: string): Promise<Buffer> {
async function downloadFileForBase64(fileUrl: string, userId?: string): Promise<Buffer> {
if (/^data:/i.test(fileUrl)) {
const [, base64Data] = fileUrl.split(',')
if (!base64Data) {
Expand All @@ -393,7 +393,7 @@ async function downloadFileForBase64(fileUrl: string): Promise<Buffer> {
return Buffer.from(base64Data, 'base64')
}
if (/^https?:\/\//i.test(fileUrl)) {
return downloadFileWithTimeout(fileUrl)
return downloadFileWithTimeout(fileUrl, userId)
}
throw new Error('Unsupported fileUrl scheme: only data: URIs and http(s):// URLs are allowed')
}
Expand Down Expand Up @@ -468,15 +468,20 @@ async function makeOCRRequest(
}
}

async function parseWithAzureMistralOCR(fileUrl: string, filename: string, mimeType: string) {
async function parseWithAzureMistralOCR(
fileUrl: string,
filename: string,
mimeType: string,
userId?: string
) {
validateOCRConfig(
env.OCR_AZURE_API_KEY,
env.OCR_AZURE_ENDPOINT,
env.OCR_AZURE_MODEL_NAME,
'Azure Mistral OCR'
)

const fileBuffer = await downloadFileForBase64(fileUrl)
const fileBuffer = await downloadFileForBase64(fileUrl, userId)

if (mimeType === 'application/pdf') {
const pageCount = await getPdfPageCount(fileBuffer)
Expand All @@ -485,7 +490,7 @@ async function parseWithAzureMistralOCR(fileUrl: string, filename: string, mimeT
`PDF has ${pageCount} pages, exceeds Azure OCR limit of ${MISTRAL_MAX_PAGES}. ` +
`Falling back to file parser.`
)
return parseWithFileParser(fileUrl, filename, mimeType)
return parseWithFileParser(fileUrl, filename, mimeType, userId)
}
logger.info(`Azure Mistral OCR: PDF page count for ${filename}: ${pageCount}`)
}
Expand Down Expand Up @@ -529,7 +534,7 @@ async function parseWithAzureMistralOCR(fileUrl: string, filename: string, mimeT
})

logger.info(`Falling back to file parser: ${filename}`)
return parseWithFileParser(fileUrl, filename, mimeType)
return parseWithFileParser(fileUrl, filename, mimeType, userId)
}
}

Expand Down Expand Up @@ -589,7 +594,7 @@ async function parseWithMistralOCR(
})

logger.info(`Falling back to file parser: ${filename}`)
return parseWithFileParser(fileUrl, filename, mimeType)
return parseWithFileParser(fileUrl, filename, mimeType, userId)
}
}

Expand Down Expand Up @@ -773,15 +778,20 @@ async function processMistralOCRInBatches(
}
}

async function parseWithFileParser(fileUrl: string, filename: string, mimeType: string) {
async function parseWithFileParser(
fileUrl: string,
filename: string,
mimeType: string,
userId?: string
) {
try {
let content: string
let metadata: FileParseMetadata = {}

if (/^data:/i.test(fileUrl)) {
content = await parseDataURI(fileUrl, filename, mimeType)
} else if (/^https?:\/\//i.test(fileUrl)) {
const result = await parseHttpFile(fileUrl, filename, mimeType)
const result = await parseHttpFile(fileUrl, filename, mimeType, userId)
content = result.content
metadata = result.metadata || {}
} else {
Expand Down Expand Up @@ -820,9 +830,10 @@ async function parseDataURI(fileUrl: string, filename: string, mimeType: string)
async function parseHttpFile(
fileUrl: string,
filename: string,
mimeType?: string
mimeType?: string,
userId?: string
): Promise<{ content: string; metadata?: FileParseMetadata }> {
const buffer = await downloadFileWithTimeout(fileUrl)
const buffer = await downloadFileWithTimeout(fileUrl, userId)

const extension = resolveParserExtension(filename, mimeType)
const result = await parseBuffer(buffer, extension)
Expand Down
61 changes: 53 additions & 8 deletions apps/sim/lib/uploads/utils/file-utils.server.ts
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
'use server'

import type { Logger } from '@sim/logger'
import { createLogger, type Logger } from '@sim/logger'
import { getErrorMessage } from '@sim/utils/errors'
import { getMaxExecutionTimeout } from '@/lib/core/execution-limits'
import {
Expand All @@ -25,6 +25,8 @@ import {
import { verifyFileAccess } from '@/app/api/files/authorization'
import type { UserFile } from '@/executor/types'

const logger = createLogger('FileUtilsServer')

/**
* Result type for file input resolution
*/
Expand Down Expand Up @@ -138,19 +140,62 @@ export async function resolveFileInputToUrl(
}

/**
* Download a file from a URL (internal or external)
* For internal URLs, uses direct storage access (server-side only)
* For external URLs, validates DNS/SSRF and uses secure fetch with IP pinning
* Options for {@link downloadFileFromUrl}.
*/
export interface DownloadFileFromUrlOptions {
/** Download timeout for external URLs. Defaults to the max execution timeout. */
timeoutMs?: number
/** Hard cap on the number of bytes read from the source. */
maxBytes?: number
/**
* Principal the download is performed on behalf of. Required to authorize
* internal (`/api/files/serve/...`) URLs: the resolved storage key is checked
* with {@link verifyFileAccess} before any bytes are read. Without it, internal
* URLs are rejected (fail closed) so a `/api/files/serve/` substring can never
* be treated as implicitly trusted.
*/
userId?: string
}

/**
* Download a file from a URL (internal or external).
*
* For internal URLs, uses direct storage access (server-side only) after
* authorizing the resolved storage key against `userId`. Context is derived
* from the key via {@link inferContextFromKey}, never from a caller-controlled
* `?context=` query param — trusting the param would let a private key be
* labeled with a world-readable context (e.g. profile-pictures) so
* {@link verifyFileAccess} short-circuits to granted while the private object is
* still read. This mirrors how `/api/files/serve` resolves context.
*
* For external URLs, validates DNS/SSRF and uses secure fetch with IP pinning.
*/
export async function downloadFileFromUrl(
fileUrl: string,
timeoutMs = getMaxExecutionTimeout(),
maxBytes?: number
options: DownloadFileFromUrlOptions = {}
): Promise<Buffer> {
const { parseInternalFileUrl } = await import('./file-utils')
const { timeoutMs = getMaxExecutionTimeout(), maxBytes, userId } = options

if (isInternalFileUrl(fileUrl)) {
const { key, context } = parseInternalFileUrl(fileUrl)
if (!userId) {
logger.warn('Internal file download denied: no userId provided', { fileUrl })
throw new Error('Access denied: internal file URL requires an authenticated user')
}

const key = extractStorageKey(fileUrl)
if (!key) {
logger.warn('Internal file download denied: could not resolve storage key', { fileUrl })
throw new Error('Access denied: could not resolve internal file key')
}

const context = inferContextFromKey(key)

const hasAccess = await verifyFileAccess(key, userId, undefined, context, false)
Comment thread
waleedlatif1 marked this conversation as resolved.
Comment thread
waleedlatif1 marked this conversation as resolved.
if (!hasAccess) {
logger.warn('Internal file download denied: access check failed', { key, context, userId })
throw new Error('Access denied: file not found or insufficient permissions')
}

const { downloadFile } = await import('@/lib/uploads/core/storage-service')
return downloadFile({ key, context, maxBytes })
Comment thread
waleedlatif1 marked this conversation as resolved.
}
Expand Down
Loading