LogicLabs-OU
diff --git a/‎.env.example‎
Lines changed: 10 additions & 1 deletion b/‎.env.example‎
Lines changed: 10 additions & 1 deletion
diff --git a/‎docker-compose.yml‎
Lines changed: 7 additions & 0 deletions b/‎docker-compose.yml‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎packages/backend/src/config/index.ts‎
Lines changed: 2 additions & 1 deletion b/‎packages/backend/src/config/index.ts‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎packages/backend/src/config/search.ts‎
Lines changed: 6 additions & 0 deletions b/‎packages/backend/src/config/search.ts‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎packages/backend/src/helpers/textExtractor.ts‎
Lines changed: 87 additions & 15 deletions b/‎packages/backend/src/helpers/textExtractor.ts‎
Lines changed: 87 additions & 15 deletions
diff --git a/‎…jobs/processors/index-email.processor.ts‎ ‎…rocessors/index-email-batch.processor.ts‎packages/backend/src/jobs/processors/index-email.processor.ts renamed to packages/backend/src/jobs/processors/index-email-batch.processor.ts
Lines changed: 5 additions & 4 deletions b/‎…jobs/processors/index-email.processor.ts‎ ‎…rocessors/index-email-batch.processor.ts‎packages/backend/src/jobs/processors/index-email.processor.ts renamed to packages/backend/src/jobs/processors/index-email-batch.processor.ts
Lines changed: 5 additions & 4 deletions
diff --git a/‎packages/backend/src/jobs/processors/process-mailbox.processor.ts‎
Lines changed: 40 additions & 7 deletions b/‎packages/backend/src/jobs/processors/process-mailbox.processor.ts‎
Lines changed: 40 additions & 7 deletions
diff --git a/‎packages/backend/src/jobs/schedulers/sync-scheduler.ts‎
Lines changed: 1 addition & 0 deletions b/‎packages/backend/src/jobs/schedulers/sync-scheduler.ts‎
Lines changed: 1 addition & 0 deletions
@@ -19,7 +19,8 @@ DATABASE_URL="postgresql://${POSTGRES_USER}:${POSTGRES_PASSWORD}@postgres:5432/$
 # Meilisearch
 MEILI_MASTER_KEY=aSampleMasterKey
 MEILI_HOST=http://meilisearch:7700
-
+# The number of emails to batch together for indexing. Defaults to 500.
+MEILI_INDEXING_BATCH=500
 
 
 # Redis (We use Valkey, which is Redis-compatible and open source)
@@ -60,6 +61,8 @@ RATE_LIMIT_WINDOW_MS=60000
 # The maximum number of API requests allowed from an IP within the window. Defaults to 100.
 RATE_LIMIT_MAX_REQUESTS=100
 
+
+
 # JWT
 # IMPORTANT: Change this to a long, random, and secret string in your .env file
 JWT_SECRET=a-very-secret-key-that-you-should-change
@@ -70,3 +73,9 @@ JWT_EXPIRES_IN="7d"
 # IMPORTANT: Generate a secure, random 32-byte hex string for this
 # You can use `openssl rand -hex 32` to generate a key.
 ENCRYPTION_KEY=
+
+# Apache Tika Integration
+# ONLY active if TIKA_URL is set
+TIKA_URL=http://tika:9998
+
+
@@ -52,6 +52,13 @@ services:
         networks:
             - open-archiver-net
 
+    tika:
+        image: apache/tika:3.2.2.0-full
+        container_name: tika
+        restart: always
+        networks:
+            - open-archiver-net
+
 volumes:
     pgdata:
         driver: local
 
@@ -1,13 +1,14 @@
 import { storage } from './storage';
 import { app } from './app';
-import { searchConfig } from './search';
+import { searchConfig, meiliConfig } from './search';
 import { connection as redisConfig } from './redis';
 import { apiConfig } from './api';
 
 export const config = {
 	storage,
 	app,
 	search: searchConfig,
+	meili: meiliConfig,
 	redis: redisConfig,
 	api: apiConfig,
 };
@@ -4,3 +4,9 @@ export const searchConfig = {
 	host: process.env.MEILI_HOST || 'http://127.0.0.1:7700',
 	apiKey: process.env.MEILI_MASTER_KEY || '',
 };
+
+export const meiliConfig = {
+	indexingBatchSize: process.env.MEILI_INDEXING_BATCH
+		? parseInt(process.env.MEILI_INDEXING_BATCH)
+		: 500,
+};
@@ -1,7 +1,10 @@
 import PDFParser from 'pdf2json';
 import mammoth from 'mammoth';
 import xlsx from 'xlsx';
+import { logger } from '../config/logger';
+import { OcrService } from '../services/OcrService';
 
+// Legacy PDF extraction (with improved memory management)
 function extractTextFromPdf(buffer: Buffer): Promise<string> {
 	return new Promise((resolve) => {
 		const pdfParser = new PDFParser(null, true);
@@ -10,34 +13,60 @@ function extractTextFromPdf(buffer: Buffer): Promise<string> {
 		const finish = (text: string) => {
 			if (completed) return;
 			completed = true;
-			pdfParser.removeAllListeners();
+
+			// explicit cleanup
+			try {
+				pdfParser.removeAllListeners();
+			} catch (e) {
+				// Ignore cleanup errors
+			}
+
 			resolve(text);
 		};
 
-		pdfParser.on('pdfParser_dataError', () => finish(''));
-		pdfParser.on('pdfParser_dataReady', () => finish(pdfParser.getRawTextContent()));
+		pdfParser.on('pdfParser_dataError', (err: any) => {
+			logger.warn('PDF parsing error:', err?.parserError || 'Unknown error');
+			finish('');
+		});
+
+		pdfParser.on('pdfParser_dataReady', () => {
+			try {
+				const text = pdfParser.getRawTextContent();
+				finish(text || '');
+			} catch (err) {
+				logger.warn('Error getting PDF text content:', err);
+				finish('');
+			}
+		});
 
 		try {
 			pdfParser.parseBuffer(buffer);
 		} catch (err) {
-			console.error('Error parsing PDF buffer', err);
+			logger.error('Error parsing PDF buffer:', err);
 			finish('');
 		}
 
-		// Prevent hanging if the parser never emits events
-		setTimeout(() => finish(''), 10000);
+		// reduced Timeout for better performance
+		setTimeout(() => {
+			logger.warn('PDF parsing timed out');
+			finish('');
+		}, 5000);
 	});
 }
 
-export async function extractText(buffer: Buffer, mimeType: string): Promise<string> {
+// Legacy text extraction for various formats
+async function extractTextLegacy(buffer: Buffer, mimeType: string): Promise<string> {
 	try {
 		if (mimeType === 'application/pdf') {
+			// Check PDF size (memory protection)
+			if (buffer.length > 50 * 1024 * 1024) { // 50MB Limit
+				logger.warn('PDF too large for legacy extraction, skipping');
+				return '';
+			}
 			return await extractTextFromPdf(buffer);
 		}
 
-		if (
-			mimeType === 'application/vnd.openxmlformats-officedocument.wordprocessingml.document'
-		) {
+		if (mimeType === 'application/vnd.openxmlformats-officedocument.wordprocessingml.document') {
 			const { value } = await mammoth.extractRawText({ buffer });
 			return value;
 		}
@@ -50,7 +79,7 @@ export async function extractText(buffer: Buffer, mimeType: string): Promise<str
 				const sheetText = xlsx.utils.sheet_to_txt(sheet);
 				fullText += sheetText + '\n';
 			}
-			return fullText;
+			return fullText.trim();
 		}
 
 		if (
@@ -60,11 +89,54 @@ export async function extractText(buffer: Buffer, mimeType: string): Promise<str
 		) {
 			return buffer.toString('utf-8');
 		}
+
+		return '';
 	} catch (error) {
-		console.error(`Error extracting text from attachment with MIME type ${mimeType}:`, error);
-		return ''; // Return empty string on failure
+		logger.error(`Error extracting text from attachment with MIME type ${mimeType}:`, error);
+
+		// Force garbage collection if available
+		if (global.gc) {
+			global.gc();
+		}
+
+		return '';
+	}
+}
+
+// Main extraction function
+export async function extractText(buffer: Buffer, mimeType: string): Promise<string> {
+	// Input validation
+	if (!buffer || buffer.length === 0) {
+		return '';
+	}
+
+	if (!mimeType) {
+		logger.warn('No MIME type provided for text extraction');
+		return '';
 	}
 
-	console.warn(`Unsupported MIME type for text extraction: ${mimeType}`);
-	return ''; // Return empty string for unsupported types
+	// General size limit
+	const maxSize = process.env.TIKA_URL ? 100 * 1024 * 1024 : 50 * 1024 * 1024; // 100MB for Tika, 50MB for Legacy
+	if (buffer.length > maxSize) {
+		logger.warn(`File too large for text extraction: ${buffer.length} bytes (limit: ${maxSize})`);
+		return '';
+	}
+
+	// Decide between Tika and legacy
+	const tikaUrl = process.env.TIKA_URL;
+
+	if (tikaUrl) {
+		// Tika decides what it can parse
+		logger.debug(`Using Tika for text extraction: ${mimeType}`);
+		const ocrService = new OcrService()
+		try {
+			return await ocrService.extractTextWithTika(buffer, mimeType);
+		} catch (error) {
+			logger.error({ error }, "OCR text extraction failed, returning empty string")
+			return ''
+		}
+	} else {
+		// extract using legacy mode
+		return await extractTextLegacy(buffer, mimeType);
+	}
 }
@@ -3,14 +3,15 @@ import { IndexingService } from '../../services/IndexingService';
 import { SearchService } from '../../services/SearchService';
 import { StorageService } from '../../services/StorageService';
 import { DatabaseService } from '../../services/DatabaseService';
+import { PendingEmail } from '@open-archiver/types';
 
 const searchService = new SearchService();
 const storageService = new StorageService();
 const databaseService = new DatabaseService();
 const indexingService = new IndexingService(databaseService, searchService, storageService);
 
-export default async function (job: Job<{ emailId: string }>) {
-	const { emailId } = job.data;
-	console.log(`Indexing email with ID: ${emailId}`);
-	await indexingService.indexEmailById(emailId);
+export default async function (job: Job<{ emails: PendingEmail[] }>) {
+    const { emails } = job.data;
+    console.log(`Indexing email batch with ${emails.length} emails`);
+    await indexingService.indexEmailBatch(emails);
 }
@@ -1,9 +1,19 @@
 import { Job } from 'bullmq';
-import { IProcessMailboxJob, SyncState, ProcessMailboxError } from '@open-archiver/types';
+import {
+	IProcessMailboxJob,
+	SyncState,
+	ProcessMailboxError,
+	PendingEmail,
+} from '@open-archiver/types';
 import { IngestionService } from '../../services/IngestionService';
 import { logger } from '../../config/logger';
 import { EmailProviderFactory } from '../../services/EmailProviderFactory';
 import { StorageService } from '../../services/StorageService';
+import { IndexingService } from '../../services/IndexingService';
+import { SearchService } from '../../services/SearchService';
+import { DatabaseService } from '../../services/DatabaseService';
+import { config } from '../../config';
+
 
 /**
  * This processor handles the ingestion of emails for a single user's mailbox.
@@ -15,9 +25,16 @@ import { StorageService } from '../../services/StorageService';
  */
 export const processMailboxProcessor = async (job: Job<IProcessMailboxJob, SyncState, string>) => {
 	const { ingestionSourceId, userEmail } = job.data;
+	const BATCH_SIZE: number = config.meili.indexingBatchSize;
+	let emailBatch: PendingEmail[] = [];
 
 	logger.info({ ingestionSourceId, userEmail }, `Processing mailbox for user`);
 
+	const searchService = new SearchService();
+	const storageService = new StorageService();
+	const databaseService = new DatabaseService();
+	const indexingService = new IndexingService(databaseService, searchService, storageService);
+
 	try {
 		const source = await IngestionService.findById(ingestionSourceId);
 		if (!source) {
@@ -26,22 +43,38 @@ export const processMailboxProcessor = async (job: Job<IProcessMailboxJob, SyncS
 
 		const connector = EmailProviderFactory.createConnector(source);
 		const ingestionService = new IngestionService();
-		const storageService = new StorageService();
 
-		// Pass the sync state for the entire source, the connector will handle per-user logic if necessary
 		for await (const email of connector.fetchEmails(userEmail, source.syncState)) {
 			if (email) {
-				await ingestionService.processEmail(email, source, storageService, userEmail);
+				const processedEmail = await ingestionService.processEmail(
+					email,
+					source,
+					storageService,
+					userEmail
+				);
+				if (processedEmail) {
+					emailBatch.push(processedEmail);
+					if (emailBatch.length >= BATCH_SIZE) {
+						await indexingService.indexEmailBatch(emailBatch);
+						emailBatch = [];
+					}
+				}
 			}
 		}
 
-		const newSyncState = connector.getUpdatedSyncState(userEmail);
+		if (emailBatch.length > 0) {
+			await indexingService.indexEmailBatch(emailBatch);
+			emailBatch = [];
+		}
 
+		const newSyncState = connector.getUpdatedSyncState(userEmail);
 		logger.info({ ingestionSourceId, userEmail }, `Finished processing mailbox for user`);
-
-		// Return the new sync state to be aggregated by the parent flow
 		return newSyncState;
 	} catch (error) {
+		if (emailBatch.length > 0) {
+			await indexingService.indexEmailBatch(emailBatch);
+		}
+
 		logger.error({ err: error, ingestionSourceId, userEmail }, 'Error processing mailbox');
 		const errorMessage = error instanceof Error ? error.message : 'An unknown error occurred';
 		const processMailboxError: ProcessMailboxError = {
 
@@ -8,6 +8,7 @@ const scheduleContinuousSync = async () => {
 		'schedule-continuous-sync',
 		{},
 		{
+			jobId: 'schedule-continuous-sync',
 			repeat: {
 				pattern: config.app.syncFrequency,
 			},
Original file line number	Diff line number	Diff line change
`@@ -8,6 +8,7 @@ const scheduleContinuousSync = async () => {`
`8`	`8`	`'schedule-continuous-sync',`
`9`	`9`	`{},`
`10`	`10`	`{`
	`11`	`+ jobId: 'schedule-continuous-sync',`
`11`	`12`	`repeat: {`
`12`	`13`	`pattern: config.app.syncFrequency,`
`13`	`14`	`},`