fix(ocr): adapt OfficeProcessor to officeparser v6.1.0 ESM changes

v6.1.0 added native ESM with Node16 resolution and a strict exports field, breaking deep subpath imports like officeparser/dist/parsers/ExcelParser.js. Switch to the main package entry and use parseOfficeAsync(), which accepts a Buffer and auto-detects the format via magic bytes. Co-authored-by: Elian Doran <eliandoran@users.noreply.github.com>
2026-05-07 07:26:36 +02:00 · 2026-04-18 17:07:02 +00:00
parent 189867ca03
commit b2bcccb4c7
1 changed files with 16 additions and 22 deletions
--- a/apps/server/src/services/ocr/processors/office_processor.ts
+++ b/apps/server/src/services/ocr/processors/office_processor.ts
@@ -1,25 +1,20 @@
-import { parseExcel } from 'officeparser/dist/parsers/ExcelParser.js';
-import { parseOpenOffice } from 'officeparser/dist/parsers/OpenOfficeParser.js';
-import { parsePowerPoint } from 'officeparser/dist/parsers/PowerPointParser.js';
-import { parseWord } from 'officeparser/dist/parsers/WordParser.js';
-import type { OfficeParserConfig } from 'officeparser/dist/types.js';
+import officeparser from 'officeparser';
+import type { OfficeParserConfig } from 'officeparser';

 import log from '../../log.js';
 import { OCRProcessingOptions, OCRResult } from '../ocr_service.js';
 import { FileProcessor } from './file_processor.js';

-type Parser = (buffer: Buffer, config: OfficeParserConfig) => Promise<{ toText(): string }>;
-
-const PARSER_BY_MIME: Record<string, Parser> = {
+const SUPPORTED_MIME_TYPES = new Set([
    // Office Open XML
-    'application/vnd.openxmlformats-officedocument.wordprocessingml.document': parseWord,
-    'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet': parseExcel,
-    'application/vnd.openxmlformats-officedocument.presentationml.presentation': parsePowerPoint,
+    'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
+    'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
+    'application/vnd.openxmlformats-officedocument.presentationml.presentation',
    // OpenDocument
-    'application/vnd.oasis.opendocument.text': parseOpenOffice,
-    'application/vnd.oasis.opendocument.spreadsheet': parseOpenOffice,
-    'application/vnd.oasis.opendocument.presentation': parseOpenOffice
-};
+    'application/vnd.oasis.opendocument.text',
+    'application/vnd.oasis.opendocument.spreadsheet',
+    'application/vnd.oasis.opendocument.presentation'
+]);

 const PARSER_CONFIG: OfficeParserConfig = {
    outputErrorToConsole: false,
@@ -30,29 +25,28 @@ const PARSER_CONFIG: OfficeParserConfig = {

 /**
 * Office document processor for extracting text from DOCX/XLSX/PPTX and ODT/ODS/ODP files.
- * Uses individual parsers from officeparser v6 to avoid pulling in pdfjs-dist.
+ * Uses officeparser's main API, which auto-detects the format from the buffer's magic bytes.
 */
 export class OfficeProcessor extends FileProcessor {

    canProcess(mimeType: string): boolean {
-        return mimeType in PARSER_BY_MIME;
+        return SUPPORTED_MIME_TYPES.has(mimeType);
    }

    getSupportedMimeTypes(): string[] {
-        return Object.keys(PARSER_BY_MIME);
+        return [...SUPPORTED_MIME_TYPES];
    }

    async extractText(buffer: Buffer, options: OCRProcessingOptions = {}): Promise<OCRResult> {
        const mimeType = options.mimeType;
-        if (!mimeType || !(mimeType in PARSER_BY_MIME)) {
+        if (!mimeType || !SUPPORTED_MIME_TYPES.has(mimeType)) {
            throw new Error(`Unsupported MIME type for Office processor: ${mimeType}`);
        }

        log.info(`Starting Office document text extraction for ${mimeType}...`);

-        const parse = PARSER_BY_MIME[mimeType];
-        const ast = await parse(buffer, PARSER_CONFIG);
-        const trimmed = ast.toText().trim();
+        const text = await officeparser.parseOfficeAsync(buffer, PARSER_CONFIG);
+        const trimmed = text.trim();

        return {
            text: trimmed,