added image OCR and parsing text from PDF (and OCR of PDF images)

2025-11-01 10:55:55 +01:00 · 2023-01-26 20:32:27 +01:00
parent 63c62df787
commit ad887c4b12
13 changed files with 380 additions and 189 deletions
--- a/src/services/notes.js
+++ b/src/services/notes.js
@@ -23,6 +23,7 @@ const dayjs = require("dayjs");
 const htmlSanitizer = require("./html_sanitizer");
 const ValidationError = require("../errors/validation_error");
 const noteTypesService = require("./note_types");
+const textExtractingService = require("./text_extracting");

 function getNewNotePosition(parentNoteId) {
    const note = becca.notes[parentNoteId];
@@ -191,7 +192,7 @@ function createNewNote(params) {
            }
        }

-        scanForLinks(note);
+        asyncPostProcessContent(note, params.content);

        copyChildAttributes(parentNote, note);

@@ -492,7 +493,7 @@ function downloadImages(noteId, content) {
                if (updatedContent !== origContent) {
                    origNote.setContent(updatedContent);

-                    scanForLinks(origNote);
+                    asyncPostProcessContent(origNote, updatedContent);

                    eventService.emit(eventService.ENTITY_CHANGED, {
                        entityName: 'note_contents',
@@ -711,13 +712,12 @@ function getUndeletedParentBranchIds(noteId, deleteId) {
                      AND parentNote.isDeleted = 0`, [noteId, deleteId]);
 }

-function scanForLinks(note) {
+function scanForLinks(note, content) {
    if (!note || !['text', 'relationMap'].includes(note.type)) {
        return;
    }

    try {
-        const content = note.getContent();
        const newContent = saveLinks(note, content);

        if (content !== newContent) {
@@ -729,6 +729,30 @@ function scanForLinks(note) {
    }
 }

+function runOcr(note, buffer) {
+    if (!note.isImage() || !optionService.getOptionBool('ocrImages')) {
+        return;
+    }
+
+    try {
+        const plainText = textExtractingService.ocrTextFromBuffer(buffer);
+
+        note.saveNoteAttachment('plainText', 'text/plain', plainText);
+    }
+    catch (e) {
+        log.error(`OCR on note '${note.noteId}' failed with error '${e.message}', stack ${e.stack}`);
+    }
+}
+
+/**
+ * Things which have to be executed after updating content, but asynchronously (separate transaction)
+ */
+async function asyncPostProcessContent(note, content) {
+    scanForLinks(note, content);
+    runOcr(note, content);
+    await textExtractingService.extractTextFromPdf(note, content);
+}
+
 function eraseNotes(noteIdsToErase) {
    if (noteIdsToErase.length === 0) {
        return;
@@ -1006,7 +1030,6 @@ module.exports = {
    updateNoteData,
    undeleteNote,
    protectNoteRecursively,
-    scanForLinks,
    duplicateSubtree,
    duplicateSubtreeWithoutRoot,
    getUndeletedParentBranchIds,
@@ -1014,5 +1037,6 @@ module.exports = {
    eraseDeletedNotesNow,
    eraseNotesWithDeleteId,
    saveNoteRevisionIfNeeded,
-    downloadImages
+    downloadImages,
+    asyncPostProcessContent
 };