mirror of
				https://github.com/zadam/trilium.git
				synced 2025-10-26 07:46:30 +01:00 
			
		
		
		
	added image OCR and parsing text from PDF (and OCR of PDF images)
This commit is contained in:
		
							
								
								
									
										334
									
								
								package-lock.json
									
									
									
										generated
									
									
									
								
							
							
						
						
									
										334
									
								
								package-lock.json
									
									
									
										generated
									
									
									
								
							
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							| @@ -24,7 +24,7 @@ | ||||
|     "test-jasmine": "jasmine", | ||||
|     "test-es6": "node -r esm spec-es6/attribute_parser.spec.js ", | ||||
|     "test": "npm run test-jasmine && npm run test-es6", | ||||
|     "postinstall": "rimraf ./node_modules/canvas" | ||||
|     "postinstall": "node src-build/fix_pdfjs.js" | ||||
|   }, | ||||
|   "dependencies": { | ||||
|     "@braintree/sanitize-url": "6.0.2", | ||||
| @@ -72,7 +72,7 @@ | ||||
|     "normalize-strings": "1.1.1", | ||||
|     "ocrad.js": "antimatter15/ocrad.js#master", | ||||
|     "open": "8.4.0", | ||||
|     "pdfjs-dist": "2.8.335", | ||||
|     "pdfjs-dist": "3.2.146", | ||||
|     "rand-token": "1.0.1", | ||||
|     "react": "17.0.2", | ||||
|     "react-dom": "17.0.2", | ||||
|   | ||||
							
								
								
									
										12
									
								
								src-build/fix_pdfjs.js
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										12
									
								
								src-build/fix_pdfjs.js
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,12 @@ | ||||
| const fs = require("fs"); | ||||
|  | ||||
| const PACKAGE_JSON_PATH = './node_modules/pdfjs-dist/package.json'; | ||||
|  | ||||
| const packageJson = JSON.parse( | ||||
|     fs.readFileSync(PACKAGE_JSON_PATH).toString() | ||||
| ); | ||||
|  | ||||
| // non-legacy build doesn't work on node 16 at least | ||||
| packageJson.main = "legacy/build/pdf.js"; | ||||
|  | ||||
| fs.writeFileSync(PACKAGE_JSON_PATH, JSON.stringify(packageJson, null, 2)); | ||||
| @@ -351,6 +351,12 @@ class BNote extends AbstractBeccaEntity { | ||||
|             && this.mime === "text/html"; | ||||
|     } | ||||
|  | ||||
|     /** @returns {boolean} true if this note is an image */ | ||||
|     isImage() { | ||||
|         return this.type === 'image' | ||||
|             || (this.type === 'file' && this.mime?.startsWith('image/')); | ||||
|     } | ||||
|  | ||||
|     /** @returns {boolean} true if the note has string content (not binary) */ | ||||
|     isStringNote() { | ||||
|         return utils.isStringNote(this.type, this.mime); | ||||
|   | ||||
| @@ -123,7 +123,7 @@ function register(router) { | ||||
|  | ||||
|         note.setContent(req.body); | ||||
|  | ||||
|         noteService.scanForLinks(note); | ||||
|         noteService.asyncPostProcessContent(note, req.body); | ||||
|  | ||||
|         return res.sendStatus(204); | ||||
|     }); | ||||
|   | ||||
| @@ -3,7 +3,7 @@ | ||||
| const protectedSessionService = require('../../services/protected_session'); | ||||
| const utils = require('../../services/utils'); | ||||
| const log = require('../../services/log'); | ||||
| const noteRevisionService = require('../../services/note_revisions'); | ||||
| const noteService = require('../../services/notes'); | ||||
| const tmp = require('tmp'); | ||||
| const fs = require('fs'); | ||||
| const { Readable } = require('stream'); | ||||
| @@ -31,21 +31,7 @@ function updateFile(req) { | ||||
|  | ||||
|     note.setLabel('originalFileName', file.originalname); | ||||
|  | ||||
|     if (note.mime === 'application/pdf') { | ||||
|         const pdfjsLib = require("pdfjs-dist"); | ||||
|  | ||||
|         (async () => | ||||
|         { | ||||
|             let doc = await pdfjsLib.getDocument({data: file.buffer}).promise; | ||||
|             let page1 = await doc.getPage(1); | ||||
|             let content = await page1.getTextContent(); | ||||
|             let strings = content.items.map(function (item) { | ||||
|                 return item.str; | ||||
|             }); | ||||
|  | ||||
|             console.log(strings); | ||||
|         })(); | ||||
|     } | ||||
|     noteService.asyncPostProcessContent(note, file.buffer); | ||||
|  | ||||
|     return { | ||||
|         uploaded: true | ||||
|   | ||||
| @@ -65,24 +65,6 @@ function getImageMimeFromExtension(ext) { | ||||
|     return `image/${ext === 'svg' ? 'svg+xml' : ext}`; | ||||
| } | ||||
|  | ||||
| function runOcr(note, buffer) { | ||||
|     if (!optionService.getOptionBool('ocrImages')) { | ||||
|         return; | ||||
|     } | ||||
|  | ||||
|     const start = Date.now(); | ||||
|     const img = new Canvas.Image(); | ||||
|     img.src = buffer; | ||||
|     const canvas = new Canvas.createCanvas(img.width, img.height); | ||||
|     const ctx = canvas.getContext('2d'); | ||||
|     ctx.drawImage(img, 0, 0, img.width, img.height); | ||||
|     const plainText = OCRAD(canvas); | ||||
|  | ||||
|     log.info(`OCR of ${buffer.byteLength} image bytes into ${plainText.length} chars of text took ${Date.now() - start}ms`); | ||||
|  | ||||
|     note.saveNoteAttachment('plainText', 'text/plain', plainText); | ||||
| } | ||||
|  | ||||
| function updateImage(noteId, uploadBuffer, originalName) { | ||||
|     log.info(`Updating image ${noteId}: ${originalName}`); | ||||
|  | ||||
|   | ||||
| @@ -335,7 +335,7 @@ function importEnex(taskContext, file, parentNote) { | ||||
|         // save updated content with links to files/images | ||||
|         noteEntity.setContent(content); | ||||
|  | ||||
|         noteService.scanForLinks(noteEntity); | ||||
|         noteService.asyncPostProcessContent(noteEntity, content); | ||||
|  | ||||
|         updateDates(noteEntity.noteId, utcDateCreated, utcDateModified); | ||||
|     } | ||||
|   | ||||
| @@ -520,7 +520,8 @@ async function importZip(taskContext, fileBuffer, importRootNote) { | ||||
|     }); | ||||
|  | ||||
|     for (const noteId in createdNoteIds) { // now the noteIds are unique | ||||
|         noteService.scanForLinks(becca.getNote(noteId)); | ||||
|         const note = becca.getNote(noteId); | ||||
|         await noteService.asyncPostProcessContent(note, note.getContent()); | ||||
|  | ||||
|         if (!metaFile) { | ||||
|             // if there's no meta file then the notes are created based on the order in that zip file but that | ||||
|   | ||||
| @@ -23,6 +23,7 @@ const dayjs = require("dayjs"); | ||||
| const htmlSanitizer = require("./html_sanitizer"); | ||||
| const ValidationError = require("../errors/validation_error"); | ||||
| const noteTypesService = require("./note_types"); | ||||
| const textExtractingService = require("./text_extracting"); | ||||
|  | ||||
| function getNewNotePosition(parentNoteId) { | ||||
|     const note = becca.notes[parentNoteId]; | ||||
| @@ -191,7 +192,7 @@ function createNewNote(params) { | ||||
|             } | ||||
|         } | ||||
|  | ||||
|         scanForLinks(note); | ||||
|         asyncPostProcessContent(note, params.content); | ||||
|  | ||||
|         copyChildAttributes(parentNote, note); | ||||
|  | ||||
| @@ -492,7 +493,7 @@ function downloadImages(noteId, content) { | ||||
|                 if (updatedContent !== origContent) { | ||||
|                     origNote.setContent(updatedContent); | ||||
|  | ||||
|                     scanForLinks(origNote); | ||||
|                     asyncPostProcessContent(origNote, updatedContent); | ||||
|  | ||||
|                     eventService.emit(eventService.ENTITY_CHANGED, { | ||||
|                         entityName: 'note_contents', | ||||
| @@ -711,13 +712,12 @@ function getUndeletedParentBranchIds(noteId, deleteId) { | ||||
|                       AND parentNote.isDeleted = 0`, [noteId, deleteId]); | ||||
| } | ||||
|  | ||||
| function scanForLinks(note) { | ||||
| function scanForLinks(note, content) { | ||||
|     if (!note || !['text', 'relationMap'].includes(note.type)) { | ||||
|         return; | ||||
|     } | ||||
|  | ||||
|     try { | ||||
|         const content = note.getContent(); | ||||
|         const newContent = saveLinks(note, content); | ||||
|  | ||||
|         if (content !== newContent) { | ||||
| @@ -729,6 +729,30 @@ function scanForLinks(note) { | ||||
|     } | ||||
| } | ||||
|  | ||||
| function runOcr(note, buffer) { | ||||
|     if (!note.isImage() || !optionService.getOptionBool('ocrImages')) { | ||||
|         return; | ||||
|     } | ||||
|  | ||||
|     try { | ||||
|         const plainText = textExtractingService.ocrTextFromBuffer(buffer); | ||||
|  | ||||
|         note.saveNoteAttachment('plainText', 'text/plain', plainText); | ||||
|     } | ||||
|     catch (e) { | ||||
|         log.error(`OCR on note '${note.noteId}' failed with error '${e.message}', stack ${e.stack}`); | ||||
|     } | ||||
| } | ||||
|  | ||||
| /** | ||||
|  * Things which have to be executed after updating content, but asynchronously (separate transaction) | ||||
|  */ | ||||
| async function asyncPostProcessContent(note, content) { | ||||
|     scanForLinks(note, content); | ||||
|     runOcr(note, content); | ||||
|     await textExtractingService.extractTextFromPdf(note, content); | ||||
| } | ||||
|  | ||||
| function eraseNotes(noteIdsToErase) { | ||||
|     if (noteIdsToErase.length === 0) { | ||||
|         return; | ||||
| @@ -1006,7 +1030,6 @@ module.exports = { | ||||
|     updateNoteData, | ||||
|     undeleteNote, | ||||
|     protectNoteRecursively, | ||||
|     scanForLinks, | ||||
|     duplicateSubtree, | ||||
|     duplicateSubtreeWithoutRoot, | ||||
|     getUndeletedParentBranchIds, | ||||
| @@ -1014,5 +1037,6 @@ module.exports = { | ||||
|     eraseDeletedNotesNow, | ||||
|     eraseNotesWithDeleteId, | ||||
|     saveNoteRevisionIfNeeded, | ||||
|     downloadImages | ||||
|     downloadImages, | ||||
|     asyncPostProcessContent | ||||
| }; | ||||
|   | ||||
| @@ -91,6 +91,7 @@ const defaultOptions = [ | ||||
|     { name: 'disableTray', value: 'false', isSynced: false }, | ||||
|     { name: 'userGuideSha256Hash', value: '', isSynced: true }, | ||||
|     { name: 'ocrImages', value: 'true', isSynced: true }, | ||||
|     { name: 'extractTextFromPdf', value: 'true', isSynced: true }, | ||||
| ]; | ||||
|  | ||||
| function initStartupOptions() { | ||||
|   | ||||
| @@ -53,7 +53,9 @@ class NoteContentFulltextExp extends Expression { | ||||
|                 FROM note_attachments JOIN note_attachment_contents USING (noteAttachmentId)  | ||||
|                 WHERE name IN ('plainText') AND isDeleted = 0`)) { | ||||
|  | ||||
|             this.findInText(row, inputNoteSet, resultNoteSet); | ||||
|             if (!resultNoteSet.hasNoteId(row.noteId)) { | ||||
|                 this.findInText(row, inputNoteSet, resultNoteSet); | ||||
|             } | ||||
|         } | ||||
|  | ||||
|         return resultNoteSet; | ||||
|   | ||||
							
								
								
									
										129
									
								
								src/services/text_extracting.js
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										129
									
								
								src/services/text_extracting.js
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,129 @@ | ||||
| const Canvas = require("canvas"); | ||||
| const OCRAD = require("ocrad.js"); | ||||
| const log = require("./log.js"); | ||||
| const optionService = require("./options.js"); | ||||
|  | ||||
| function ocrFromByteArray(img) { | ||||
|     // byte array contains raw uncompressed pixel data | ||||
|     // kind: 1 - GRAYSCALE_1BPP (unsupported) | ||||
|     // kind: 2 - RGB_24BPP | ||||
|     // kind: 3 - RGBA_32BPP | ||||
|  | ||||
|     if (!(img.data instanceof Uint8ClampedArray) || ![2, 3].includes(img.kind)) { | ||||
|         return null; | ||||
|     } | ||||
|  | ||||
|     const start = Date.now(); | ||||
|     const canvas = new Canvas.createCanvas(img.width, img.height); | ||||
|     const ctx = canvas.getContext('2d'); | ||||
|  | ||||
|     const imageData = ctx.createImageData(img.width, img.height); | ||||
|     const imageBytes = imageData.data; | ||||
|  | ||||
|     for (let j = 0, k = 0, jj = img.width * img.height * 4; j < jj;) { | ||||
|         imageBytes[j++] = img.data[k++]; | ||||
|         imageBytes[j++] = img.data[k++]; | ||||
|         imageBytes[j++] = img.data[k++]; | ||||
|         // in case of kind = 2, the alpha channel is missing in source pixels and we'll add it | ||||
|         imageBytes[j++] = img.kind === 2 ? 255 : img.data[k++]; | ||||
|     } | ||||
|  | ||||
|     ctx.putImageData(imageData, 0, 0); | ||||
|     const text = OCRAD(canvas); | ||||
|  | ||||
|     log.info(`OCR of ${img.data.length} canvas into ${text.length} chars of text took ${Date.now() - start}ms`); | ||||
|  | ||||
|     return text; | ||||
| } | ||||
|  | ||||
| async function ocrTextFromPdfImages(pdfjsLib, page, strings) { | ||||
|     const ops = await page.getOperatorList(); | ||||
|  | ||||
|     const fns = ops.fnArray; | ||||
|     const args = ops.argsArray; | ||||
|  | ||||
|     for (const arg of args) { | ||||
|         const i = args.indexOf(arg); | ||||
|  | ||||
|         if (fns[i] !== pdfjsLib.OPS.paintXObject && fns[i] !== pdfjsLib.OPS.paintImageXObject) { | ||||
|             continue; | ||||
|         } | ||||
|  | ||||
|         const imgKey = arg[0]; | ||||
|         const img = await new Promise((res) => page.objs.get(imgKey, r => res(r))); | ||||
|  | ||||
|         if (!img) { | ||||
|             continue; | ||||
|         } | ||||
|  | ||||
|         const text = ocrFromByteArray(img); | ||||
|  | ||||
|         if (text) { | ||||
|             strings.push(text); | ||||
|         } | ||||
|     } | ||||
| } | ||||
|  | ||||
| async function extractTextFromPdf(note, buffer) { | ||||
|     if (note.mime !== 'application/pdf' || !optionService.getOptionBool('extractTextFromPdf')) { | ||||
|         return; | ||||
|     } | ||||
|  | ||||
|     try { | ||||
|         const pdfjsLib = require("pdfjs-dist"); | ||||
|         const doc = await pdfjsLib.getDocument({data: buffer}).promise; | ||||
|         let strings = []; | ||||
|  | ||||
|         for (let p = 1; p <= doc.numPages; p++) { | ||||
|             const page = await doc.getPage(p); | ||||
|  | ||||
|             const content = await page.getTextContent({ | ||||
|                 normalizeWhitespace: true, | ||||
|                 disableCombineTextItems: false | ||||
|             }); | ||||
|  | ||||
|             content.items.forEach(({str}) => strings.push(str)); | ||||
|  | ||||
|             try { | ||||
|                 if (optionService.getOptionBool('ocrImages')) { | ||||
|                     await ocrTextFromPdfImages(pdfjsLib, page, strings); | ||||
|                 } | ||||
|             } | ||||
|             catch (e) { | ||||
|                 log.info(`Could not OCR images from PDF note '${note.noteId}': '${e.message}', stack '${e.stack}'`); | ||||
|             } | ||||
|         } | ||||
|  | ||||
|         strings = strings.filter(str => str?.trim()); | ||||
|  | ||||
|         note.saveNoteAttachment('plainText', 'text/plain', strings.join(" ")); | ||||
|     } | ||||
|     catch (e) { | ||||
|         log.info(`Extracting text from PDF on note '${note.noteId}' failed with error '${e.message}', stack ${e.stack}`); | ||||
|     } | ||||
| } | ||||
|  | ||||
| async function ocrTextFromBuffer(buffer) { | ||||
|     // buffer is expected to contain an image in JPEG, PNG etc. | ||||
|     const start = Date.now(); | ||||
|  | ||||
|     const img = await new Promise((res, rej) => { | ||||
|         const img = new Canvas.Image(); | ||||
|         img.onload = () => res(img); | ||||
|         img.onerror = err => rej(new Error("Can't load the image " + err)); | ||||
|         img.src = buffer; | ||||
|     }); | ||||
|  | ||||
|     const canvas = new Canvas.createCanvas(img.width, img.height); | ||||
|     const ctx = canvas.getContext('2d'); | ||||
|     ctx.drawImage(img, 0, 0, img.width, img.height); | ||||
|     const plainText = OCRAD(canvas); | ||||
|  | ||||
|     log.info(`OCR of ${buffer.byteLength} image bytes into ${plainText.length} chars of text took ${Date.now() - start}ms`); | ||||
|     return plainText; | ||||
| } | ||||
|  | ||||
| module.exports = { | ||||
|     ocrTextFromBuffer, | ||||
|     extractTextFromPdf | ||||
| }; | ||||
		Reference in New Issue
	
	Block a user