mirror of
				https://github.com/zadam/trilium.git
				synced 2025-10-31 18:36:30 +01:00 
			
		
		
		
	ocr wip
This commit is contained in:
		
							
								
								
									
										1413
									
								
								package-lock.json
									
									
									
										generated
									
									
									
								
							
							
						
						
									
										1413
									
								
								package-lock.json
									
									
									
										generated
									
									
									
								
							
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							| @@ -33,7 +33,7 @@ | ||||
|     "archiver": "5.3.1", | ||||
|     "async-mutex": "0.4.0", | ||||
|     "axios": "1.2.5", | ||||
|     "better-sqlite3": "7.4.5", | ||||
|     "better-sqlite3": "8.0.1", | ||||
|     "canvas": "2.11.0", | ||||
|     "chokidar": "3.5.3", | ||||
|     "cls-hooked": "4.2.2", | ||||
| @@ -95,7 +95,7 @@ | ||||
|   }, | ||||
|   "devDependencies": { | ||||
|     "cross-env": "7.0.3", | ||||
|     "electron": "16.2.8", | ||||
|     "electron": "23.0.0-beta.6", | ||||
|     "electron-builder": "23.6.0", | ||||
|     "electron-packager": "17.1.1", | ||||
|     "electron-rebuild": "3.2.9", | ||||
|   | ||||
| @@ -48,6 +48,14 @@ function isEntityEventsDisabled() { | ||||
|     return !!namespace.get('disableEntityEvents'); | ||||
| } | ||||
|  | ||||
| function isOcrDisabled() { | ||||
|     return !!namespace.get('disableOcr'); | ||||
| } | ||||
|  | ||||
| function disableOcr() { | ||||
|     namespace.set('disableOcr', true); | ||||
| } | ||||
|  | ||||
| function getAndClearEntityChangeIds() { | ||||
|     const entityChangeIds = namespace.get('entityChangeIds') || []; | ||||
|  | ||||
| @@ -92,5 +100,7 @@ module.exports = { | ||||
|     reset, | ||||
|     getAndClearEntityChangeIds, | ||||
|     addEntityChange, | ||||
|     ignoreEntityChangeIds | ||||
|     ignoreEntityChangeIds, | ||||
|     isOcrDisabled, | ||||
|     disableOcr | ||||
| }; | ||||
|   | ||||
| @@ -12,8 +12,7 @@ const sanitizeFilename = require('sanitize-filename'); | ||||
| const isSvg = require('is-svg'); | ||||
| const isAnimated = require('is-animated'); | ||||
| const htmlSanitizer = require("./html_sanitizer"); | ||||
| const OCRAD = require('ocrad.js'); | ||||
| const Canvas = require('canvas'); | ||||
| const textExtractingService = require("./text_extracting"); | ||||
|  | ||||
| async function processImage(uploadBuffer, originalName, shrinkImageSwitch) { | ||||
|     const compressImages = optionService.getOptionBool("compressImages"); | ||||
| @@ -128,7 +127,7 @@ function saveImage(parentNoteId, uploadBuffer, originalName, shrinkImageSwitch, | ||||
|             note.setContent(buffer); | ||||
|         }); | ||||
|  | ||||
|         runOcr(note, buffer); | ||||
|         textExtractingService.runOcr(note, buffer); | ||||
|     }); | ||||
|  | ||||
|     return { | ||||
|   | ||||
| @@ -729,27 +729,12 @@ function scanForLinks(note, content) { | ||||
|     } | ||||
| } | ||||
|  | ||||
| function runOcr(note, buffer) { | ||||
|     if (!note.isImage() || !optionService.getOptionBool('ocrImages')) { | ||||
|         return; | ||||
|     } | ||||
|  | ||||
|     try { | ||||
|         const plainText = textExtractingService.ocrTextFromBuffer(buffer); | ||||
|  | ||||
|         note.saveNoteAncillary('plainText', 'text/plain', plainText); | ||||
|     } | ||||
|     catch (e) { | ||||
|         log.error(`OCR on note '${note.noteId}' failed with error '${e.message}', stack ${e.stack}`); | ||||
|     } | ||||
| } | ||||
|  | ||||
| /** | ||||
|  * Things which have to be executed after updating content, but asynchronously (separate transaction) | ||||
|  */ | ||||
| async function asyncPostProcessContent(note, content) { | ||||
|     scanForLinks(note, content); | ||||
|     runOcr(note, content); | ||||
|     await textExtractingService.runOcr(note, content); | ||||
|     await textExtractingService.extractTextFromPdf(note, content); | ||||
| } | ||||
|  | ||||
|   | ||||
| @@ -1,7 +1,8 @@ | ||||
| const Canvas = require("canvas"); | ||||
| const OCRAD = require("ocrad.js"); | ||||
| const log = require("./log.js"); | ||||
| const optionService = require("./options.js"); | ||||
| const log = require("./log"); | ||||
| const optionService = require("./options"); | ||||
| const cls = require("./cls"); | ||||
|  | ||||
| function ocrFromByteArray(img) { | ||||
|     // byte array contains raw uncompressed pixel data | ||||
| @@ -85,7 +86,7 @@ async function extractTextFromPdf(note, buffer) { | ||||
|             content.items.forEach(({str}) => strings.push(str)); | ||||
|  | ||||
|             try { | ||||
|                 if (optionService.getOptionBool('ocrImages')) { | ||||
|                 if (optionService.getOptionBool('ocrImages') && !cls.isOcrDisabled()) { | ||||
|                     await ocrTextFromPdfImages(pdfjsLib, page, strings); | ||||
|                 } | ||||
|             } | ||||
| @@ -117,13 +118,37 @@ async function ocrTextFromBuffer(buffer) { | ||||
|     const canvas = new Canvas.createCanvas(img.width, img.height); | ||||
|     const ctx = canvas.getContext('2d'); | ||||
|     ctx.drawImage(img, 0, 0, img.width, img.height); | ||||
|  | ||||
|     const plainText = OCRAD(canvas); | ||||
|  | ||||
|     log.info(`OCR of ${buffer.byteLength} image bytes into ${plainText.length} chars of text took ${Date.now() - start}ms`); | ||||
|     return plainText; | ||||
| } | ||||
|  | ||||
| async function runOcr(note, buffer) { | ||||
|     console.log("buffer length", buffer.length); | ||||
|  | ||||
|     if (!note.isImage() | ||||
|         || !optionService.getOptionBool('ocrImages') | ||||
|         || cls.isOcrDisabled() | ||||
|         || buffer.length === 0 | ||||
|     ) { | ||||
|         return; | ||||
|     } | ||||
|  | ||||
|     try { | ||||
|         const plainText = await ocrTextFromBuffer(buffer); | ||||
|  | ||||
|         console.log("OCR", plainText); | ||||
|  | ||||
|         note.saveNoteAncillary('plainText', 'text/plain', plainText); | ||||
|     } | ||||
|     catch (e) { | ||||
|         log.error(`OCR on note '${note.noteId}' failed with error '${e.message}', stack ${e.stack}`); | ||||
|     } | ||||
| } | ||||
|  | ||||
| module.exports = { | ||||
|     ocrTextFromBuffer, | ||||
|     runOcr, | ||||
|     extractTextFromPdf | ||||
| }; | ||||
|   | ||||
| @@ -13,6 +13,7 @@ const yauzl = require("yauzl"); | ||||
| const htmlSanitizer = require('./html_sanitizer'); | ||||
| const sql = require('./sql'); | ||||
| const options = require('./options'); | ||||
| const cls = require('./cls'); | ||||
| const {USER_GUIDE_ZIP_DIR} = require('./resource_dir'); | ||||
|  | ||||
| async function importUserGuideIfNeeded() { | ||||
| @@ -33,6 +34,8 @@ async function importUserGuideIfNeeded() { | ||||
|     const hiddenRoot = becca.getNote("_hidden"); | ||||
|     const data = await fs.readFile(USER_GUIDE_ZIP_DIR + "/user-guide.zip", "binary"); | ||||
|  | ||||
|     cls.disableOcr(); // no OCR needed for user guide images | ||||
|  | ||||
|     await importZip(Buffer.from(data, 'binary'), hiddenRoot); | ||||
|  | ||||
|     options.setOption('userGuideSha256Hash', userGuideSha256HashInFile); | ||||
|   | ||||
		Reference in New Issue
	
	Block a user