mirror of
				https://github.com/zadam/trilium.git
				synced 2025-10-31 18:36:30 +01:00 
			
		
		
		
	ocr wip
This commit is contained in:
		
							
								
								
									
										1413
									
								
								package-lock.json
									
									
									
										generated
									
									
									
								
							
							
						
						
									
										1413
									
								
								package-lock.json
									
									
									
										generated
									
									
									
								
							
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							| @@ -33,7 +33,7 @@ | |||||||
|     "archiver": "5.3.1", |     "archiver": "5.3.1", | ||||||
|     "async-mutex": "0.4.0", |     "async-mutex": "0.4.0", | ||||||
|     "axios": "1.2.5", |     "axios": "1.2.5", | ||||||
|     "better-sqlite3": "7.4.5", |     "better-sqlite3": "8.0.1", | ||||||
|     "canvas": "2.11.0", |     "canvas": "2.11.0", | ||||||
|     "chokidar": "3.5.3", |     "chokidar": "3.5.3", | ||||||
|     "cls-hooked": "4.2.2", |     "cls-hooked": "4.2.2", | ||||||
| @@ -95,7 +95,7 @@ | |||||||
|   }, |   }, | ||||||
|   "devDependencies": { |   "devDependencies": { | ||||||
|     "cross-env": "7.0.3", |     "cross-env": "7.0.3", | ||||||
|     "electron": "16.2.8", |     "electron": "23.0.0-beta.6", | ||||||
|     "electron-builder": "23.6.0", |     "electron-builder": "23.6.0", | ||||||
|     "electron-packager": "17.1.1", |     "electron-packager": "17.1.1", | ||||||
|     "electron-rebuild": "3.2.9", |     "electron-rebuild": "3.2.9", | ||||||
|   | |||||||
| @@ -48,6 +48,14 @@ function isEntityEventsDisabled() { | |||||||
|     return !!namespace.get('disableEntityEvents'); |     return !!namespace.get('disableEntityEvents'); | ||||||
| } | } | ||||||
|  |  | ||||||
|  | function isOcrDisabled() { | ||||||
|  |     return !!namespace.get('disableOcr'); | ||||||
|  | } | ||||||
|  |  | ||||||
|  | function disableOcr() { | ||||||
|  |     namespace.set('disableOcr', true); | ||||||
|  | } | ||||||
|  |  | ||||||
| function getAndClearEntityChangeIds() { | function getAndClearEntityChangeIds() { | ||||||
|     const entityChangeIds = namespace.get('entityChangeIds') || []; |     const entityChangeIds = namespace.get('entityChangeIds') || []; | ||||||
|  |  | ||||||
| @@ -92,5 +100,7 @@ module.exports = { | |||||||
|     reset, |     reset, | ||||||
|     getAndClearEntityChangeIds, |     getAndClearEntityChangeIds, | ||||||
|     addEntityChange, |     addEntityChange, | ||||||
|     ignoreEntityChangeIds |     ignoreEntityChangeIds, | ||||||
|  |     isOcrDisabled, | ||||||
|  |     disableOcr | ||||||
| }; | }; | ||||||
|   | |||||||
| @@ -12,8 +12,7 @@ const sanitizeFilename = require('sanitize-filename'); | |||||||
| const isSvg = require('is-svg'); | const isSvg = require('is-svg'); | ||||||
| const isAnimated = require('is-animated'); | const isAnimated = require('is-animated'); | ||||||
| const htmlSanitizer = require("./html_sanitizer"); | const htmlSanitizer = require("./html_sanitizer"); | ||||||
| const OCRAD = require('ocrad.js'); | const textExtractingService = require("./text_extracting"); | ||||||
| const Canvas = require('canvas'); |  | ||||||
|  |  | ||||||
| async function processImage(uploadBuffer, originalName, shrinkImageSwitch) { | async function processImage(uploadBuffer, originalName, shrinkImageSwitch) { | ||||||
|     const compressImages = optionService.getOptionBool("compressImages"); |     const compressImages = optionService.getOptionBool("compressImages"); | ||||||
| @@ -128,7 +127,7 @@ function saveImage(parentNoteId, uploadBuffer, originalName, shrinkImageSwitch, | |||||||
|             note.setContent(buffer); |             note.setContent(buffer); | ||||||
|         }); |         }); | ||||||
|  |  | ||||||
|         runOcr(note, buffer); |         textExtractingService.runOcr(note, buffer); | ||||||
|     }); |     }); | ||||||
|  |  | ||||||
|     return { |     return { | ||||||
|   | |||||||
| @@ -729,27 +729,12 @@ function scanForLinks(note, content) { | |||||||
|     } |     } | ||||||
| } | } | ||||||
|  |  | ||||||
| function runOcr(note, buffer) { |  | ||||||
|     if (!note.isImage() || !optionService.getOptionBool('ocrImages')) { |  | ||||||
|         return; |  | ||||||
|     } |  | ||||||
|  |  | ||||||
|     try { |  | ||||||
|         const plainText = textExtractingService.ocrTextFromBuffer(buffer); |  | ||||||
|  |  | ||||||
|         note.saveNoteAncillary('plainText', 'text/plain', plainText); |  | ||||||
|     } |  | ||||||
|     catch (e) { |  | ||||||
|         log.error(`OCR on note '${note.noteId}' failed with error '${e.message}', stack ${e.stack}`); |  | ||||||
|     } |  | ||||||
| } |  | ||||||
|  |  | ||||||
| /** | /** | ||||||
|  * Things which have to be executed after updating content, but asynchronously (separate transaction) |  * Things which have to be executed after updating content, but asynchronously (separate transaction) | ||||||
|  */ |  */ | ||||||
| async function asyncPostProcessContent(note, content) { | async function asyncPostProcessContent(note, content) { | ||||||
|     scanForLinks(note, content); |     scanForLinks(note, content); | ||||||
|     runOcr(note, content); |     await textExtractingService.runOcr(note, content); | ||||||
|     await textExtractingService.extractTextFromPdf(note, content); |     await textExtractingService.extractTextFromPdf(note, content); | ||||||
| } | } | ||||||
|  |  | ||||||
|   | |||||||
| @@ -1,7 +1,8 @@ | |||||||
| const Canvas = require("canvas"); | const Canvas = require("canvas"); | ||||||
| const OCRAD = require("ocrad.js"); | const OCRAD = require("ocrad.js"); | ||||||
| const log = require("./log.js"); | const log = require("./log"); | ||||||
| const optionService = require("./options.js"); | const optionService = require("./options"); | ||||||
|  | const cls = require("./cls"); | ||||||
|  |  | ||||||
| function ocrFromByteArray(img) { | function ocrFromByteArray(img) { | ||||||
|     // byte array contains raw uncompressed pixel data |     // byte array contains raw uncompressed pixel data | ||||||
| @@ -85,7 +86,7 @@ async function extractTextFromPdf(note, buffer) { | |||||||
|             content.items.forEach(({str}) => strings.push(str)); |             content.items.forEach(({str}) => strings.push(str)); | ||||||
|  |  | ||||||
|             try { |             try { | ||||||
|                 if (optionService.getOptionBool('ocrImages')) { |                 if (optionService.getOptionBool('ocrImages') && !cls.isOcrDisabled()) { | ||||||
|                     await ocrTextFromPdfImages(pdfjsLib, page, strings); |                     await ocrTextFromPdfImages(pdfjsLib, page, strings); | ||||||
|                 } |                 } | ||||||
|             } |             } | ||||||
| @@ -117,13 +118,37 @@ async function ocrTextFromBuffer(buffer) { | |||||||
|     const canvas = new Canvas.createCanvas(img.width, img.height); |     const canvas = new Canvas.createCanvas(img.width, img.height); | ||||||
|     const ctx = canvas.getContext('2d'); |     const ctx = canvas.getContext('2d'); | ||||||
|     ctx.drawImage(img, 0, 0, img.width, img.height); |     ctx.drawImage(img, 0, 0, img.width, img.height); | ||||||
|  |  | ||||||
|     const plainText = OCRAD(canvas); |     const plainText = OCRAD(canvas); | ||||||
|  |  | ||||||
|     log.info(`OCR of ${buffer.byteLength} image bytes into ${plainText.length} chars of text took ${Date.now() - start}ms`); |     log.info(`OCR of ${buffer.byteLength} image bytes into ${plainText.length} chars of text took ${Date.now() - start}ms`); | ||||||
|     return plainText; |     return plainText; | ||||||
| } | } | ||||||
|  |  | ||||||
|  | async function runOcr(note, buffer) { | ||||||
|  |     console.log("buffer length", buffer.length); | ||||||
|  |  | ||||||
|  |     if (!note.isImage() | ||||||
|  |         || !optionService.getOptionBool('ocrImages') | ||||||
|  |         || cls.isOcrDisabled() | ||||||
|  |         || buffer.length === 0 | ||||||
|  |     ) { | ||||||
|  |         return; | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     try { | ||||||
|  |         const plainText = await ocrTextFromBuffer(buffer); | ||||||
|  |  | ||||||
|  |         console.log("OCR", plainText); | ||||||
|  |  | ||||||
|  |         note.saveNoteAncillary('plainText', 'text/plain', plainText); | ||||||
|  |     } | ||||||
|  |     catch (e) { | ||||||
|  |         log.error(`OCR on note '${note.noteId}' failed with error '${e.message}', stack ${e.stack}`); | ||||||
|  |     } | ||||||
|  | } | ||||||
|  |  | ||||||
| module.exports = { | module.exports = { | ||||||
|     ocrTextFromBuffer, |     runOcr, | ||||||
|     extractTextFromPdf |     extractTextFromPdf | ||||||
| }; | }; | ||||||
|   | |||||||
| @@ -13,6 +13,7 @@ const yauzl = require("yauzl"); | |||||||
| const htmlSanitizer = require('./html_sanitizer'); | const htmlSanitizer = require('./html_sanitizer'); | ||||||
| const sql = require('./sql'); | const sql = require('./sql'); | ||||||
| const options = require('./options'); | const options = require('./options'); | ||||||
|  | const cls = require('./cls'); | ||||||
| const {USER_GUIDE_ZIP_DIR} = require('./resource_dir'); | const {USER_GUIDE_ZIP_DIR} = require('./resource_dir'); | ||||||
|  |  | ||||||
| async function importUserGuideIfNeeded() { | async function importUserGuideIfNeeded() { | ||||||
| @@ -33,6 +34,8 @@ async function importUserGuideIfNeeded() { | |||||||
|     const hiddenRoot = becca.getNote("_hidden"); |     const hiddenRoot = becca.getNote("_hidden"); | ||||||
|     const data = await fs.readFile(USER_GUIDE_ZIP_DIR + "/user-guide.zip", "binary"); |     const data = await fs.readFile(USER_GUIDE_ZIP_DIR + "/user-guide.zip", "binary"); | ||||||
|  |  | ||||||
|  |     cls.disableOcr(); // no OCR needed for user guide images | ||||||
|  |  | ||||||
|     await importZip(Buffer.from(data, 'binary'), hiddenRoot); |     await importZip(Buffer.from(data, 'binary'), hiddenRoot); | ||||||
|  |  | ||||||
|     options.setOption('userGuideSha256Hash', userGuideSha256HashInFile); |     options.setOption('userGuideSha256Hash', userGuideSha256HashInFile); | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user