Compare commits

...

50 Commits

Author SHA1 Message Date
Elian Doran
6393d2c188 chore(ocr): remove trainneddata artifact 2026-04-01 17:08:15 +03:00
Elian Doran
d9f0a163cf refactor(ocr): use idiomatic status handling 2026-04-01 17:04:36 +03:00
Elian Doran
6534beec14 fix(ocr): errors not properly shown due to lack of convention 2026-04-01 16:58:34 +03:00
Elian Doran
6d050340ee fix(client): server errors don't reject the promise 2026-04-01 16:53:50 +03:00
Elian Doran
0e7f7fa208 chore(ocr): fix type issues & integrate ReadOnlyTextRepresentation 2026-04-01 16:45:38 +03:00
Elian Doran
287be0bd25 chore(scripts): integrate filter-tsc-output from standalone branch 2026-04-01 16:39:54 +03:00
Elian Doran
18cf2ff873 test(ocr): fix type issues 2026-04-01 16:35:45 +03:00
Elian Doran
b626fb448b refactor(ocr): get rid of require imports 2026-04-01 16:30:27 +03:00
Elian Doran
38f6fb5a7f refactor(ocr): rename ocr_last_processed to textExtractionLastProcessed 2026-04-01 16:26:16 +03:00
Elian Doran
5846df7d02 refactor(ocr): rename ocr_text to textRepresentation 2026-04-01 16:14:08 +03:00
Elian Doran
9462d6109c Merge remote-tracking branch 'origin/main' into feat/add-ocr-capabilities 2026-04-01 15:59:05 +03:00
Elian Doran
0d805a01c1 fix(deps): update dependency i18next to v26 (#9224) 2026-04-01 10:58:03 +03:00
copilot-swe-agent[bot]
7f1e4c0969 fix: remove showSupportNotice from i18next init options (removed in v26)
Agent-Logs-Url: https://github.com/TriliumNext/Trilium/sessions/41f772f7-49b7-4905-8b17-cf90165fc736

Co-authored-by: eliandoran <21236836+eliandoran@users.noreply.github.com>
2026-03-31 20:13:27 +00:00
renovate[bot]
e55cd7841f fix(deps): update dependency i18next to v26 2026-03-31 20:03:35 +00:00
Elian Doran
b9cef158d8 Merge remote-tracking branch 'origin/main' into feat/add-ocr-capabilities 2025-07-31 08:25:30 +03:00
Elian Doran
5ec6141369 feat(ocr): filter out text based on confidence 2025-07-26 14:57:12 +03:00
Elian Doran
55ac1e01f2 chore(ocr): improve ocr search result style 2025-07-26 14:15:45 +03:00
Elian Doran
65b58c3668 feat(ocr): auto-process images only if enabled in settings 2025-07-26 14:12:22 +03:00
Elian Doran
2cb4e5e8dc feat(ocr): run the image operation in the background 2025-07-26 14:07:23 +03:00
Elian Doran
72cea245f1 feat(ocr): automatically process images 2025-07-26 14:00:35 +03:00
Elian Doran
08ca86c68a chore(deps): move workspace dependencies to server 2025-07-26 13:48:28 +03:00
Elian Doran
925c9c1e7b feat(ocr): display OCR text only in search results 2025-07-26 12:55:52 +03:00
Elian Doran
6212ea0304 feat(ocr): display OCR text in search results 2025-07-26 12:41:30 +03:00
Elian Doran
f295592134 fix(ocr): search error due to scoring 2025-07-26 12:33:45 +03:00
Elian Doran
69b0973e6d feat(ocr): add a button to trigger an OCR manually 2025-07-26 12:18:20 +03:00
Elian Doran
422d318dac feat(ocr): add an option to display OCR text 2025-07-26 12:08:04 +03:00
Elian Doran
c55aa6ee88 refactor(ocr): unnecessary initialization logic 2025-07-26 11:56:48 +03:00
Elian Doran
090b175152 refactor(ocr): deduplicate mime types partially 2025-07-26 11:51:53 +03:00
Elian Doran
11e9b097a2 feat(ocr): basic processing of new files 2025-07-26 11:46:28 +03:00
Elian Doran
2adfc1d32b chore(ci): remove unnecessary change 2025-07-26 11:24:42 +03:00
Elian Doran
99fa5d89e7 Merge remote-tracking branch 'origin/main' into feat/add-ocr-capabilities 2025-07-26 10:33:01 +03:00
perf3ct
ca8cbf8ccf feat(ocr): add additional processors for OCR feature 2025-07-16 20:10:56 +00:00
perf3ct
6722d2d266 feat(ocr): implement new language selection form 2025-07-16 20:10:41 +00:00
perf3ct
508cbeaa1b feat(ocr): update this new migration to also add a ocr_last_processed column 2025-07-16 20:10:07 +00:00
perf3ct
e040865905 feat(ocr): add officeparser, pdf-parse, and sharp dependencies for ocr 2025-07-16 20:09:41 +00:00
perf3ct
a7878dd2c6 Merge branch 'main' into feat/add-ocr-capabilities 2025-07-16 17:54:32 +00:00
Jon Fuller
02980834ad Merge branch 'main' into feat/add-ocr-capabilities 2025-07-15 10:10:47 -07:00
perf3ct
2a8c8871c4 fix(dev): resolve issues with pnpm-lock.yaml 2025-07-14 16:41:02 +00:00
perf3ct
893be24c1d merge main into feature branch 2025-07-14 16:38:22 +00:00
perf3ct
9029f59410 feat(ocr): swap from custom table to using the blobs table, with a new column 2025-07-14 16:15:15 +00:00
Jon Fuller
4b5e8d33a6 Update playwright.yml 2025-06-10 15:37:05 -07:00
perf3ct
09196c045f fix(ocr): obviously don't need this migration file anymore 2025-06-10 20:59:17 +00:00
perf3ct
7868ebec1e fix(unit): also fix broken llm test 2025-06-10 20:51:34 +00:00
perf3ct
80a9182f05 feat(unit): ocr tests almost pass... 2025-06-10 20:41:40 +00:00
perf3ct
d20b3d854f feat(unit): ocr tests almost pass... 2025-06-10 20:36:52 +00:00
perf3ct
f1356228a3 feat(unit): ocr unit tests almost pass 2025-06-10 20:22:31 +00:00
perf3ct
a4adc51e50 fix(unit): resolve typecheck errors 2025-06-10 19:48:48 +00:00
perf3ct
864543e4f9 feat(ocr): drop confidence down a little bit 2025-06-10 19:22:46 +00:00
perf3ct
33a549202b fix(package): referenced wrong tesseract.js lol 2025-06-10 19:19:17 +00:00
perf3ct
c4a0219b18 feat(ocr): add unit tests, resolve double sent headers, and fix the wonderful tesseract.js path issues 2025-06-10 19:12:50 +00:00
47 changed files with 4221 additions and 150 deletions

View File

@@ -54,7 +54,7 @@
"draggabilly": "3.0.0",
"force-graph": "1.51.2",
"globals": "17.4.0",
"i18next": "25.10.10",
"i18next": "26.0.1",
"i18next-http-backend": "3.0.2",
"jquery": "4.0.0",
"jquery.fancytree": "2.38.5",

View File

@@ -302,6 +302,7 @@ export type CommandMappings = {
ninthTab: CommandData;
lastTab: CommandData;
showNoteSource: CommandData;
showNoteOCRText: CommandData;
showSQLConsole: CommandData;
showBackendLog: CommandData;
showCheatsheet: CommandData;

View File

@@ -148,6 +148,19 @@ export default class RootCommandExecutor extends Component {
}
}
async showNoteOCRTextCommand() {
const notePath = appContext.tabManager.getActiveContextNotePath();
if (notePath) {
await appContext.tabManager.openTabWithNoteWithHoisting(notePath, {
activate: true,
viewScope: {
viewMode: "ocr"
}
});
}
}
async showAttachmentsCommand() {
const notePath = appContext.tabManager.getActiveContextNotePath();

View File

@@ -32,6 +32,7 @@ export interface RenderOptions {
includeArchivedNotes?: boolean;
/** Set of note IDs that have already been seen during rendering to prevent infinite recursion. */
seenNoteIds?: Set<string>;
showTextRepresentation?: boolean;
}
const CODE_MIME_TYPES = new Set(["application/json"]);
@@ -55,7 +56,7 @@ export async function getRenderedContent(this: {} | { ctx: string }, entity: FNo
} else if (type === "code") {
await renderCode(entity, $renderedContent);
} else if (["image", "canvas", "mindMap", "spreadsheet"].includes(type)) {
renderImage(entity, $renderedContent, options);
await renderImage(entity, $renderedContent, options);
} else if (!options.tooltip && ["file", "pdf", "audio", "video"].includes(type)) {
await renderFile(entity, type, $renderedContent);
} else if (type === "mermaid") {
@@ -138,7 +139,7 @@ async function renderCode(note: FNote | FAttachment, $renderedContent: JQuery<HT
await applySingleBlockSyntaxHighlight($codeBlock, normalizeMimeTypeForCKEditor(note.mime));
}
function renderImage(entity: FNote | FAttachment, $renderedContent: JQuery<HTMLElement>, options: RenderOptions = {}) {
async function renderImage(entity: FNote | FAttachment, $renderedContent: JQuery<HTMLElement>, options: RenderOptions = {}) {
const encodedTitle = encodeURIComponent(entity.title);
let url;
@@ -178,9 +179,39 @@ function renderImage(entity: FNote | FAttachment, $renderedContent: JQuery<HTMLE
}
imageContextMenuService.setupContextMenu($img);
// Add OCR text display for image notes
if (entity instanceof FNote && options.showTextRepresentation) {
await addOCRTextIfAvailable(entity, $renderedContent);
}
}
async function renderFile(entity: FNote | FAttachment, type: string, $renderedContent: JQuery<HTMLElement>) {
async function addOCRTextIfAvailable(note: FNote, $content: JQuery<HTMLElement>) {
try {
const response = await fetch(`api/ocr/notes/${note.noteId}/text`);
if (response.ok) {
const data = await response.json();
if (data.success && data.hasOcr && data.text) {
const $ocrSection = $(`
<div class="ocr-text-section">
<div class="ocr-header">
<span class="bx bx-text"></span> ${t("ocr.extracted_text")}
</div>
<div class="ocr-content"></div>
</div>
`);
$ocrSection.find('.ocr-content').text(data.text);
$content.append($ocrSection);
}
}
} catch (error) {
// Silently fail if OCR API is not available
console.debug('Failed to fetch OCR text:', error);
}
}
async function renderFile(entity: FNote | FAttachment, type: string, $renderedContent: JQuery<HTMLElement>, options: RenderOptions = {}) {
let entityType, entityId;
if (entity instanceof FNote) {
@@ -220,6 +251,11 @@ async function renderFile(entity: FNote | FAttachment, type: string, $renderedCo
$content.append($videoPreview);
}
// Add OCR text display for file notes
if (entity instanceof FNote && options.showTextRepresentation) {
await addOCRTextIfAvailable(entity, $content);
}
if (entityType === "notes" && "noteId" in entity) {
// TODO: we should make this available also for attachments, but there's a problem with "Open externally" support
// in attachment list

View File

@@ -24,8 +24,7 @@ export async function initLocale() {
backend: {
loadPath: `${window.glob.assetPath}/translations/{{lng}}/{{ns}}.json`
},
returnEmptyString: false,
showSupportNotice: false
returnEmptyString: false
});
await setDayjsLocale(locale);

View File

@@ -28,7 +28,7 @@ async function getLinkIcon(noteId: string, viewMode: ViewMode | undefined) {
return icon;
}
export type ViewMode = "default" | "source" | "attachments" | "contextual-help" | "note-map";
export type ViewMode = "default" | "source" | "attachments" | "contextual-help" | "note-map" | "ocr";
export interface ViewScope {
/**

View File

@@ -270,7 +270,11 @@ function ajax(url: string, method: string, data: unknown, headers: Headers, opts
} else if (opts.silentInternalServerError && jqXhr.status === 500) {
// report nothing
} else {
await reportError(method, url, jqXhr.status, jqXhr.responseText);
try {
await reportError(method, url, jqXhr.status, jqXhr.responseText);
} catch {
// reportError may throw (e.g. ValidationError); ensure rej() is still called below.
}
}
rej(jqXhr.responseText);

View File

@@ -2641,3 +2641,26 @@ iframe.print-iframe {
min-height: 50px;
align-items: center;
}
.ocr-text-section {
margin: 10px 0;
padding: 10px;
background: var(--accented-background-color);
border-left: 3px solid var(--main-border-color);
text-align: left;
}
.ocr-header {
font-weight: bold;
margin-bottom: 8px;
font-size: 0.9em;
color: var(--muted-text-color);
}
.ocr-content {
max-height: 150px;
overflow-y: auto;
font-size: 0.9em;
line-height: 1.4;
white-space: pre-wrap;
}

View File

@@ -691,6 +691,7 @@
"search_in_note": "Search in note",
"note_source": "Note source",
"note_attachments": "Note attachments",
"view_ocr_text": "View OCR text",
"open_note_externally": "Open note externally",
"open_note_externally_title": "File will be open in an external application and watched for changes. You'll then be able to upload the modified version back to Trilium.",
"open_note_custom": "Open note custom",
@@ -1259,7 +1260,22 @@
"enable_image_compression": "Enable image compression",
"max_image_dimensions": "Max width / height of an image (image will be resized if it exceeds this setting).",
"max_image_dimensions_unit": "pixels",
"jpeg_quality_description": "JPEG quality (10 - worst quality, 100 - best quality, 50 - 85 is recommended)"
"jpeg_quality_description": "JPEG quality (10 - worst quality, 100 - best quality, 50 - 85 is recommended)",
"ocr_section_title": "Optical Character Recognition (OCR)",
"enable_ocr": "Enable OCR for images",
"ocr_description": "Automatically extract text from images using OCR technology. This makes image content searchable within your notes.",
"ocr_auto_process": "Automatically process new images with OCR",
"ocr_language": "OCR Language",
"ocr_min_confidence": "Minimum confidence threshold",
"ocr_confidence_unit": "(0.0-1.0)",
"ocr_confidence_description": "Only extract text with confidence above this threshold. Lower values include more text but may be less accurate.",
"batch_ocr_title": "Process Existing Images",
"batch_ocr_description": "Process all existing images in your notes with OCR. This may take some time depending on the number of images.",
"batch_ocr_start": "Start Batch OCR Processing",
"batch_ocr_starting": "Starting batch OCR processing...",
"batch_ocr_progress": "Processing {{processed}} of {{total}} images...",
"batch_ocr_completed": "Batch OCR completed! Processed {{processed}} images.",
"batch_ocr_error": "Error during batch OCR: {{error}}"
},
"attachment_erasure_timeout": {
"attachment_erasure_timeout": "Attachment Erasure Timeout",
@@ -2067,6 +2083,20 @@
"calendar_view": {
"delete_note": "Delete note..."
},
"ocr": {
"extracted_text": "Extracted Text (OCR)",
"extracted_text_title": "Extracted Text (OCR)",
"loading_text": "Loading OCR text...",
"no_text_available": "No OCR text available",
"no_text_explanation": "This note has not been processed for OCR text extraction or no text was found.",
"failed_to_load": "Failed to load OCR text",
"extracted_on": "Extracted on: {{date}}",
"unknown_date": "Unknown",
"process_now": "Process OCR",
"processing": "Processing...",
"processing_started": "OCR processing has been started. Please wait a moment and refresh.",
"processing_failed": "Failed to start OCR processing"
},
"command_palette": {
"tree-action-name": "Tree: {{name}}",
"export_note_title": "Export Note",

View File

@@ -336,6 +336,8 @@ export async function getExtendedWidgetType(note: FNote | null | undefined, note
if (noteContext?.viewScope?.viewMode === "source") {
resultingType = "readOnlyCode";
} else if (noteContext.viewScope?.viewMode === "ocr") {
resultingType = "readOnlyOCRText";
} else if (noteContext.viewScope?.viewMode === "attachments") {
resultingType = noteContext.viewScope.attachmentId ? "attachmentDetail" : "attachmentList";
} else if (noteContext.viewScope?.viewMode === "note-map") {

View File

@@ -1,8 +1,9 @@
import { it, describe, expect } from "vitest";
import { buildNote } from "../../../test/easy-froca";
import { getBoardData } from "./data";
import { describe, expect,it } from "vitest";
import FBranch from "../../../entities/fbranch";
import froca from "../../../services/froca";
import { buildNote } from "../../../test/easy-froca";
import { getBoardData } from "./data";
describe("Board data", () => {
it("deduplicates cloned notes", async () => {

View File

@@ -27,6 +27,7 @@ const VIEW_MODE_ICON_MAPPINGS: Record<Exclude<ViewMode, "default">, string> = {
"contextual-help": "bx bx-help-circle",
"note-map": "bx bxs-network-chart",
attachments: "bx bx-paperclip",
ocr: "bx bx-text"
};
export default function TabSwitcher() {

View File

@@ -12,7 +12,7 @@ import { TypeWidgetProps } from "./type_widgets/type_widget";
* A `NoteType` altered by the note detail widget, taking into consideration whether the note is editable or not and adding special note types such as an empty one,
* for protected session or attachment information.
*/
export type ExtendedNoteType = Exclude<NoteType, "launcher" | "text" | "code" | "llmChat"> | "empty" | "readOnlyCode" | "readOnlyText" | "editableText" | "editableCode" | "attachmentDetail" | "attachmentList" | "protectedSession" | "sqlConsole" | "llmChat";
export type ExtendedNoteType = Exclude<NoteType, "launcher" | "text" | "code" | "llmChat"> | "empty" | "readOnlyCode" | "readOnlyText" | "readOnlyOCRText" | "editableText" | "editableCode" | "attachmentDetail" | "attachmentList" | "protectedSession" | "sqlConsole" | "llmChat";
export type TypeWidget = ((props: TypeWidgetProps) => VNode | JSX.Element | undefined);
type NoteTypeView = () => (Promise<{ default: TypeWidget } | TypeWidget> | TypeWidget);
@@ -78,6 +78,11 @@ export const TYPE_MAPPINGS: Record<ExtendedNoteType, NoteTypeMapping> = {
className: "note-detail-readonly-code",
printable: true
},
readOnlyOCRText: {
view: () => import("./type_widgets/ReadOnlyTextRepresentation"),
className: "note-detail-ocr-text",
printable: true
},
editableCode: {
view: async () => (await import("./type_widgets/code/Code")).EditableCode,
className: "note-detail-code",

View File

@@ -162,6 +162,7 @@ export function NoteContextMenu({ note, noteContext, itemsAtStart, itemsNearNote
<CommandItem command="openNoteExternally" icon="bx bx-file-find" disabled={isSearchOrBook || !isElectron} text={t("note_actions.open_note_externally")} title={t("note_actions.open_note_externally_title")} />
<CommandItem command="openNoteCustom" icon="bx bx-customize" disabled={isSearchOrBook || isMac || !isElectron} text={t("note_actions.open_note_custom")} />
<CommandItem command="showNoteSource" icon="bx bx-code" disabled={!hasSource} text={t("note_actions.note_source")} />
<CommandItem command="showNoteOCRText" icon="bx bx-text" disabled={!["image", "file"].includes(noteType)} text={t("note_actions.view_ocr_text")} />
{(syncServerHost && isElectron) &&
<CommandItem command="openNoteOnServer" icon="bx bx-world" disabled={!syncServerHost} text={t("note_actions.open_note_on_server")} />
}

View File

@@ -0,0 +1,145 @@
import { useEffect, useState } from "preact/hooks";
import { t } from "../../services/i18n";
import server from "../../services/server";
import toast from "../../services/toast";
import { TypeWidgetProps } from "./type_widget";
interface TextRepresentationResponse {
success: boolean;
text: string;
hasOcr: boolean;
extractedAt: string | null;
message?: string;
}
type State =
| { kind: "loading" }
| { kind: "loaded"; text: string; extractedAt: string | null }
| { kind: "empty" }
| { kind: "error"; message: string };
export default function ReadOnlyTextRepresentation({ note }: TypeWidgetProps) {
const [ state, setState ] = useState<State>({ kind: "loading" });
const [ processing, setProcessing ] = useState(false);
async function fetchText() {
setState({ kind: "loading" });
try {
const response = await server.get<TextRepresentationResponse>(`ocr/notes/${note.noteId}/text`);
if (!response.success) {
setState({ kind: "error", message: response.message || t("ocr.failed_to_load") });
return;
}
if (!response.hasOcr || !response.text) {
setState({ kind: "empty" });
return;
}
setState({ kind: "loaded", text: response.text, extractedAt: response.extractedAt });
} catch (error: any) {
console.error("Error loading text representation:", error);
setState({ kind: "error", message: error.message || t("ocr.failed_to_load") });
}
}
useEffect(() => { fetchText(); }, [ note.noteId ]);
async function processOCR() {
setProcessing(true);
try {
const response = await server.post<{ success: boolean; message?: string }>(`ocr/process-note/${note.noteId}`);
if (response.success) {
toast.showMessage(t("ocr.processing_started"));
setTimeout(fetchText, 2000);
} else {
toast.showError(response.message || t("ocr.processing_failed"));
}
} catch {
// Server errors (4xx/5xx) are already shown as toasts by server.ts.
} finally {
setProcessing(false);
}
}
return (
<div className="note-detail-printable" style={{ padding: "10px" }}>
<div style={{
marginBottom: "10px",
padding: "8px 12px",
backgroundColor: "var(--main-background-color)",
border: "1px solid var(--main-border-color)",
borderRadius: "4px",
fontWeight: 500
}}>
<span className="bx bx-text" />{" "}{t("ocr.extracted_text_title")}
</div>
{state.kind === "loading" && (
<div style={{ textAlign: "center", padding: "30px", color: "var(--muted-text-color)" }}>
<span className="bx bx-loader-alt bx-spin" />{" "}{t("ocr.loading_text")}
</div>
)}
{state.kind === "loaded" && (
<>
<pre style={{
whiteSpace: "pre-wrap",
fontFamily: "var(--detail-text-font-family)",
fontSize: "var(--detail-text-font-size)",
lineHeight: 1.6,
border: "1px solid var(--main-border-color)",
borderRadius: "4px",
padding: "15px",
backgroundColor: "var(--accented-background-color)",
minHeight: "100px"
}}>
{state.text}
</pre>
<div style={{ fontSize: "0.9em", color: "var(--muted-text-color)", marginTop: "10px", fontStyle: "italic" }}>
{t("ocr.extracted_on", { date: state.extractedAt ? new Date(state.extractedAt).toLocaleString() : t("ocr.unknown_date") })}
</div>
</>
)}
{state.kind === "empty" && (
<>
<div style={{ color: "var(--muted-text-color)", fontStyle: "italic", textAlign: "center", padding: "30px" }}>
<span className="bx bx-info-circle" />{" "}{t("ocr.no_text_available")}
</div>
<button
type="button"
className="btn btn-secondary"
style={{ marginTop: "15px" }}
disabled={processing}
onClick={processOCR}
>
{processing
? <><span className="bx bx-loader-alt bx-spin" />{" "}{t("ocr.processing")}</>
: <><span className="bx bx-play" />{" "}{t("ocr.process_now")}</>
}
</button>
<div style={{ fontSize: "0.9em", color: "var(--muted-text-color)", marginTop: "10px", fontStyle: "italic" }}>
{t("ocr.no_text_explanation")}
</div>
</>
)}
{state.kind === "error" && (
<div style={{
color: "var(--error-color)",
backgroundColor: "var(--error-background-color)",
border: "1px solid var(--error-border-color)",
padding: "10px",
borderRadius: "4px",
marginTop: "10px"
}}>
<span className="bx bx-error" />{" "}{state.message}
</div>
)}
</div>
);
}

View File

@@ -68,6 +68,7 @@
"@types/serve-static": "2.2.0",
"@types/stream-throttle": "0.1.4",
"@types/supertest": "7.2.0",
"@types/tesseract.js": "2.0.0",
"@types/tmp": "0.2.6",
"@types/turndown": "5.0.6",
"@types/ws": "8.18.1",
@@ -115,16 +116,20 @@
"mime-types": "3.0.2",
"multer": "2.1.1",
"normalize-strings": "1.1.1",
"officeparser": "5.2.0",
"pdf-parse": "1.1.1",
"rand-token": "1.0.1",
"safe-compare": "1.1.4",
"sanitize-filename": "1.6.4",
"sanitize-html": "2.17.2",
"sax": "1.6.0",
"serve-favicon": "2.5.1",
"sharp": "0.34.3",
"stream-throttle": "0.1.3",
"strip-bom": "5.0.0",
"striptags": "3.2.0",
"supertest": "7.2.2",
"tesseract.js": "6.0.1",
"swagger-jsdoc": "6.2.8",
"time2fa": "1.4.2",
"tmp": "0.2.5",

View File

@@ -107,6 +107,8 @@ CREATE TABLE IF NOT EXISTS "recent_notes"
CREATE TABLE IF NOT EXISTS "blobs" (
`blobId` TEXT NOT NULL,
`content` TEXT NULL DEFAULT NULL,
`textRepresentation` TEXT DEFAULT NULL,
`textExtractionLastProcessed` TEXT DEFAULT NULL,
`dateModified` TEXT NOT NULL,
`utcDateModified` TEXT NOT NULL,
PRIMARY KEY(`blobId`)

View File

@@ -10,11 +10,12 @@ class BBlob extends AbstractBeccaEntity<BBlob> {
return "blobId";
}
static get hashedProperties() {
return ["blobId", "content"];
return ["blobId", "content", "textRepresentation"];
}
content!: string | Buffer;
contentLength!: number;
textRepresentation?: string | null;
constructor(row: BlobRow) {
super();
@@ -25,6 +26,7 @@ class BBlob extends AbstractBeccaEntity<BBlob> {
this.blobId = row.blobId;
this.content = row.content;
this.contentLength = row.contentLength;
this.textRepresentation = row.textRepresentation;
this.dateModified = row.dateModified;
this.utcDateModified = row.utcDateModified;
}
@@ -34,6 +36,7 @@ class BBlob extends AbstractBeccaEntity<BBlob> {
blobId: this.blobId,
content: this.content || null,
contentLength: this.contentLength,
textRepresentation: this.textRepresentation || null,
dateModified: this.dateModified,
utcDateModified: this.utcDateModified
};

View File

@@ -6,6 +6,25 @@
// Migrations should be kept in descending order, so the latest migration is first.
const MIGRATIONS: (SqlMigration | JsMigration)[] = [
// Add text representation column and last processed timestamp to blobs table
{
version: 236,
sql: /*sql*/`\
-- Add text representation column to blobs table
ALTER TABLE blobs ADD COLUMN textRepresentation TEXT DEFAULT NULL;
-- Add OCR last processed timestamp to blobs table
ALTER TABLE blobs ADD COLUMN textExtractionLastProcessed TEXT DEFAULT NULL;
-- Create index for text representation searches
CREATE INDEX IF NOT EXISTS idx_blobs_textRepresentation
ON blobs (textRepresentation);
-- Create index for OCR last processed timestamp
CREATE INDEX IF NOT EXISTS idx_blobs_textExtractionLastProcessed
ON blobs (textExtractionLastProcessed);
`
},
// Add missing database indices for query performance
{
version: 235,

View File

@@ -0,0 +1,75 @@
import { describe, expect, it, vi, beforeEach } from "vitest";
import ocrRoutes from "./ocr.js";
// Mock the OCR service
vi.mock("../../services/ocr/ocr_service.js", () => ({
default: {
isOCREnabled: vi.fn(() => true),
startBatchProcessing: vi.fn(() => Promise.resolve({ success: true })),
getBatchProgress: vi.fn(() => ({ inProgress: false, total: 0, processed: 0 }))
}
}));
// Mock becca
vi.mock("../../becca/becca.js", () => ({
default: {}
}));
// Mock log
vi.mock("../../services/log.js", () => ({
default: {
error: vi.fn()
}
}));
describe("OCR API", () => {
let mockRequest: any;
let mockResponse: any;
beforeEach(() => {
mockRequest = {
params: {},
body: {},
query: {}
};
mockResponse = {
status: vi.fn().mockReturnThis(),
json: vi.fn().mockReturnThis(),
triliumResponseHandled: false
};
});
it("should set triliumResponseHandled flag in batch processing", async () => {
await ocrRoutes.batchProcessOCR(mockRequest, mockResponse);
expect(mockResponse.json).toHaveBeenCalledWith({ success: true });
expect(mockResponse.triliumResponseHandled).toBe(true);
});
it("should set triliumResponseHandled flag in get batch progress", async () => {
await ocrRoutes.getBatchProgress(mockRequest, mockResponse);
expect(mockResponse.json).toHaveBeenCalledWith({
inProgress: false,
total: 0,
processed: 0
});
expect(mockResponse.triliumResponseHandled).toBe(true);
});
it("should handle errors and set triliumResponseHandled flag", async () => {
// Mock service to throw error
const ocrService = await import("../../services/ocr/ocr_service.js");
vi.mocked(ocrService.default.startBatchProcessing).mockRejectedValueOnce(new Error("Test error"));
await ocrRoutes.batchProcessOCR(mockRequest, mockResponse);
expect(mockResponse.status).toHaveBeenCalledWith(500);
expect(mockResponse.json).toHaveBeenCalledWith({
success: false,
error: "Test error"
});
expect(mockResponse.triliumResponseHandled).toBe(true);
});
});

View File

@@ -0,0 +1,324 @@
import type { Request } from "express";
import becca from "../../becca/becca.js";
import ocrService from "../../services/ocr/ocr_service.js";
import sql from "../../services/sql.js";
/**
* @swagger
* /api/ocr/process-note/{noteId}:
* post:
* summary: Process OCR for a specific note
* operationId: ocr-process-note
* parameters:
* - name: noteId
* in: path
* required: true
* schema:
* type: string
* description: ID of the note to process
* requestBody:
* required: false
* content:
* application/json:
* schema:
* type: object
* properties:
* language:
* type: string
* description: OCR language code (e.g. 'eng', 'fra', 'deu')
* default: 'eng'
* forceReprocess:
* type: boolean
* description: Force reprocessing even if OCR already exists
* default: false
* responses:
* '200':
* description: OCR processing completed successfully
* '400':
* description: Bad request - OCR disabled or unsupported file type
* '404':
* description: Note not found
* '500':
* description: Internal server error
* security:
* - session: []
* tags: ["ocr"]
*/
async function processNoteOCR(req: Request<{ noteId: string }>) {
const { noteId } = req.params;
const { language = 'eng', forceReprocess = false } = req.body || {};
if (!ocrService.isOCREnabled()) {
return [400, { success: false, message: 'OCR is not enabled in settings' }];
}
const note = becca.getNote(noteId);
if (!note) {
return [404, { success: false, message: 'Note not found' }];
}
const result = await ocrService.processNoteOCR(noteId, { language, forceReprocess });
if (!result) {
return [400, { success: false, message: 'Note is not an image or has unsupported format' }];
}
return { success: true, result };
}
/**
* @swagger
* /api/ocr/process-attachment/{attachmentId}:
* post:
* summary: Process OCR for a specific attachment
* operationId: ocr-process-attachment
* parameters:
* - name: attachmentId
* in: path
* required: true
* schema:
* type: string
* description: ID of the attachment to process
* requestBody:
* required: false
* content:
* application/json:
* schema:
* type: object
* properties:
* language:
* type: string
* description: OCR language code (e.g. 'eng', 'fra', 'deu')
* default: 'eng'
* forceReprocess:
* type: boolean
* description: Force reprocessing even if OCR already exists
* default: false
* responses:
* '200':
* description: OCR processing completed successfully
* '400':
* description: Bad request - OCR disabled or unsupported file type
* '404':
* description: Attachment not found
* '500':
* description: Internal server error
* security:
* - session: []
* tags: ["ocr"]
*/
async function processAttachmentOCR(req: Request<{ attachmentId: string }>) {
const { attachmentId } = req.params;
const { language = 'eng', forceReprocess = false } = req.body || {};
if (!ocrService.isOCREnabled()) {
return [400, { success: false, message: 'OCR is not enabled in settings' }];
}
const attachment = becca.getAttachment(attachmentId);
if (!attachment) {
return [404, { success: false, message: 'Attachment not found' }];
}
const result = await ocrService.processAttachmentOCR(attachmentId, { language, forceReprocess });
if (!result) {
return [400, { success: false, message: 'Attachment is not an image or has unsupported format' }];
}
return { success: true, result };
}
/**
* @swagger
* /api/ocr/search:
* get:
* summary: Search for text in OCR results
* operationId: ocr-search
* parameters:
* - name: q
* in: query
* required: true
* schema:
* type: string
* description: Search query text
* responses:
* '200':
* description: Search results
* '400':
* description: Bad request - missing search query
* '500':
* description: Internal server error
* security:
* - session: []
* tags: ["ocr"]
*/
async function searchOCR(req: Request) {
const { q: searchText } = req.query;
if (!searchText || typeof searchText !== 'string') {
return [400, { success: false, message: 'Search query is required' }];
}
const results = ocrService.searchOCRResults(searchText);
return { success: true, results };
}
/**
* @swagger
* /api/ocr/batch-process:
* post:
* summary: Process OCR for all images without existing OCR results
* operationId: ocr-batch-process
* responses:
* '200':
* description: Batch processing initiated successfully
* '400':
* description: Bad request - OCR disabled or already processing
* '500':
* description: Internal server error
* security:
* - session: []
* tags: ["ocr"]
*/
async function batchProcessOCR() {
const result = await ocrService.startBatchProcessing();
if (!result.success) {
return [400, result];
}
return result;
}
/**
* @swagger
* /api/ocr/batch-progress:
* get:
* summary: Get batch OCR processing progress
* operationId: ocr-batch-progress
* responses:
* '200':
* description: Batch processing progress information
* '500':
* description: Internal server error
* security:
* - session: []
* tags: ["ocr"]
*/
async function getBatchProgress() {
return ocrService.getBatchProgress();
}
/**
* @swagger
* /api/ocr/stats:
* get:
* summary: Get OCR processing statistics
* operationId: ocr-get-stats
* responses:
* '200':
* description: OCR statistics
* '500':
* description: Internal server error
* security:
* - session: []
* tags: ["ocr"]
*/
async function getOCRStats() {
return { success: true, stats: ocrService.getOCRStats() };
}
/**
* @swagger
* /api/ocr/delete/{blobId}:
* delete:
* summary: Delete OCR results for a specific blob
* operationId: ocr-delete-results
* parameters:
* - name: blobId
* in: path
* required: true
* schema:
* type: string
* description: ID of the blob
* responses:
* '200':
* description: OCR results deleted successfully
* '400':
* description: Bad request - invalid parameters
* '500':
* description: Internal server error
* security:
* - session: []
* tags: ["ocr"]
*/
async function deleteOCRResults(req: Request<{ blobId: string }>) {
const { blobId } = req.params;
ocrService.deleteOCRResult(blobId);
return { success: true, message: `OCR results deleted for blob ${blobId}` };
}
/**
* @swagger
* /api/ocr/notes/{noteId}/text:
* get:
* summary: Get OCR text for a specific note
* operationId: ocr-get-note-text
* parameters:
* - name: noteId
* in: path
* required: true
* schema:
* type: string
* description: Note ID to get OCR text for
* responses:
* 200:
* description: OCR text retrieved successfully
* 404:
* description: Note not found
* tags: ["ocr"]
*/
async function getNoteOCRText(req: Request<{ noteId: string }>) {
const { noteId } = req.params;
const note = becca.getNote(noteId);
if (!note) {
return [404, { success: false, message: 'Note not found' }];
}
let ocrText: string | null = null;
let extractedAt: string | null = null;
if (note.blobId) {
const result = sql.getRow<{
textRepresentation: string | null;
textExtractionLastProcessed: string | null;
}>(`
SELECT textRepresentation, textExtractionLastProcessed
FROM blobs
WHERE blobId = ?
`, [note.blobId]);
if (result) {
ocrText = result.textRepresentation;
extractedAt = result.textExtractionLastProcessed;
}
}
return {
success: true,
text: ocrText || '',
hasOcr: !!ocrText,
extractedAt
};
}
export default {
processNoteOCR,
processAttachmentOCR,
searchOCR,
batchProcessOCR,
getBatchProgress,
getOCRStats,
deleteOCRResults,
getNoteOCRText
};

View File

@@ -105,7 +105,13 @@ const ALLOWED_OPTIONS = new Set<OptionNames>([
"newLayout",
"mfaEnabled",
"mfaMethod",
"llmProviders"
"llmProviders",
// OCR options
"ocrEnabled",
"ocrLanguage",
"ocrAutoProcessImages",
"ocrMinConfidence"
]);
function getOptions() {

View File

@@ -39,6 +39,7 @@ import loginApiRoute from "./api/login.js";
import metricsRoute from "./api/metrics.js";
import noteMapRoute from "./api/note_map.js";
import notesApiRoute from "./api/notes.js";
import ocrRoute from "./api/ocr.js";
import optionsApiRoute from "./api/options.js";
import otherRoute from "./api/other.js";
import passwordApiRoute from "./api/password.js";
@@ -376,6 +377,16 @@ function register(app: express.Application) {
etapiBackupRoute.register(router);
etapiMetricsRoute.register(router);
// OCR API
asyncApiRoute(PST, "/api/ocr/process-note/:noteId", ocrRoute.processNoteOCR);
asyncApiRoute(PST, "/api/ocr/process-attachment/:attachmentId", ocrRoute.processAttachmentOCR);
asyncApiRoute(GET, "/api/ocr/search", ocrRoute.searchOCR);
asyncApiRoute(PST, "/api/ocr/batch-process", ocrRoute.batchProcessOCR);
asyncApiRoute(GET, "/api/ocr/batch-progress", ocrRoute.getBatchProgress);
asyncApiRoute(GET, "/api/ocr/stats", ocrRoute.getOCRStats);
asyncApiRoute(DEL, "/api/ocr/delete/:blobId", ocrRoute.deleteOCRResults);
asyncApiRoute(GET, "/api/ocr/notes/:noteId/text", ocrRoute.getNoteOCRText);
app.use("", router);
}

View File

@@ -5,7 +5,7 @@ import packageJson from "../../package.json" with { type: "json" };
import build from "./build.js";
import dataDir from "./data_dir.js";
const APP_DB_VERSION = 235;
const APP_DB_VERSION = 236;
const SYNC_VERSION = 37;
const CLIPPER_PROTOCOL_VERSION = "1.0";

View File

@@ -6,6 +6,9 @@ import becca from "../becca/becca.js";
import BAttribute from "../becca/entities/battribute.js";
import hiddenSubtreeService from "./hidden_subtree.js";
import oneTimeTimer from "./one_time_timer.js";
import ocrService from "./ocr/ocr_service.js";
import optionService from "./options.js";
import log from "./log.js";
import type BNote from "../becca/entities/bnote.js";
import type AbstractBeccaEntity from "../becca/entities/abstract_becca_entity.js";
import type { DefinitionObject } from "./promoted_attribute_definition_interface.js";
@@ -137,6 +140,25 @@ eventService.subscribe(eventService.ENTITY_CREATED, ({ entityName, entity }) =>
}
} else if (entityName === "notes") {
runAttachedRelations(entity, "runOnNoteCreation", entity);
// Note: OCR processing for images is now handled in image.ts during image processing
// OCR processing for files remains here since they don't go through image processing
// Only auto-process if both OCR is enabled and auto-processing is enabled
if (entity.type === 'file' && ocrService.isOCREnabled() && optionService.getOptionBool("ocrAutoProcessImages")) {
// Check if the file MIME type is supported by any OCR processor
const supportedMimeTypes = ocrService.getAllSupportedMimeTypes();
if (entity.mime && supportedMimeTypes.includes(entity.mime)) {
// Process OCR asynchronously to avoid blocking note creation
ocrService.processNoteOCR(entity.noteId).then(result => {
if (result) {
log.info(`Automatically processed OCR for file note ${entity.noteId} with MIME type ${entity.mime}`);
}
}).catch(error => {
log.error(`Failed to automatically process OCR for file note ${entity.noteId}: ${error}`);
});
}
}
}
});

View File

@@ -18,8 +18,7 @@ export async function initializeTranslations() {
ns: "server",
backend: {
loadPath: join(resourceDir, "assets/translations/{{lng}}/{{ns}}.json")
},
showSupportNotice: false
}
});
// Initialize dayjs locale.

View File

@@ -12,8 +12,9 @@ import sanitizeFilename from "sanitize-filename";
import isSvg from "is-svg";
import isAnimated from "is-animated";
import htmlSanitizer from "./html_sanitizer.js";
import ocrService, { type OCRResult } from "./ocr/ocr_service.js";
async function processImage(uploadBuffer: Buffer, originalName: string, shrinkImageSwitch: boolean) {
async function processImage(uploadBuffer: Buffer, originalName: string, shrinkImageSwitch: boolean, noteId?: string) {
const compressImages = optionService.getOptionBool("compressImages");
const origImageFormat = await getImageType(uploadBuffer);
@@ -24,6 +25,42 @@ async function processImage(uploadBuffer: Buffer, originalName: string, shrinkIm
shrinkImageSwitch = false;
}
// Schedule OCR processing in the background for best quality
// Only auto-process if both OCR is enabled and auto-processing is enabled
if (noteId && ocrService.isOCREnabled() && optionService.getOptionBool("ocrAutoProcessImages") && origImageFormat) {
const imageMime = getImageMimeFromExtension(origImageFormat.ext);
const supportedMimeTypes = ocrService.getAllSupportedMimeTypes();
if (supportedMimeTypes.includes(imageMime)) {
// Process OCR asynchronously without blocking image creation
setImmediate(async () => {
try {
const ocrResult = await ocrService.extractTextFromFile(uploadBuffer, imageMime);
if (ocrResult) {
// We need to get the entity again to get its blobId after it's been saved
// noteId could be either a note ID or attachment ID
const note = becca.getNote(noteId);
const attachment = becca.getAttachment(noteId);
let blobId: string | undefined;
if (note && note.blobId) {
blobId = note.blobId;
} else if (attachment && attachment.blobId) {
blobId = attachment.blobId;
}
if (blobId) {
await ocrService.storeOCRResult(blobId, ocrResult);
log.info(`Successfully processed OCR for image ${noteId} (${originalName})`);
}
}
} catch (error) {
log.error(`Failed to process OCR for image ${noteId}: ${error}`);
}
});
}
}
let finalImageBuffer;
let imageFormat;
@@ -72,7 +109,7 @@ function updateImage(noteId: string, uploadBuffer: Buffer, originalName: string)
note.setLabel("originalFileName", originalName);
// resizing images asynchronously since JIMP does not support sync operation
processImage(uploadBuffer, originalName, true).then(({ buffer, imageFormat }) => {
processImage(uploadBuffer, originalName, true, noteId).then(({ buffer, imageFormat }) => {
sql.transactional(() => {
note.mime = getImageMimeFromExtension(imageFormat.ext);
note.save();
@@ -108,7 +145,7 @@ function saveImage(parentNoteId: string, uploadBuffer: Buffer, originalName: str
note.addLabel("originalFileName", originalName);
// resizing images asynchronously since JIMP does not support sync operation
processImage(uploadBuffer, originalName, shrinkImageSwitch).then(({ buffer, imageFormat }) => {
processImage(uploadBuffer, originalName, shrinkImageSwitch, note.noteId).then(({ buffer, imageFormat }) => {
sql.transactional(() => {
note.mime = getImageMimeFromExtension(imageFormat.ext);
@@ -159,7 +196,7 @@ function saveImageToAttachment(noteId: string, uploadBuffer: Buffer, originalNam
}, 5000);
// resizing images asynchronously since JIMP does not support sync operation
processImage(uploadBuffer, originalName, !!shrinkImageSwitch).then(({ buffer, imageFormat }) => {
processImage(uploadBuffer, originalName, !!shrinkImageSwitch, attachment.attachmentId).then(({ buffer, imageFormat }) => {
sql.transactional(() => {
// re-read, might be changed in the meantime
if (!attachment.attachmentId) {

View File

@@ -0,0 +1,823 @@
import { afterEach,beforeEach, describe, expect, it, vi } from 'vitest';
// Mock Tesseract.js
const mockWorker = {
recognize: vi.fn(),
terminate: vi.fn(),
reinitialize: vi.fn()
};
const mockTesseract = {
createWorker: vi.fn().mockResolvedValue(mockWorker)
};
vi.mock('tesseract.js', () => ({
default: mockTesseract
}));
// Mock dependencies
const mockOptions = {
getOptionBool: vi.fn(),
getOption: vi.fn()
};
const mockLog = {
info: vi.fn(),
error: vi.fn()
};
const mockSql = {
execute: vi.fn(),
getRow: vi.fn(),
getRows: vi.fn()
};
const mockBecca = {
getNote: vi.fn(),
getAttachment: vi.fn()
};
vi.mock('../options.js', () => ({
default: mockOptions
}));
vi.mock('../log.js', () => ({
default: mockLog
}));
vi.mock('../sql.js', () => ({
default: mockSql
}));
vi.mock('../../becca/becca.js', () => ({
default: mockBecca
}));
// Import the service after mocking
let ocrService: typeof import('./ocr_service.js').default;
beforeEach(async () => {
// Clear all mocks
vi.clearAllMocks();
// Reset mock implementations
mockOptions.getOptionBool.mockReturnValue(true);
mockOptions.getOption.mockReturnValue('eng');
mockSql.execute.mockImplementation(() => ({ lastInsertRowid: 1 }));
mockSql.getRow.mockReturnValue(null);
mockSql.getRows.mockReturnValue([]);
// Set up createWorker to properly set the worker on the service
mockTesseract.createWorker.mockImplementation(async () => {
return mockWorker;
});
// Dynamically import the service to ensure mocks are applied
const module = await import('./ocr_service.js');
ocrService = module.default; // It's an instance, not a class
// Reset the OCR service state
(ocrService as any).isInitialized = false;
(ocrService as any).worker = null;
(ocrService as any).isProcessing = false;
(ocrService as any).batchProcessingState = {
inProgress: false,
total: 0,
processed: 0
};
});
afterEach(() => {
vi.restoreAllMocks();
});
describe('OCRService', () => {
describe('isOCREnabled', () => {
it('should return true when OCR is enabled in options', () => {
mockOptions.getOptionBool.mockReturnValue(true);
expect(ocrService.isOCREnabled()).toBe(true);
expect(mockOptions.getOptionBool).toHaveBeenCalledWith('ocrEnabled');
});
it('should return false when OCR is disabled in options', () => {
mockOptions.getOptionBool.mockReturnValue(false);
expect(ocrService.isOCREnabled()).toBe(false);
expect(mockOptions.getOptionBool).toHaveBeenCalledWith('ocrEnabled');
});
it('should return false when options throws an error', () => {
mockOptions.getOptionBool.mockImplementation(() => {
throw new Error('Options not available');
});
expect(ocrService.isOCREnabled()).toBe(false);
});
});
describe('isSupportedMimeType', () => {
it('should return true for supported image MIME types', () => {
expect(ocrService.isSupportedMimeType('image/jpeg')).toBe(true);
expect(ocrService.isSupportedMimeType('image/jpg')).toBe(true);
expect(ocrService.isSupportedMimeType('image/png')).toBe(true);
expect(ocrService.isSupportedMimeType('image/gif')).toBe(true);
expect(ocrService.isSupportedMimeType('image/bmp')).toBe(true);
expect(ocrService.isSupportedMimeType('image/tiff')).toBe(true);
});
it('should return false for unsupported MIME types', () => {
expect(ocrService.isSupportedMimeType('text/plain')).toBe(false);
expect(ocrService.isSupportedMimeType('application/pdf')).toBe(false);
expect(ocrService.isSupportedMimeType('video/mp4')).toBe(false);
expect(ocrService.isSupportedMimeType('audio/mp3')).toBe(false);
});
it('should handle null/undefined MIME types', () => {
expect(ocrService.isSupportedMimeType(null as any)).toBe(false);
expect(ocrService.isSupportedMimeType(undefined as any)).toBe(false);
expect(ocrService.isSupportedMimeType('')).toBe(false);
});
});
describe('extractTextFromFile', () => {
const mockImageBuffer = Buffer.from('fake-image-data');
it('should extract text successfully with default options', async () => {
const mockResult = {
data: {
text: 'Extracted text from image',
confidence: 95
}
};
mockWorker.recognize.mockResolvedValue(mockResult);
const result = await ocrService.extractTextFromFile(mockImageBuffer, 'image/jpeg');
expect(result).toBeDefined();
expect(result.text).toBe('Extracted text from image');
expect(result.extractedAt).toEqual(expect.any(String));
});
it('should handle OCR recognition errors', async () => {
const error = new Error('OCR recognition failed');
mockWorker.recognize.mockRejectedValue(error);
await expect(ocrService.extractTextFromFile(mockImageBuffer, 'image/jpeg')).rejects.toThrow('OCR recognition failed');
expect(mockLog.error).toHaveBeenCalledWith('OCR text extraction failed: Error: OCR recognition failed');
});
});
describe('storeOCRResult', () => {
it('should store OCR result in blob successfully', async () => {
const ocrResult = {
text: 'Sample text',
confidence: 0.95,
extractedAt: '2025-06-10T10:00:00.000Z',
language: 'eng'
};
await ocrService.storeOCRResult('blob123', ocrResult);
expect(mockSql.execute).toHaveBeenCalledWith(
expect.stringContaining('UPDATE blobs SET textRepresentation = ?'),
['Sample text', 'blob123']
);
});
it('should handle undefined blobId gracefully', async () => {
const ocrResult = {
text: 'Sample text',
confidence: 0.95,
extractedAt: '2025-06-10T10:00:00.000Z',
language: 'eng'
};
await ocrService.storeOCRResult(undefined, ocrResult);
expect(mockSql.execute).not.toHaveBeenCalled();
expect(mockLog.error).toHaveBeenCalledWith('Cannot store OCR result: blobId is undefined');
});
it('should handle database update errors', async () => {
const error = new Error('Database error');
mockSql.execute.mockImplementation(() => {
throw error;
});
const ocrResult = {
text: 'Sample text',
confidence: 0.95,
extractedAt: '2025-06-10T10:00:00.000Z',
language: 'eng'
};
await expect(ocrService.storeOCRResult('blob123', ocrResult)).rejects.toThrow('Database error');
expect(mockLog.error).toHaveBeenCalledWith('Failed to store OCR result for blob blob123: Error: Database error');
});
});
describe('processNoteOCR', () => {
const mockNote = {
noteId: 'note123',
type: 'image',
mime: 'image/jpeg',
blobId: 'blob123',
getContent: vi.fn()
};
beforeEach(() => {
mockBecca.getNote.mockReturnValue(mockNote);
mockNote.getContent.mockReturnValue(Buffer.from('fake-image-data'));
});
it('should process note OCR successfully', async () => {
// Ensure getRow returns null for all calls in this test
mockSql.getRow.mockImplementation(() => null);
const mockOCRResult = {
data: {
text: 'Note image text',
confidence: 90
}
};
mockWorker.recognize.mockResolvedValue(mockOCRResult);
const result = await ocrService.processNoteOCR('note123');
expect(result).toEqual({
text: 'Note image text',
confidence: 0.9,
extractedAt: expect.any(String),
language: 'eng'
});
expect(mockBecca.getNote).toHaveBeenCalledWith('note123');
expect(mockNote.getContent).toHaveBeenCalled();
});
it('should return existing OCR result if forceReprocess is false', async () => {
const existingResult = {
textRepresentation: 'Existing text'
};
mockSql.getRow.mockReturnValue(existingResult);
const result = await ocrService.processNoteOCR('note123');
expect(result).toEqual({
text: 'Existing text',
confidence: 0.95,
language: 'eng',
extractedAt: expect.any(String)
});
expect(mockNote.getContent).not.toHaveBeenCalled();
});
it('should reprocess if forceReprocess is true', async () => {
const existingResult = {
textRepresentation: 'Existing text'
};
mockSql.getRow.mockResolvedValue(existingResult);
const mockOCRResult = {
data: {
text: 'New processed text',
confidence: 95
}
};
mockWorker.recognize.mockResolvedValue(mockOCRResult);
const result = await ocrService.processNoteOCR('note123', { forceReprocess: true });
expect(result?.text).toBe('New processed text');
expect(mockNote.getContent).toHaveBeenCalled();
});
it('should return null for non-existent note', async () => {
mockBecca.getNote.mockReturnValue(null);
const result = await ocrService.processNoteOCR('nonexistent');
expect(result).toBe(null);
expect(mockLog.error).toHaveBeenCalledWith('Note nonexistent not found');
});
it('should return null for unsupported MIME type', async () => {
mockNote.mime = 'text/plain';
const result = await ocrService.processNoteOCR('note123');
expect(result).toBe(null);
expect(mockLog.info).toHaveBeenCalledWith('Note note123 has unsupported MIME type text/plain, skipping OCR');
});
});
describe('processAttachmentOCR', () => {
const mockAttachment = {
attachmentId: 'attach123',
role: 'image',
mime: 'image/png',
blobId: 'blob456',
getContent: vi.fn()
};
beforeEach(() => {
mockBecca.getAttachment.mockReturnValue(mockAttachment);
mockAttachment.getContent.mockReturnValue(Buffer.from('fake-image-data'));
});
it('should process attachment OCR successfully', async () => {
// Ensure getRow returns null for all calls in this test
mockSql.getRow.mockImplementation(() => null);
const mockOCRResult = {
data: {
text: 'Attachment image text',
confidence: 92
}
};
mockWorker.recognize.mockResolvedValue(mockOCRResult);
const result = await ocrService.processAttachmentOCR('attach123');
expect(result).toEqual({
text: 'Attachment image text',
confidence: 0.92,
extractedAt: expect.any(String),
language: 'eng'
});
expect(mockBecca.getAttachment).toHaveBeenCalledWith('attach123');
});
it('should return null for non-existent attachment', async () => {
mockBecca.getAttachment.mockReturnValue(null);
const result = await ocrService.processAttachmentOCR('nonexistent');
expect(result).toBe(null);
expect(mockLog.error).toHaveBeenCalledWith('Attachment nonexistent not found');
});
});
describe('searchOCRResults', () => {
it('should search OCR results successfully', () => {
const mockResults = [
{
blobId: 'blob1',
textRepresentation: 'Sample search text'
}
];
mockSql.getRows.mockReturnValue(mockResults);
const results = ocrService.searchOCRResults('search');
expect(results).toEqual([{
blobId: 'blob1',
text: 'Sample search text'
}]);
expect(mockSql.getRows).toHaveBeenCalledWith(
expect.stringContaining('WHERE textRepresentation LIKE ?'),
['%search%']
);
});
it('should handle search errors gracefully', () => {
mockSql.getRows.mockImplementation(() => {
throw new Error('Database error');
});
const results = ocrService.searchOCRResults('search');
expect(results).toEqual([]);
expect(mockLog.error).toHaveBeenCalledWith('Failed to search OCR results: Error: Database error');
});
});
describe('getOCRStats', () => {
it('should return OCR statistics successfully', () => {
const mockStats = {
total_processed: 150
};
const mockNoteStats = {
count: 100
};
const mockAttachmentStats = {
count: 50
};
mockSql.getRow.mockReturnValueOnce(mockStats);
mockSql.getRow.mockReturnValueOnce(mockNoteStats);
mockSql.getRow.mockReturnValueOnce(mockAttachmentStats);
const stats = ocrService.getOCRStats();
expect(stats).toEqual({
totalProcessed: 150,
imageNotes: 100,
imageAttachments: 50
});
});
it('should handle missing statistics gracefully', () => {
mockSql.getRow.mockReturnValue(null);
const stats = ocrService.getOCRStats();
expect(stats).toEqual({
totalProcessed: 0,
imageNotes: 0,
imageAttachments: 0
});
});
});
describe('Batch Processing', () => {
describe('startBatchProcessing', () => {
beforeEach(() => {
// Reset batch processing state
ocrService.cancelBatchProcessing();
});
it('should start batch processing when images are available', async () => {
mockSql.getRow.mockReturnValueOnce({ count: 5 }); // image notes
mockSql.getRow.mockReturnValueOnce({ count: 3 }); // image attachments
const result = await ocrService.startBatchProcessing();
expect(result).toEqual({ success: true });
expect(mockSql.getRow).toHaveBeenCalledTimes(2);
});
it('should return error if batch processing already in progress', async () => {
// Start first batch
mockSql.getRow.mockReturnValueOnce({ count: 5 });
mockSql.getRow.mockReturnValueOnce({ count: 3 });
// Mock background processing queries
const mockImageNotes = Array.from({length: 5}, (_, i) => ({
noteId: `note${i}`,
mime: 'image/jpeg'
}));
mockSql.getRows.mockReturnValueOnce(mockImageNotes);
mockSql.getRows.mockReturnValueOnce([]);
// Start without awaiting to keep it in progress
const firstStart = ocrService.startBatchProcessing();
// Try to start second batch immediately
const result = await ocrService.startBatchProcessing();
// Clean up by awaiting the first one
await firstStart;
expect(result).toEqual({
success: false,
message: 'Batch processing already in progress'
});
});
it('should return error if OCR is disabled', async () => {
mockOptions.getOptionBool.mockReturnValue(false);
const result = await ocrService.startBatchProcessing();
expect(result).toEqual({
success: false,
message: 'OCR is disabled'
});
});
it('should return error if no images need processing', async () => {
mockSql.getRow.mockReturnValueOnce({ count: 0 }); // image notes
mockSql.getRow.mockReturnValueOnce({ count: 0 }); // image attachments
const result = await ocrService.startBatchProcessing();
expect(result).toEqual({
success: false,
message: 'No images found that need OCR processing'
});
});
it('should handle database errors gracefully', async () => {
const error = new Error('Database connection failed');
mockSql.getRow.mockImplementation(() => {
throw error;
});
const result = await ocrService.startBatchProcessing();
expect(result).toEqual({
success: false,
message: 'Database connection failed'
});
expect(mockLog.error).toHaveBeenCalledWith(
'Failed to start batch processing: Database connection failed'
);
});
});
describe('getBatchProgress', () => {
it('should return initial progress state', () => {
const progress = ocrService.getBatchProgress();
expect(progress.inProgress).toBe(false);
expect(progress.total).toBe(0);
expect(progress.processed).toBe(0);
});
it('should return progress with percentage when total > 0', async () => {
// Start batch processing
mockSql.getRow.mockReturnValueOnce({ count: 10 });
mockSql.getRow.mockReturnValueOnce({ count: 0 });
// Mock the background processing queries to return items that will take time to process
const mockImageNotes = Array.from({length: 10}, (_, i) => ({
noteId: `note${i}`,
mime: 'image/jpeg'
}));
mockSql.getRows.mockReturnValueOnce(mockImageNotes); // image notes query
mockSql.getRows.mockReturnValueOnce([]); // image attachments query
const startPromise = ocrService.startBatchProcessing();
// Check progress immediately after starting (before awaiting)
const progress = ocrService.getBatchProgress();
await startPromise;
expect(progress.inProgress).toBe(true);
expect(progress.total).toBe(10);
expect(progress.processed).toBe(0);
expect(progress.percentage).toBe(0);
expect(progress.startTime).toBeInstanceOf(Date);
});
});
describe('cancelBatchProcessing', () => {
it('should cancel ongoing batch processing', async () => {
// Start batch processing
mockSql.getRow.mockReturnValueOnce({ count: 5 });
mockSql.getRow.mockReturnValueOnce({ count: 0 });
// Mock background processing queries
const mockImageNotes = Array.from({length: 5}, (_, i) => ({
noteId: `note${i}`,
mime: 'image/jpeg'
}));
mockSql.getRows.mockReturnValueOnce(mockImageNotes);
mockSql.getRows.mockReturnValueOnce([]);
const startPromise = ocrService.startBatchProcessing();
expect(ocrService.getBatchProgress().inProgress).toBe(true);
await startPromise;
ocrService.cancelBatchProcessing();
expect(ocrService.getBatchProgress().inProgress).toBe(false);
expect(mockLog.info).toHaveBeenCalledWith('Batch OCR processing cancelled');
});
it('should do nothing if no batch processing is running', () => {
ocrService.cancelBatchProcessing();
expect(mockLog.info).not.toHaveBeenCalledWith('Batch OCR processing cancelled');
});
});
describe('processBatchInBackground', () => {
it('should process image notes and attachments in sequence', async () => {
// Clear all mocks at the start of this test to ensure clean state
vi.clearAllMocks();
// Mock data for batch processing
const imageNotes = [
{ noteId: 'note1', mime: 'image/jpeg', blobId: 'blob1' },
{ noteId: 'note2', mime: 'image/png', blobId: 'blob2' }
];
const imageAttachments = [
{ attachmentId: 'attach1', mime: 'image/gif', blobId: 'blob3' }
];
// Setup mocks for startBatchProcessing
mockSql.getRow.mockReturnValueOnce({ count: 2 }); // image notes count
mockSql.getRow.mockReturnValueOnce({ count: 1 }); // image attachments count
// Setup mocks for background processing
mockSql.getRows.mockReturnValueOnce(imageNotes); // image notes query
mockSql.getRows.mockReturnValueOnce(imageAttachments); // image attachments query
// Mock successful OCR processing
mockWorker.recognize.mockResolvedValue({
data: { text: 'Test text', confidence: 95 }
});
// Mock notes and attachments
const mockNote1 = {
noteId: 'note1',
type: 'image',
mime: 'image/jpeg',
blobId: 'blob1',
getContent: vi.fn().mockReturnValue(Buffer.from('fake-image-data'))
};
const mockNote2 = {
noteId: 'note2',
type: 'image',
mime: 'image/png',
blobId: 'blob2',
getContent: vi.fn().mockReturnValue(Buffer.from('fake-image-data'))
};
const mockAttachment = {
attachmentId: 'attach1',
role: 'image',
mime: 'image/gif',
blobId: 'blob3',
getContent: vi.fn().mockReturnValue(Buffer.from('fake-image-data'))
};
mockBecca.getNote.mockImplementation((noteId) => {
if (noteId === 'note1') return mockNote1;
if (noteId === 'note2') return mockNote2;
return null;
});
mockBecca.getAttachment.mockReturnValue(mockAttachment);
mockSql.getRow.mockReturnValue(null); // No existing OCR results
// Start batch processing
await ocrService.startBatchProcessing();
// Wait for background processing to complete
// Need to wait longer since there's a 500ms delay between each item in batch processing
await new Promise(resolve => setTimeout(resolve, 2000));
// Verify notes and attachments were processed
expect(mockBecca.getNote).toHaveBeenCalledWith('note1');
expect(mockBecca.getNote).toHaveBeenCalledWith('note2');
expect(mockBecca.getAttachment).toHaveBeenCalledWith('attach1');
});
it('should handle processing errors gracefully', async () => {
const imageNotes = [
{ noteId: 'note1', mime: 'image/jpeg', blobId: 'blob1' }
];
// Setup mocks for startBatchProcessing
mockSql.getRow.mockReturnValueOnce({ count: 1 });
mockSql.getRow.mockReturnValueOnce({ count: 0 });
// Setup mocks for background processing
mockSql.getRows.mockReturnValueOnce(imageNotes);
mockSql.getRows.mockReturnValueOnce([]);
// Mock note that will cause an error
const mockNote = {
noteId: 'note1',
type: 'image',
mime: 'image/jpeg',
blobId: 'blob1',
getContent: vi.fn().mockImplementation(() => { throw new Error('Failed to get content'); })
};
mockBecca.getNote.mockReturnValue(mockNote);
mockSql.getRow.mockReturnValue(null);
// Start batch processing
await ocrService.startBatchProcessing();
// Wait for background processing to complete
await new Promise(resolve => setTimeout(resolve, 100));
// Verify error was logged but processing continued
expect(mockLog.error).toHaveBeenCalledWith(
expect.stringContaining('Failed to process OCR for note note1')
);
});
it('should stop processing when cancelled', async () => {
const imageNotes = [
{ noteId: 'note1', mime: 'image/jpeg', blobId: 'blob1' },
{ noteId: 'note2', mime: 'image/png', blobId: 'blob2' }
];
// Setup mocks
mockSql.getRow.mockReturnValueOnce({ count: 2 });
mockSql.getRow.mockReturnValueOnce({ count: 0 });
mockSql.getRows.mockReturnValueOnce(imageNotes);
mockSql.getRows.mockReturnValueOnce([]);
// Start batch processing
await ocrService.startBatchProcessing();
// Cancel immediately
ocrService.cancelBatchProcessing();
// Wait for background processing to complete
await new Promise(resolve => setTimeout(resolve, 100));
// Verify processing was stopped early
expect(ocrService.getBatchProgress().inProgress).toBe(false);
});
it('should skip unsupported MIME types', async () => {
const imageNotes = [
{ noteId: 'note1', mime: 'text/plain', blobId: 'blob1' }, // unsupported
{ noteId: 'note2', mime: 'image/jpeg', blobId: 'blob2' } // supported
];
// Setup mocks
mockSql.getRow.mockReturnValueOnce({ count: 2 });
mockSql.getRow.mockReturnValueOnce({ count: 0 });
mockSql.getRows.mockReturnValueOnce(imageNotes);
mockSql.getRows.mockReturnValueOnce([]);
const mockNote = {
noteId: 'note2',
type: 'image',
mime: 'image/jpeg',
blobId: 'blob2',
getContent: vi.fn().mockReturnValue(Buffer.from('fake-image-data'))
};
mockBecca.getNote.mockReturnValue(mockNote);
mockSql.getRow.mockReturnValue(null);
mockWorker.recognize.mockResolvedValue({
data: { text: 'Test text', confidence: 95 }
});
// Start batch processing
await ocrService.startBatchProcessing();
// Wait for background processing to complete
await new Promise(resolve => setTimeout(resolve, 100));
// Verify only supported MIME type was processed
expect(mockBecca.getNote).toHaveBeenCalledWith('note2');
expect(mockBecca.getNote).not.toHaveBeenCalledWith('note1');
});
});
});
describe('deleteOCRResult', () => {
it('should delete OCR result successfully', () => {
ocrService.deleteOCRResult('blob123');
expect(mockSql.execute).toHaveBeenCalledWith(
expect.stringContaining('UPDATE blobs SET textRepresentation = NULL'),
['blob123']
);
expect(mockLog.info).toHaveBeenCalledWith('Deleted OCR result for blob blob123');
});
it('should handle deletion errors', () => {
mockSql.execute.mockImplementation(() => {
throw new Error('Database error');
});
expect(() => ocrService.deleteOCRResult('blob123')).toThrow('Database error');
expect(mockLog.error).toHaveBeenCalledWith('Failed to delete OCR result for blob blob123: Error: Database error');
});
});
describe('isCurrentlyProcessing', () => {
it('should return false initially', () => {
expect(ocrService.isCurrentlyProcessing()).toBe(false);
});
it('should return true during processing', async () => {
mockBecca.getNote.mockReturnValue({
noteId: 'note123',
mime: 'image/jpeg',
blobId: 'blob123',
getContent: vi.fn().mockReturnValue(Buffer.from('fake-image-data'))
});
mockSql.getRow.mockResolvedValue(null);
mockWorker.recognize.mockImplementation(() => {
expect(ocrService.isCurrentlyProcessing()).toBe(true);
return Promise.resolve({
data: { text: 'test', confidence: 90 }
});
});
await ocrService.processNoteOCR('note123');
expect(ocrService.isCurrentlyProcessing()).toBe(false);
});
});
describe('cleanup', () => {
it('should terminate worker on cleanup', async () => {
await ocrService.cleanup();
expect(mockWorker.terminate).toHaveBeenCalled();
expect(mockLog.info).toHaveBeenCalledWith('OCR service cleaned up');
});
it('should handle cleanup when worker is not initialized', async () => {
await ocrService.cleanup();
expect(mockWorker.terminate).not.toHaveBeenCalled();
expect(mockLog.info).toHaveBeenCalledWith('OCR service cleaned up');
});
});
});

View File

@@ -0,0 +1,752 @@
import Tesseract from 'tesseract.js';
import log from '../log.js';
import sql from '../sql.js';
import becca from '../../becca/becca.js';
import options from '../options.js';
import { ImageProcessor } from './processors/image_processor.js';
import { PDFProcessor } from './processors/pdf_processor.js';
import { TIFFProcessor } from './processors/tiff_processor.js';
import { OfficeProcessor } from './processors/office_processor.js';
import { FileProcessor } from './processors/file_processor.js';
export interface OCRResult {
text: string;
confidence: number;
extractedAt: string;
language?: string;
pageCount?: number;
}
export interface OCRProcessingOptions {
language?: string;
forceReprocess?: boolean;
confidence?: number;
enablePDFTextExtraction?: boolean;
}
interface OCRBlobRow {
blobId: string;
textRepresentation: string;
textExtractionLastProcessed?: string;
}
/**
* OCR Service for extracting text from images and other OCR-able objects
* Uses Tesseract.js for text recognition
*/
class OCRService {
private worker: Tesseract.Worker | null = null;
private isProcessing = false;
private processors: Map<string, FileProcessor> = new Map();
constructor() {
// Initialize file processors
this.processors.set('image', new ImageProcessor());
this.processors.set('pdf', new PDFProcessor());
this.processors.set('tiff', new TIFFProcessor());
this.processors.set('office', new OfficeProcessor());
}
/**
* Check if OCR is enabled in settings
*/
isOCREnabled(): boolean {
try {
return options.getOptionBool('ocrEnabled');
} catch (error) {
log.error(`Failed to check OCR enabled status: ${error}`);
return false;
}
}
/**
* Check if a MIME type is supported for OCR
*/
isSupportedMimeType(mimeType: string): boolean {
if (!mimeType || typeof mimeType !== 'string') {
return false;
}
const supportedTypes = [
'image/jpeg',
'image/jpg',
'image/png',
'image/gif',
'image/bmp',
'image/tiff',
'image/webp'
];
return supportedTypes.includes(mimeType.toLowerCase());
}
/**
* Extract text from file buffer using appropriate processor
*/
async extractTextFromFile(fileBuffer: Buffer, mimeType: string, options: OCRProcessingOptions = {}): Promise<OCRResult> {
try {
log.info(`Starting OCR text extraction for MIME type: ${mimeType}`);
this.isProcessing = true;
// Find appropriate processor
const processor = this.getProcessorForMimeType(mimeType);
if (!processor) {
throw new Error(`No processor found for MIME type: ${mimeType}`);
}
const result = await processor.extractText(fileBuffer, options);
log.info(`OCR extraction completed. Confidence: ${result.confidence}%, Text length: ${result.text.length}`);
return result;
} catch (error) {
log.error(`OCR text extraction failed: ${error}`);
throw error;
} finally {
this.isProcessing = false;
}
}
/**
* Process OCR for a note (image type)
*/
async processNoteOCR(noteId: string, options: OCRProcessingOptions = {}): Promise<OCRResult | null> {
if (!this.isOCREnabled()) {
log.info('OCR is disabled in settings');
return null;
}
const note = becca.getNote(noteId);
if (!note) {
log.error(`Note ${noteId} not found`);
return null;
}
// Check if note type and MIME type are supported for OCR
if (note.type === 'image') {
if (!this.isSupportedMimeType(note.mime)) {
log.info(`Image note ${noteId} has unsupported MIME type ${note.mime}, skipping OCR`);
return null;
}
} else if (note.type === 'file') {
// Check if file MIME type is supported by any processor
const processor = this.getProcessorForMimeType(note.mime);
if (!processor) {
log.info(`File note ${noteId} has unsupported MIME type ${note.mime} for OCR, skipping`);
return null;
}
} else {
log.info(`Note ${noteId} is not an image or file note, skipping OCR`);
return null;
}
// Check if OCR already exists and is up-to-date
const existingOCR = this.getStoredOCRResult(note.blobId);
if (existingOCR && !options.forceReprocess && note.blobId && !this.needsReprocessing(note.blobId)) {
log.info(`OCR already exists and is up-to-date for note ${noteId}, returning cached result`);
return existingOCR;
}
try {
const content = note.getContent();
if (!content || !(content instanceof Buffer)) {
throw new Error(`Cannot get image content for note ${noteId}`);
}
const ocrResult = await this.extractTextFromFile(content, note.mime, options);
// Store OCR result in blob
await this.storeOCRResult(note.blobId, ocrResult);
return ocrResult;
} catch (error) {
log.error(`Failed to process OCR for note ${noteId}: ${error}`);
throw error;
}
}
/**
* Process OCR for an attachment
*/
async processAttachmentOCR(attachmentId: string, options: OCRProcessingOptions = {}): Promise<OCRResult | null> {
if (!this.isOCREnabled()) {
log.info('OCR is disabled in settings');
return null;
}
const attachment = becca.getAttachment(attachmentId);
if (!attachment) {
log.error(`Attachment ${attachmentId} not found`);
return null;
}
// Check if attachment role and MIME type are supported for OCR
if (attachment.role === 'image') {
if (!this.isSupportedMimeType(attachment.mime)) {
log.info(`Image attachment ${attachmentId} has unsupported MIME type ${attachment.mime}, skipping OCR`);
return null;
}
} else if (attachment.role === 'file') {
// Check if file MIME type is supported by any processor
const processor = this.getProcessorForMimeType(attachment.mime);
if (!processor) {
log.info(`File attachment ${attachmentId} has unsupported MIME type ${attachment.mime} for OCR, skipping`);
return null;
}
} else {
log.info(`Attachment ${attachmentId} is not an image or file, skipping OCR`);
return null;
}
// Check if OCR already exists and is up-to-date
const existingOCR = this.getStoredOCRResult(attachment.blobId);
if (existingOCR && !options.forceReprocess && attachment.blobId && !this.needsReprocessing(attachment.blobId)) {
log.info(`OCR already exists and is up-to-date for attachment ${attachmentId}, returning cached result`);
return existingOCR;
}
try {
const content = attachment.getContent();
if (!content || !(content instanceof Buffer)) {
throw new Error(`Cannot get image content for attachment ${attachmentId}`);
}
const ocrResult = await this.extractTextFromFile(content, attachment.mime, options);
// Store OCR result in blob
await this.storeOCRResult(attachment.blobId, ocrResult);
return ocrResult;
} catch (error) {
log.error(`Failed to process OCR for attachment ${attachmentId}: ${error}`);
throw error;
}
}
/**
* Store OCR result in blob
*/
async storeOCRResult(blobId: string | undefined, ocrResult: OCRResult): Promise<void> {
if (!blobId) {
log.error('Cannot store OCR result: blobId is undefined');
return;
}
try {
// Store OCR text and timestamp in blobs table
sql.execute(`
UPDATE blobs SET
textRepresentation = ?,
textExtractionLastProcessed = ?
WHERE blobId = ?
`, [
ocrResult.text,
new Date().toISOString(),
blobId
]);
log.info(`Stored OCR result for blob ${blobId}`);
} catch (error) {
log.error(`Failed to store OCR result for blob ${blobId}: ${error}`);
throw error;
}
}
/**
* Get stored OCR result from blob
*/
private getStoredOCRResult(blobId: string | undefined): OCRResult | null {
if (!blobId) {
return null;
}
try {
const row = sql.getRow<{
textRepresentation: string | null;
}>(`
SELECT textRepresentation
FROM blobs
WHERE blobId = ?
`, [blobId]);
if (!row || !row.textRepresentation) {
return null;
}
// Return basic OCR result from stored text
// Note: we lose confidence, language, and extractedAt metadata
// but gain simplicity by storing directly in blob
return {
text: row.textRepresentation,
confidence: 0.95, // Default high confidence for existing OCR
extractedAt: new Date().toISOString(),
language: 'eng'
};
} catch (error) {
log.error(`Failed to get OCR result for blob ${blobId}: ${error}`);
return null;
}
}
/**
* Search for text in OCR results
*/
searchOCRResults(searchText: string): Array<{ blobId: string; text: string }> {
try {
const query = `
SELECT blobId, textRepresentation
FROM blobs
WHERE textRepresentation LIKE ?
AND textRepresentation IS NOT NULL
`;
const params = [`%${searchText}%`];
const rows = sql.getRows<OCRBlobRow>(query, params);
return rows.map(row => ({
blobId: row.blobId,
text: row.textRepresentation
}));
} catch (error) {
log.error(`Failed to search OCR results: ${error}`);
return [];
}
}
/**
* Delete OCR results for a blob
*/
deleteOCRResult(blobId: string): void {
try {
sql.execute(`
UPDATE blobs SET textRepresentation = NULL
WHERE blobId = ?
`, [blobId]);
log.info(`Deleted OCR result for blob ${blobId}`);
} catch (error) {
log.error(`Failed to delete OCR result for blob ${blobId}: ${error}`);
throw error;
}
}
/**
* Process OCR for all files that don't have OCR results yet or need reprocessing
*/
async processAllImages(): Promise<void> {
return this.processAllBlobsNeedingOCR();
}
/**
* Get OCR statistics
*/
getOCRStats(): { totalProcessed: number; imageNotes: number; imageAttachments: number } {
try {
const stats = sql.getRow<{
total_processed: number;
}>(`
SELECT COUNT(*) as total_processed
FROM blobs
WHERE textRepresentation IS NOT NULL AND textRepresentation != ''
`);
// Count image notes with OCR
const noteStats = sql.getRow<{
count: number;
}>(`
SELECT COUNT(*) as count
FROM notes n
JOIN blobs b ON n.blobId = b.blobId
WHERE n.type = 'image'
AND n.isDeleted = 0
AND b.textRepresentation IS NOT NULL AND b.textRepresentation != ''
`);
// Count image attachments with OCR
const attachmentStats = sql.getRow<{
count: number;
}>(`
SELECT COUNT(*) as count
FROM attachments a
JOIN blobs b ON a.blobId = b.blobId
WHERE a.role = 'image'
AND a.isDeleted = 0
AND b.textRepresentation IS NOT NULL AND b.textRepresentation != ''
`);
return {
totalProcessed: stats?.total_processed || 0,
imageNotes: noteStats?.count || 0,
imageAttachments: attachmentStats?.count || 0
};
} catch (error) {
log.error(`Failed to get OCR stats: ${error}`);
return { totalProcessed: 0, imageNotes: 0, imageAttachments: 0 };
}
}
/**
* Clean up OCR service
*/
async cleanup(): Promise<void> {
if (this.worker) {
await this.worker.terminate();
this.worker = null;
}
log.info('OCR service cleaned up');
}
/**
* Check if currently processing
*/
isCurrentlyProcessing(): boolean {
return this.isProcessing;
}
// Batch processing state
private batchProcessingState: {
inProgress: boolean;
total: number;
processed: number;
startTime?: Date;
} = {
inProgress: false,
total: 0,
processed: 0
};
/**
* Start batch OCR processing with progress tracking
*/
async startBatchProcessing(): Promise<{ success: boolean; message?: string }> {
if (this.batchProcessingState.inProgress) {
return { success: false, message: 'Batch processing already in progress' };
}
if (!this.isOCREnabled()) {
return { success: false, message: 'OCR is disabled' };
}
try {
// Count total blobs needing OCR processing
const blobsNeedingOCR = this.getBlobsNeedingOCR();
const totalCount = blobsNeedingOCR.length;
if (totalCount === 0) {
return { success: false, message: 'No images found that need OCR processing' };
}
// Initialize batch processing state
this.batchProcessingState = {
inProgress: true,
total: totalCount,
processed: 0,
startTime: new Date()
};
// Start processing in background
this.processBatchInBackground(blobsNeedingOCR).catch(error => {
log.error(`Batch processing failed: ${error instanceof Error ? error.message : String(error)}`);
this.batchProcessingState.inProgress = false;
});
return { success: true };
} catch (error) {
log.error(`Failed to start batch processing: ${error instanceof Error ? error.message : String(error)}`);
return { success: false, message: error instanceof Error ? error.message : String(error) };
}
}
/**
* Get batch processing progress
*/
getBatchProgress(): { inProgress: boolean; total: number; processed: number; percentage?: number; startTime?: Date } {
const result: { inProgress: boolean; total: number; processed: number; percentage?: number; startTime?: Date } = { ...this.batchProcessingState };
if (result.total > 0) {
result.percentage = (result.processed / result.total) * 100;
}
return result;
}
/**
* Process batch OCR in background with progress tracking
*/
private async processBatchInBackground(blobsToProcess: Array<{ blobId: string; mimeType: string; entityType: 'note' | 'attachment'; entityId: string }>): Promise<void> {
try {
log.info('Starting batch OCR processing...');
for (const blobInfo of blobsToProcess) {
if (!this.batchProcessingState.inProgress) {
break; // Stop if processing was cancelled
}
try {
if (blobInfo.entityType === 'note') {
await this.processNoteOCR(blobInfo.entityId);
} else {
await this.processAttachmentOCR(blobInfo.entityId);
}
this.batchProcessingState.processed++;
// Add small delay to prevent overwhelming the system
await new Promise(resolve => setTimeout(resolve, 500));
} catch (error) {
log.error(`Failed to process OCR for ${blobInfo.entityType} ${blobInfo.entityId}: ${error}`);
this.batchProcessingState.processed++; // Count as processed even if failed
}
}
// Mark as completed
this.batchProcessingState.inProgress = false;
log.info(`Batch OCR processing completed. Processed ${this.batchProcessingState.processed} files.`);
} catch (error) {
log.error(`Batch OCR processing failed: ${error}`);
this.batchProcessingState.inProgress = false;
throw error;
}
}
/**
* Cancel batch processing
*/
cancelBatchProcessing(): void {
if (this.batchProcessingState.inProgress) {
this.batchProcessingState.inProgress = false;
log.info('Batch OCR processing cancelled');
}
}
/**
* Get processor for a given MIME type
*/
private getProcessorForMimeType(mimeType: string): FileProcessor | null {
for (const processor of this.processors.values()) {
if (processor.canProcess(mimeType)) {
return processor;
}
}
return null;
}
/**
* Get all MIME types supported by all registered processors
*/
getAllSupportedMimeTypes(): string[] {
const supportedTypes = new Set<string>();
// Gather MIME types from all registered processors
for (const processor of this.processors.values()) {
const processorTypes = processor.getSupportedMimeTypes();
processorTypes.forEach(type => supportedTypes.add(type));
}
return Array.from(supportedTypes);
}
/**
* Check if a MIME type is supported by any processor
*/
isSupportedByAnyProcessor(mimeType: string): boolean {
if (!mimeType) return false;
// Check if any processor can handle this MIME type
const processor = this.getProcessorForMimeType(mimeType);
return processor !== null;
}
/**
* Check if blob needs OCR re-processing due to content changes
*/
needsReprocessing(blobId: string): boolean {
if (!blobId) {
return false;
}
try {
const blobInfo = sql.getRow<{
utcDateModified: string;
textExtractionLastProcessed: string | null;
}>(`
SELECT utcDateModified, textExtractionLastProcessed
FROM blobs
WHERE blobId = ?
`, [blobId]);
if (!blobInfo) {
return false;
}
// If OCR was never processed, it needs processing
if (!blobInfo.textExtractionLastProcessed) {
return true;
}
// If blob was modified after last OCR processing, it needs re-processing
const blobModified = new Date(blobInfo.utcDateModified);
const lastOcrProcessed = new Date(blobInfo.textExtractionLastProcessed);
return blobModified > lastOcrProcessed;
} catch (error) {
log.error(`Failed to check if blob ${blobId} needs reprocessing: ${error}`);
return false;
}
}
/**
* Invalidate OCR results for a blob (clear textRepresentation and textExtractionLastProcessed)
*/
invalidateOCRResult(blobId: string): void {
if (!blobId) {
return;
}
try {
sql.execute(`
UPDATE blobs SET
textRepresentation = NULL,
textExtractionLastProcessed = NULL
WHERE blobId = ?
`, [blobId]);
log.info(`Invalidated OCR result for blob ${blobId}`);
} catch (error) {
log.error(`Failed to invalidate OCR result for blob ${blobId}: ${error}`);
throw error;
}
}
/**
* Get blobs that need OCR processing (modified after last OCR or never processed)
*/
getBlobsNeedingOCR(): Array<{ blobId: string; mimeType: string; entityType: 'note' | 'attachment'; entityId: string }> {
try {
// Get notes with blobs that need OCR (both image notes and file notes with supported MIME types)
const noteBlobs = sql.getRows<{
blobId: string;
mimeType: string;
entityId: string;
}>(`
SELECT n.blobId, n.mime as mimeType, n.noteId as entityId
FROM notes n
JOIN blobs b ON n.blobId = b.blobId
WHERE (
n.type = 'image'
OR (
n.type = 'file'
AND n.mime IN (
'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
'application/vnd.openxmlformats-officedocument.presentationml.presentation',
'application/msword',
'application/vnd.ms-excel',
'application/vnd.ms-powerpoint',
'application/rtf',
'application/pdf',
'image/jpeg',
'image/jpg',
'image/png',
'image/gif',
'image/bmp',
'image/tiff',
'image/webp'
)
)
)
AND n.isDeleted = 0
AND n.blobId IS NOT NULL
AND (
b.textExtractionLastProcessed IS NULL
OR b.utcDateModified > b.textExtractionLastProcessed
)
`);
// Get attachments with blobs that need OCR (both image and file attachments with supported MIME types)
const attachmentBlobs = sql.getRows<{
blobId: string;
mimeType: string;
entityId: string;
}>(`
SELECT a.blobId, a.mime as mimeType, a.attachmentId as entityId
FROM attachments a
JOIN blobs b ON a.blobId = b.blobId
WHERE (
a.role = 'image'
OR (
a.role = 'file'
AND a.mime IN (
'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
'application/vnd.openxmlformats-officedocument.presentationml.presentation',
'application/msword',
'application/vnd.ms-excel',
'application/vnd.ms-powerpoint',
'application/rtf',
'application/pdf',
'image/jpeg',
'image/jpg',
'image/png',
'image/gif',
'image/bmp',
'image/tiff',
'image/webp'
)
)
)
AND a.isDeleted = 0
AND a.blobId IS NOT NULL
AND (
b.textExtractionLastProcessed IS NULL
OR b.utcDateModified > b.textExtractionLastProcessed
)
`);
// Combine results
const result = [
...noteBlobs.map(blob => ({ ...blob, entityType: 'note' as const })),
...attachmentBlobs.map(blob => ({ ...blob, entityType: 'attachment' as const }))
];
// Return all results (no need to filter by MIME type as we already did in the query)
return result;
} catch (error) {
log.error(`Failed to get blobs needing OCR: ${error}`);
return [];
}
}
/**
* Process OCR for all blobs that need it (auto-processing)
*/
async processAllBlobsNeedingOCR(): Promise<void> {
if (!this.isOCREnabled()) {
log.info('OCR is disabled, skipping auto-processing');
return;
}
const blobsNeedingOCR = this.getBlobsNeedingOCR();
if (blobsNeedingOCR.length === 0) {
log.info('No blobs need OCR processing');
return;
}
log.info(`Auto-processing OCR for ${blobsNeedingOCR.length} blobs...`);
for (const blobInfo of blobsNeedingOCR) {
try {
if (blobInfo.entityType === 'note') {
await this.processNoteOCR(blobInfo.entityId);
} else {
await this.processAttachmentOCR(blobInfo.entityId);
}
// Add small delay to prevent overwhelming the system
await new Promise(resolve => setTimeout(resolve, 100));
} catch (error) {
log.error(`Failed to auto-process OCR for ${blobInfo.entityType} ${blobInfo.entityId}: ${error}`);
// Continue with other blobs
}
}
log.info('Auto-processing OCR completed');
}
}
export default new OCRService();

View File

@@ -0,0 +1,33 @@
import { OCRResult, OCRProcessingOptions } from '../ocr_service.js';
/**
* Base class for file processors that extract text from different file types
*/
export abstract class FileProcessor {
/**
* Check if this processor can handle the given MIME type
*/
abstract canProcess(mimeType: string): boolean;
/**
* Extract text from the given file buffer
*/
abstract extractText(buffer: Buffer, options: OCRProcessingOptions): Promise<OCRResult>;
/**
* Get the processing type identifier
*/
abstract getProcessingType(): string;
/**
* Get list of MIME types supported by this processor
*/
abstract getSupportedMimeTypes(): string[];
/**
* Clean up any resources
*/
cleanup(): Promise<void> {
return Promise.resolve();
}
}

View File

@@ -0,0 +1,236 @@
import Tesseract from 'tesseract.js';
import log from '../../log.js';
import options from '../../options.js';
import { OCRProcessingOptions,OCRResult } from '../ocr_service.js';
import { FileProcessor } from './file_processor.js';
/**
* Image processor for extracting text from image files using Tesseract
*/
export class ImageProcessor extends FileProcessor {
private worker: Tesseract.Worker | null = null;
private isInitialized = false;
private readonly supportedTypes = [
'image/jpeg',
'image/jpg',
'image/png',
'image/gif',
'image/bmp',
'image/tiff',
'image/webp'
];
canProcess(mimeType: string): boolean {
return this.supportedTypes.includes(mimeType.toLowerCase());
}
getSupportedMimeTypes(): string[] {
return [...this.supportedTypes];
}
async extractText(buffer: Buffer, options: OCRProcessingOptions = {}): Promise<OCRResult> {
if (!this.isInitialized) {
await this.initialize();
}
if (!this.worker) {
throw new Error('Image processor worker not initialized');
}
try {
log.info('Starting image OCR text extraction...');
// Set language if specified and different from current
// Support multi-language format like 'ron+eng'
const language = options.language || this.getDefaultOCRLanguage();
// Validate language format
if (!this.isValidLanguageFormat(language)) {
throw new Error(`Invalid OCR language format: ${language}. Use format like 'eng' or 'ron+eng'`);
}
if (language !== 'eng') {
// For different languages, create a new worker
await this.worker.terminate();
log.info(`Initializing Tesseract worker for language(s): ${language}`);
this.worker = await Tesseract.createWorker(language, 1, {
logger: (m: { status: string; progress: number }) => {
if (m.status === 'recognizing text') {
log.info(`Image OCR progress (${language}): ${Math.round(m.progress * 100)}%`);
}
}
});
}
const result = await this.worker.recognize(buffer);
// Filter text based on minimum confidence threshold
const { filteredText, overallConfidence } = this.filterTextByConfidence(result.data, options);
const ocrResult: OCRResult = {
text: filteredText,
confidence: overallConfidence,
extractedAt: new Date().toISOString(),
language: options.language || this.getDefaultOCRLanguage(),
pageCount: 1
};
log.info(`Image OCR extraction completed. Confidence: ${ocrResult.confidence}%, Text length: ${ocrResult.text.length}`);
return ocrResult;
} catch (error) {
log.error(`Image OCR text extraction failed: ${error}`);
throw error;
}
}
getProcessingType(): string {
return 'image';
}
private async initialize(): Promise<void> {
if (this.isInitialized) {
return;
}
try {
log.info('Initializing image OCR processor with Tesseract.js...');
// Configure proper paths for Node.js environment
const tesseractDir = require.resolve('tesseract.js').replace('/src/index.js', '');
const workerPath = require.resolve('tesseract.js/src/worker-script/node/index.js');
const corePath = require.resolve('tesseract.js-core/tesseract-core.wasm.js');
log.info(`Using worker path: ${workerPath}`);
log.info(`Using core path: ${corePath}`);
this.worker = await Tesseract.createWorker(this.getDefaultOCRLanguage(), 1, {
workerPath,
corePath,
logger: (m: { status: string; progress: number }) => {
if (m.status === 'recognizing text') {
log.info(`Image OCR progress: ${Math.round(m.progress * 100)}%`);
}
}
});
this.isInitialized = true;
log.info('Image OCR processor initialized successfully');
} catch (error) {
log.error(`Failed to initialize image OCR processor: ${error}`);
throw error;
}
}
async cleanup(): Promise<void> {
if (this.worker) {
await this.worker.terminate();
this.worker = null;
}
this.isInitialized = false;
log.info('Image OCR processor cleaned up');
}
/**
* Get default OCR language from options
*/
private getDefaultOCRLanguage(): string {
try {
const ocrLanguage = options.getOption('ocrLanguage');
if (!ocrLanguage) {
throw new Error('OCR language not configured in user settings');
}
return ocrLanguage;
} catch (error) {
log.error(`Failed to get default OCR language: ${error}`);
throw new Error('OCR language must be configured in settings before processing');
}
}
/**
* Filter text based on minimum confidence threshold
*/
private filterTextByConfidence(data: any, options: OCRProcessingOptions): { filteredText: string; overallConfidence: number } {
const minConfidence = this.getMinConfidenceThreshold();
// If no minimum confidence set, return original text
if (minConfidence <= 0) {
return {
filteredText: data.text.trim(),
overallConfidence: data.confidence / 100
};
}
const filteredWords: string[] = [];
const validConfidences: number[] = [];
// Tesseract provides word-level data
if (data.words && Array.isArray(data.words)) {
for (const word of data.words) {
const wordConfidence = word.confidence / 100; // Convert to decimal
if (wordConfidence >= minConfidence) {
filteredWords.push(word.text);
validConfidences.push(wordConfidence);
}
}
} else {
// Fallback: if word-level data not available, use overall confidence
const overallConfidence = data.confidence / 100;
if (overallConfidence >= minConfidence) {
return {
filteredText: data.text.trim(),
overallConfidence
};
}
log.info(`Entire text filtered out due to low confidence ${overallConfidence} (below threshold ${minConfidence})`);
return {
filteredText: '',
overallConfidence
};
}
// Calculate average confidence of accepted words
const averageConfidence = validConfidences.length > 0
? validConfidences.reduce((sum, conf) => sum + conf, 0) / validConfidences.length
: 0;
const filteredText = filteredWords.join(' ').trim();
log.info(`Filtered OCR text: ${filteredWords.length} words kept out of ${data.words?.length || 0} total words (min confidence: ${minConfidence})`);
return {
filteredText,
overallConfidence: averageConfidence
};
}
/**
* Get minimum confidence threshold from options
*/
private getMinConfidenceThreshold(): number {
const minConfidence = options.getOption('ocrMinConfidence') ?? 0;
return parseFloat(minConfidence);
}
/**
* Validate OCR language format
* Supports single language (eng) or multi-language (ron+eng)
*/
private isValidLanguageFormat(language: string): boolean {
if (!language || typeof language !== 'string') {
return false;
}
// Split by '+' for multi-language format
const languages = language.split('+');
// Check each language code (should be 2-7 characters, alphanumeric with underscores)
const validLanguagePattern = /^[a-zA-Z]{2,3}(_[a-zA-Z]{2,3})?$/;
return languages.every(lang => {
const trimmed = lang.trim();
return trimmed.length > 0 && validLanguagePattern.test(trimmed);
});
}
}

View File

@@ -0,0 +1,133 @@
import * as officeParser from 'officeparser';
import log from '../../log.js';
import options from '../../options.js';
import { OCRProcessingOptions,OCRResult } from '../ocr_service.js';
import { FileProcessor } from './file_processor.js';
import { ImageProcessor } from './image_processor.js';
/**
* Office document processor for extracting text and images from DOCX/XLSX/PPTX files
*/
export class OfficeProcessor extends FileProcessor {
private imageProcessor: ImageProcessor;
private readonly supportedTypes = [
'application/vnd.openxmlformats-officedocument.wordprocessingml.document', // DOCX
'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', // XLSX
'application/vnd.openxmlformats-officedocument.presentationml.presentation', // PPTX
'application/msword', // DOC
'application/vnd.ms-excel', // XLS
'application/vnd.ms-powerpoint', // PPT
'application/rtf' // RTF
];
constructor() {
super();
this.imageProcessor = new ImageProcessor();
}
canProcess(mimeType: string): boolean {
return this.supportedTypes.includes(mimeType);
}
getSupportedMimeTypes(): string[] {
return [...this.supportedTypes];
}
async extractText(buffer: Buffer, options: OCRProcessingOptions = {}): Promise<OCRResult> {
try {
log.info('Starting Office document text extraction...');
// Validate language format
const language = options.language || this.getDefaultOCRLanguage();
if (!this.isValidLanguageFormat(language)) {
throw new Error(`Invalid OCR language format: ${language}. Use format like 'eng' or 'ron+eng'`);
}
// Extract text from Office document
const data = await this.parseOfficeDocument(buffer);
// Extract text from Office document
const combinedText = data.data && data.data.trim().length > 0 ? data.data.trim() : '';
const confidence = combinedText.length > 0 ? 0.99 : 0; // High confidence for direct text extraction
const result: OCRResult = {
text: combinedText,
confidence,
extractedAt: new Date().toISOString(),
language,
pageCount: 1 // Office documents are treated as single logical document
};
log.info(`Office document text extraction completed. Confidence: ${confidence}%, Text length: ${result.text.length}`);
return result;
} catch (error) {
log.error(`Office document text extraction failed: ${error}`);
throw error;
}
}
private async parseOfficeDocument(buffer: Buffer): Promise<{ data: string }> {
try {
// Use promise-based API directly
const data = await officeParser.parseOfficeAsync(buffer, {
outputErrorToConsole: false,
newlineDelimiter: '\n',
ignoreNotes: false,
putNotesAtLast: false
});
return {
data: data || ''
};
} catch (error) {
throw new Error(`Office document parsing failed: ${error}`);
}
}
getProcessingType(): string {
return 'office';
}
async cleanup(): Promise<void> {
await this.imageProcessor.cleanup();
}
/**
* Get default OCR language from options
*/
private getDefaultOCRLanguage(): string {
try {
const ocrLanguage = options.getOption('ocrLanguage');
if (!ocrLanguage) {
throw new Error('OCR language not configured in user settings');
}
return ocrLanguage;
} catch (error) {
log.error(`Failed to get default OCR language: ${error}`);
throw new Error('OCR language must be configured in settings before processing');
}
}
/**
* Validate OCR language format
* Supports single language (eng) or multi-language (ron+eng)
*/
private isValidLanguageFormat(language: string): boolean {
if (!language || typeof language !== 'string') {
return false;
}
// Split by '+' for multi-language format
const languages = language.split('+');
// Check each language code (should be 2-7 characters, alphanumeric with underscores)
const validLanguagePattern = /^[a-zA-Z]{2,3}(_[a-zA-Z]{2,3})?$/;
return languages.every(lang => {
const trimmed = lang.trim();
return trimmed.length > 0 && validLanguagePattern.test(trimmed);
});
}
}

View File

@@ -0,0 +1,147 @@
import * as pdfParse from 'pdf-parse';
import log from '../../log.js';
import options from '../../options.js';
import { OCRProcessingOptions,OCRResult } from '../ocr_service.js';
import { FileProcessor } from './file_processor.js';
import { ImageProcessor } from './image_processor.js';
/**
* PDF processor for extracting text from PDF files
* First tries to extract existing text, then falls back to OCR on images
*/
export class PDFProcessor extends FileProcessor {
private imageProcessor: ImageProcessor;
private readonly supportedTypes = ['application/pdf'];
constructor() {
super();
this.imageProcessor = new ImageProcessor();
}
canProcess(mimeType: string): boolean {
return mimeType.toLowerCase() === 'application/pdf';
}
getSupportedMimeTypes(): string[] {
return [...this.supportedTypes];
}
async extractText(buffer: Buffer, options: OCRProcessingOptions = {}): Promise<OCRResult> {
try {
log.info('Starting PDF text extraction...');
// Validate language format
const language = options.language || this.getDefaultOCRLanguage();
if (!this.isValidLanguageFormat(language)) {
throw new Error(`Invalid OCR language format: ${language}. Use format like 'eng' or 'ron+eng'`);
}
// First try to extract existing text from PDF
if (options.enablePDFTextExtraction !== false) {
const textResult = await this.extractTextFromPDF(buffer, options);
if (textResult.text.trim().length > 0) {
log.info(`PDF text extraction successful. Length: ${textResult.text.length}`);
return textResult;
}
}
// Fall back to OCR if no text found or PDF text extraction is disabled
log.info('No text found in PDF or text extraction disabled, falling back to OCR...');
return await this.extractTextViaOCR(buffer, options);
} catch (error) {
log.error(`PDF text extraction failed: ${error}`);
throw error;
}
}
private async extractTextFromPDF(buffer: Buffer, options: OCRProcessingOptions): Promise<OCRResult> {
try {
const data = await pdfParse(buffer);
return {
text: data.text.trim(),
confidence: 0.99, // High confidence for direct text extraction
extractedAt: new Date().toISOString(),
language: options.language || this.getDefaultOCRLanguage(),
pageCount: data.numpages
};
} catch (error) {
log.error(`PDF text extraction failed: ${error}`);
throw error;
}
}
private async extractTextViaOCR(buffer: Buffer, options: OCRProcessingOptions): Promise<OCRResult> {
try {
// Convert PDF to images and OCR each page
// For now, we'll use a simple approach - convert first page to image
// In a full implementation, we'd convert all pages
// This is a simplified implementation
// In practice, you might want to use pdf2pic or similar library
// to convert PDF pages to images for OCR
// For now, we'll return a placeholder result
// indicating that OCR on PDF is not fully implemented
log.info('PDF to image conversion not fully implemented, returning placeholder');
return {
text: '[PDF OCR not fully implemented - would convert PDF pages to images and OCR each page]',
confidence: 0.0,
extractedAt: new Date().toISOString(),
language: options.language || this.getDefaultOCRLanguage(),
pageCount: 1
};
} catch (error) {
log.error(`PDF OCR extraction failed: ${error}`);
throw error;
}
}
getProcessingType(): string {
return 'pdf';
}
async cleanup(): Promise<void> {
await this.imageProcessor.cleanup();
}
/**
* Get default OCR language from options
*/
private getDefaultOCRLanguage(): string {
try {
const ocrLanguage = options.getOption('ocrLanguage');
if (!ocrLanguage) {
throw new Error('OCR language not configured in user settings');
}
return ocrLanguage;
} catch (error) {
log.error(`Failed to get default OCR language: ${error}`);
throw new Error('OCR language must be configured in settings before processing');
}
}
/**
* Validate OCR language format
* Supports single language (eng) or multi-language (ron+eng)
*/
private isValidLanguageFormat(language: string): boolean {
if (!language || typeof language !== 'string') {
return false;
}
// Split by '+' for multi-language format
const languages = language.split('+');
// Check each language code (should be 2-7 characters, alphanumeric with underscores)
const validLanguagePattern = /^[a-zA-Z]{2,3}(_[a-zA-Z]{2,3})?$/;
return languages.every(lang => {
const trimmed = lang.trim();
return trimmed.length > 0 && validLanguagePattern.test(trimmed);
});
}
}

View File

@@ -0,0 +1,135 @@
import sharp from 'sharp';
import log from '../../log.js';
import options from '../../options.js';
import { OCRProcessingOptions,OCRResult } from '../ocr_service.js';
import { FileProcessor } from './file_processor.js';
import { ImageProcessor } from './image_processor.js';
/**
* TIFF processor for extracting text from multi-page TIFF files
*/
export class TIFFProcessor extends FileProcessor {
private imageProcessor: ImageProcessor;
private readonly supportedTypes = ['image/tiff', 'image/tif'];
constructor() {
super();
this.imageProcessor = new ImageProcessor();
}
canProcess(mimeType: string): boolean {
return mimeType.toLowerCase() === 'image/tiff' || mimeType.toLowerCase() === 'image/tif';
}
getSupportedMimeTypes(): string[] {
return [...this.supportedTypes];
}
async extractText(buffer: Buffer, options: OCRProcessingOptions = {}): Promise<OCRResult> {
try {
log.info('Starting TIFF text extraction...');
// Validate language format
const language = options.language || this.getDefaultOCRLanguage();
if (!this.isValidLanguageFormat(language)) {
throw new Error(`Invalid OCR language format: ${language}. Use format like 'eng' or 'ron+eng'`);
}
// Check if this is a multi-page TIFF
const metadata = await sharp(buffer).metadata();
const pageCount = metadata.pages || 1;
let combinedText = '';
let totalConfidence = 0;
// Process each page
for (let page = 0; page < pageCount; page++) {
try {
log.info(`Processing TIFF page ${page + 1}/${pageCount}...`);
// Extract page as PNG buffer
const pageBuffer = await sharp(buffer, { page })
.png()
.toBuffer();
// OCR the page
const pageResult = await this.imageProcessor.extractText(pageBuffer, options);
if (pageResult.text.trim().length > 0) {
if (combinedText.length > 0) {
combinedText += `\n\n--- Page ${page + 1} ---\n`;
}
combinedText += pageResult.text;
totalConfidence += pageResult.confidence;
}
} catch (error) {
log.error(`Failed to process TIFF page ${page + 1}: ${error}`);
// Continue with other pages
}
}
const averageConfidence = pageCount > 0 ? totalConfidence / pageCount : 0;
const result: OCRResult = {
text: combinedText.trim(),
confidence: averageConfidence,
extractedAt: new Date().toISOString(),
language: options.language || this.getDefaultOCRLanguage(),
pageCount
};
log.info(`TIFF text extraction completed. Pages: ${pageCount}, Confidence: ${averageConfidence}%, Text length: ${result.text.length}`);
return result;
} catch (error) {
log.error(`TIFF text extraction failed: ${error}`);
throw error;
}
}
getProcessingType(): string {
return 'tiff';
}
async cleanup(): Promise<void> {
await this.imageProcessor.cleanup();
}
/**
* Get default OCR language from options
*/
private getDefaultOCRLanguage(): string {
try {
const ocrLanguage = options.getOption('ocrLanguage');
if (!ocrLanguage) {
throw new Error('OCR language not configured in user settings');
}
return ocrLanguage;
} catch (error) {
log.error(`Failed to get default OCR language: ${error}`);
throw new Error('OCR language must be configured in settings before processing');
}
}
/**
* Validate OCR language format
* Supports single language (eng) or multi-language (ron+eng)
*/
private isValidLanguageFormat(language: string): boolean {
if (!language || typeof language !== 'string') {
return false;
}
// Split by '+' for multi-language format
const languages = language.split('+');
// Check each language code (should be 2-7 characters, alphanumeric with underscores)
const validLanguagePattern = /^[a-zA-Z]{2,3}(_[a-zA-Z]{2,3})?$/;
return languages.every(lang => {
const trimmed = lang.trim();
return trimmed.length > 0 && validLanguagePattern.test(trimmed);
});
}
}

View File

@@ -212,7 +212,13 @@ const defaultOptions: DefaultOption[] = [
{ name: "experimentalFeatures", value: "[]", isSynced: true },
// AI / LLM
{ name: "llmProviders", value: "[]", isSynced: false }
{ name: "llmProviders", value: "[]", isSynced: false },
// OCR options
{ name: "ocrEnabled", value: "false", isSynced: true },
{ name: "ocrLanguage", value: "eng", isSynced: true },
{ name: "ocrAutoProcessImages", value: "true", isSynced: true },
{ name: "ocrMinConfidence", value: "0.55", isSynced: true },
];
/**

View File

@@ -0,0 +1,111 @@
import Expression from "./expression.js";
import SearchContext from "../search_context.js";
import NoteSet from "../note_set.js";
import sql from "../../sql.js";
import becca from "../../../becca/becca.js";
/**
* Search expression for finding text within OCR-extracted content from images
*/
export default class OCRContentExpression extends Expression {
private searchText: string;
constructor(searchText: string) {
super();
this.searchText = searchText;
}
execute(inputNoteSet: NoteSet, executionContext: object, searchContext: SearchContext): NoteSet {
// Don't search OCR content if it's not enabled
if (!this.isOCRSearchEnabled()) {
return new NoteSet();
}
const resultNoteSet = new NoteSet();
const ocrResults = this.searchOCRContent(this.searchText);
for (const ocrResult of ocrResults) {
// Find notes that use this blob
const notes = sql.getRows<{noteId: string}>(`
SELECT noteId FROM notes
WHERE blobId = ? AND isDeleted = 0
`, [ocrResult.blobId]);
for (const noteRow of notes) {
const note = becca.getNote(noteRow.noteId);
if (note && !note.isDeleted && inputNoteSet.hasNoteId(note.noteId)) {
resultNoteSet.add(note);
}
}
// Find attachments that use this blob and their parent notes
const attachments = sql.getRows<{ownerId: string}>(`
SELECT ownerId FROM attachments
WHERE blobId = ? AND isDeleted = 0
`, [ocrResult.blobId]);
for (const attachmentRow of attachments) {
const note = becca.getNote(attachmentRow.ownerId);
if (note && !note.isDeleted && inputNoteSet.hasNoteId(note.noteId)) {
resultNoteSet.add(note);
}
}
}
// Add highlight tokens for OCR matches
if (ocrResults.length > 0) {
const tokens = this.extractHighlightTokens(this.searchText);
searchContext.highlightedTokens.push(...tokens);
}
return resultNoteSet;
}
private isOCRSearchEnabled(): boolean {
try {
const optionService = require('../../options.js').default;
return optionService.getOptionBool('ocrEnabled');
} catch {
return false;
}
}
private searchOCRContent(searchText: string): Array<{
blobId: string;
textRepresentation: string;
}> {
try {
// Search in blobs table for OCR text
const query = `
SELECT blobId, textRepresentation
FROM blobs
WHERE textRepresentation LIKE ?
AND textRepresentation IS NOT NULL
AND textRepresentation != ''
LIMIT 50
`;
const params = [`%${searchText}%`];
return sql.getRows<{
blobId: string;
textRepresentation: string;
}>(query, params);
} catch (error) {
console.error('Error searching OCR content:', error);
return [];
}
}
private extractHighlightTokens(searchText: string): string[] {
// Split search text into words and return them as highlight tokens
return searchText
.split(/\s+/)
.filter(token => token.length > 2)
.map(token => token.toLowerCase());
}
toString(): string {
return `OCRContent('${this.searchText}')`;
}
}

View File

@@ -1,12 +1,11 @@
"use strict";
import beccaService from "../../becca/becca_service.js";
import becca from "../../becca/becca.js";
import {
normalizeSearchText,
calculateOptimizedEditDistance,
FUZZY_SEARCH_CONFIG
} from "./utils/text_utils.js";
import beccaService from "../../becca/becca_service.js";
import options from "../options.js";
import sql from "../sql.js";
import {
calculateOptimizedEditDistance,
FUZZY_SEARCH_CONFIG,
normalizeSearchText} from "./utils/text_utils.js";
// Scoring constants for better maintainability
const SCORE_WEIGHTS = {
@@ -85,6 +84,9 @@ class SearchResult {
this.addScoreForStrings(tokens, note.title, SCORE_WEIGHTS.TITLE_FACTOR, enableFuzzyMatching);
this.addScoreForStrings(tokens, this.notePathTitle, SCORE_WEIGHTS.PATH_FACTOR, enableFuzzyMatching);
// Add OCR scoring - weight between title and content matches
this.addOCRScore(tokens, 1.5);
if (note.isInHiddenSubtree()) {
this.score = this.score / SCORE_WEIGHTS.HIDDEN_NOTE_PENALTY;
}
@@ -98,7 +100,7 @@ class SearchResult {
for (const chunk of chunks) {
for (const token of tokens) {
const normalizedToken = normalizeSearchText(token.toLowerCase());
if (chunk === normalizedToken) {
tokenScore += SCORE_WEIGHTS.TOKEN_EXACT_MATCH * token.length * factor;
} else if (chunk.startsWith(normalizedToken)) {
@@ -108,10 +110,10 @@ class SearchResult {
} else {
// Try fuzzy matching for individual tokens with caps applied
const editDistance = calculateOptimizedEditDistance(chunk, normalizedToken, FUZZY_SEARCH_CONFIG.MAX_EDIT_DISTANCE);
if (editDistance <= FUZZY_SEARCH_CONFIG.MAX_EDIT_DISTANCE &&
if (editDistance <= FUZZY_SEARCH_CONFIG.MAX_EDIT_DISTANCE &&
normalizedToken.length >= FUZZY_SEARCH_CONFIG.MIN_FUZZY_TOKEN_LENGTH &&
this.fuzzyScore < SCORE_WEIGHTS.MAX_TOTAL_FUZZY_SCORE) {
const fuzzyWeight = SCORE_WEIGHTS.TOKEN_FUZZY_MATCH * (1 - editDistance / FUZZY_SEARCH_CONFIG.MAX_EDIT_DISTANCE);
// Apply caps: limit token length multiplier and per-token contribution
const cappedTokenLength = Math.min(token.length, SCORE_WEIGHTS.MAX_FUZZY_TOKEN_LENGTH_MULTIPLIER);
@@ -119,7 +121,7 @@ class SearchResult {
fuzzyWeight * cappedTokenLength * factor,
SCORE_WEIGHTS.MAX_FUZZY_SCORE_PER_TOKEN
);
tokenScore += fuzzyTokenScore;
this.fuzzyScore += fuzzyTokenScore;
}
@@ -129,13 +131,43 @@ class SearchResult {
this.score += tokenScore;
}
addOCRScore(tokens: string[], factor: number) {
try {
// Check if OCR is enabled
if (!options.getOptionBool('ocrEnabled')) {
return;
}
// Search for OCR results for this note and its attachments
const ocrResults = sql.getRows(`
SELECT b.textRepresentation
FROM blobs b
WHERE b.textRepresentation IS NOT NULL
AND b.textRepresentation != ''
AND (
b.blobId = (SELECT blobId FROM notes WHERE noteId = ? AND isDeleted = 0)
OR b.blobId IN (
SELECT blobId FROM attachments WHERE ownerId = ? AND isDeleted = 0
)
)
`, [this.noteId, this.noteId]);
for (const ocrResult of ocrResults as Array<{textRepresentation: string}>) {
// Add score for OCR text matches
this.addScoreForStrings(tokens, ocrResult.textRepresentation, factor);
}
} catch (error) {
// Silently fail if OCR service is not available
console.debug('OCR scoring failed:', error);
}
}
/**
* Checks if the query matches as a complete word in the text
*/
private isWordMatch(text: string, query: string): boolean {
return text.includes(` ${query} `) ||
text.startsWith(`${query} `) ||
return text.includes(` ${query} `) ||
text.startsWith(`${query} `) ||
text.endsWith(` ${query}`);
}
@@ -147,21 +179,21 @@ class SearchResult {
if (this.fuzzyScore >= SCORE_WEIGHTS.MAX_TOTAL_FUZZY_SCORE) {
return 0;
}
const editDistance = calculateOptimizedEditDistance(title, query, FUZZY_SEARCH_CONFIG.MAX_EDIT_DISTANCE);
const maxLen = Math.max(title.length, query.length);
// Only apply fuzzy matching if the query is reasonably long and edit distance is small
if (query.length >= FUZZY_SEARCH_CONFIG.MIN_FUZZY_TOKEN_LENGTH &&
editDistance <= FUZZY_SEARCH_CONFIG.MAX_EDIT_DISTANCE &&
if (query.length >= FUZZY_SEARCH_CONFIG.MIN_FUZZY_TOKEN_LENGTH &&
editDistance <= FUZZY_SEARCH_CONFIG.MAX_EDIT_DISTANCE &&
editDistance / maxLen <= 0.3) {
const similarity = 1 - (editDistance / maxLen);
const baseFuzzyScore = SCORE_WEIGHTS.TITLE_WORD_MATCH * similarity * 0.7; // Reduced weight for fuzzy matches
// Apply cap to ensure fuzzy title matches don't exceed reasonable bounds
return Math.min(baseFuzzyScore, SCORE_WEIGHTS.MAX_TOTAL_FUZZY_SCORE * 0.3);
}
return 0;
}

View File

@@ -0,0 +1,337 @@
import { describe, it, expect, vi, beforeEach } from 'vitest';
// Mock dependencies
const mockSql = {
getRows: vi.fn()
};
const mockOptions = {
getOptionBool: vi.fn()
};
const mockBecca = {
notes: {},
getNote: vi.fn()
};
const mockBeccaService = {
getNoteTitleForPath: vi.fn()
};
vi.mock('../sql.js', () => ({
default: mockSql
}));
vi.mock('../options.js', () => ({
default: mockOptions
}));
// The SearchResult now uses proper ES imports which are mocked above
vi.mock('../../becca/becca.js', () => ({
default: mockBecca
}));
vi.mock('../../becca/becca_service.js', () => ({
default: mockBeccaService
}));
// Import SearchResult after mocking
let SearchResult: any;
beforeEach(async () => {
vi.clearAllMocks();
// Reset mock implementations
mockOptions.getOptionBool.mockReturnValue(true);
mockSql.getRows.mockReturnValue([]);
mockBeccaService.getNoteTitleForPath.mockReturnValue('Test Note Title');
// Setup mock note
const mockNote = {
noteId: 'test123',
title: 'Test Note',
isInHiddenSubtree: vi.fn().mockReturnValue(false)
};
mockBecca.notes['test123'] = mockNote;
// Dynamically import SearchResult
const module = await import('./search_result.js');
SearchResult = module.default;
});
describe('SearchResult', () => {
describe('constructor', () => {
it('should initialize with note path array', () => {
const searchResult = new SearchResult(['root', 'folder', 'test123']);
expect(searchResult.notePathArray).toEqual(['root', 'folder', 'test123']);
expect(searchResult.noteId).toBe('test123');
expect(searchResult.notePath).toBe('root/folder/test123');
expect(searchResult.score).toBe(0);
expect(mockBeccaService.getNoteTitleForPath).toHaveBeenCalledWith(['root', 'folder', 'test123']);
});
});
describe('computeScore', () => {
let searchResult: any;
beforeEach(() => {
searchResult = new SearchResult(['root', 'test123']);
});
describe('basic scoring', () => {
it('should give highest score for exact note ID match', () => {
searchResult.computeScore('test123', ['test123']);
expect(searchResult.score).toBeGreaterThanOrEqual(1000);
});
it('should give high score for exact title match', () => {
searchResult.computeScore('test note', ['test', 'note']);
expect(searchResult.score).toBeGreaterThan(2000);
});
it('should give medium score for title prefix match', () => {
searchResult.computeScore('test', ['test']);
expect(searchResult.score).toBeGreaterThan(500);
});
it('should give lower score for title word match', () => {
mockBecca.notes['test123'].title = 'This is a test note';
searchResult.computeScore('test', ['test']);
expect(searchResult.score).toBeGreaterThan(300);
});
});
describe('OCR scoring integration', () => {
beforeEach(() => {
// Mock OCR-enabled
mockOptions.getOptionBool.mockReturnValue(true);
});
it('should add OCR score when OCR results exist', () => {
const mockOCRResults = [
{
extracted_text: 'sample text from image',
confidence: 0.95
}
];
mockSql.getRows.mockReturnValue(mockOCRResults);
searchResult.computeScore('sample', ['sample']);
expect(mockSql.getRows).toHaveBeenCalledWith(
expect.stringContaining('FROM ocr_results'),
['test123', 'test123']
);
expect(searchResult.score).toBeGreaterThan(0);
});
it('should apply confidence weighting to OCR scores', () => {
const highConfidenceResult = [
{
extracted_text: 'sample text',
confidence: 0.95
}
];
const lowConfidenceResult = [
{
extracted_text: 'sample text',
confidence: 0.30
}
];
// Test high confidence
mockSql.getRows.mockReturnValue(highConfidenceResult);
searchResult.computeScore('sample', ['sample']);
const highConfidenceScore = searchResult.score;
// Reset and test low confidence
searchResult.score = 0;
mockSql.getRows.mockReturnValue(lowConfidenceResult);
searchResult.computeScore('sample', ['sample']);
const lowConfidenceScore = searchResult.score;
expect(highConfidenceScore).toBeGreaterThan(lowConfidenceScore);
});
it('should handle multiple OCR results', () => {
const multipleResults = [
{
extracted_text: 'first sample text',
confidence: 0.90
},
{
extracted_text: 'second sample document',
confidence: 0.85
}
];
mockSql.getRows.mockReturnValue(multipleResults);
searchResult.computeScore('sample', ['sample']);
expect(searchResult.score).toBeGreaterThan(0);
// Score should account for multiple matches
});
it('should skip OCR scoring when OCR is disabled', () => {
mockOptions.getOptionBool.mockReturnValue(false);
searchResult.computeScore('sample', ['sample']);
expect(mockSql.getRows).not.toHaveBeenCalled();
});
it('should handle OCR scoring errors gracefully', () => {
mockSql.getRows.mockImplementation(() => {
throw new Error('Database error');
});
expect(() => {
searchResult.computeScore('sample', ['sample']);
}).not.toThrow();
// Score should still be calculated from other factors
expect(searchResult.score).toBeGreaterThanOrEqual(0);
});
});
describe('hidden notes penalty', () => {
it('should apply penalty for hidden notes', () => {
mockBecca.notes['test123'].isInHiddenSubtree.mockReturnValue(true);
searchResult.computeScore('test', ['test']);
const hiddenScore = searchResult.score;
// Reset and test non-hidden
mockBecca.notes['test123'].isInHiddenSubtree.mockReturnValue(false);
searchResult.score = 0;
searchResult.computeScore('test', ['test']);
const normalScore = searchResult.score;
expect(normalScore).toBeGreaterThan(hiddenScore);
expect(hiddenScore).toBe(normalScore / 3);
});
});
});
describe('addScoreForStrings', () => {
let searchResult: any;
beforeEach(() => {
searchResult = new SearchResult(['root', 'test123']);
});
it('should give highest score for exact token match', () => {
searchResult.addScoreForStrings(['sample'], 'sample text', 1.0);
const exactScore = searchResult.score;
searchResult.score = 0;
searchResult.addScoreForStrings(['sample'], 'sampling text', 1.0);
const prefixScore = searchResult.score;
searchResult.score = 0;
searchResult.addScoreForStrings(['sample'], 'text sample text', 1.0);
const partialScore = searchResult.score;
expect(exactScore).toBeGreaterThan(prefixScore);
expect(exactScore).toBeGreaterThanOrEqual(partialScore);
});
it('should apply factor multiplier correctly', () => {
searchResult.addScoreForStrings(['sample'], 'sample text', 2.0);
const doubleFactorScore = searchResult.score;
searchResult.score = 0;
searchResult.addScoreForStrings(['sample'], 'sample text', 1.0);
const singleFactorScore = searchResult.score;
expect(doubleFactorScore).toBe(singleFactorScore * 2);
});
it('should handle multiple tokens', () => {
searchResult.addScoreForStrings(['hello', 'world'], 'hello world test', 1.0);
expect(searchResult.score).toBeGreaterThan(0);
});
it('should be case insensitive', () => {
searchResult.addScoreForStrings(['sample'], 'sample text', 1.0);
const lowerCaseScore = searchResult.score;
searchResult.score = 0;
searchResult.addScoreForStrings(['sample'], 'SAMPLE text', 1.0);
const upperCaseScore = searchResult.score;
expect(upperCaseScore).toEqual(lowerCaseScore);
expect(upperCaseScore).toBeGreaterThan(0);
});
});
describe('addOCRScore', () => {
let searchResult: any;
beforeEach(() => {
searchResult = new SearchResult(['root', 'test123']);
});
it('should query for both note and attachment OCR results', () => {
mockOptions.getOptionBool.mockReturnValue(true);
mockSql.getRows.mockReturnValue([]);
searchResult.addOCRScore(['sample'], 1.5);
expect(mockSql.getRows).toHaveBeenCalledWith(
expect.stringContaining('FROM ocr_results'),
['test123', 'test123']
);
});
it('should apply minimum confidence multiplier', () => {
mockOptions.getOptionBool.mockReturnValue(true);
const lowConfidenceResult = [
{
extracted_text: 'sample text',
confidence: 0.1 // Very low confidence
}
];
mockSql.getRows.mockReturnValue(lowConfidenceResult);
searchResult.addOCRScore(['sample'], 1.0);
// Should still get some score due to minimum 0.5x multiplier
expect(searchResult.score).toBeGreaterThan(0);
});
it('should handle database query errors', () => {
mockOptions.getOptionBool.mockReturnValue(true);
mockSql.getRows.mockImplementation(() => {
throw new Error('Database connection failed');
});
// Should not throw error
expect(() => {
searchResult.addOCRScore(['sample'], 1.5);
}).not.toThrow();
});
it('should skip when OCR is disabled', () => {
mockOptions.getOptionBool.mockReturnValue(false);
searchResult.addOCRScore(['sample'], 1.5);
expect(mockSql.getRows).not.toHaveBeenCalled();
});
it('should handle options service errors', () => {
mockOptions.getOptionBool.mockImplementation(() => {
throw new Error('Options service unavailable');
});
expect(() => {
searchResult.addOCRScore(['sample'], 1.5);
}).not.toThrow();
expect(mockSql.getRows).not.toHaveBeenCalled();
});
});
});

View File

@@ -1,28 +1,30 @@
"use strict";
import { dayjs } from "@triliumnext/commons";
import { removeDiacritic } from "../../utils.js";
import AncestorExp from "../expressions/ancestor.js";
import AndExp from "../expressions/and.js";
import OrExp from "../expressions/or.js";
import NotExp from "../expressions/not.js";
import AttributeExistsExp from "../expressions/attribute_exists.js";
import ChildOfExp from "../expressions/child_of.js";
import DescendantOfExp from "../expressions/descendant_of.js";
import ParentOfExp from "../expressions/parent_of.js";
import RelationWhereExp from "../expressions/relation_where.js";
import PropertyComparisonExp from "../expressions/property_comparison.js";
import AttributeExistsExp from "../expressions/attribute_exists.js";
import LabelComparisonExp from "../expressions/label_comparison.js";
import NoteFlatTextExp from "../expressions/note_flat_text.js";
import NoteContentFulltextExp from "../expressions/note_content_fulltext.js";
import OrderByAndLimitExp from "../expressions/order_by_and_limit.js";
import AncestorExp from "../expressions/ancestor.js";
import buildComparator from "./build_comparator.js";
import ValueExtractor from "../value_extractor.js";
import { removeDiacritic } from "../../utils.js";
import TrueExp from "../expressions/true.js";
import IsHiddenExp from "../expressions/is_hidden.js";
import type SearchContext from "../search_context.js";
import type { TokenData, TokenStructure } from "./types.js";
import type Expression from "../expressions/expression.js";
import IsHiddenExp from "../expressions/is_hidden.js";
import LabelComparisonExp from "../expressions/label_comparison.js";
import NotExp from "../expressions/not.js";
import NoteContentFulltextExp from "../expressions/note_content_fulltext.js";
import NoteFlatTextExp from "../expressions/note_flat_text.js";
import OCRContentExpression from "../expressions/ocr_content.js";
import OrExp from "../expressions/or.js";
import OrderByAndLimitExp from "../expressions/order_by_and_limit.js";
import ParentOfExp from "../expressions/parent_of.js";
import PropertyComparisonExp from "../expressions/property_comparison.js";
import RelationWhereExp from "../expressions/relation_where.js";
import TrueExp from "../expressions/true.js";
import type SearchContext from "../search_context.js";
import ValueExtractor from "../value_extractor.js";
import buildComparator from "./build_comparator.js";
import type { TokenData, TokenStructure } from "./types.js";
function getFulltext(_tokens: TokenData[], searchContext: SearchContext, leadingOperator?: string) {
const tokens: string[] = _tokens.map((t) => removeDiacritic(t.token));
@@ -42,16 +44,33 @@ function getFulltext(_tokens: TokenData[], searchContext: SearchContext, leading
// Exact match on title OR exact match on content OR exact match in flat text (includes attributes)
// For multi-word, join tokens with space to form exact phrase
const titleSearchValue = tokens.join(" ");
return new OrExp([
const exactMatchExpressions: Expression[] = [
new PropertyComparisonExp(searchContext, "title", "=", titleSearchValue),
new NoteContentFulltextExp("=", { tokens, flatText: false }),
new NoteContentFulltextExp("=", { tokens, flatText: true })
]);
];
// Add OCR content search for each token
for (const token of tokens) {
exactMatchExpressions.push(new OCRContentExpression(token));
}
return new OrExp(exactMatchExpressions);
}
return new OrExp([new NoteFlatTextExp(tokens), new NoteContentFulltextExp(operator, { tokens, flatText: true })]);
} else {
return new NoteFlatTextExp(tokens);
const searchExpressions: Expression[] = [
new NoteFlatTextExp(tokens),
new NoteContentFulltextExp(operator, { tokens, flatText: true })
];
// Add OCR content search for each token
for (const token of tokens) {
searchExpressions.push(new OCRContentExpression(token));
}
return new OrExp(searchExpressions);
}
return new NoteFlatTextExp(tokens);
}
const OPERATORS = new Set(["=", "!=", "*=*", "*=", "=*", ">", ">=", "<", "<=", "%=", "~=", "~*"]);
@@ -298,9 +317,9 @@ function getExpression(tokens: TokenData[], searchContext: SearchContext, level
searchContext.addError(`Relation can be compared only with property, e.g. ~relation.title=hello in ${context(i)}`);
return null;
} else {
return new AttributeExistsExp("relation", relationName, searchContext.fuzzyAttributeSearch);
}
return new AttributeExistsExp("relation", relationName, searchContext.fuzzyAttributeSearch);
}
function parseOrderByAndLimit() {
@@ -308,7 +327,7 @@ function getExpression(tokens: TokenData[], searchContext: SearchContext, level
valueExtractor: ValueExtractor;
direction: string;
}[] = [];
let limit: number | undefined = undefined;
let limit: number | undefined;
if (tokens[i].token === "orderby") {
do {
@@ -354,9 +373,9 @@ function getExpression(tokens: TokenData[], searchContext: SearchContext, level
return AndExp.of(expressions);
} else if (op === "or") {
return OrExp.of(expressions);
} else {
throw new Error(`Unrecognized op=${op}`);
}
throw new Error(`Unrecognized op=${op}`);
}
for (i = 0; i < tokens.length; i++) {
@@ -423,7 +442,7 @@ function getExpression(tokens: TokenData[], searchContext: SearchContext, level
} else if (op !== token) {
searchContext.addError("Mixed usage of AND/OR - always use parenthesis to group AND/OR expressions.");
}
} else if (isOperator({ token: token })) {
} else if (isOperator({ token })) {
searchContext.addError(`Misplaced or incomplete expression "${token}"`);
} else {
searchContext.addError(`Unrecognized expression "${token}"`);
@@ -493,9 +512,9 @@ function getAncestorExp({ ancestorNoteId, ancestorDepth, includeHiddenNotes }: S
return new AncestorExp(ancestorNoteId, ancestorDepth);
} else if (!includeHiddenNotes) {
return new NotExp(new IsHiddenExp());
} else {
return null;
}
return null;
}
export default parse;

View File

@@ -9,7 +9,7 @@
"preview": "pnpm build && vite preview"
},
"dependencies": {
"i18next": "25.10.10",
"i18next": "26.0.1",
"i18next-http-backend": "3.0.2",
"preact": "10.29.0",
"preact-iso": "2.11.1",

View File

@@ -27,8 +27,7 @@ export function initTranslations(lng: string) {
initAsync: false,
react: {
useSuspense: false
},
showSupportNotice: false
}
});
}

BIN
eng.traineddata Normal file

Binary file not shown.

View File

@@ -36,7 +36,7 @@
"test:all": "pnpm test:parallel && pnpm test:sequential",
"test:parallel": "pnpm --filter=!server --filter=!ckeditor5-mermaid --filter=!ckeditor5-math --parallel test",
"test:sequential": "pnpm --filter=server --filter=ckeditor5-mermaid --filter=ckeditor5-math --sequential test",
"typecheck": "tsc --build",
"typecheck": "tsx scripts/filter-tsc-output.mts",
"dev:format-check": "eslint -c eslint.format.config.mjs .",
"dev:format-fix": "eslint -c eslint.format.config.mjs . --fix",
"dev:linter-check": "cross-env NODE_OPTIONS=--max_old_space_size=4096 eslint .",

View File

@@ -144,6 +144,12 @@ export interface OptionDefinitions extends KeyboardShortcutsOptions<KeyboardActi
// AI / LLM
/** JSON array of configured LLM providers with their API keys */
llmProviders: string;
// OCR options
ocrEnabled: boolean;
ocrLanguage: string;
ocrAutoProcessImages: boolean;
ocrMinConfidence: string;
}
export type OptionNames = keyof OptionDefinitions;

View File

@@ -72,6 +72,7 @@ export interface BlobRow {
blobId: string;
content: string | Buffer;
contentLength: number;
textRepresentation?: string | null;
dateModified: string;
utcDateModified: string;
}

623
pnpm-lock.yaml generated

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,56 @@
/**
* Runs `tsc --build` and filters out noisy cascade errors (TS6305).
* Numbers each remaining error and prints a summary at the end.
*/
import { execSync } from "child_process";
const SUPPRESSED_CODES = [ "TS6305" ];
const ERROR_LINE_PATTERN = /^.+\(\d+,\d+\): error TS\d+:/;
let output: string;
try {
output = execSync("tsc --build", {
encoding: "utf-8",
stdio: [ "inherit", "pipe", "pipe" ]
});
} catch (err: unknown) {
const execErr = err as { stdout?: string; stderr?: string };
output = (execErr.stdout ?? "") + (execErr.stderr ?? "");
}
const lines = output.split(/\r?\n/);
const filtered = lines.filter(
(line) => !SUPPRESSED_CODES.some((code) => line.includes(code))
);
let errorIndex = 0;
const numbered: string[] = [];
const seen = new Set<string>();
let skipContinuation = false;
for (const line of filtered) {
if (ERROR_LINE_PATTERN.test(line)) {
if (seen.has(line)) {
skipContinuation = true;
continue;
}
seen.add(line);
skipContinuation = false;
errorIndex++;
numbered.push(`[${errorIndex}] ${line}`);
} else if (line.trim()) {
// Continuation line (indented context for multi-line errors)
if (!skipContinuation) {
numbered.push(line);
}
}
}
if (errorIndex > 0) {
console.log(numbered.join("\n"));
console.log(`\n${errorIndex} error(s) found.`);
process.exit(1);
} else {
console.log("No errors found.");
}