mirror of
https://github.com/zadam/trilium.git
synced 2026-04-01 17:50:26 +02:00
Compare commits
50 Commits
feature/mc
...
feat/add-o
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
6393d2c188 | ||
|
|
d9f0a163cf | ||
|
|
6534beec14 | ||
|
|
6d050340ee | ||
|
|
0e7f7fa208 | ||
|
|
287be0bd25 | ||
|
|
18cf2ff873 | ||
|
|
b626fb448b | ||
|
|
38f6fb5a7f | ||
|
|
5846df7d02 | ||
|
|
9462d6109c | ||
|
|
0d805a01c1 | ||
|
|
7f1e4c0969 | ||
|
|
e55cd7841f | ||
|
|
b9cef158d8 | ||
|
|
5ec6141369 | ||
|
|
55ac1e01f2 | ||
|
|
65b58c3668 | ||
|
|
2cb4e5e8dc | ||
|
|
72cea245f1 | ||
|
|
08ca86c68a | ||
|
|
925c9c1e7b | ||
|
|
6212ea0304 | ||
|
|
f295592134 | ||
|
|
69b0973e6d | ||
|
|
422d318dac | ||
|
|
c55aa6ee88 | ||
|
|
090b175152 | ||
|
|
11e9b097a2 | ||
|
|
2adfc1d32b | ||
|
|
99fa5d89e7 | ||
|
|
ca8cbf8ccf | ||
|
|
6722d2d266 | ||
|
|
508cbeaa1b | ||
|
|
e040865905 | ||
|
|
a7878dd2c6 | ||
|
|
02980834ad | ||
|
|
2a8c8871c4 | ||
|
|
893be24c1d | ||
|
|
9029f59410 | ||
|
|
4b5e8d33a6 | ||
|
|
09196c045f | ||
|
|
7868ebec1e | ||
|
|
80a9182f05 | ||
|
|
d20b3d854f | ||
|
|
f1356228a3 | ||
|
|
a4adc51e50 | ||
|
|
864543e4f9 | ||
|
|
33a549202b | ||
|
|
c4a0219b18 |
@@ -54,7 +54,7 @@
|
||||
"draggabilly": "3.0.0",
|
||||
"force-graph": "1.51.2",
|
||||
"globals": "17.4.0",
|
||||
"i18next": "25.10.10",
|
||||
"i18next": "26.0.1",
|
||||
"i18next-http-backend": "3.0.2",
|
||||
"jquery": "4.0.0",
|
||||
"jquery.fancytree": "2.38.5",
|
||||
|
||||
@@ -302,6 +302,7 @@ export type CommandMappings = {
|
||||
ninthTab: CommandData;
|
||||
lastTab: CommandData;
|
||||
showNoteSource: CommandData;
|
||||
showNoteOCRText: CommandData;
|
||||
showSQLConsole: CommandData;
|
||||
showBackendLog: CommandData;
|
||||
showCheatsheet: CommandData;
|
||||
|
||||
@@ -148,6 +148,19 @@ export default class RootCommandExecutor extends Component {
|
||||
}
|
||||
}
|
||||
|
||||
async showNoteOCRTextCommand() {
|
||||
const notePath = appContext.tabManager.getActiveContextNotePath();
|
||||
|
||||
if (notePath) {
|
||||
await appContext.tabManager.openTabWithNoteWithHoisting(notePath, {
|
||||
activate: true,
|
||||
viewScope: {
|
||||
viewMode: "ocr"
|
||||
}
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
async showAttachmentsCommand() {
|
||||
const notePath = appContext.tabManager.getActiveContextNotePath();
|
||||
|
||||
|
||||
@@ -32,6 +32,7 @@ export interface RenderOptions {
|
||||
includeArchivedNotes?: boolean;
|
||||
/** Set of note IDs that have already been seen during rendering to prevent infinite recursion. */
|
||||
seenNoteIds?: Set<string>;
|
||||
showTextRepresentation?: boolean;
|
||||
}
|
||||
|
||||
const CODE_MIME_TYPES = new Set(["application/json"]);
|
||||
@@ -55,7 +56,7 @@ export async function getRenderedContent(this: {} | { ctx: string }, entity: FNo
|
||||
} else if (type === "code") {
|
||||
await renderCode(entity, $renderedContent);
|
||||
} else if (["image", "canvas", "mindMap", "spreadsheet"].includes(type)) {
|
||||
renderImage(entity, $renderedContent, options);
|
||||
await renderImage(entity, $renderedContent, options);
|
||||
} else if (!options.tooltip && ["file", "pdf", "audio", "video"].includes(type)) {
|
||||
await renderFile(entity, type, $renderedContent);
|
||||
} else if (type === "mermaid") {
|
||||
@@ -138,7 +139,7 @@ async function renderCode(note: FNote | FAttachment, $renderedContent: JQuery<HT
|
||||
await applySingleBlockSyntaxHighlight($codeBlock, normalizeMimeTypeForCKEditor(note.mime));
|
||||
}
|
||||
|
||||
function renderImage(entity: FNote | FAttachment, $renderedContent: JQuery<HTMLElement>, options: RenderOptions = {}) {
|
||||
async function renderImage(entity: FNote | FAttachment, $renderedContent: JQuery<HTMLElement>, options: RenderOptions = {}) {
|
||||
const encodedTitle = encodeURIComponent(entity.title);
|
||||
|
||||
let url;
|
||||
@@ -178,9 +179,39 @@ function renderImage(entity: FNote | FAttachment, $renderedContent: JQuery<HTMLE
|
||||
}
|
||||
|
||||
imageContextMenuService.setupContextMenu($img);
|
||||
|
||||
// Add OCR text display for image notes
|
||||
if (entity instanceof FNote && options.showTextRepresentation) {
|
||||
await addOCRTextIfAvailable(entity, $renderedContent);
|
||||
}
|
||||
}
|
||||
|
||||
async function renderFile(entity: FNote | FAttachment, type: string, $renderedContent: JQuery<HTMLElement>) {
|
||||
async function addOCRTextIfAvailable(note: FNote, $content: JQuery<HTMLElement>) {
|
||||
try {
|
||||
const response = await fetch(`api/ocr/notes/${note.noteId}/text`);
|
||||
if (response.ok) {
|
||||
const data = await response.json();
|
||||
if (data.success && data.hasOcr && data.text) {
|
||||
const $ocrSection = $(`
|
||||
<div class="ocr-text-section">
|
||||
<div class="ocr-header">
|
||||
<span class="bx bx-text"></span> ${t("ocr.extracted_text")}
|
||||
</div>
|
||||
<div class="ocr-content"></div>
|
||||
</div>
|
||||
`);
|
||||
|
||||
$ocrSection.find('.ocr-content').text(data.text);
|
||||
$content.append($ocrSection);
|
||||
}
|
||||
}
|
||||
} catch (error) {
|
||||
// Silently fail if OCR API is not available
|
||||
console.debug('Failed to fetch OCR text:', error);
|
||||
}
|
||||
}
|
||||
|
||||
async function renderFile(entity: FNote | FAttachment, type: string, $renderedContent: JQuery<HTMLElement>, options: RenderOptions = {}) {
|
||||
let entityType, entityId;
|
||||
|
||||
if (entity instanceof FNote) {
|
||||
@@ -220,6 +251,11 @@ async function renderFile(entity: FNote | FAttachment, type: string, $renderedCo
|
||||
$content.append($videoPreview);
|
||||
}
|
||||
|
||||
// Add OCR text display for file notes
|
||||
if (entity instanceof FNote && options.showTextRepresentation) {
|
||||
await addOCRTextIfAvailable(entity, $content);
|
||||
}
|
||||
|
||||
if (entityType === "notes" && "noteId" in entity) {
|
||||
// TODO: we should make this available also for attachments, but there's a problem with "Open externally" support
|
||||
// in attachment list
|
||||
|
||||
@@ -24,8 +24,7 @@ export async function initLocale() {
|
||||
backend: {
|
||||
loadPath: `${window.glob.assetPath}/translations/{{lng}}/{{ns}}.json`
|
||||
},
|
||||
returnEmptyString: false,
|
||||
showSupportNotice: false
|
||||
returnEmptyString: false
|
||||
});
|
||||
|
||||
await setDayjsLocale(locale);
|
||||
|
||||
@@ -28,7 +28,7 @@ async function getLinkIcon(noteId: string, viewMode: ViewMode | undefined) {
|
||||
return icon;
|
||||
}
|
||||
|
||||
export type ViewMode = "default" | "source" | "attachments" | "contextual-help" | "note-map";
|
||||
export type ViewMode = "default" | "source" | "attachments" | "contextual-help" | "note-map" | "ocr";
|
||||
|
||||
export interface ViewScope {
|
||||
/**
|
||||
|
||||
@@ -270,7 +270,11 @@ function ajax(url: string, method: string, data: unknown, headers: Headers, opts
|
||||
} else if (opts.silentInternalServerError && jqXhr.status === 500) {
|
||||
// report nothing
|
||||
} else {
|
||||
await reportError(method, url, jqXhr.status, jqXhr.responseText);
|
||||
try {
|
||||
await reportError(method, url, jqXhr.status, jqXhr.responseText);
|
||||
} catch {
|
||||
// reportError may throw (e.g. ValidationError); ensure rej() is still called below.
|
||||
}
|
||||
}
|
||||
|
||||
rej(jqXhr.responseText);
|
||||
|
||||
@@ -2641,3 +2641,26 @@ iframe.print-iframe {
|
||||
min-height: 50px;
|
||||
align-items: center;
|
||||
}
|
||||
|
||||
.ocr-text-section {
|
||||
margin: 10px 0;
|
||||
padding: 10px;
|
||||
background: var(--accented-background-color);
|
||||
border-left: 3px solid var(--main-border-color);
|
||||
text-align: left;
|
||||
}
|
||||
|
||||
.ocr-header {
|
||||
font-weight: bold;
|
||||
margin-bottom: 8px;
|
||||
font-size: 0.9em;
|
||||
color: var(--muted-text-color);
|
||||
}
|
||||
|
||||
.ocr-content {
|
||||
max-height: 150px;
|
||||
overflow-y: auto;
|
||||
font-size: 0.9em;
|
||||
line-height: 1.4;
|
||||
white-space: pre-wrap;
|
||||
}
|
||||
@@ -691,6 +691,7 @@
|
||||
"search_in_note": "Search in note",
|
||||
"note_source": "Note source",
|
||||
"note_attachments": "Note attachments",
|
||||
"view_ocr_text": "View OCR text",
|
||||
"open_note_externally": "Open note externally",
|
||||
"open_note_externally_title": "File will be open in an external application and watched for changes. You'll then be able to upload the modified version back to Trilium.",
|
||||
"open_note_custom": "Open note custom",
|
||||
@@ -1259,7 +1260,22 @@
|
||||
"enable_image_compression": "Enable image compression",
|
||||
"max_image_dimensions": "Max width / height of an image (image will be resized if it exceeds this setting).",
|
||||
"max_image_dimensions_unit": "pixels",
|
||||
"jpeg_quality_description": "JPEG quality (10 - worst quality, 100 - best quality, 50 - 85 is recommended)"
|
||||
"jpeg_quality_description": "JPEG quality (10 - worst quality, 100 - best quality, 50 - 85 is recommended)",
|
||||
"ocr_section_title": "Optical Character Recognition (OCR)",
|
||||
"enable_ocr": "Enable OCR for images",
|
||||
"ocr_description": "Automatically extract text from images using OCR technology. This makes image content searchable within your notes.",
|
||||
"ocr_auto_process": "Automatically process new images with OCR",
|
||||
"ocr_language": "OCR Language",
|
||||
"ocr_min_confidence": "Minimum confidence threshold",
|
||||
"ocr_confidence_unit": "(0.0-1.0)",
|
||||
"ocr_confidence_description": "Only extract text with confidence above this threshold. Lower values include more text but may be less accurate.",
|
||||
"batch_ocr_title": "Process Existing Images",
|
||||
"batch_ocr_description": "Process all existing images in your notes with OCR. This may take some time depending on the number of images.",
|
||||
"batch_ocr_start": "Start Batch OCR Processing",
|
||||
"batch_ocr_starting": "Starting batch OCR processing...",
|
||||
"batch_ocr_progress": "Processing {{processed}} of {{total}} images...",
|
||||
"batch_ocr_completed": "Batch OCR completed! Processed {{processed}} images.",
|
||||
"batch_ocr_error": "Error during batch OCR: {{error}}"
|
||||
},
|
||||
"attachment_erasure_timeout": {
|
||||
"attachment_erasure_timeout": "Attachment Erasure Timeout",
|
||||
@@ -2067,6 +2083,20 @@
|
||||
"calendar_view": {
|
||||
"delete_note": "Delete note..."
|
||||
},
|
||||
"ocr": {
|
||||
"extracted_text": "Extracted Text (OCR)",
|
||||
"extracted_text_title": "Extracted Text (OCR)",
|
||||
"loading_text": "Loading OCR text...",
|
||||
"no_text_available": "No OCR text available",
|
||||
"no_text_explanation": "This note has not been processed for OCR text extraction or no text was found.",
|
||||
"failed_to_load": "Failed to load OCR text",
|
||||
"extracted_on": "Extracted on: {{date}}",
|
||||
"unknown_date": "Unknown",
|
||||
"process_now": "Process OCR",
|
||||
"processing": "Processing...",
|
||||
"processing_started": "OCR processing has been started. Please wait a moment and refresh.",
|
||||
"processing_failed": "Failed to start OCR processing"
|
||||
},
|
||||
"command_palette": {
|
||||
"tree-action-name": "Tree: {{name}}",
|
||||
"export_note_title": "Export Note",
|
||||
|
||||
@@ -336,6 +336,8 @@ export async function getExtendedWidgetType(note: FNote | null | undefined, note
|
||||
|
||||
if (noteContext?.viewScope?.viewMode === "source") {
|
||||
resultingType = "readOnlyCode";
|
||||
} else if (noteContext.viewScope?.viewMode === "ocr") {
|
||||
resultingType = "readOnlyOCRText";
|
||||
} else if (noteContext.viewScope?.viewMode === "attachments") {
|
||||
resultingType = noteContext.viewScope.attachmentId ? "attachmentDetail" : "attachmentList";
|
||||
} else if (noteContext.viewScope?.viewMode === "note-map") {
|
||||
|
||||
@@ -1,8 +1,9 @@
|
||||
import { it, describe, expect } from "vitest";
|
||||
import { buildNote } from "../../../test/easy-froca";
|
||||
import { getBoardData } from "./data";
|
||||
import { describe, expect,it } from "vitest";
|
||||
|
||||
import FBranch from "../../../entities/fbranch";
|
||||
import froca from "../../../services/froca";
|
||||
import { buildNote } from "../../../test/easy-froca";
|
||||
import { getBoardData } from "./data";
|
||||
|
||||
describe("Board data", () => {
|
||||
it("deduplicates cloned notes", async () => {
|
||||
|
||||
@@ -27,6 +27,7 @@ const VIEW_MODE_ICON_MAPPINGS: Record<Exclude<ViewMode, "default">, string> = {
|
||||
"contextual-help": "bx bx-help-circle",
|
||||
"note-map": "bx bxs-network-chart",
|
||||
attachments: "bx bx-paperclip",
|
||||
ocr: "bx bx-text"
|
||||
};
|
||||
|
||||
export default function TabSwitcher() {
|
||||
|
||||
@@ -12,7 +12,7 @@ import { TypeWidgetProps } from "./type_widgets/type_widget";
|
||||
* A `NoteType` altered by the note detail widget, taking into consideration whether the note is editable or not and adding special note types such as an empty one,
|
||||
* for protected session or attachment information.
|
||||
*/
|
||||
export type ExtendedNoteType = Exclude<NoteType, "launcher" | "text" | "code" | "llmChat"> | "empty" | "readOnlyCode" | "readOnlyText" | "editableText" | "editableCode" | "attachmentDetail" | "attachmentList" | "protectedSession" | "sqlConsole" | "llmChat";
|
||||
export type ExtendedNoteType = Exclude<NoteType, "launcher" | "text" | "code" | "llmChat"> | "empty" | "readOnlyCode" | "readOnlyText" | "readOnlyOCRText" | "editableText" | "editableCode" | "attachmentDetail" | "attachmentList" | "protectedSession" | "sqlConsole" | "llmChat";
|
||||
|
||||
export type TypeWidget = ((props: TypeWidgetProps) => VNode | JSX.Element | undefined);
|
||||
type NoteTypeView = () => (Promise<{ default: TypeWidget } | TypeWidget> | TypeWidget);
|
||||
@@ -78,6 +78,11 @@ export const TYPE_MAPPINGS: Record<ExtendedNoteType, NoteTypeMapping> = {
|
||||
className: "note-detail-readonly-code",
|
||||
printable: true
|
||||
},
|
||||
readOnlyOCRText: {
|
||||
view: () => import("./type_widgets/ReadOnlyTextRepresentation"),
|
||||
className: "note-detail-ocr-text",
|
||||
printable: true
|
||||
},
|
||||
editableCode: {
|
||||
view: async () => (await import("./type_widgets/code/Code")).EditableCode,
|
||||
className: "note-detail-code",
|
||||
|
||||
@@ -162,6 +162,7 @@ export function NoteContextMenu({ note, noteContext, itemsAtStart, itemsNearNote
|
||||
<CommandItem command="openNoteExternally" icon="bx bx-file-find" disabled={isSearchOrBook || !isElectron} text={t("note_actions.open_note_externally")} title={t("note_actions.open_note_externally_title")} />
|
||||
<CommandItem command="openNoteCustom" icon="bx bx-customize" disabled={isSearchOrBook || isMac || !isElectron} text={t("note_actions.open_note_custom")} />
|
||||
<CommandItem command="showNoteSource" icon="bx bx-code" disabled={!hasSource} text={t("note_actions.note_source")} />
|
||||
<CommandItem command="showNoteOCRText" icon="bx bx-text" disabled={!["image", "file"].includes(noteType)} text={t("note_actions.view_ocr_text")} />
|
||||
{(syncServerHost && isElectron) &&
|
||||
<CommandItem command="openNoteOnServer" icon="bx bx-world" disabled={!syncServerHost} text={t("note_actions.open_note_on_server")} />
|
||||
}
|
||||
|
||||
@@ -0,0 +1,145 @@
|
||||
import { useEffect, useState } from "preact/hooks";
|
||||
|
||||
import { t } from "../../services/i18n";
|
||||
import server from "../../services/server";
|
||||
import toast from "../../services/toast";
|
||||
import { TypeWidgetProps } from "./type_widget";
|
||||
|
||||
interface TextRepresentationResponse {
|
||||
success: boolean;
|
||||
text: string;
|
||||
hasOcr: boolean;
|
||||
extractedAt: string | null;
|
||||
message?: string;
|
||||
}
|
||||
|
||||
type State =
|
||||
| { kind: "loading" }
|
||||
| { kind: "loaded"; text: string; extractedAt: string | null }
|
||||
| { kind: "empty" }
|
||||
| { kind: "error"; message: string };
|
||||
|
||||
export default function ReadOnlyTextRepresentation({ note }: TypeWidgetProps) {
|
||||
const [ state, setState ] = useState<State>({ kind: "loading" });
|
||||
const [ processing, setProcessing ] = useState(false);
|
||||
|
||||
async function fetchText() {
|
||||
setState({ kind: "loading" });
|
||||
|
||||
try {
|
||||
const response = await server.get<TextRepresentationResponse>(`ocr/notes/${note.noteId}/text`);
|
||||
|
||||
if (!response.success) {
|
||||
setState({ kind: "error", message: response.message || t("ocr.failed_to_load") });
|
||||
return;
|
||||
}
|
||||
|
||||
if (!response.hasOcr || !response.text) {
|
||||
setState({ kind: "empty" });
|
||||
return;
|
||||
}
|
||||
|
||||
setState({ kind: "loaded", text: response.text, extractedAt: response.extractedAt });
|
||||
} catch (error: any) {
|
||||
console.error("Error loading text representation:", error);
|
||||
setState({ kind: "error", message: error.message || t("ocr.failed_to_load") });
|
||||
}
|
||||
}
|
||||
|
||||
useEffect(() => { fetchText(); }, [ note.noteId ]);
|
||||
|
||||
async function processOCR() {
|
||||
setProcessing(true);
|
||||
try {
|
||||
const response = await server.post<{ success: boolean; message?: string }>(`ocr/process-note/${note.noteId}`);
|
||||
if (response.success) {
|
||||
toast.showMessage(t("ocr.processing_started"));
|
||||
setTimeout(fetchText, 2000);
|
||||
} else {
|
||||
toast.showError(response.message || t("ocr.processing_failed"));
|
||||
}
|
||||
} catch {
|
||||
// Server errors (4xx/5xx) are already shown as toasts by server.ts.
|
||||
} finally {
|
||||
setProcessing(false);
|
||||
}
|
||||
}
|
||||
|
||||
return (
|
||||
<div className="note-detail-printable" style={{ padding: "10px" }}>
|
||||
<div style={{
|
||||
marginBottom: "10px",
|
||||
padding: "8px 12px",
|
||||
backgroundColor: "var(--main-background-color)",
|
||||
border: "1px solid var(--main-border-color)",
|
||||
borderRadius: "4px",
|
||||
fontWeight: 500
|
||||
}}>
|
||||
<span className="bx bx-text" />{" "}{t("ocr.extracted_text_title")}
|
||||
</div>
|
||||
|
||||
{state.kind === "loading" && (
|
||||
<div style={{ textAlign: "center", padding: "30px", color: "var(--muted-text-color)" }}>
|
||||
<span className="bx bx-loader-alt bx-spin" />{" "}{t("ocr.loading_text")}
|
||||
</div>
|
||||
)}
|
||||
|
||||
{state.kind === "loaded" && (
|
||||
<>
|
||||
<pre style={{
|
||||
whiteSpace: "pre-wrap",
|
||||
fontFamily: "var(--detail-text-font-family)",
|
||||
fontSize: "var(--detail-text-font-size)",
|
||||
lineHeight: 1.6,
|
||||
border: "1px solid var(--main-border-color)",
|
||||
borderRadius: "4px",
|
||||
padding: "15px",
|
||||
backgroundColor: "var(--accented-background-color)",
|
||||
minHeight: "100px"
|
||||
}}>
|
||||
{state.text}
|
||||
</pre>
|
||||
<div style={{ fontSize: "0.9em", color: "var(--muted-text-color)", marginTop: "10px", fontStyle: "italic" }}>
|
||||
{t("ocr.extracted_on", { date: state.extractedAt ? new Date(state.extractedAt).toLocaleString() : t("ocr.unknown_date") })}
|
||||
</div>
|
||||
</>
|
||||
)}
|
||||
|
||||
{state.kind === "empty" && (
|
||||
<>
|
||||
<div style={{ color: "var(--muted-text-color)", fontStyle: "italic", textAlign: "center", padding: "30px" }}>
|
||||
<span className="bx bx-info-circle" />{" "}{t("ocr.no_text_available")}
|
||||
</div>
|
||||
<button
|
||||
type="button"
|
||||
className="btn btn-secondary"
|
||||
style={{ marginTop: "15px" }}
|
||||
disabled={processing}
|
||||
onClick={processOCR}
|
||||
>
|
||||
{processing
|
||||
? <><span className="bx bx-loader-alt bx-spin" />{" "}{t("ocr.processing")}</>
|
||||
: <><span className="bx bx-play" />{" "}{t("ocr.process_now")}</>
|
||||
}
|
||||
</button>
|
||||
<div style={{ fontSize: "0.9em", color: "var(--muted-text-color)", marginTop: "10px", fontStyle: "italic" }}>
|
||||
{t("ocr.no_text_explanation")}
|
||||
</div>
|
||||
</>
|
||||
)}
|
||||
|
||||
{state.kind === "error" && (
|
||||
<div style={{
|
||||
color: "var(--error-color)",
|
||||
backgroundColor: "var(--error-background-color)",
|
||||
border: "1px solid var(--error-border-color)",
|
||||
padding: "10px",
|
||||
borderRadius: "4px",
|
||||
marginTop: "10px"
|
||||
}}>
|
||||
<span className="bx bx-error" />{" "}{state.message}
|
||||
</div>
|
||||
)}
|
||||
</div>
|
||||
);
|
||||
}
|
||||
@@ -68,6 +68,7 @@
|
||||
"@types/serve-static": "2.2.0",
|
||||
"@types/stream-throttle": "0.1.4",
|
||||
"@types/supertest": "7.2.0",
|
||||
"@types/tesseract.js": "2.0.0",
|
||||
"@types/tmp": "0.2.6",
|
||||
"@types/turndown": "5.0.6",
|
||||
"@types/ws": "8.18.1",
|
||||
@@ -115,16 +116,20 @@
|
||||
"mime-types": "3.0.2",
|
||||
"multer": "2.1.1",
|
||||
"normalize-strings": "1.1.1",
|
||||
"officeparser": "5.2.0",
|
||||
"pdf-parse": "1.1.1",
|
||||
"rand-token": "1.0.1",
|
||||
"safe-compare": "1.1.4",
|
||||
"sanitize-filename": "1.6.4",
|
||||
"sanitize-html": "2.17.2",
|
||||
"sax": "1.6.0",
|
||||
"serve-favicon": "2.5.1",
|
||||
"sharp": "0.34.3",
|
||||
"stream-throttle": "0.1.3",
|
||||
"strip-bom": "5.0.0",
|
||||
"striptags": "3.2.0",
|
||||
"supertest": "7.2.2",
|
||||
"tesseract.js": "6.0.1",
|
||||
"swagger-jsdoc": "6.2.8",
|
||||
"time2fa": "1.4.2",
|
||||
"tmp": "0.2.5",
|
||||
|
||||
@@ -107,6 +107,8 @@ CREATE TABLE IF NOT EXISTS "recent_notes"
|
||||
CREATE TABLE IF NOT EXISTS "blobs" (
|
||||
`blobId` TEXT NOT NULL,
|
||||
`content` TEXT NULL DEFAULT NULL,
|
||||
`textRepresentation` TEXT DEFAULT NULL,
|
||||
`textExtractionLastProcessed` TEXT DEFAULT NULL,
|
||||
`dateModified` TEXT NOT NULL,
|
||||
`utcDateModified` TEXT NOT NULL,
|
||||
PRIMARY KEY(`blobId`)
|
||||
|
||||
@@ -10,11 +10,12 @@ class BBlob extends AbstractBeccaEntity<BBlob> {
|
||||
return "blobId";
|
||||
}
|
||||
static get hashedProperties() {
|
||||
return ["blobId", "content"];
|
||||
return ["blobId", "content", "textRepresentation"];
|
||||
}
|
||||
|
||||
content!: string | Buffer;
|
||||
contentLength!: number;
|
||||
textRepresentation?: string | null;
|
||||
|
||||
constructor(row: BlobRow) {
|
||||
super();
|
||||
@@ -25,6 +26,7 @@ class BBlob extends AbstractBeccaEntity<BBlob> {
|
||||
this.blobId = row.blobId;
|
||||
this.content = row.content;
|
||||
this.contentLength = row.contentLength;
|
||||
this.textRepresentation = row.textRepresentation;
|
||||
this.dateModified = row.dateModified;
|
||||
this.utcDateModified = row.utcDateModified;
|
||||
}
|
||||
@@ -34,6 +36,7 @@ class BBlob extends AbstractBeccaEntity<BBlob> {
|
||||
blobId: this.blobId,
|
||||
content: this.content || null,
|
||||
contentLength: this.contentLength,
|
||||
textRepresentation: this.textRepresentation || null,
|
||||
dateModified: this.dateModified,
|
||||
utcDateModified: this.utcDateModified
|
||||
};
|
||||
|
||||
@@ -6,6 +6,25 @@
|
||||
|
||||
// Migrations should be kept in descending order, so the latest migration is first.
|
||||
const MIGRATIONS: (SqlMigration | JsMigration)[] = [
|
||||
// Add text representation column and last processed timestamp to blobs table
|
||||
{
|
||||
version: 236,
|
||||
sql: /*sql*/`\
|
||||
-- Add text representation column to blobs table
|
||||
ALTER TABLE blobs ADD COLUMN textRepresentation TEXT DEFAULT NULL;
|
||||
|
||||
-- Add OCR last processed timestamp to blobs table
|
||||
ALTER TABLE blobs ADD COLUMN textExtractionLastProcessed TEXT DEFAULT NULL;
|
||||
|
||||
-- Create index for text representation searches
|
||||
CREATE INDEX IF NOT EXISTS idx_blobs_textRepresentation
|
||||
ON blobs (textRepresentation);
|
||||
|
||||
-- Create index for OCR last processed timestamp
|
||||
CREATE INDEX IF NOT EXISTS idx_blobs_textExtractionLastProcessed
|
||||
ON blobs (textExtractionLastProcessed);
|
||||
`
|
||||
},
|
||||
// Add missing database indices for query performance
|
||||
{
|
||||
version: 235,
|
||||
|
||||
75
apps/server/src/routes/api/ocr.spec.ts
Normal file
75
apps/server/src/routes/api/ocr.spec.ts
Normal file
@@ -0,0 +1,75 @@
|
||||
import { describe, expect, it, vi, beforeEach } from "vitest";
|
||||
import ocrRoutes from "./ocr.js";
|
||||
|
||||
// Mock the OCR service
|
||||
vi.mock("../../services/ocr/ocr_service.js", () => ({
|
||||
default: {
|
||||
isOCREnabled: vi.fn(() => true),
|
||||
startBatchProcessing: vi.fn(() => Promise.resolve({ success: true })),
|
||||
getBatchProgress: vi.fn(() => ({ inProgress: false, total: 0, processed: 0 }))
|
||||
}
|
||||
}));
|
||||
|
||||
// Mock becca
|
||||
vi.mock("../../becca/becca.js", () => ({
|
||||
default: {}
|
||||
}));
|
||||
|
||||
// Mock log
|
||||
vi.mock("../../services/log.js", () => ({
|
||||
default: {
|
||||
error: vi.fn()
|
||||
}
|
||||
}));
|
||||
|
||||
describe("OCR API", () => {
|
||||
let mockRequest: any;
|
||||
let mockResponse: any;
|
||||
|
||||
beforeEach(() => {
|
||||
mockRequest = {
|
||||
params: {},
|
||||
body: {},
|
||||
query: {}
|
||||
};
|
||||
|
||||
mockResponse = {
|
||||
status: vi.fn().mockReturnThis(),
|
||||
json: vi.fn().mockReturnThis(),
|
||||
triliumResponseHandled: false
|
||||
};
|
||||
});
|
||||
|
||||
it("should set triliumResponseHandled flag in batch processing", async () => {
|
||||
await ocrRoutes.batchProcessOCR(mockRequest, mockResponse);
|
||||
|
||||
expect(mockResponse.json).toHaveBeenCalledWith({ success: true });
|
||||
expect(mockResponse.triliumResponseHandled).toBe(true);
|
||||
});
|
||||
|
||||
it("should set triliumResponseHandled flag in get batch progress", async () => {
|
||||
await ocrRoutes.getBatchProgress(mockRequest, mockResponse);
|
||||
|
||||
expect(mockResponse.json).toHaveBeenCalledWith({
|
||||
inProgress: false,
|
||||
total: 0,
|
||||
processed: 0
|
||||
});
|
||||
expect(mockResponse.triliumResponseHandled).toBe(true);
|
||||
});
|
||||
|
||||
it("should handle errors and set triliumResponseHandled flag", async () => {
|
||||
// Mock service to throw error
|
||||
const ocrService = await import("../../services/ocr/ocr_service.js");
|
||||
vi.mocked(ocrService.default.startBatchProcessing).mockRejectedValueOnce(new Error("Test error"));
|
||||
|
||||
await ocrRoutes.batchProcessOCR(mockRequest, mockResponse);
|
||||
|
||||
expect(mockResponse.status).toHaveBeenCalledWith(500);
|
||||
expect(mockResponse.json).toHaveBeenCalledWith({
|
||||
success: false,
|
||||
error: "Test error"
|
||||
});
|
||||
expect(mockResponse.triliumResponseHandled).toBe(true);
|
||||
});
|
||||
});
|
||||
324
apps/server/src/routes/api/ocr.ts
Normal file
324
apps/server/src/routes/api/ocr.ts
Normal file
@@ -0,0 +1,324 @@
|
||||
import type { Request } from "express";
|
||||
|
||||
import becca from "../../becca/becca.js";
|
||||
import ocrService from "../../services/ocr/ocr_service.js";
|
||||
import sql from "../../services/sql.js";
|
||||
|
||||
/**
|
||||
* @swagger
|
||||
* /api/ocr/process-note/{noteId}:
|
||||
* post:
|
||||
* summary: Process OCR for a specific note
|
||||
* operationId: ocr-process-note
|
||||
* parameters:
|
||||
* - name: noteId
|
||||
* in: path
|
||||
* required: true
|
||||
* schema:
|
||||
* type: string
|
||||
* description: ID of the note to process
|
||||
* requestBody:
|
||||
* required: false
|
||||
* content:
|
||||
* application/json:
|
||||
* schema:
|
||||
* type: object
|
||||
* properties:
|
||||
* language:
|
||||
* type: string
|
||||
* description: OCR language code (e.g. 'eng', 'fra', 'deu')
|
||||
* default: 'eng'
|
||||
* forceReprocess:
|
||||
* type: boolean
|
||||
* description: Force reprocessing even if OCR already exists
|
||||
* default: false
|
||||
* responses:
|
||||
* '200':
|
||||
* description: OCR processing completed successfully
|
||||
* '400':
|
||||
* description: Bad request - OCR disabled or unsupported file type
|
||||
* '404':
|
||||
* description: Note not found
|
||||
* '500':
|
||||
* description: Internal server error
|
||||
* security:
|
||||
* - session: []
|
||||
* tags: ["ocr"]
|
||||
*/
|
||||
async function processNoteOCR(req: Request<{ noteId: string }>) {
|
||||
const { noteId } = req.params;
|
||||
const { language = 'eng', forceReprocess = false } = req.body || {};
|
||||
|
||||
if (!ocrService.isOCREnabled()) {
|
||||
return [400, { success: false, message: 'OCR is not enabled in settings' }];
|
||||
}
|
||||
|
||||
const note = becca.getNote(noteId);
|
||||
if (!note) {
|
||||
return [404, { success: false, message: 'Note not found' }];
|
||||
}
|
||||
|
||||
const result = await ocrService.processNoteOCR(noteId, { language, forceReprocess });
|
||||
if (!result) {
|
||||
return [400, { success: false, message: 'Note is not an image or has unsupported format' }];
|
||||
}
|
||||
|
||||
return { success: true, result };
|
||||
}
|
||||
|
||||
/**
|
||||
* @swagger
|
||||
* /api/ocr/process-attachment/{attachmentId}:
|
||||
* post:
|
||||
* summary: Process OCR for a specific attachment
|
||||
* operationId: ocr-process-attachment
|
||||
* parameters:
|
||||
* - name: attachmentId
|
||||
* in: path
|
||||
* required: true
|
||||
* schema:
|
||||
* type: string
|
||||
* description: ID of the attachment to process
|
||||
* requestBody:
|
||||
* required: false
|
||||
* content:
|
||||
* application/json:
|
||||
* schema:
|
||||
* type: object
|
||||
* properties:
|
||||
* language:
|
||||
* type: string
|
||||
* description: OCR language code (e.g. 'eng', 'fra', 'deu')
|
||||
* default: 'eng'
|
||||
* forceReprocess:
|
||||
* type: boolean
|
||||
* description: Force reprocessing even if OCR already exists
|
||||
* default: false
|
||||
* responses:
|
||||
* '200':
|
||||
* description: OCR processing completed successfully
|
||||
* '400':
|
||||
* description: Bad request - OCR disabled or unsupported file type
|
||||
* '404':
|
||||
* description: Attachment not found
|
||||
* '500':
|
||||
* description: Internal server error
|
||||
* security:
|
||||
* - session: []
|
||||
* tags: ["ocr"]
|
||||
*/
|
||||
async function processAttachmentOCR(req: Request<{ attachmentId: string }>) {
|
||||
const { attachmentId } = req.params;
|
||||
const { language = 'eng', forceReprocess = false } = req.body || {};
|
||||
|
||||
if (!ocrService.isOCREnabled()) {
|
||||
return [400, { success: false, message: 'OCR is not enabled in settings' }];
|
||||
}
|
||||
|
||||
const attachment = becca.getAttachment(attachmentId);
|
||||
if (!attachment) {
|
||||
return [404, { success: false, message: 'Attachment not found' }];
|
||||
}
|
||||
|
||||
const result = await ocrService.processAttachmentOCR(attachmentId, { language, forceReprocess });
|
||||
if (!result) {
|
||||
return [400, { success: false, message: 'Attachment is not an image or has unsupported format' }];
|
||||
}
|
||||
|
||||
return { success: true, result };
|
||||
}
|
||||
|
||||
/**
|
||||
* @swagger
|
||||
* /api/ocr/search:
|
||||
* get:
|
||||
* summary: Search for text in OCR results
|
||||
* operationId: ocr-search
|
||||
* parameters:
|
||||
* - name: q
|
||||
* in: query
|
||||
* required: true
|
||||
* schema:
|
||||
* type: string
|
||||
* description: Search query text
|
||||
* responses:
|
||||
* '200':
|
||||
* description: Search results
|
||||
* '400':
|
||||
* description: Bad request - missing search query
|
||||
* '500':
|
||||
* description: Internal server error
|
||||
* security:
|
||||
* - session: []
|
||||
* tags: ["ocr"]
|
||||
*/
|
||||
async function searchOCR(req: Request) {
|
||||
const { q: searchText } = req.query;
|
||||
|
||||
if (!searchText || typeof searchText !== 'string') {
|
||||
return [400, { success: false, message: 'Search query is required' }];
|
||||
}
|
||||
|
||||
const results = ocrService.searchOCRResults(searchText);
|
||||
return { success: true, results };
|
||||
}
|
||||
|
||||
/**
|
||||
* @swagger
|
||||
* /api/ocr/batch-process:
|
||||
* post:
|
||||
* summary: Process OCR for all images without existing OCR results
|
||||
* operationId: ocr-batch-process
|
||||
* responses:
|
||||
* '200':
|
||||
* description: Batch processing initiated successfully
|
||||
* '400':
|
||||
* description: Bad request - OCR disabled or already processing
|
||||
* '500':
|
||||
* description: Internal server error
|
||||
* security:
|
||||
* - session: []
|
||||
* tags: ["ocr"]
|
||||
*/
|
||||
async function batchProcessOCR() {
|
||||
const result = await ocrService.startBatchProcessing();
|
||||
if (!result.success) {
|
||||
return [400, result];
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
* @swagger
|
||||
* /api/ocr/batch-progress:
|
||||
* get:
|
||||
* summary: Get batch OCR processing progress
|
||||
* operationId: ocr-batch-progress
|
||||
* responses:
|
||||
* '200':
|
||||
* description: Batch processing progress information
|
||||
* '500':
|
||||
* description: Internal server error
|
||||
* security:
|
||||
* - session: []
|
||||
* tags: ["ocr"]
|
||||
*/
|
||||
async function getBatchProgress() {
|
||||
return ocrService.getBatchProgress();
|
||||
}
|
||||
|
||||
/**
|
||||
* @swagger
|
||||
* /api/ocr/stats:
|
||||
* get:
|
||||
* summary: Get OCR processing statistics
|
||||
* operationId: ocr-get-stats
|
||||
* responses:
|
||||
* '200':
|
||||
* description: OCR statistics
|
||||
* '500':
|
||||
* description: Internal server error
|
||||
* security:
|
||||
* - session: []
|
||||
* tags: ["ocr"]
|
||||
*/
|
||||
async function getOCRStats() {
|
||||
return { success: true, stats: ocrService.getOCRStats() };
|
||||
}
|
||||
|
||||
/**
|
||||
* @swagger
|
||||
* /api/ocr/delete/{blobId}:
|
||||
* delete:
|
||||
* summary: Delete OCR results for a specific blob
|
||||
* operationId: ocr-delete-results
|
||||
* parameters:
|
||||
* - name: blobId
|
||||
* in: path
|
||||
* required: true
|
||||
* schema:
|
||||
* type: string
|
||||
* description: ID of the blob
|
||||
* responses:
|
||||
* '200':
|
||||
* description: OCR results deleted successfully
|
||||
* '400':
|
||||
* description: Bad request - invalid parameters
|
||||
* '500':
|
||||
* description: Internal server error
|
||||
* security:
|
||||
* - session: []
|
||||
* tags: ["ocr"]
|
||||
*/
|
||||
async function deleteOCRResults(req: Request<{ blobId: string }>) {
|
||||
const { blobId } = req.params;
|
||||
|
||||
ocrService.deleteOCRResult(blobId);
|
||||
return { success: true, message: `OCR results deleted for blob ${blobId}` };
|
||||
}
|
||||
|
||||
/**
|
||||
* @swagger
|
||||
* /api/ocr/notes/{noteId}/text:
|
||||
* get:
|
||||
* summary: Get OCR text for a specific note
|
||||
* operationId: ocr-get-note-text
|
||||
* parameters:
|
||||
* - name: noteId
|
||||
* in: path
|
||||
* required: true
|
||||
* schema:
|
||||
* type: string
|
||||
* description: Note ID to get OCR text for
|
||||
* responses:
|
||||
* 200:
|
||||
* description: OCR text retrieved successfully
|
||||
* 404:
|
||||
* description: Note not found
|
||||
* tags: ["ocr"]
|
||||
*/
|
||||
async function getNoteOCRText(req: Request<{ noteId: string }>) {
|
||||
const { noteId } = req.params;
|
||||
|
||||
const note = becca.getNote(noteId);
|
||||
if (!note) {
|
||||
return [404, { success: false, message: 'Note not found' }];
|
||||
}
|
||||
|
||||
let ocrText: string | null = null;
|
||||
let extractedAt: string | null = null;
|
||||
|
||||
if (note.blobId) {
|
||||
const result = sql.getRow<{
|
||||
textRepresentation: string | null;
|
||||
textExtractionLastProcessed: string | null;
|
||||
}>(`
|
||||
SELECT textRepresentation, textExtractionLastProcessed
|
||||
FROM blobs
|
||||
WHERE blobId = ?
|
||||
`, [note.blobId]);
|
||||
|
||||
if (result) {
|
||||
ocrText = result.textRepresentation;
|
||||
extractedAt = result.textExtractionLastProcessed;
|
||||
}
|
||||
}
|
||||
|
||||
return {
|
||||
success: true,
|
||||
text: ocrText || '',
|
||||
hasOcr: !!ocrText,
|
||||
extractedAt
|
||||
};
|
||||
}
|
||||
|
||||
export default {
|
||||
processNoteOCR,
|
||||
processAttachmentOCR,
|
||||
searchOCR,
|
||||
batchProcessOCR,
|
||||
getBatchProgress,
|
||||
getOCRStats,
|
||||
deleteOCRResults,
|
||||
getNoteOCRText
|
||||
};
|
||||
@@ -105,7 +105,13 @@ const ALLOWED_OPTIONS = new Set<OptionNames>([
|
||||
"newLayout",
|
||||
"mfaEnabled",
|
||||
"mfaMethod",
|
||||
"llmProviders"
|
||||
"llmProviders",
|
||||
|
||||
// OCR options
|
||||
"ocrEnabled",
|
||||
"ocrLanguage",
|
||||
"ocrAutoProcessImages",
|
||||
"ocrMinConfidence"
|
||||
]);
|
||||
|
||||
function getOptions() {
|
||||
|
||||
@@ -39,6 +39,7 @@ import loginApiRoute from "./api/login.js";
|
||||
import metricsRoute from "./api/metrics.js";
|
||||
import noteMapRoute from "./api/note_map.js";
|
||||
import notesApiRoute from "./api/notes.js";
|
||||
import ocrRoute from "./api/ocr.js";
|
||||
import optionsApiRoute from "./api/options.js";
|
||||
import otherRoute from "./api/other.js";
|
||||
import passwordApiRoute from "./api/password.js";
|
||||
@@ -376,6 +377,16 @@ function register(app: express.Application) {
|
||||
etapiBackupRoute.register(router);
|
||||
etapiMetricsRoute.register(router);
|
||||
|
||||
// OCR API
|
||||
asyncApiRoute(PST, "/api/ocr/process-note/:noteId", ocrRoute.processNoteOCR);
|
||||
asyncApiRoute(PST, "/api/ocr/process-attachment/:attachmentId", ocrRoute.processAttachmentOCR);
|
||||
asyncApiRoute(GET, "/api/ocr/search", ocrRoute.searchOCR);
|
||||
asyncApiRoute(PST, "/api/ocr/batch-process", ocrRoute.batchProcessOCR);
|
||||
asyncApiRoute(GET, "/api/ocr/batch-progress", ocrRoute.getBatchProgress);
|
||||
asyncApiRoute(GET, "/api/ocr/stats", ocrRoute.getOCRStats);
|
||||
asyncApiRoute(DEL, "/api/ocr/delete/:blobId", ocrRoute.deleteOCRResults);
|
||||
asyncApiRoute(GET, "/api/ocr/notes/:noteId/text", ocrRoute.getNoteOCRText);
|
||||
|
||||
app.use("", router);
|
||||
}
|
||||
|
||||
|
||||
@@ -5,7 +5,7 @@ import packageJson from "../../package.json" with { type: "json" };
|
||||
import build from "./build.js";
|
||||
import dataDir from "./data_dir.js";
|
||||
|
||||
const APP_DB_VERSION = 235;
|
||||
const APP_DB_VERSION = 236;
|
||||
const SYNC_VERSION = 37;
|
||||
const CLIPPER_PROTOCOL_VERSION = "1.0";
|
||||
|
||||
|
||||
@@ -6,6 +6,9 @@ import becca from "../becca/becca.js";
|
||||
import BAttribute from "../becca/entities/battribute.js";
|
||||
import hiddenSubtreeService from "./hidden_subtree.js";
|
||||
import oneTimeTimer from "./one_time_timer.js";
|
||||
import ocrService from "./ocr/ocr_service.js";
|
||||
import optionService from "./options.js";
|
||||
import log from "./log.js";
|
||||
import type BNote from "../becca/entities/bnote.js";
|
||||
import type AbstractBeccaEntity from "../becca/entities/abstract_becca_entity.js";
|
||||
import type { DefinitionObject } from "./promoted_attribute_definition_interface.js";
|
||||
@@ -137,6 +140,25 @@ eventService.subscribe(eventService.ENTITY_CREATED, ({ entityName, entity }) =>
|
||||
}
|
||||
} else if (entityName === "notes") {
|
||||
runAttachedRelations(entity, "runOnNoteCreation", entity);
|
||||
|
||||
// Note: OCR processing for images is now handled in image.ts during image processing
|
||||
// OCR processing for files remains here since they don't go through image processing
|
||||
// Only auto-process if both OCR is enabled and auto-processing is enabled
|
||||
if (entity.type === 'file' && ocrService.isOCREnabled() && optionService.getOptionBool("ocrAutoProcessImages")) {
|
||||
// Check if the file MIME type is supported by any OCR processor
|
||||
const supportedMimeTypes = ocrService.getAllSupportedMimeTypes();
|
||||
|
||||
if (entity.mime && supportedMimeTypes.includes(entity.mime)) {
|
||||
// Process OCR asynchronously to avoid blocking note creation
|
||||
ocrService.processNoteOCR(entity.noteId).then(result => {
|
||||
if (result) {
|
||||
log.info(`Automatically processed OCR for file note ${entity.noteId} with MIME type ${entity.mime}`);
|
||||
}
|
||||
}).catch(error => {
|
||||
log.error(`Failed to automatically process OCR for file note ${entity.noteId}: ${error}`);
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
|
||||
@@ -18,8 +18,7 @@ export async function initializeTranslations() {
|
||||
ns: "server",
|
||||
backend: {
|
||||
loadPath: join(resourceDir, "assets/translations/{{lng}}/{{ns}}.json")
|
||||
},
|
||||
showSupportNotice: false
|
||||
}
|
||||
});
|
||||
|
||||
// Initialize dayjs locale.
|
||||
|
||||
@@ -12,8 +12,9 @@ import sanitizeFilename from "sanitize-filename";
|
||||
import isSvg from "is-svg";
|
||||
import isAnimated from "is-animated";
|
||||
import htmlSanitizer from "./html_sanitizer.js";
|
||||
import ocrService, { type OCRResult } from "./ocr/ocr_service.js";
|
||||
|
||||
async function processImage(uploadBuffer: Buffer, originalName: string, shrinkImageSwitch: boolean) {
|
||||
async function processImage(uploadBuffer: Buffer, originalName: string, shrinkImageSwitch: boolean, noteId?: string) {
|
||||
const compressImages = optionService.getOptionBool("compressImages");
|
||||
const origImageFormat = await getImageType(uploadBuffer);
|
||||
|
||||
@@ -24,6 +25,42 @@ async function processImage(uploadBuffer: Buffer, originalName: string, shrinkIm
|
||||
shrinkImageSwitch = false;
|
||||
}
|
||||
|
||||
// Schedule OCR processing in the background for best quality
|
||||
// Only auto-process if both OCR is enabled and auto-processing is enabled
|
||||
if (noteId && ocrService.isOCREnabled() && optionService.getOptionBool("ocrAutoProcessImages") && origImageFormat) {
|
||||
const imageMime = getImageMimeFromExtension(origImageFormat.ext);
|
||||
const supportedMimeTypes = ocrService.getAllSupportedMimeTypes();
|
||||
|
||||
if (supportedMimeTypes.includes(imageMime)) {
|
||||
// Process OCR asynchronously without blocking image creation
|
||||
setImmediate(async () => {
|
||||
try {
|
||||
const ocrResult = await ocrService.extractTextFromFile(uploadBuffer, imageMime);
|
||||
if (ocrResult) {
|
||||
// We need to get the entity again to get its blobId after it's been saved
|
||||
// noteId could be either a note ID or attachment ID
|
||||
const note = becca.getNote(noteId);
|
||||
const attachment = becca.getAttachment(noteId);
|
||||
|
||||
let blobId: string | undefined;
|
||||
if (note && note.blobId) {
|
||||
blobId = note.blobId;
|
||||
} else if (attachment && attachment.blobId) {
|
||||
blobId = attachment.blobId;
|
||||
}
|
||||
|
||||
if (blobId) {
|
||||
await ocrService.storeOCRResult(blobId, ocrResult);
|
||||
log.info(`Successfully processed OCR for image ${noteId} (${originalName})`);
|
||||
}
|
||||
}
|
||||
} catch (error) {
|
||||
log.error(`Failed to process OCR for image ${noteId}: ${error}`);
|
||||
}
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
let finalImageBuffer;
|
||||
let imageFormat;
|
||||
|
||||
@@ -72,7 +109,7 @@ function updateImage(noteId: string, uploadBuffer: Buffer, originalName: string)
|
||||
note.setLabel("originalFileName", originalName);
|
||||
|
||||
// resizing images asynchronously since JIMP does not support sync operation
|
||||
processImage(uploadBuffer, originalName, true).then(({ buffer, imageFormat }) => {
|
||||
processImage(uploadBuffer, originalName, true, noteId).then(({ buffer, imageFormat }) => {
|
||||
sql.transactional(() => {
|
||||
note.mime = getImageMimeFromExtension(imageFormat.ext);
|
||||
note.save();
|
||||
@@ -108,7 +145,7 @@ function saveImage(parentNoteId: string, uploadBuffer: Buffer, originalName: str
|
||||
note.addLabel("originalFileName", originalName);
|
||||
|
||||
// resizing images asynchronously since JIMP does not support sync operation
|
||||
processImage(uploadBuffer, originalName, shrinkImageSwitch).then(({ buffer, imageFormat }) => {
|
||||
processImage(uploadBuffer, originalName, shrinkImageSwitch, note.noteId).then(({ buffer, imageFormat }) => {
|
||||
sql.transactional(() => {
|
||||
note.mime = getImageMimeFromExtension(imageFormat.ext);
|
||||
|
||||
@@ -159,7 +196,7 @@ function saveImageToAttachment(noteId: string, uploadBuffer: Buffer, originalNam
|
||||
}, 5000);
|
||||
|
||||
// resizing images asynchronously since JIMP does not support sync operation
|
||||
processImage(uploadBuffer, originalName, !!shrinkImageSwitch).then(({ buffer, imageFormat }) => {
|
||||
processImage(uploadBuffer, originalName, !!shrinkImageSwitch, attachment.attachmentId).then(({ buffer, imageFormat }) => {
|
||||
sql.transactional(() => {
|
||||
// re-read, might be changed in the meantime
|
||||
if (!attachment.attachmentId) {
|
||||
|
||||
823
apps/server/src/services/ocr/ocr_service.spec.ts
Normal file
823
apps/server/src/services/ocr/ocr_service.spec.ts
Normal file
@@ -0,0 +1,823 @@
|
||||
import { afterEach,beforeEach, describe, expect, it, vi } from 'vitest';
|
||||
// Mock Tesseract.js
|
||||
const mockWorker = {
|
||||
recognize: vi.fn(),
|
||||
terminate: vi.fn(),
|
||||
reinitialize: vi.fn()
|
||||
};
|
||||
|
||||
const mockTesseract = {
|
||||
createWorker: vi.fn().mockResolvedValue(mockWorker)
|
||||
};
|
||||
|
||||
vi.mock('tesseract.js', () => ({
|
||||
default: mockTesseract
|
||||
}));
|
||||
|
||||
// Mock dependencies
|
||||
const mockOptions = {
|
||||
getOptionBool: vi.fn(),
|
||||
getOption: vi.fn()
|
||||
};
|
||||
|
||||
const mockLog = {
|
||||
info: vi.fn(),
|
||||
error: vi.fn()
|
||||
};
|
||||
|
||||
const mockSql = {
|
||||
execute: vi.fn(),
|
||||
getRow: vi.fn(),
|
||||
getRows: vi.fn()
|
||||
};
|
||||
|
||||
const mockBecca = {
|
||||
getNote: vi.fn(),
|
||||
getAttachment: vi.fn()
|
||||
};
|
||||
|
||||
vi.mock('../options.js', () => ({
|
||||
default: mockOptions
|
||||
}));
|
||||
|
||||
vi.mock('../log.js', () => ({
|
||||
default: mockLog
|
||||
}));
|
||||
|
||||
vi.mock('../sql.js', () => ({
|
||||
default: mockSql
|
||||
}));
|
||||
|
||||
vi.mock('../../becca/becca.js', () => ({
|
||||
default: mockBecca
|
||||
}));
|
||||
|
||||
// Import the service after mocking
|
||||
let ocrService: typeof import('./ocr_service.js').default;
|
||||
|
||||
beforeEach(async () => {
|
||||
// Clear all mocks
|
||||
vi.clearAllMocks();
|
||||
|
||||
// Reset mock implementations
|
||||
mockOptions.getOptionBool.mockReturnValue(true);
|
||||
mockOptions.getOption.mockReturnValue('eng');
|
||||
mockSql.execute.mockImplementation(() => ({ lastInsertRowid: 1 }));
|
||||
mockSql.getRow.mockReturnValue(null);
|
||||
mockSql.getRows.mockReturnValue([]);
|
||||
|
||||
// Set up createWorker to properly set the worker on the service
|
||||
mockTesseract.createWorker.mockImplementation(async () => {
|
||||
return mockWorker;
|
||||
});
|
||||
|
||||
// Dynamically import the service to ensure mocks are applied
|
||||
const module = await import('./ocr_service.js');
|
||||
ocrService = module.default; // It's an instance, not a class
|
||||
|
||||
// Reset the OCR service state
|
||||
(ocrService as any).isInitialized = false;
|
||||
(ocrService as any).worker = null;
|
||||
(ocrService as any).isProcessing = false;
|
||||
(ocrService as any).batchProcessingState = {
|
||||
inProgress: false,
|
||||
total: 0,
|
||||
processed: 0
|
||||
};
|
||||
});
|
||||
|
||||
afterEach(() => {
|
||||
vi.restoreAllMocks();
|
||||
});
|
||||
|
||||
describe('OCRService', () => {
|
||||
describe('isOCREnabled', () => {
|
||||
it('should return true when OCR is enabled in options', () => {
|
||||
mockOptions.getOptionBool.mockReturnValue(true);
|
||||
|
||||
expect(ocrService.isOCREnabled()).toBe(true);
|
||||
expect(mockOptions.getOptionBool).toHaveBeenCalledWith('ocrEnabled');
|
||||
});
|
||||
|
||||
it('should return false when OCR is disabled in options', () => {
|
||||
mockOptions.getOptionBool.mockReturnValue(false);
|
||||
|
||||
expect(ocrService.isOCREnabled()).toBe(false);
|
||||
expect(mockOptions.getOptionBool).toHaveBeenCalledWith('ocrEnabled');
|
||||
});
|
||||
|
||||
it('should return false when options throws an error', () => {
|
||||
mockOptions.getOptionBool.mockImplementation(() => {
|
||||
throw new Error('Options not available');
|
||||
});
|
||||
|
||||
expect(ocrService.isOCREnabled()).toBe(false);
|
||||
});
|
||||
});
|
||||
|
||||
describe('isSupportedMimeType', () => {
|
||||
it('should return true for supported image MIME types', () => {
|
||||
expect(ocrService.isSupportedMimeType('image/jpeg')).toBe(true);
|
||||
expect(ocrService.isSupportedMimeType('image/jpg')).toBe(true);
|
||||
expect(ocrService.isSupportedMimeType('image/png')).toBe(true);
|
||||
expect(ocrService.isSupportedMimeType('image/gif')).toBe(true);
|
||||
expect(ocrService.isSupportedMimeType('image/bmp')).toBe(true);
|
||||
expect(ocrService.isSupportedMimeType('image/tiff')).toBe(true);
|
||||
});
|
||||
|
||||
it('should return false for unsupported MIME types', () => {
|
||||
expect(ocrService.isSupportedMimeType('text/plain')).toBe(false);
|
||||
expect(ocrService.isSupportedMimeType('application/pdf')).toBe(false);
|
||||
expect(ocrService.isSupportedMimeType('video/mp4')).toBe(false);
|
||||
expect(ocrService.isSupportedMimeType('audio/mp3')).toBe(false);
|
||||
});
|
||||
|
||||
it('should handle null/undefined MIME types', () => {
|
||||
expect(ocrService.isSupportedMimeType(null as any)).toBe(false);
|
||||
expect(ocrService.isSupportedMimeType(undefined as any)).toBe(false);
|
||||
expect(ocrService.isSupportedMimeType('')).toBe(false);
|
||||
});
|
||||
});
|
||||
|
||||
describe('extractTextFromFile', () => {
|
||||
const mockImageBuffer = Buffer.from('fake-image-data');
|
||||
|
||||
it('should extract text successfully with default options', async () => {
|
||||
const mockResult = {
|
||||
data: {
|
||||
text: 'Extracted text from image',
|
||||
confidence: 95
|
||||
}
|
||||
};
|
||||
mockWorker.recognize.mockResolvedValue(mockResult);
|
||||
|
||||
const result = await ocrService.extractTextFromFile(mockImageBuffer, 'image/jpeg');
|
||||
|
||||
expect(result).toBeDefined();
|
||||
expect(result.text).toBe('Extracted text from image');
|
||||
expect(result.extractedAt).toEqual(expect.any(String));
|
||||
});
|
||||
|
||||
it('should handle OCR recognition errors', async () => {
|
||||
const error = new Error('OCR recognition failed');
|
||||
mockWorker.recognize.mockRejectedValue(error);
|
||||
|
||||
await expect(ocrService.extractTextFromFile(mockImageBuffer, 'image/jpeg')).rejects.toThrow('OCR recognition failed');
|
||||
expect(mockLog.error).toHaveBeenCalledWith('OCR text extraction failed: Error: OCR recognition failed');
|
||||
});
|
||||
});
|
||||
|
||||
describe('storeOCRResult', () => {
|
||||
it('should store OCR result in blob successfully', async () => {
|
||||
const ocrResult = {
|
||||
text: 'Sample text',
|
||||
confidence: 0.95,
|
||||
extractedAt: '2025-06-10T10:00:00.000Z',
|
||||
language: 'eng'
|
||||
};
|
||||
|
||||
await ocrService.storeOCRResult('blob123', ocrResult);
|
||||
|
||||
expect(mockSql.execute).toHaveBeenCalledWith(
|
||||
expect.stringContaining('UPDATE blobs SET textRepresentation = ?'),
|
||||
['Sample text', 'blob123']
|
||||
);
|
||||
});
|
||||
|
||||
it('should handle undefined blobId gracefully', async () => {
|
||||
const ocrResult = {
|
||||
text: 'Sample text',
|
||||
confidence: 0.95,
|
||||
extractedAt: '2025-06-10T10:00:00.000Z',
|
||||
language: 'eng'
|
||||
};
|
||||
|
||||
await ocrService.storeOCRResult(undefined, ocrResult);
|
||||
|
||||
expect(mockSql.execute).not.toHaveBeenCalled();
|
||||
expect(mockLog.error).toHaveBeenCalledWith('Cannot store OCR result: blobId is undefined');
|
||||
});
|
||||
|
||||
it('should handle database update errors', async () => {
|
||||
const error = new Error('Database error');
|
||||
mockSql.execute.mockImplementation(() => {
|
||||
throw error;
|
||||
});
|
||||
|
||||
const ocrResult = {
|
||||
text: 'Sample text',
|
||||
confidence: 0.95,
|
||||
extractedAt: '2025-06-10T10:00:00.000Z',
|
||||
language: 'eng'
|
||||
};
|
||||
|
||||
await expect(ocrService.storeOCRResult('blob123', ocrResult)).rejects.toThrow('Database error');
|
||||
expect(mockLog.error).toHaveBeenCalledWith('Failed to store OCR result for blob blob123: Error: Database error');
|
||||
});
|
||||
});
|
||||
|
||||
describe('processNoteOCR', () => {
|
||||
const mockNote = {
|
||||
noteId: 'note123',
|
||||
type: 'image',
|
||||
mime: 'image/jpeg',
|
||||
blobId: 'blob123',
|
||||
getContent: vi.fn()
|
||||
};
|
||||
|
||||
beforeEach(() => {
|
||||
mockBecca.getNote.mockReturnValue(mockNote);
|
||||
mockNote.getContent.mockReturnValue(Buffer.from('fake-image-data'));
|
||||
});
|
||||
|
||||
it('should process note OCR successfully', async () => {
|
||||
// Ensure getRow returns null for all calls in this test
|
||||
mockSql.getRow.mockImplementation(() => null);
|
||||
|
||||
const mockOCRResult = {
|
||||
data: {
|
||||
text: 'Note image text',
|
||||
confidence: 90
|
||||
}
|
||||
};
|
||||
mockWorker.recognize.mockResolvedValue(mockOCRResult);
|
||||
|
||||
const result = await ocrService.processNoteOCR('note123');
|
||||
|
||||
expect(result).toEqual({
|
||||
text: 'Note image text',
|
||||
confidence: 0.9,
|
||||
extractedAt: expect.any(String),
|
||||
language: 'eng'
|
||||
});
|
||||
expect(mockBecca.getNote).toHaveBeenCalledWith('note123');
|
||||
expect(mockNote.getContent).toHaveBeenCalled();
|
||||
});
|
||||
|
||||
it('should return existing OCR result if forceReprocess is false', async () => {
|
||||
const existingResult = {
|
||||
textRepresentation: 'Existing text'
|
||||
};
|
||||
mockSql.getRow.mockReturnValue(existingResult);
|
||||
|
||||
const result = await ocrService.processNoteOCR('note123');
|
||||
|
||||
expect(result).toEqual({
|
||||
text: 'Existing text',
|
||||
confidence: 0.95,
|
||||
language: 'eng',
|
||||
extractedAt: expect.any(String)
|
||||
});
|
||||
expect(mockNote.getContent).not.toHaveBeenCalled();
|
||||
});
|
||||
|
||||
it('should reprocess if forceReprocess is true', async () => {
|
||||
const existingResult = {
|
||||
textRepresentation: 'Existing text'
|
||||
};
|
||||
mockSql.getRow.mockResolvedValue(existingResult);
|
||||
|
||||
|
||||
const mockOCRResult = {
|
||||
data: {
|
||||
text: 'New processed text',
|
||||
confidence: 95
|
||||
}
|
||||
};
|
||||
mockWorker.recognize.mockResolvedValue(mockOCRResult);
|
||||
|
||||
const result = await ocrService.processNoteOCR('note123', { forceReprocess: true });
|
||||
|
||||
expect(result?.text).toBe('New processed text');
|
||||
expect(mockNote.getContent).toHaveBeenCalled();
|
||||
});
|
||||
|
||||
it('should return null for non-existent note', async () => {
|
||||
mockBecca.getNote.mockReturnValue(null);
|
||||
|
||||
const result = await ocrService.processNoteOCR('nonexistent');
|
||||
|
||||
expect(result).toBe(null);
|
||||
expect(mockLog.error).toHaveBeenCalledWith('Note nonexistent not found');
|
||||
});
|
||||
|
||||
it('should return null for unsupported MIME type', async () => {
|
||||
mockNote.mime = 'text/plain';
|
||||
|
||||
const result = await ocrService.processNoteOCR('note123');
|
||||
|
||||
expect(result).toBe(null);
|
||||
expect(mockLog.info).toHaveBeenCalledWith('Note note123 has unsupported MIME type text/plain, skipping OCR');
|
||||
});
|
||||
});
|
||||
|
||||
describe('processAttachmentOCR', () => {
|
||||
const mockAttachment = {
|
||||
attachmentId: 'attach123',
|
||||
role: 'image',
|
||||
mime: 'image/png',
|
||||
blobId: 'blob456',
|
||||
getContent: vi.fn()
|
||||
};
|
||||
|
||||
beforeEach(() => {
|
||||
mockBecca.getAttachment.mockReturnValue(mockAttachment);
|
||||
mockAttachment.getContent.mockReturnValue(Buffer.from('fake-image-data'));
|
||||
});
|
||||
|
||||
it('should process attachment OCR successfully', async () => {
|
||||
// Ensure getRow returns null for all calls in this test
|
||||
mockSql.getRow.mockImplementation(() => null);
|
||||
|
||||
|
||||
const mockOCRResult = {
|
||||
data: {
|
||||
text: 'Attachment image text',
|
||||
confidence: 92
|
||||
}
|
||||
};
|
||||
mockWorker.recognize.mockResolvedValue(mockOCRResult);
|
||||
|
||||
const result = await ocrService.processAttachmentOCR('attach123');
|
||||
|
||||
expect(result).toEqual({
|
||||
text: 'Attachment image text',
|
||||
confidence: 0.92,
|
||||
extractedAt: expect.any(String),
|
||||
language: 'eng'
|
||||
});
|
||||
expect(mockBecca.getAttachment).toHaveBeenCalledWith('attach123');
|
||||
});
|
||||
|
||||
it('should return null for non-existent attachment', async () => {
|
||||
mockBecca.getAttachment.mockReturnValue(null);
|
||||
|
||||
const result = await ocrService.processAttachmentOCR('nonexistent');
|
||||
|
||||
expect(result).toBe(null);
|
||||
expect(mockLog.error).toHaveBeenCalledWith('Attachment nonexistent not found');
|
||||
});
|
||||
});
|
||||
|
||||
describe('searchOCRResults', () => {
|
||||
it('should search OCR results successfully', () => {
|
||||
const mockResults = [
|
||||
{
|
||||
blobId: 'blob1',
|
||||
textRepresentation: 'Sample search text'
|
||||
}
|
||||
];
|
||||
mockSql.getRows.mockReturnValue(mockResults);
|
||||
|
||||
const results = ocrService.searchOCRResults('search');
|
||||
|
||||
expect(results).toEqual([{
|
||||
blobId: 'blob1',
|
||||
text: 'Sample search text'
|
||||
}]);
|
||||
expect(mockSql.getRows).toHaveBeenCalledWith(
|
||||
expect.stringContaining('WHERE textRepresentation LIKE ?'),
|
||||
['%search%']
|
||||
);
|
||||
});
|
||||
|
||||
it('should handle search errors gracefully', () => {
|
||||
mockSql.getRows.mockImplementation(() => {
|
||||
throw new Error('Database error');
|
||||
});
|
||||
|
||||
const results = ocrService.searchOCRResults('search');
|
||||
|
||||
expect(results).toEqual([]);
|
||||
expect(mockLog.error).toHaveBeenCalledWith('Failed to search OCR results: Error: Database error');
|
||||
});
|
||||
});
|
||||
|
||||
describe('getOCRStats', () => {
|
||||
it('should return OCR statistics successfully', () => {
|
||||
const mockStats = {
|
||||
total_processed: 150
|
||||
};
|
||||
const mockNoteStats = {
|
||||
count: 100
|
||||
};
|
||||
const mockAttachmentStats = {
|
||||
count: 50
|
||||
};
|
||||
|
||||
mockSql.getRow.mockReturnValueOnce(mockStats);
|
||||
mockSql.getRow.mockReturnValueOnce(mockNoteStats);
|
||||
mockSql.getRow.mockReturnValueOnce(mockAttachmentStats);
|
||||
|
||||
const stats = ocrService.getOCRStats();
|
||||
|
||||
expect(stats).toEqual({
|
||||
totalProcessed: 150,
|
||||
imageNotes: 100,
|
||||
imageAttachments: 50
|
||||
});
|
||||
});
|
||||
|
||||
it('should handle missing statistics gracefully', () => {
|
||||
mockSql.getRow.mockReturnValue(null);
|
||||
|
||||
const stats = ocrService.getOCRStats();
|
||||
|
||||
expect(stats).toEqual({
|
||||
totalProcessed: 0,
|
||||
imageNotes: 0,
|
||||
imageAttachments: 0
|
||||
});
|
||||
});
|
||||
});
|
||||
|
||||
describe('Batch Processing', () => {
|
||||
describe('startBatchProcessing', () => {
|
||||
beforeEach(() => {
|
||||
// Reset batch processing state
|
||||
ocrService.cancelBatchProcessing();
|
||||
});
|
||||
|
||||
it('should start batch processing when images are available', async () => {
|
||||
mockSql.getRow.mockReturnValueOnce({ count: 5 }); // image notes
|
||||
mockSql.getRow.mockReturnValueOnce({ count: 3 }); // image attachments
|
||||
|
||||
const result = await ocrService.startBatchProcessing();
|
||||
|
||||
expect(result).toEqual({ success: true });
|
||||
expect(mockSql.getRow).toHaveBeenCalledTimes(2);
|
||||
});
|
||||
|
||||
it('should return error if batch processing already in progress', async () => {
|
||||
// Start first batch
|
||||
mockSql.getRow.mockReturnValueOnce({ count: 5 });
|
||||
mockSql.getRow.mockReturnValueOnce({ count: 3 });
|
||||
|
||||
// Mock background processing queries
|
||||
const mockImageNotes = Array.from({length: 5}, (_, i) => ({
|
||||
noteId: `note${i}`,
|
||||
mime: 'image/jpeg'
|
||||
}));
|
||||
mockSql.getRows.mockReturnValueOnce(mockImageNotes);
|
||||
mockSql.getRows.mockReturnValueOnce([]);
|
||||
|
||||
// Start without awaiting to keep it in progress
|
||||
const firstStart = ocrService.startBatchProcessing();
|
||||
|
||||
// Try to start second batch immediately
|
||||
const result = await ocrService.startBatchProcessing();
|
||||
|
||||
// Clean up by awaiting the first one
|
||||
await firstStart;
|
||||
|
||||
expect(result).toEqual({
|
||||
success: false,
|
||||
message: 'Batch processing already in progress'
|
||||
});
|
||||
});
|
||||
|
||||
it('should return error if OCR is disabled', async () => {
|
||||
mockOptions.getOptionBool.mockReturnValue(false);
|
||||
|
||||
const result = await ocrService.startBatchProcessing();
|
||||
|
||||
expect(result).toEqual({
|
||||
success: false,
|
||||
message: 'OCR is disabled'
|
||||
});
|
||||
});
|
||||
|
||||
it('should return error if no images need processing', async () => {
|
||||
mockSql.getRow.mockReturnValueOnce({ count: 0 }); // image notes
|
||||
mockSql.getRow.mockReturnValueOnce({ count: 0 }); // image attachments
|
||||
|
||||
const result = await ocrService.startBatchProcessing();
|
||||
|
||||
expect(result).toEqual({
|
||||
success: false,
|
||||
message: 'No images found that need OCR processing'
|
||||
});
|
||||
});
|
||||
|
||||
it('should handle database errors gracefully', async () => {
|
||||
const error = new Error('Database connection failed');
|
||||
mockSql.getRow.mockImplementation(() => {
|
||||
throw error;
|
||||
});
|
||||
|
||||
const result = await ocrService.startBatchProcessing();
|
||||
|
||||
expect(result).toEqual({
|
||||
success: false,
|
||||
message: 'Database connection failed'
|
||||
});
|
||||
expect(mockLog.error).toHaveBeenCalledWith(
|
||||
'Failed to start batch processing: Database connection failed'
|
||||
);
|
||||
});
|
||||
});
|
||||
|
||||
describe('getBatchProgress', () => {
|
||||
it('should return initial progress state', () => {
|
||||
const progress = ocrService.getBatchProgress();
|
||||
|
||||
expect(progress.inProgress).toBe(false);
|
||||
expect(progress.total).toBe(0);
|
||||
expect(progress.processed).toBe(0);
|
||||
});
|
||||
|
||||
it('should return progress with percentage when total > 0', async () => {
|
||||
// Start batch processing
|
||||
mockSql.getRow.mockReturnValueOnce({ count: 10 });
|
||||
mockSql.getRow.mockReturnValueOnce({ count: 0 });
|
||||
|
||||
// Mock the background processing queries to return items that will take time to process
|
||||
const mockImageNotes = Array.from({length: 10}, (_, i) => ({
|
||||
noteId: `note${i}`,
|
||||
mime: 'image/jpeg'
|
||||
}));
|
||||
mockSql.getRows.mockReturnValueOnce(mockImageNotes); // image notes query
|
||||
mockSql.getRows.mockReturnValueOnce([]); // image attachments query
|
||||
|
||||
const startPromise = ocrService.startBatchProcessing();
|
||||
|
||||
// Check progress immediately after starting (before awaiting)
|
||||
const progress = ocrService.getBatchProgress();
|
||||
|
||||
await startPromise;
|
||||
|
||||
expect(progress.inProgress).toBe(true);
|
||||
expect(progress.total).toBe(10);
|
||||
expect(progress.processed).toBe(0);
|
||||
expect(progress.percentage).toBe(0);
|
||||
expect(progress.startTime).toBeInstanceOf(Date);
|
||||
});
|
||||
});
|
||||
|
||||
describe('cancelBatchProcessing', () => {
|
||||
it('should cancel ongoing batch processing', async () => {
|
||||
// Start batch processing
|
||||
mockSql.getRow.mockReturnValueOnce({ count: 5 });
|
||||
mockSql.getRow.mockReturnValueOnce({ count: 0 });
|
||||
|
||||
// Mock background processing queries
|
||||
const mockImageNotes = Array.from({length: 5}, (_, i) => ({
|
||||
noteId: `note${i}`,
|
||||
mime: 'image/jpeg'
|
||||
}));
|
||||
mockSql.getRows.mockReturnValueOnce(mockImageNotes);
|
||||
mockSql.getRows.mockReturnValueOnce([]);
|
||||
|
||||
const startPromise = ocrService.startBatchProcessing();
|
||||
|
||||
expect(ocrService.getBatchProgress().inProgress).toBe(true);
|
||||
|
||||
await startPromise;
|
||||
|
||||
ocrService.cancelBatchProcessing();
|
||||
|
||||
expect(ocrService.getBatchProgress().inProgress).toBe(false);
|
||||
expect(mockLog.info).toHaveBeenCalledWith('Batch OCR processing cancelled');
|
||||
});
|
||||
|
||||
it('should do nothing if no batch processing is running', () => {
|
||||
ocrService.cancelBatchProcessing();
|
||||
|
||||
expect(mockLog.info).not.toHaveBeenCalledWith('Batch OCR processing cancelled');
|
||||
});
|
||||
});
|
||||
|
||||
describe('processBatchInBackground', () => {
|
||||
it('should process image notes and attachments in sequence', async () => {
|
||||
// Clear all mocks at the start of this test to ensure clean state
|
||||
vi.clearAllMocks();
|
||||
|
||||
// Mock data for batch processing
|
||||
const imageNotes = [
|
||||
{ noteId: 'note1', mime: 'image/jpeg', blobId: 'blob1' },
|
||||
{ noteId: 'note2', mime: 'image/png', blobId: 'blob2' }
|
||||
];
|
||||
const imageAttachments = [
|
||||
{ attachmentId: 'attach1', mime: 'image/gif', blobId: 'blob3' }
|
||||
];
|
||||
|
||||
// Setup mocks for startBatchProcessing
|
||||
mockSql.getRow.mockReturnValueOnce({ count: 2 }); // image notes count
|
||||
mockSql.getRow.mockReturnValueOnce({ count: 1 }); // image attachments count
|
||||
|
||||
// Setup mocks for background processing
|
||||
mockSql.getRows.mockReturnValueOnce(imageNotes); // image notes query
|
||||
mockSql.getRows.mockReturnValueOnce(imageAttachments); // image attachments query
|
||||
|
||||
// Mock successful OCR processing
|
||||
mockWorker.recognize.mockResolvedValue({
|
||||
data: { text: 'Test text', confidence: 95 }
|
||||
});
|
||||
|
||||
// Mock notes and attachments
|
||||
const mockNote1 = {
|
||||
noteId: 'note1',
|
||||
type: 'image',
|
||||
mime: 'image/jpeg',
|
||||
blobId: 'blob1',
|
||||
getContent: vi.fn().mockReturnValue(Buffer.from('fake-image-data'))
|
||||
};
|
||||
const mockNote2 = {
|
||||
noteId: 'note2',
|
||||
type: 'image',
|
||||
mime: 'image/png',
|
||||
blobId: 'blob2',
|
||||
getContent: vi.fn().mockReturnValue(Buffer.from('fake-image-data'))
|
||||
};
|
||||
const mockAttachment = {
|
||||
attachmentId: 'attach1',
|
||||
role: 'image',
|
||||
mime: 'image/gif',
|
||||
blobId: 'blob3',
|
||||
getContent: vi.fn().mockReturnValue(Buffer.from('fake-image-data'))
|
||||
};
|
||||
|
||||
mockBecca.getNote.mockImplementation((noteId) => {
|
||||
if (noteId === 'note1') return mockNote1;
|
||||
if (noteId === 'note2') return mockNote2;
|
||||
return null;
|
||||
});
|
||||
mockBecca.getAttachment.mockReturnValue(mockAttachment);
|
||||
mockSql.getRow.mockReturnValue(null); // No existing OCR results
|
||||
|
||||
// Start batch processing
|
||||
await ocrService.startBatchProcessing();
|
||||
|
||||
// Wait for background processing to complete
|
||||
// Need to wait longer since there's a 500ms delay between each item in batch processing
|
||||
await new Promise(resolve => setTimeout(resolve, 2000));
|
||||
|
||||
// Verify notes and attachments were processed
|
||||
expect(mockBecca.getNote).toHaveBeenCalledWith('note1');
|
||||
expect(mockBecca.getNote).toHaveBeenCalledWith('note2');
|
||||
expect(mockBecca.getAttachment).toHaveBeenCalledWith('attach1');
|
||||
});
|
||||
|
||||
it('should handle processing errors gracefully', async () => {
|
||||
const imageNotes = [
|
||||
{ noteId: 'note1', mime: 'image/jpeg', blobId: 'blob1' }
|
||||
];
|
||||
|
||||
// Setup mocks for startBatchProcessing
|
||||
mockSql.getRow.mockReturnValueOnce({ count: 1 });
|
||||
mockSql.getRow.mockReturnValueOnce({ count: 0 });
|
||||
|
||||
// Setup mocks for background processing
|
||||
mockSql.getRows.mockReturnValueOnce(imageNotes);
|
||||
mockSql.getRows.mockReturnValueOnce([]);
|
||||
|
||||
// Mock note that will cause an error
|
||||
const mockNote = {
|
||||
noteId: 'note1',
|
||||
type: 'image',
|
||||
mime: 'image/jpeg',
|
||||
blobId: 'blob1',
|
||||
getContent: vi.fn().mockImplementation(() => { throw new Error('Failed to get content'); })
|
||||
};
|
||||
mockBecca.getNote.mockReturnValue(mockNote);
|
||||
mockSql.getRow.mockReturnValue(null);
|
||||
|
||||
// Start batch processing
|
||||
await ocrService.startBatchProcessing();
|
||||
|
||||
// Wait for background processing to complete
|
||||
await new Promise(resolve => setTimeout(resolve, 100));
|
||||
|
||||
// Verify error was logged but processing continued
|
||||
expect(mockLog.error).toHaveBeenCalledWith(
|
||||
expect.stringContaining('Failed to process OCR for note note1')
|
||||
);
|
||||
});
|
||||
|
||||
it('should stop processing when cancelled', async () => {
|
||||
const imageNotes = [
|
||||
{ noteId: 'note1', mime: 'image/jpeg', blobId: 'blob1' },
|
||||
{ noteId: 'note2', mime: 'image/png', blobId: 'blob2' }
|
||||
];
|
||||
|
||||
// Setup mocks
|
||||
mockSql.getRow.mockReturnValueOnce({ count: 2 });
|
||||
mockSql.getRow.mockReturnValueOnce({ count: 0 });
|
||||
mockSql.getRows.mockReturnValueOnce(imageNotes);
|
||||
mockSql.getRows.mockReturnValueOnce([]);
|
||||
|
||||
// Start batch processing
|
||||
await ocrService.startBatchProcessing();
|
||||
|
||||
// Cancel immediately
|
||||
ocrService.cancelBatchProcessing();
|
||||
|
||||
// Wait for background processing to complete
|
||||
await new Promise(resolve => setTimeout(resolve, 100));
|
||||
|
||||
// Verify processing was stopped early
|
||||
expect(ocrService.getBatchProgress().inProgress).toBe(false);
|
||||
});
|
||||
|
||||
it('should skip unsupported MIME types', async () => {
|
||||
const imageNotes = [
|
||||
{ noteId: 'note1', mime: 'text/plain', blobId: 'blob1' }, // unsupported
|
||||
{ noteId: 'note2', mime: 'image/jpeg', blobId: 'blob2' } // supported
|
||||
];
|
||||
|
||||
// Setup mocks
|
||||
mockSql.getRow.mockReturnValueOnce({ count: 2 });
|
||||
mockSql.getRow.mockReturnValueOnce({ count: 0 });
|
||||
mockSql.getRows.mockReturnValueOnce(imageNotes);
|
||||
mockSql.getRows.mockReturnValueOnce([]);
|
||||
|
||||
const mockNote = {
|
||||
noteId: 'note2',
|
||||
type: 'image',
|
||||
mime: 'image/jpeg',
|
||||
blobId: 'blob2',
|
||||
getContent: vi.fn().mockReturnValue(Buffer.from('fake-image-data'))
|
||||
};
|
||||
mockBecca.getNote.mockReturnValue(mockNote);
|
||||
mockSql.getRow.mockReturnValue(null);
|
||||
mockWorker.recognize.mockResolvedValue({
|
||||
data: { text: 'Test text', confidence: 95 }
|
||||
});
|
||||
|
||||
// Start batch processing
|
||||
await ocrService.startBatchProcessing();
|
||||
|
||||
// Wait for background processing to complete
|
||||
await new Promise(resolve => setTimeout(resolve, 100));
|
||||
|
||||
// Verify only supported MIME type was processed
|
||||
expect(mockBecca.getNote).toHaveBeenCalledWith('note2');
|
||||
expect(mockBecca.getNote).not.toHaveBeenCalledWith('note1');
|
||||
});
|
||||
});
|
||||
});
|
||||
|
||||
describe('deleteOCRResult', () => {
|
||||
it('should delete OCR result successfully', () => {
|
||||
ocrService.deleteOCRResult('blob123');
|
||||
|
||||
expect(mockSql.execute).toHaveBeenCalledWith(
|
||||
expect.stringContaining('UPDATE blobs SET textRepresentation = NULL'),
|
||||
['blob123']
|
||||
);
|
||||
expect(mockLog.info).toHaveBeenCalledWith('Deleted OCR result for blob blob123');
|
||||
});
|
||||
|
||||
it('should handle deletion errors', () => {
|
||||
mockSql.execute.mockImplementation(() => {
|
||||
throw new Error('Database error');
|
||||
});
|
||||
|
||||
expect(() => ocrService.deleteOCRResult('blob123')).toThrow('Database error');
|
||||
expect(mockLog.error).toHaveBeenCalledWith('Failed to delete OCR result for blob blob123: Error: Database error');
|
||||
});
|
||||
});
|
||||
|
||||
describe('isCurrentlyProcessing', () => {
|
||||
it('should return false initially', () => {
|
||||
expect(ocrService.isCurrentlyProcessing()).toBe(false);
|
||||
});
|
||||
|
||||
it('should return true during processing', async () => {
|
||||
mockBecca.getNote.mockReturnValue({
|
||||
noteId: 'note123',
|
||||
mime: 'image/jpeg',
|
||||
blobId: 'blob123',
|
||||
getContent: vi.fn().mockReturnValue(Buffer.from('fake-image-data'))
|
||||
});
|
||||
mockSql.getRow.mockResolvedValue(null);
|
||||
|
||||
mockWorker.recognize.mockImplementation(() => {
|
||||
expect(ocrService.isCurrentlyProcessing()).toBe(true);
|
||||
return Promise.resolve({
|
||||
data: { text: 'test', confidence: 90 }
|
||||
});
|
||||
});
|
||||
|
||||
await ocrService.processNoteOCR('note123');
|
||||
expect(ocrService.isCurrentlyProcessing()).toBe(false);
|
||||
});
|
||||
});
|
||||
|
||||
describe('cleanup', () => {
|
||||
it('should terminate worker on cleanup', async () => {
|
||||
|
||||
await ocrService.cleanup();
|
||||
|
||||
expect(mockWorker.terminate).toHaveBeenCalled();
|
||||
expect(mockLog.info).toHaveBeenCalledWith('OCR service cleaned up');
|
||||
});
|
||||
|
||||
it('should handle cleanup when worker is not initialized', async () => {
|
||||
await ocrService.cleanup();
|
||||
|
||||
expect(mockWorker.terminate).not.toHaveBeenCalled();
|
||||
expect(mockLog.info).toHaveBeenCalledWith('OCR service cleaned up');
|
||||
});
|
||||
});
|
||||
});
|
||||
752
apps/server/src/services/ocr/ocr_service.ts
Normal file
752
apps/server/src/services/ocr/ocr_service.ts
Normal file
@@ -0,0 +1,752 @@
|
||||
import Tesseract from 'tesseract.js';
|
||||
import log from '../log.js';
|
||||
import sql from '../sql.js';
|
||||
import becca from '../../becca/becca.js';
|
||||
import options from '../options.js';
|
||||
import { ImageProcessor } from './processors/image_processor.js';
|
||||
import { PDFProcessor } from './processors/pdf_processor.js';
|
||||
import { TIFFProcessor } from './processors/tiff_processor.js';
|
||||
import { OfficeProcessor } from './processors/office_processor.js';
|
||||
import { FileProcessor } from './processors/file_processor.js';
|
||||
|
||||
export interface OCRResult {
|
||||
text: string;
|
||||
confidence: number;
|
||||
extractedAt: string;
|
||||
language?: string;
|
||||
pageCount?: number;
|
||||
}
|
||||
|
||||
export interface OCRProcessingOptions {
|
||||
language?: string;
|
||||
forceReprocess?: boolean;
|
||||
confidence?: number;
|
||||
enablePDFTextExtraction?: boolean;
|
||||
}
|
||||
|
||||
interface OCRBlobRow {
|
||||
blobId: string;
|
||||
textRepresentation: string;
|
||||
textExtractionLastProcessed?: string;
|
||||
}
|
||||
|
||||
/**
|
||||
* OCR Service for extracting text from images and other OCR-able objects
|
||||
* Uses Tesseract.js for text recognition
|
||||
*/
|
||||
class OCRService {
|
||||
private worker: Tesseract.Worker | null = null;
|
||||
private isProcessing = false;
|
||||
private processors: Map<string, FileProcessor> = new Map();
|
||||
|
||||
constructor() {
|
||||
// Initialize file processors
|
||||
this.processors.set('image', new ImageProcessor());
|
||||
this.processors.set('pdf', new PDFProcessor());
|
||||
this.processors.set('tiff', new TIFFProcessor());
|
||||
this.processors.set('office', new OfficeProcessor());
|
||||
}
|
||||
|
||||
/**
|
||||
* Check if OCR is enabled in settings
|
||||
*/
|
||||
isOCREnabled(): boolean {
|
||||
try {
|
||||
return options.getOptionBool('ocrEnabled');
|
||||
} catch (error) {
|
||||
log.error(`Failed to check OCR enabled status: ${error}`);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Check if a MIME type is supported for OCR
|
||||
*/
|
||||
isSupportedMimeType(mimeType: string): boolean {
|
||||
if (!mimeType || typeof mimeType !== 'string') {
|
||||
return false;
|
||||
}
|
||||
|
||||
const supportedTypes = [
|
||||
'image/jpeg',
|
||||
'image/jpg',
|
||||
'image/png',
|
||||
'image/gif',
|
||||
'image/bmp',
|
||||
'image/tiff',
|
||||
'image/webp'
|
||||
];
|
||||
return supportedTypes.includes(mimeType.toLowerCase());
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract text from file buffer using appropriate processor
|
||||
*/
|
||||
async extractTextFromFile(fileBuffer: Buffer, mimeType: string, options: OCRProcessingOptions = {}): Promise<OCRResult> {
|
||||
try {
|
||||
log.info(`Starting OCR text extraction for MIME type: ${mimeType}`);
|
||||
this.isProcessing = true;
|
||||
|
||||
// Find appropriate processor
|
||||
const processor = this.getProcessorForMimeType(mimeType);
|
||||
if (!processor) {
|
||||
throw new Error(`No processor found for MIME type: ${mimeType}`);
|
||||
}
|
||||
|
||||
const result = await processor.extractText(fileBuffer, options);
|
||||
|
||||
log.info(`OCR extraction completed. Confidence: ${result.confidence}%, Text length: ${result.text.length}`);
|
||||
return result;
|
||||
|
||||
} catch (error) {
|
||||
log.error(`OCR text extraction failed: ${error}`);
|
||||
throw error;
|
||||
} finally {
|
||||
this.isProcessing = false;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Process OCR for a note (image type)
|
||||
*/
|
||||
async processNoteOCR(noteId: string, options: OCRProcessingOptions = {}): Promise<OCRResult | null> {
|
||||
if (!this.isOCREnabled()) {
|
||||
log.info('OCR is disabled in settings');
|
||||
return null;
|
||||
}
|
||||
|
||||
const note = becca.getNote(noteId);
|
||||
if (!note) {
|
||||
log.error(`Note ${noteId} not found`);
|
||||
return null;
|
||||
}
|
||||
|
||||
// Check if note type and MIME type are supported for OCR
|
||||
if (note.type === 'image') {
|
||||
if (!this.isSupportedMimeType(note.mime)) {
|
||||
log.info(`Image note ${noteId} has unsupported MIME type ${note.mime}, skipping OCR`);
|
||||
return null;
|
||||
}
|
||||
} else if (note.type === 'file') {
|
||||
// Check if file MIME type is supported by any processor
|
||||
const processor = this.getProcessorForMimeType(note.mime);
|
||||
if (!processor) {
|
||||
log.info(`File note ${noteId} has unsupported MIME type ${note.mime} for OCR, skipping`);
|
||||
return null;
|
||||
}
|
||||
} else {
|
||||
log.info(`Note ${noteId} is not an image or file note, skipping OCR`);
|
||||
return null;
|
||||
}
|
||||
|
||||
// Check if OCR already exists and is up-to-date
|
||||
const existingOCR = this.getStoredOCRResult(note.blobId);
|
||||
if (existingOCR && !options.forceReprocess && note.blobId && !this.needsReprocessing(note.blobId)) {
|
||||
log.info(`OCR already exists and is up-to-date for note ${noteId}, returning cached result`);
|
||||
return existingOCR;
|
||||
}
|
||||
|
||||
try {
|
||||
const content = note.getContent();
|
||||
if (!content || !(content instanceof Buffer)) {
|
||||
throw new Error(`Cannot get image content for note ${noteId}`);
|
||||
}
|
||||
|
||||
const ocrResult = await this.extractTextFromFile(content, note.mime, options);
|
||||
|
||||
// Store OCR result in blob
|
||||
await this.storeOCRResult(note.blobId, ocrResult);
|
||||
|
||||
return ocrResult;
|
||||
} catch (error) {
|
||||
log.error(`Failed to process OCR for note ${noteId}: ${error}`);
|
||||
throw error;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Process OCR for an attachment
|
||||
*/
|
||||
async processAttachmentOCR(attachmentId: string, options: OCRProcessingOptions = {}): Promise<OCRResult | null> {
|
||||
if (!this.isOCREnabled()) {
|
||||
log.info('OCR is disabled in settings');
|
||||
return null;
|
||||
}
|
||||
|
||||
const attachment = becca.getAttachment(attachmentId);
|
||||
if (!attachment) {
|
||||
log.error(`Attachment ${attachmentId} not found`);
|
||||
return null;
|
||||
}
|
||||
|
||||
// Check if attachment role and MIME type are supported for OCR
|
||||
if (attachment.role === 'image') {
|
||||
if (!this.isSupportedMimeType(attachment.mime)) {
|
||||
log.info(`Image attachment ${attachmentId} has unsupported MIME type ${attachment.mime}, skipping OCR`);
|
||||
return null;
|
||||
}
|
||||
} else if (attachment.role === 'file') {
|
||||
// Check if file MIME type is supported by any processor
|
||||
const processor = this.getProcessorForMimeType(attachment.mime);
|
||||
if (!processor) {
|
||||
log.info(`File attachment ${attachmentId} has unsupported MIME type ${attachment.mime} for OCR, skipping`);
|
||||
return null;
|
||||
}
|
||||
} else {
|
||||
log.info(`Attachment ${attachmentId} is not an image or file, skipping OCR`);
|
||||
return null;
|
||||
}
|
||||
|
||||
// Check if OCR already exists and is up-to-date
|
||||
const existingOCR = this.getStoredOCRResult(attachment.blobId);
|
||||
if (existingOCR && !options.forceReprocess && attachment.blobId && !this.needsReprocessing(attachment.blobId)) {
|
||||
log.info(`OCR already exists and is up-to-date for attachment ${attachmentId}, returning cached result`);
|
||||
return existingOCR;
|
||||
}
|
||||
|
||||
try {
|
||||
const content = attachment.getContent();
|
||||
if (!content || !(content instanceof Buffer)) {
|
||||
throw new Error(`Cannot get image content for attachment ${attachmentId}`);
|
||||
}
|
||||
|
||||
const ocrResult = await this.extractTextFromFile(content, attachment.mime, options);
|
||||
|
||||
// Store OCR result in blob
|
||||
await this.storeOCRResult(attachment.blobId, ocrResult);
|
||||
|
||||
return ocrResult;
|
||||
} catch (error) {
|
||||
log.error(`Failed to process OCR for attachment ${attachmentId}: ${error}`);
|
||||
throw error;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Store OCR result in blob
|
||||
*/
|
||||
async storeOCRResult(blobId: string | undefined, ocrResult: OCRResult): Promise<void> {
|
||||
if (!blobId) {
|
||||
log.error('Cannot store OCR result: blobId is undefined');
|
||||
return;
|
||||
}
|
||||
|
||||
try {
|
||||
// Store OCR text and timestamp in blobs table
|
||||
sql.execute(`
|
||||
UPDATE blobs SET
|
||||
textRepresentation = ?,
|
||||
textExtractionLastProcessed = ?
|
||||
WHERE blobId = ?
|
||||
`, [
|
||||
ocrResult.text,
|
||||
new Date().toISOString(),
|
||||
blobId
|
||||
]);
|
||||
|
||||
log.info(`Stored OCR result for blob ${blobId}`);
|
||||
} catch (error) {
|
||||
log.error(`Failed to store OCR result for blob ${blobId}: ${error}`);
|
||||
throw error;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Get stored OCR result from blob
|
||||
*/
|
||||
private getStoredOCRResult(blobId: string | undefined): OCRResult | null {
|
||||
if (!blobId) {
|
||||
return null;
|
||||
}
|
||||
|
||||
try {
|
||||
const row = sql.getRow<{
|
||||
textRepresentation: string | null;
|
||||
}>(`
|
||||
SELECT textRepresentation
|
||||
FROM blobs
|
||||
WHERE blobId = ?
|
||||
`, [blobId]);
|
||||
|
||||
if (!row || !row.textRepresentation) {
|
||||
return null;
|
||||
}
|
||||
|
||||
// Return basic OCR result from stored text
|
||||
// Note: we lose confidence, language, and extractedAt metadata
|
||||
// but gain simplicity by storing directly in blob
|
||||
return {
|
||||
text: row.textRepresentation,
|
||||
confidence: 0.95, // Default high confidence for existing OCR
|
||||
extractedAt: new Date().toISOString(),
|
||||
language: 'eng'
|
||||
};
|
||||
} catch (error) {
|
||||
log.error(`Failed to get OCR result for blob ${blobId}: ${error}`);
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Search for text in OCR results
|
||||
*/
|
||||
searchOCRResults(searchText: string): Array<{ blobId: string; text: string }> {
|
||||
try {
|
||||
const query = `
|
||||
SELECT blobId, textRepresentation
|
||||
FROM blobs
|
||||
WHERE textRepresentation LIKE ?
|
||||
AND textRepresentation IS NOT NULL
|
||||
`;
|
||||
const params = [`%${searchText}%`];
|
||||
|
||||
const rows = sql.getRows<OCRBlobRow>(query, params);
|
||||
|
||||
return rows.map(row => ({
|
||||
blobId: row.blobId,
|
||||
text: row.textRepresentation
|
||||
}));
|
||||
} catch (error) {
|
||||
log.error(`Failed to search OCR results: ${error}`);
|
||||
return [];
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Delete OCR results for a blob
|
||||
*/
|
||||
deleteOCRResult(blobId: string): void {
|
||||
try {
|
||||
sql.execute(`
|
||||
UPDATE blobs SET textRepresentation = NULL
|
||||
WHERE blobId = ?
|
||||
`, [blobId]);
|
||||
|
||||
log.info(`Deleted OCR result for blob ${blobId}`);
|
||||
} catch (error) {
|
||||
log.error(`Failed to delete OCR result for blob ${blobId}: ${error}`);
|
||||
throw error;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Process OCR for all files that don't have OCR results yet or need reprocessing
|
||||
*/
|
||||
async processAllImages(): Promise<void> {
|
||||
return this.processAllBlobsNeedingOCR();
|
||||
}
|
||||
|
||||
/**
|
||||
* Get OCR statistics
|
||||
*/
|
||||
getOCRStats(): { totalProcessed: number; imageNotes: number; imageAttachments: number } {
|
||||
try {
|
||||
const stats = sql.getRow<{
|
||||
total_processed: number;
|
||||
}>(`
|
||||
SELECT COUNT(*) as total_processed
|
||||
FROM blobs
|
||||
WHERE textRepresentation IS NOT NULL AND textRepresentation != ''
|
||||
`);
|
||||
|
||||
// Count image notes with OCR
|
||||
const noteStats = sql.getRow<{
|
||||
count: number;
|
||||
}>(`
|
||||
SELECT COUNT(*) as count
|
||||
FROM notes n
|
||||
JOIN blobs b ON n.blobId = b.blobId
|
||||
WHERE n.type = 'image'
|
||||
AND n.isDeleted = 0
|
||||
AND b.textRepresentation IS NOT NULL AND b.textRepresentation != ''
|
||||
`);
|
||||
|
||||
// Count image attachments with OCR
|
||||
const attachmentStats = sql.getRow<{
|
||||
count: number;
|
||||
}>(`
|
||||
SELECT COUNT(*) as count
|
||||
FROM attachments a
|
||||
JOIN blobs b ON a.blobId = b.blobId
|
||||
WHERE a.role = 'image'
|
||||
AND a.isDeleted = 0
|
||||
AND b.textRepresentation IS NOT NULL AND b.textRepresentation != ''
|
||||
`);
|
||||
|
||||
return {
|
||||
totalProcessed: stats?.total_processed || 0,
|
||||
imageNotes: noteStats?.count || 0,
|
||||
imageAttachments: attachmentStats?.count || 0
|
||||
};
|
||||
} catch (error) {
|
||||
log.error(`Failed to get OCR stats: ${error}`);
|
||||
return { totalProcessed: 0, imageNotes: 0, imageAttachments: 0 };
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Clean up OCR service
|
||||
*/
|
||||
async cleanup(): Promise<void> {
|
||||
if (this.worker) {
|
||||
await this.worker.terminate();
|
||||
this.worker = null;
|
||||
}
|
||||
log.info('OCR service cleaned up');
|
||||
}
|
||||
|
||||
/**
|
||||
* Check if currently processing
|
||||
*/
|
||||
isCurrentlyProcessing(): boolean {
|
||||
return this.isProcessing;
|
||||
}
|
||||
|
||||
// Batch processing state
|
||||
private batchProcessingState: {
|
||||
inProgress: boolean;
|
||||
total: number;
|
||||
processed: number;
|
||||
startTime?: Date;
|
||||
} = {
|
||||
inProgress: false,
|
||||
total: 0,
|
||||
processed: 0
|
||||
};
|
||||
|
||||
/**
|
||||
* Start batch OCR processing with progress tracking
|
||||
*/
|
||||
async startBatchProcessing(): Promise<{ success: boolean; message?: string }> {
|
||||
if (this.batchProcessingState.inProgress) {
|
||||
return { success: false, message: 'Batch processing already in progress' };
|
||||
}
|
||||
|
||||
if (!this.isOCREnabled()) {
|
||||
return { success: false, message: 'OCR is disabled' };
|
||||
}
|
||||
|
||||
try {
|
||||
// Count total blobs needing OCR processing
|
||||
const blobsNeedingOCR = this.getBlobsNeedingOCR();
|
||||
const totalCount = blobsNeedingOCR.length;
|
||||
|
||||
if (totalCount === 0) {
|
||||
return { success: false, message: 'No images found that need OCR processing' };
|
||||
}
|
||||
|
||||
// Initialize batch processing state
|
||||
this.batchProcessingState = {
|
||||
inProgress: true,
|
||||
total: totalCount,
|
||||
processed: 0,
|
||||
startTime: new Date()
|
||||
};
|
||||
|
||||
// Start processing in background
|
||||
this.processBatchInBackground(blobsNeedingOCR).catch(error => {
|
||||
log.error(`Batch processing failed: ${error instanceof Error ? error.message : String(error)}`);
|
||||
this.batchProcessingState.inProgress = false;
|
||||
});
|
||||
|
||||
return { success: true };
|
||||
} catch (error) {
|
||||
log.error(`Failed to start batch processing: ${error instanceof Error ? error.message : String(error)}`);
|
||||
return { success: false, message: error instanceof Error ? error.message : String(error) };
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Get batch processing progress
|
||||
*/
|
||||
getBatchProgress(): { inProgress: boolean; total: number; processed: number; percentage?: number; startTime?: Date } {
|
||||
const result: { inProgress: boolean; total: number; processed: number; percentage?: number; startTime?: Date } = { ...this.batchProcessingState };
|
||||
if (result.total > 0) {
|
||||
result.percentage = (result.processed / result.total) * 100;
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
* Process batch OCR in background with progress tracking
|
||||
*/
|
||||
private async processBatchInBackground(blobsToProcess: Array<{ blobId: string; mimeType: string; entityType: 'note' | 'attachment'; entityId: string }>): Promise<void> {
|
||||
try {
|
||||
log.info('Starting batch OCR processing...');
|
||||
|
||||
for (const blobInfo of blobsToProcess) {
|
||||
if (!this.batchProcessingState.inProgress) {
|
||||
break; // Stop if processing was cancelled
|
||||
}
|
||||
|
||||
try {
|
||||
if (blobInfo.entityType === 'note') {
|
||||
await this.processNoteOCR(blobInfo.entityId);
|
||||
} else {
|
||||
await this.processAttachmentOCR(blobInfo.entityId);
|
||||
}
|
||||
this.batchProcessingState.processed++;
|
||||
// Add small delay to prevent overwhelming the system
|
||||
await new Promise(resolve => setTimeout(resolve, 500));
|
||||
} catch (error) {
|
||||
log.error(`Failed to process OCR for ${blobInfo.entityType} ${blobInfo.entityId}: ${error}`);
|
||||
this.batchProcessingState.processed++; // Count as processed even if failed
|
||||
}
|
||||
}
|
||||
|
||||
// Mark as completed
|
||||
this.batchProcessingState.inProgress = false;
|
||||
log.info(`Batch OCR processing completed. Processed ${this.batchProcessingState.processed} files.`);
|
||||
} catch (error) {
|
||||
log.error(`Batch OCR processing failed: ${error}`);
|
||||
this.batchProcessingState.inProgress = false;
|
||||
throw error;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Cancel batch processing
|
||||
*/
|
||||
cancelBatchProcessing(): void {
|
||||
if (this.batchProcessingState.inProgress) {
|
||||
this.batchProcessingState.inProgress = false;
|
||||
log.info('Batch OCR processing cancelled');
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Get processor for a given MIME type
|
||||
*/
|
||||
private getProcessorForMimeType(mimeType: string): FileProcessor | null {
|
||||
for (const processor of this.processors.values()) {
|
||||
if (processor.canProcess(mimeType)) {
|
||||
return processor;
|
||||
}
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get all MIME types supported by all registered processors
|
||||
*/
|
||||
getAllSupportedMimeTypes(): string[] {
|
||||
const supportedTypes = new Set<string>();
|
||||
|
||||
// Gather MIME types from all registered processors
|
||||
for (const processor of this.processors.values()) {
|
||||
const processorTypes = processor.getSupportedMimeTypes();
|
||||
processorTypes.forEach(type => supportedTypes.add(type));
|
||||
}
|
||||
|
||||
return Array.from(supportedTypes);
|
||||
}
|
||||
|
||||
/**
|
||||
* Check if a MIME type is supported by any processor
|
||||
*/
|
||||
isSupportedByAnyProcessor(mimeType: string): boolean {
|
||||
if (!mimeType) return false;
|
||||
|
||||
// Check if any processor can handle this MIME type
|
||||
const processor = this.getProcessorForMimeType(mimeType);
|
||||
return processor !== null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Check if blob needs OCR re-processing due to content changes
|
||||
*/
|
||||
needsReprocessing(blobId: string): boolean {
|
||||
if (!blobId) {
|
||||
return false;
|
||||
}
|
||||
|
||||
try {
|
||||
const blobInfo = sql.getRow<{
|
||||
utcDateModified: string;
|
||||
textExtractionLastProcessed: string | null;
|
||||
}>(`
|
||||
SELECT utcDateModified, textExtractionLastProcessed
|
||||
FROM blobs
|
||||
WHERE blobId = ?
|
||||
`, [blobId]);
|
||||
|
||||
if (!blobInfo) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// If OCR was never processed, it needs processing
|
||||
if (!blobInfo.textExtractionLastProcessed) {
|
||||
return true;
|
||||
}
|
||||
|
||||
// If blob was modified after last OCR processing, it needs re-processing
|
||||
const blobModified = new Date(blobInfo.utcDateModified);
|
||||
const lastOcrProcessed = new Date(blobInfo.textExtractionLastProcessed);
|
||||
|
||||
return blobModified > lastOcrProcessed;
|
||||
} catch (error) {
|
||||
log.error(`Failed to check if blob ${blobId} needs reprocessing: ${error}`);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Invalidate OCR results for a blob (clear textRepresentation and textExtractionLastProcessed)
|
||||
*/
|
||||
invalidateOCRResult(blobId: string): void {
|
||||
if (!blobId) {
|
||||
return;
|
||||
}
|
||||
|
||||
try {
|
||||
sql.execute(`
|
||||
UPDATE blobs SET
|
||||
textRepresentation = NULL,
|
||||
textExtractionLastProcessed = NULL
|
||||
WHERE blobId = ?
|
||||
`, [blobId]);
|
||||
|
||||
log.info(`Invalidated OCR result for blob ${blobId}`);
|
||||
} catch (error) {
|
||||
log.error(`Failed to invalidate OCR result for blob ${blobId}: ${error}`);
|
||||
throw error;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Get blobs that need OCR processing (modified after last OCR or never processed)
|
||||
*/
|
||||
getBlobsNeedingOCR(): Array<{ blobId: string; mimeType: string; entityType: 'note' | 'attachment'; entityId: string }> {
|
||||
try {
|
||||
// Get notes with blobs that need OCR (both image notes and file notes with supported MIME types)
|
||||
const noteBlobs = sql.getRows<{
|
||||
blobId: string;
|
||||
mimeType: string;
|
||||
entityId: string;
|
||||
}>(`
|
||||
SELECT n.blobId, n.mime as mimeType, n.noteId as entityId
|
||||
FROM notes n
|
||||
JOIN blobs b ON n.blobId = b.blobId
|
||||
WHERE (
|
||||
n.type = 'image'
|
||||
OR (
|
||||
n.type = 'file'
|
||||
AND n.mime IN (
|
||||
'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
|
||||
'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
|
||||
'application/vnd.openxmlformats-officedocument.presentationml.presentation',
|
||||
'application/msword',
|
||||
'application/vnd.ms-excel',
|
||||
'application/vnd.ms-powerpoint',
|
||||
'application/rtf',
|
||||
'application/pdf',
|
||||
'image/jpeg',
|
||||
'image/jpg',
|
||||
'image/png',
|
||||
'image/gif',
|
||||
'image/bmp',
|
||||
'image/tiff',
|
||||
'image/webp'
|
||||
)
|
||||
)
|
||||
)
|
||||
AND n.isDeleted = 0
|
||||
AND n.blobId IS NOT NULL
|
||||
AND (
|
||||
b.textExtractionLastProcessed IS NULL
|
||||
OR b.utcDateModified > b.textExtractionLastProcessed
|
||||
)
|
||||
`);
|
||||
|
||||
// Get attachments with blobs that need OCR (both image and file attachments with supported MIME types)
|
||||
const attachmentBlobs = sql.getRows<{
|
||||
blobId: string;
|
||||
mimeType: string;
|
||||
entityId: string;
|
||||
}>(`
|
||||
SELECT a.blobId, a.mime as mimeType, a.attachmentId as entityId
|
||||
FROM attachments a
|
||||
JOIN blobs b ON a.blobId = b.blobId
|
||||
WHERE (
|
||||
a.role = 'image'
|
||||
OR (
|
||||
a.role = 'file'
|
||||
AND a.mime IN (
|
||||
'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
|
||||
'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
|
||||
'application/vnd.openxmlformats-officedocument.presentationml.presentation',
|
||||
'application/msword',
|
||||
'application/vnd.ms-excel',
|
||||
'application/vnd.ms-powerpoint',
|
||||
'application/rtf',
|
||||
'application/pdf',
|
||||
'image/jpeg',
|
||||
'image/jpg',
|
||||
'image/png',
|
||||
'image/gif',
|
||||
'image/bmp',
|
||||
'image/tiff',
|
||||
'image/webp'
|
||||
)
|
||||
)
|
||||
)
|
||||
AND a.isDeleted = 0
|
||||
AND a.blobId IS NOT NULL
|
||||
AND (
|
||||
b.textExtractionLastProcessed IS NULL
|
||||
OR b.utcDateModified > b.textExtractionLastProcessed
|
||||
)
|
||||
`);
|
||||
|
||||
// Combine results
|
||||
const result = [
|
||||
...noteBlobs.map(blob => ({ ...blob, entityType: 'note' as const })),
|
||||
...attachmentBlobs.map(blob => ({ ...blob, entityType: 'attachment' as const }))
|
||||
];
|
||||
|
||||
// Return all results (no need to filter by MIME type as we already did in the query)
|
||||
return result;
|
||||
} catch (error) {
|
||||
log.error(`Failed to get blobs needing OCR: ${error}`);
|
||||
return [];
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Process OCR for all blobs that need it (auto-processing)
|
||||
*/
|
||||
async processAllBlobsNeedingOCR(): Promise<void> {
|
||||
if (!this.isOCREnabled()) {
|
||||
log.info('OCR is disabled, skipping auto-processing');
|
||||
return;
|
||||
}
|
||||
|
||||
const blobsNeedingOCR = this.getBlobsNeedingOCR();
|
||||
if (blobsNeedingOCR.length === 0) {
|
||||
log.info('No blobs need OCR processing');
|
||||
return;
|
||||
}
|
||||
|
||||
log.info(`Auto-processing OCR for ${blobsNeedingOCR.length} blobs...`);
|
||||
|
||||
for (const blobInfo of blobsNeedingOCR) {
|
||||
try {
|
||||
if (blobInfo.entityType === 'note') {
|
||||
await this.processNoteOCR(blobInfo.entityId);
|
||||
} else {
|
||||
await this.processAttachmentOCR(blobInfo.entityId);
|
||||
}
|
||||
|
||||
// Add small delay to prevent overwhelming the system
|
||||
await new Promise(resolve => setTimeout(resolve, 100));
|
||||
} catch (error) {
|
||||
log.error(`Failed to auto-process OCR for ${blobInfo.entityType} ${blobInfo.entityId}: ${error}`);
|
||||
// Continue with other blobs
|
||||
}
|
||||
}
|
||||
|
||||
log.info('Auto-processing OCR completed');
|
||||
}
|
||||
}
|
||||
|
||||
export default new OCRService();
|
||||
33
apps/server/src/services/ocr/processors/file_processor.ts
Normal file
33
apps/server/src/services/ocr/processors/file_processor.ts
Normal file
@@ -0,0 +1,33 @@
|
||||
import { OCRResult, OCRProcessingOptions } from '../ocr_service.js';
|
||||
|
||||
/**
|
||||
* Base class for file processors that extract text from different file types
|
||||
*/
|
||||
export abstract class FileProcessor {
|
||||
/**
|
||||
* Check if this processor can handle the given MIME type
|
||||
*/
|
||||
abstract canProcess(mimeType: string): boolean;
|
||||
|
||||
/**
|
||||
* Extract text from the given file buffer
|
||||
*/
|
||||
abstract extractText(buffer: Buffer, options: OCRProcessingOptions): Promise<OCRResult>;
|
||||
|
||||
/**
|
||||
* Get the processing type identifier
|
||||
*/
|
||||
abstract getProcessingType(): string;
|
||||
|
||||
/**
|
||||
* Get list of MIME types supported by this processor
|
||||
*/
|
||||
abstract getSupportedMimeTypes(): string[];
|
||||
|
||||
/**
|
||||
* Clean up any resources
|
||||
*/
|
||||
cleanup(): Promise<void> {
|
||||
return Promise.resolve();
|
||||
}
|
||||
}
|
||||
236
apps/server/src/services/ocr/processors/image_processor.ts
Normal file
236
apps/server/src/services/ocr/processors/image_processor.ts
Normal file
@@ -0,0 +1,236 @@
|
||||
import Tesseract from 'tesseract.js';
|
||||
|
||||
import log from '../../log.js';
|
||||
import options from '../../options.js';
|
||||
import { OCRProcessingOptions,OCRResult } from '../ocr_service.js';
|
||||
import { FileProcessor } from './file_processor.js';
|
||||
|
||||
/**
|
||||
* Image processor for extracting text from image files using Tesseract
|
||||
*/
|
||||
export class ImageProcessor extends FileProcessor {
|
||||
private worker: Tesseract.Worker | null = null;
|
||||
private isInitialized = false;
|
||||
private readonly supportedTypes = [
|
||||
'image/jpeg',
|
||||
'image/jpg',
|
||||
'image/png',
|
||||
'image/gif',
|
||||
'image/bmp',
|
||||
'image/tiff',
|
||||
'image/webp'
|
||||
];
|
||||
|
||||
canProcess(mimeType: string): boolean {
|
||||
return this.supportedTypes.includes(mimeType.toLowerCase());
|
||||
}
|
||||
|
||||
getSupportedMimeTypes(): string[] {
|
||||
return [...this.supportedTypes];
|
||||
}
|
||||
|
||||
async extractText(buffer: Buffer, options: OCRProcessingOptions = {}): Promise<OCRResult> {
|
||||
if (!this.isInitialized) {
|
||||
await this.initialize();
|
||||
}
|
||||
|
||||
if (!this.worker) {
|
||||
throw new Error('Image processor worker not initialized');
|
||||
}
|
||||
|
||||
try {
|
||||
log.info('Starting image OCR text extraction...');
|
||||
|
||||
// Set language if specified and different from current
|
||||
// Support multi-language format like 'ron+eng'
|
||||
const language = options.language || this.getDefaultOCRLanguage();
|
||||
|
||||
// Validate language format
|
||||
if (!this.isValidLanguageFormat(language)) {
|
||||
throw new Error(`Invalid OCR language format: ${language}. Use format like 'eng' or 'ron+eng'`);
|
||||
}
|
||||
|
||||
if (language !== 'eng') {
|
||||
// For different languages, create a new worker
|
||||
await this.worker.terminate();
|
||||
log.info(`Initializing Tesseract worker for language(s): ${language}`);
|
||||
this.worker = await Tesseract.createWorker(language, 1, {
|
||||
logger: (m: { status: string; progress: number }) => {
|
||||
if (m.status === 'recognizing text') {
|
||||
log.info(`Image OCR progress (${language}): ${Math.round(m.progress * 100)}%`);
|
||||
}
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
const result = await this.worker.recognize(buffer);
|
||||
|
||||
// Filter text based on minimum confidence threshold
|
||||
const { filteredText, overallConfidence } = this.filterTextByConfidence(result.data, options);
|
||||
|
||||
const ocrResult: OCRResult = {
|
||||
text: filteredText,
|
||||
confidence: overallConfidence,
|
||||
extractedAt: new Date().toISOString(),
|
||||
language: options.language || this.getDefaultOCRLanguage(),
|
||||
pageCount: 1
|
||||
};
|
||||
|
||||
log.info(`Image OCR extraction completed. Confidence: ${ocrResult.confidence}%, Text length: ${ocrResult.text.length}`);
|
||||
return ocrResult;
|
||||
|
||||
} catch (error) {
|
||||
log.error(`Image OCR text extraction failed: ${error}`);
|
||||
throw error;
|
||||
}
|
||||
}
|
||||
|
||||
getProcessingType(): string {
|
||||
return 'image';
|
||||
}
|
||||
|
||||
private async initialize(): Promise<void> {
|
||||
if (this.isInitialized) {
|
||||
return;
|
||||
}
|
||||
|
||||
try {
|
||||
log.info('Initializing image OCR processor with Tesseract.js...');
|
||||
|
||||
// Configure proper paths for Node.js environment
|
||||
const tesseractDir = require.resolve('tesseract.js').replace('/src/index.js', '');
|
||||
const workerPath = require.resolve('tesseract.js/src/worker-script/node/index.js');
|
||||
const corePath = require.resolve('tesseract.js-core/tesseract-core.wasm.js');
|
||||
|
||||
log.info(`Using worker path: ${workerPath}`);
|
||||
log.info(`Using core path: ${corePath}`);
|
||||
|
||||
this.worker = await Tesseract.createWorker(this.getDefaultOCRLanguage(), 1, {
|
||||
workerPath,
|
||||
corePath,
|
||||
logger: (m: { status: string; progress: number }) => {
|
||||
if (m.status === 'recognizing text') {
|
||||
log.info(`Image OCR progress: ${Math.round(m.progress * 100)}%`);
|
||||
}
|
||||
}
|
||||
});
|
||||
this.isInitialized = true;
|
||||
log.info('Image OCR processor initialized successfully');
|
||||
} catch (error) {
|
||||
log.error(`Failed to initialize image OCR processor: ${error}`);
|
||||
throw error;
|
||||
}
|
||||
}
|
||||
|
||||
async cleanup(): Promise<void> {
|
||||
if (this.worker) {
|
||||
await this.worker.terminate();
|
||||
this.worker = null;
|
||||
}
|
||||
this.isInitialized = false;
|
||||
log.info('Image OCR processor cleaned up');
|
||||
}
|
||||
|
||||
/**
|
||||
* Get default OCR language from options
|
||||
*/
|
||||
private getDefaultOCRLanguage(): string {
|
||||
try {
|
||||
const ocrLanguage = options.getOption('ocrLanguage');
|
||||
if (!ocrLanguage) {
|
||||
throw new Error('OCR language not configured in user settings');
|
||||
}
|
||||
return ocrLanguage;
|
||||
} catch (error) {
|
||||
log.error(`Failed to get default OCR language: ${error}`);
|
||||
throw new Error('OCR language must be configured in settings before processing');
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Filter text based on minimum confidence threshold
|
||||
*/
|
||||
private filterTextByConfidence(data: any, options: OCRProcessingOptions): { filteredText: string; overallConfidence: number } {
|
||||
const minConfidence = this.getMinConfidenceThreshold();
|
||||
|
||||
// If no minimum confidence set, return original text
|
||||
if (minConfidence <= 0) {
|
||||
return {
|
||||
filteredText: data.text.trim(),
|
||||
overallConfidence: data.confidence / 100
|
||||
};
|
||||
}
|
||||
|
||||
const filteredWords: string[] = [];
|
||||
const validConfidences: number[] = [];
|
||||
|
||||
// Tesseract provides word-level data
|
||||
if (data.words && Array.isArray(data.words)) {
|
||||
for (const word of data.words) {
|
||||
const wordConfidence = word.confidence / 100; // Convert to decimal
|
||||
|
||||
if (wordConfidence >= minConfidence) {
|
||||
filteredWords.push(word.text);
|
||||
validConfidences.push(wordConfidence);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// Fallback: if word-level data not available, use overall confidence
|
||||
const overallConfidence = data.confidence / 100;
|
||||
if (overallConfidence >= minConfidence) {
|
||||
return {
|
||||
filteredText: data.text.trim(),
|
||||
overallConfidence
|
||||
};
|
||||
}
|
||||
log.info(`Entire text filtered out due to low confidence ${overallConfidence} (below threshold ${minConfidence})`);
|
||||
return {
|
||||
filteredText: '',
|
||||
overallConfidence
|
||||
};
|
||||
}
|
||||
|
||||
// Calculate average confidence of accepted words
|
||||
const averageConfidence = validConfidences.length > 0
|
||||
? validConfidences.reduce((sum, conf) => sum + conf, 0) / validConfidences.length
|
||||
: 0;
|
||||
|
||||
const filteredText = filteredWords.join(' ').trim();
|
||||
|
||||
log.info(`Filtered OCR text: ${filteredWords.length} words kept out of ${data.words?.length || 0} total words (min confidence: ${minConfidence})`);
|
||||
|
||||
return {
|
||||
filteredText,
|
||||
overallConfidence: averageConfidence
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Get minimum confidence threshold from options
|
||||
*/
|
||||
private getMinConfidenceThreshold(): number {
|
||||
const minConfidence = options.getOption('ocrMinConfidence') ?? 0;
|
||||
return parseFloat(minConfidence);
|
||||
}
|
||||
|
||||
/**
|
||||
* Validate OCR language format
|
||||
* Supports single language (eng) or multi-language (ron+eng)
|
||||
*/
|
||||
private isValidLanguageFormat(language: string): boolean {
|
||||
if (!language || typeof language !== 'string') {
|
||||
return false;
|
||||
}
|
||||
|
||||
// Split by '+' for multi-language format
|
||||
const languages = language.split('+');
|
||||
|
||||
// Check each language code (should be 2-7 characters, alphanumeric with underscores)
|
||||
const validLanguagePattern = /^[a-zA-Z]{2,3}(_[a-zA-Z]{2,3})?$/;
|
||||
|
||||
return languages.every(lang => {
|
||||
const trimmed = lang.trim();
|
||||
return trimmed.length > 0 && validLanguagePattern.test(trimmed);
|
||||
});
|
||||
}
|
||||
}
|
||||
133
apps/server/src/services/ocr/processors/office_processor.ts
Normal file
133
apps/server/src/services/ocr/processors/office_processor.ts
Normal file
@@ -0,0 +1,133 @@
|
||||
import * as officeParser from 'officeparser';
|
||||
|
||||
import log from '../../log.js';
|
||||
import options from '../../options.js';
|
||||
import { OCRProcessingOptions,OCRResult } from '../ocr_service.js';
|
||||
import { FileProcessor } from './file_processor.js';
|
||||
import { ImageProcessor } from './image_processor.js';
|
||||
|
||||
/**
|
||||
* Office document processor for extracting text and images from DOCX/XLSX/PPTX files
|
||||
*/
|
||||
export class OfficeProcessor extends FileProcessor {
|
||||
private imageProcessor: ImageProcessor;
|
||||
private readonly supportedTypes = [
|
||||
'application/vnd.openxmlformats-officedocument.wordprocessingml.document', // DOCX
|
||||
'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', // XLSX
|
||||
'application/vnd.openxmlformats-officedocument.presentationml.presentation', // PPTX
|
||||
'application/msword', // DOC
|
||||
'application/vnd.ms-excel', // XLS
|
||||
'application/vnd.ms-powerpoint', // PPT
|
||||
'application/rtf' // RTF
|
||||
];
|
||||
|
||||
constructor() {
|
||||
super();
|
||||
this.imageProcessor = new ImageProcessor();
|
||||
}
|
||||
|
||||
canProcess(mimeType: string): boolean {
|
||||
return this.supportedTypes.includes(mimeType);
|
||||
}
|
||||
|
||||
getSupportedMimeTypes(): string[] {
|
||||
return [...this.supportedTypes];
|
||||
}
|
||||
|
||||
async extractText(buffer: Buffer, options: OCRProcessingOptions = {}): Promise<OCRResult> {
|
||||
try {
|
||||
log.info('Starting Office document text extraction...');
|
||||
|
||||
// Validate language format
|
||||
const language = options.language || this.getDefaultOCRLanguage();
|
||||
if (!this.isValidLanguageFormat(language)) {
|
||||
throw new Error(`Invalid OCR language format: ${language}. Use format like 'eng' or 'ron+eng'`);
|
||||
}
|
||||
|
||||
// Extract text from Office document
|
||||
const data = await this.parseOfficeDocument(buffer);
|
||||
|
||||
// Extract text from Office document
|
||||
const combinedText = data.data && data.data.trim().length > 0 ? data.data.trim() : '';
|
||||
const confidence = combinedText.length > 0 ? 0.99 : 0; // High confidence for direct text extraction
|
||||
|
||||
const result: OCRResult = {
|
||||
text: combinedText,
|
||||
confidence,
|
||||
extractedAt: new Date().toISOString(),
|
||||
language,
|
||||
pageCount: 1 // Office documents are treated as single logical document
|
||||
};
|
||||
|
||||
log.info(`Office document text extraction completed. Confidence: ${confidence}%, Text length: ${result.text.length}`);
|
||||
return result;
|
||||
|
||||
} catch (error) {
|
||||
log.error(`Office document text extraction failed: ${error}`);
|
||||
throw error;
|
||||
}
|
||||
}
|
||||
|
||||
private async parseOfficeDocument(buffer: Buffer): Promise<{ data: string }> {
|
||||
try {
|
||||
// Use promise-based API directly
|
||||
const data = await officeParser.parseOfficeAsync(buffer, {
|
||||
outputErrorToConsole: false,
|
||||
newlineDelimiter: '\n',
|
||||
ignoreNotes: false,
|
||||
putNotesAtLast: false
|
||||
});
|
||||
|
||||
return {
|
||||
data: data || ''
|
||||
};
|
||||
} catch (error) {
|
||||
throw new Error(`Office document parsing failed: ${error}`);
|
||||
}
|
||||
}
|
||||
|
||||
getProcessingType(): string {
|
||||
return 'office';
|
||||
}
|
||||
|
||||
async cleanup(): Promise<void> {
|
||||
await this.imageProcessor.cleanup();
|
||||
}
|
||||
|
||||
/**
|
||||
* Get default OCR language from options
|
||||
*/
|
||||
private getDefaultOCRLanguage(): string {
|
||||
try {
|
||||
const ocrLanguage = options.getOption('ocrLanguage');
|
||||
if (!ocrLanguage) {
|
||||
throw new Error('OCR language not configured in user settings');
|
||||
}
|
||||
return ocrLanguage;
|
||||
} catch (error) {
|
||||
log.error(`Failed to get default OCR language: ${error}`);
|
||||
throw new Error('OCR language must be configured in settings before processing');
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Validate OCR language format
|
||||
* Supports single language (eng) or multi-language (ron+eng)
|
||||
*/
|
||||
private isValidLanguageFormat(language: string): boolean {
|
||||
if (!language || typeof language !== 'string') {
|
||||
return false;
|
||||
}
|
||||
|
||||
// Split by '+' for multi-language format
|
||||
const languages = language.split('+');
|
||||
|
||||
// Check each language code (should be 2-7 characters, alphanumeric with underscores)
|
||||
const validLanguagePattern = /^[a-zA-Z]{2,3}(_[a-zA-Z]{2,3})?$/;
|
||||
|
||||
return languages.every(lang => {
|
||||
const trimmed = lang.trim();
|
||||
return trimmed.length > 0 && validLanguagePattern.test(trimmed);
|
||||
});
|
||||
}
|
||||
}
|
||||
147
apps/server/src/services/ocr/processors/pdf_processor.ts
Normal file
147
apps/server/src/services/ocr/processors/pdf_processor.ts
Normal file
@@ -0,0 +1,147 @@
|
||||
import * as pdfParse from 'pdf-parse';
|
||||
|
||||
import log from '../../log.js';
|
||||
import options from '../../options.js';
|
||||
import { OCRProcessingOptions,OCRResult } from '../ocr_service.js';
|
||||
import { FileProcessor } from './file_processor.js';
|
||||
import { ImageProcessor } from './image_processor.js';
|
||||
|
||||
/**
|
||||
* PDF processor for extracting text from PDF files
|
||||
* First tries to extract existing text, then falls back to OCR on images
|
||||
*/
|
||||
export class PDFProcessor extends FileProcessor {
|
||||
private imageProcessor: ImageProcessor;
|
||||
private readonly supportedTypes = ['application/pdf'];
|
||||
|
||||
constructor() {
|
||||
super();
|
||||
this.imageProcessor = new ImageProcessor();
|
||||
}
|
||||
|
||||
canProcess(mimeType: string): boolean {
|
||||
return mimeType.toLowerCase() === 'application/pdf';
|
||||
}
|
||||
|
||||
getSupportedMimeTypes(): string[] {
|
||||
return [...this.supportedTypes];
|
||||
}
|
||||
|
||||
async extractText(buffer: Buffer, options: OCRProcessingOptions = {}): Promise<OCRResult> {
|
||||
try {
|
||||
log.info('Starting PDF text extraction...');
|
||||
|
||||
// Validate language format
|
||||
const language = options.language || this.getDefaultOCRLanguage();
|
||||
if (!this.isValidLanguageFormat(language)) {
|
||||
throw new Error(`Invalid OCR language format: ${language}. Use format like 'eng' or 'ron+eng'`);
|
||||
}
|
||||
|
||||
// First try to extract existing text from PDF
|
||||
if (options.enablePDFTextExtraction !== false) {
|
||||
const textResult = await this.extractTextFromPDF(buffer, options);
|
||||
if (textResult.text.trim().length > 0) {
|
||||
log.info(`PDF text extraction successful. Length: ${textResult.text.length}`);
|
||||
return textResult;
|
||||
}
|
||||
}
|
||||
|
||||
// Fall back to OCR if no text found or PDF text extraction is disabled
|
||||
log.info('No text found in PDF or text extraction disabled, falling back to OCR...');
|
||||
return await this.extractTextViaOCR(buffer, options);
|
||||
|
||||
} catch (error) {
|
||||
log.error(`PDF text extraction failed: ${error}`);
|
||||
throw error;
|
||||
}
|
||||
}
|
||||
|
||||
private async extractTextFromPDF(buffer: Buffer, options: OCRProcessingOptions): Promise<OCRResult> {
|
||||
try {
|
||||
const data = await pdfParse(buffer);
|
||||
|
||||
return {
|
||||
text: data.text.trim(),
|
||||
confidence: 0.99, // High confidence for direct text extraction
|
||||
extractedAt: new Date().toISOString(),
|
||||
language: options.language || this.getDefaultOCRLanguage(),
|
||||
pageCount: data.numpages
|
||||
};
|
||||
} catch (error) {
|
||||
log.error(`PDF text extraction failed: ${error}`);
|
||||
throw error;
|
||||
}
|
||||
}
|
||||
|
||||
private async extractTextViaOCR(buffer: Buffer, options: OCRProcessingOptions): Promise<OCRResult> {
|
||||
try {
|
||||
// Convert PDF to images and OCR each page
|
||||
// For now, we'll use a simple approach - convert first page to image
|
||||
// In a full implementation, we'd convert all pages
|
||||
|
||||
// This is a simplified implementation
|
||||
// In practice, you might want to use pdf2pic or similar library
|
||||
// to convert PDF pages to images for OCR
|
||||
|
||||
// For now, we'll return a placeholder result
|
||||
// indicating that OCR on PDF is not fully implemented
|
||||
log.info('PDF to image conversion not fully implemented, returning placeholder');
|
||||
|
||||
return {
|
||||
text: '[PDF OCR not fully implemented - would convert PDF pages to images and OCR each page]',
|
||||
confidence: 0.0,
|
||||
extractedAt: new Date().toISOString(),
|
||||
language: options.language || this.getDefaultOCRLanguage(),
|
||||
pageCount: 1
|
||||
};
|
||||
} catch (error) {
|
||||
log.error(`PDF OCR extraction failed: ${error}`);
|
||||
throw error;
|
||||
}
|
||||
}
|
||||
|
||||
getProcessingType(): string {
|
||||
return 'pdf';
|
||||
}
|
||||
|
||||
async cleanup(): Promise<void> {
|
||||
await this.imageProcessor.cleanup();
|
||||
}
|
||||
|
||||
/**
|
||||
* Get default OCR language from options
|
||||
*/
|
||||
private getDefaultOCRLanguage(): string {
|
||||
try {
|
||||
const ocrLanguage = options.getOption('ocrLanguage');
|
||||
if (!ocrLanguage) {
|
||||
throw new Error('OCR language not configured in user settings');
|
||||
}
|
||||
return ocrLanguage;
|
||||
} catch (error) {
|
||||
log.error(`Failed to get default OCR language: ${error}`);
|
||||
throw new Error('OCR language must be configured in settings before processing');
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Validate OCR language format
|
||||
* Supports single language (eng) or multi-language (ron+eng)
|
||||
*/
|
||||
private isValidLanguageFormat(language: string): boolean {
|
||||
if (!language || typeof language !== 'string') {
|
||||
return false;
|
||||
}
|
||||
|
||||
// Split by '+' for multi-language format
|
||||
const languages = language.split('+');
|
||||
|
||||
// Check each language code (should be 2-7 characters, alphanumeric with underscores)
|
||||
const validLanguagePattern = /^[a-zA-Z]{2,3}(_[a-zA-Z]{2,3})?$/;
|
||||
|
||||
return languages.every(lang => {
|
||||
const trimmed = lang.trim();
|
||||
return trimmed.length > 0 && validLanguagePattern.test(trimmed);
|
||||
});
|
||||
}
|
||||
}
|
||||
135
apps/server/src/services/ocr/processors/tiff_processor.ts
Normal file
135
apps/server/src/services/ocr/processors/tiff_processor.ts
Normal file
@@ -0,0 +1,135 @@
|
||||
import sharp from 'sharp';
|
||||
|
||||
import log from '../../log.js';
|
||||
import options from '../../options.js';
|
||||
import { OCRProcessingOptions,OCRResult } from '../ocr_service.js';
|
||||
import { FileProcessor } from './file_processor.js';
|
||||
import { ImageProcessor } from './image_processor.js';
|
||||
|
||||
/**
|
||||
* TIFF processor for extracting text from multi-page TIFF files
|
||||
*/
|
||||
export class TIFFProcessor extends FileProcessor {
|
||||
private imageProcessor: ImageProcessor;
|
||||
private readonly supportedTypes = ['image/tiff', 'image/tif'];
|
||||
|
||||
constructor() {
|
||||
super();
|
||||
this.imageProcessor = new ImageProcessor();
|
||||
}
|
||||
|
||||
canProcess(mimeType: string): boolean {
|
||||
return mimeType.toLowerCase() === 'image/tiff' || mimeType.toLowerCase() === 'image/tif';
|
||||
}
|
||||
|
||||
getSupportedMimeTypes(): string[] {
|
||||
return [...this.supportedTypes];
|
||||
}
|
||||
|
||||
async extractText(buffer: Buffer, options: OCRProcessingOptions = {}): Promise<OCRResult> {
|
||||
try {
|
||||
log.info('Starting TIFF text extraction...');
|
||||
|
||||
// Validate language format
|
||||
const language = options.language || this.getDefaultOCRLanguage();
|
||||
if (!this.isValidLanguageFormat(language)) {
|
||||
throw new Error(`Invalid OCR language format: ${language}. Use format like 'eng' or 'ron+eng'`);
|
||||
}
|
||||
|
||||
// Check if this is a multi-page TIFF
|
||||
const metadata = await sharp(buffer).metadata();
|
||||
const pageCount = metadata.pages || 1;
|
||||
|
||||
let combinedText = '';
|
||||
let totalConfidence = 0;
|
||||
|
||||
// Process each page
|
||||
for (let page = 0; page < pageCount; page++) {
|
||||
try {
|
||||
log.info(`Processing TIFF page ${page + 1}/${pageCount}...`);
|
||||
|
||||
// Extract page as PNG buffer
|
||||
const pageBuffer = await sharp(buffer, { page })
|
||||
.png()
|
||||
.toBuffer();
|
||||
|
||||
// OCR the page
|
||||
const pageResult = await this.imageProcessor.extractText(pageBuffer, options);
|
||||
|
||||
if (pageResult.text.trim().length > 0) {
|
||||
if (combinedText.length > 0) {
|
||||
combinedText += `\n\n--- Page ${page + 1} ---\n`;
|
||||
}
|
||||
combinedText += pageResult.text;
|
||||
totalConfidence += pageResult.confidence;
|
||||
}
|
||||
} catch (error) {
|
||||
log.error(`Failed to process TIFF page ${page + 1}: ${error}`);
|
||||
// Continue with other pages
|
||||
}
|
||||
}
|
||||
|
||||
const averageConfidence = pageCount > 0 ? totalConfidence / pageCount : 0;
|
||||
|
||||
const result: OCRResult = {
|
||||
text: combinedText.trim(),
|
||||
confidence: averageConfidence,
|
||||
extractedAt: new Date().toISOString(),
|
||||
language: options.language || this.getDefaultOCRLanguage(),
|
||||
pageCount
|
||||
};
|
||||
|
||||
log.info(`TIFF text extraction completed. Pages: ${pageCount}, Confidence: ${averageConfidence}%, Text length: ${result.text.length}`);
|
||||
return result;
|
||||
|
||||
} catch (error) {
|
||||
log.error(`TIFF text extraction failed: ${error}`);
|
||||
throw error;
|
||||
}
|
||||
}
|
||||
|
||||
getProcessingType(): string {
|
||||
return 'tiff';
|
||||
}
|
||||
|
||||
async cleanup(): Promise<void> {
|
||||
await this.imageProcessor.cleanup();
|
||||
}
|
||||
|
||||
/**
|
||||
* Get default OCR language from options
|
||||
*/
|
||||
private getDefaultOCRLanguage(): string {
|
||||
try {
|
||||
const ocrLanguage = options.getOption('ocrLanguage');
|
||||
if (!ocrLanguage) {
|
||||
throw new Error('OCR language not configured in user settings');
|
||||
}
|
||||
return ocrLanguage;
|
||||
} catch (error) {
|
||||
log.error(`Failed to get default OCR language: ${error}`);
|
||||
throw new Error('OCR language must be configured in settings before processing');
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Validate OCR language format
|
||||
* Supports single language (eng) or multi-language (ron+eng)
|
||||
*/
|
||||
private isValidLanguageFormat(language: string): boolean {
|
||||
if (!language || typeof language !== 'string') {
|
||||
return false;
|
||||
}
|
||||
|
||||
// Split by '+' for multi-language format
|
||||
const languages = language.split('+');
|
||||
|
||||
// Check each language code (should be 2-7 characters, alphanumeric with underscores)
|
||||
const validLanguagePattern = /^[a-zA-Z]{2,3}(_[a-zA-Z]{2,3})?$/;
|
||||
|
||||
return languages.every(lang => {
|
||||
const trimmed = lang.trim();
|
||||
return trimmed.length > 0 && validLanguagePattern.test(trimmed);
|
||||
});
|
||||
}
|
||||
}
|
||||
@@ -212,7 +212,13 @@ const defaultOptions: DefaultOption[] = [
|
||||
{ name: "experimentalFeatures", value: "[]", isSynced: true },
|
||||
|
||||
// AI / LLM
|
||||
{ name: "llmProviders", value: "[]", isSynced: false }
|
||||
{ name: "llmProviders", value: "[]", isSynced: false },
|
||||
|
||||
// OCR options
|
||||
{ name: "ocrEnabled", value: "false", isSynced: true },
|
||||
{ name: "ocrLanguage", value: "eng", isSynced: true },
|
||||
{ name: "ocrAutoProcessImages", value: "true", isSynced: true },
|
||||
{ name: "ocrMinConfidence", value: "0.55", isSynced: true },
|
||||
];
|
||||
|
||||
/**
|
||||
|
||||
111
apps/server/src/services/search/expressions/ocr_content.ts
Normal file
111
apps/server/src/services/search/expressions/ocr_content.ts
Normal file
@@ -0,0 +1,111 @@
|
||||
import Expression from "./expression.js";
|
||||
import SearchContext from "../search_context.js";
|
||||
import NoteSet from "../note_set.js";
|
||||
import sql from "../../sql.js";
|
||||
import becca from "../../../becca/becca.js";
|
||||
|
||||
/**
|
||||
* Search expression for finding text within OCR-extracted content from images
|
||||
*/
|
||||
export default class OCRContentExpression extends Expression {
|
||||
private searchText: string;
|
||||
|
||||
constructor(searchText: string) {
|
||||
super();
|
||||
this.searchText = searchText;
|
||||
}
|
||||
|
||||
execute(inputNoteSet: NoteSet, executionContext: object, searchContext: SearchContext): NoteSet {
|
||||
// Don't search OCR content if it's not enabled
|
||||
if (!this.isOCRSearchEnabled()) {
|
||||
return new NoteSet();
|
||||
}
|
||||
|
||||
const resultNoteSet = new NoteSet();
|
||||
const ocrResults = this.searchOCRContent(this.searchText);
|
||||
|
||||
for (const ocrResult of ocrResults) {
|
||||
// Find notes that use this blob
|
||||
const notes = sql.getRows<{noteId: string}>(`
|
||||
SELECT noteId FROM notes
|
||||
WHERE blobId = ? AND isDeleted = 0
|
||||
`, [ocrResult.blobId]);
|
||||
|
||||
for (const noteRow of notes) {
|
||||
const note = becca.getNote(noteRow.noteId);
|
||||
if (note && !note.isDeleted && inputNoteSet.hasNoteId(note.noteId)) {
|
||||
resultNoteSet.add(note);
|
||||
}
|
||||
}
|
||||
|
||||
// Find attachments that use this blob and their parent notes
|
||||
const attachments = sql.getRows<{ownerId: string}>(`
|
||||
SELECT ownerId FROM attachments
|
||||
WHERE blobId = ? AND isDeleted = 0
|
||||
`, [ocrResult.blobId]);
|
||||
|
||||
for (const attachmentRow of attachments) {
|
||||
const note = becca.getNote(attachmentRow.ownerId);
|
||||
if (note && !note.isDeleted && inputNoteSet.hasNoteId(note.noteId)) {
|
||||
resultNoteSet.add(note);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Add highlight tokens for OCR matches
|
||||
if (ocrResults.length > 0) {
|
||||
const tokens = this.extractHighlightTokens(this.searchText);
|
||||
searchContext.highlightedTokens.push(...tokens);
|
||||
}
|
||||
|
||||
return resultNoteSet;
|
||||
}
|
||||
|
||||
private isOCRSearchEnabled(): boolean {
|
||||
try {
|
||||
const optionService = require('../../options.js').default;
|
||||
return optionService.getOptionBool('ocrEnabled');
|
||||
} catch {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
private searchOCRContent(searchText: string): Array<{
|
||||
blobId: string;
|
||||
textRepresentation: string;
|
||||
}> {
|
||||
try {
|
||||
// Search in blobs table for OCR text
|
||||
const query = `
|
||||
SELECT blobId, textRepresentation
|
||||
FROM blobs
|
||||
WHERE textRepresentation LIKE ?
|
||||
AND textRepresentation IS NOT NULL
|
||||
AND textRepresentation != ''
|
||||
LIMIT 50
|
||||
`;
|
||||
const params = [`%${searchText}%`];
|
||||
|
||||
return sql.getRows<{
|
||||
blobId: string;
|
||||
textRepresentation: string;
|
||||
}>(query, params);
|
||||
} catch (error) {
|
||||
console.error('Error searching OCR content:', error);
|
||||
return [];
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private extractHighlightTokens(searchText: string): string[] {
|
||||
// Split search text into words and return them as highlight tokens
|
||||
return searchText
|
||||
.split(/\s+/)
|
||||
.filter(token => token.length > 2)
|
||||
.map(token => token.toLowerCase());
|
||||
}
|
||||
|
||||
toString(): string {
|
||||
return `OCRContent('${this.searchText}')`;
|
||||
}
|
||||
}
|
||||
@@ -1,12 +1,11 @@
|
||||
"use strict";
|
||||
|
||||
import beccaService from "../../becca/becca_service.js";
|
||||
import becca from "../../becca/becca.js";
|
||||
import {
|
||||
normalizeSearchText,
|
||||
calculateOptimizedEditDistance,
|
||||
FUZZY_SEARCH_CONFIG
|
||||
} from "./utils/text_utils.js";
|
||||
import beccaService from "../../becca/becca_service.js";
|
||||
import options from "../options.js";
|
||||
import sql from "../sql.js";
|
||||
import {
|
||||
calculateOptimizedEditDistance,
|
||||
FUZZY_SEARCH_CONFIG,
|
||||
normalizeSearchText} from "./utils/text_utils.js";
|
||||
|
||||
// Scoring constants for better maintainability
|
||||
const SCORE_WEIGHTS = {
|
||||
@@ -85,6 +84,9 @@ class SearchResult {
|
||||
this.addScoreForStrings(tokens, note.title, SCORE_WEIGHTS.TITLE_FACTOR, enableFuzzyMatching);
|
||||
this.addScoreForStrings(tokens, this.notePathTitle, SCORE_WEIGHTS.PATH_FACTOR, enableFuzzyMatching);
|
||||
|
||||
// Add OCR scoring - weight between title and content matches
|
||||
this.addOCRScore(tokens, 1.5);
|
||||
|
||||
if (note.isInHiddenSubtree()) {
|
||||
this.score = this.score / SCORE_WEIGHTS.HIDDEN_NOTE_PENALTY;
|
||||
}
|
||||
@@ -98,7 +100,7 @@ class SearchResult {
|
||||
for (const chunk of chunks) {
|
||||
for (const token of tokens) {
|
||||
const normalizedToken = normalizeSearchText(token.toLowerCase());
|
||||
|
||||
|
||||
if (chunk === normalizedToken) {
|
||||
tokenScore += SCORE_WEIGHTS.TOKEN_EXACT_MATCH * token.length * factor;
|
||||
} else if (chunk.startsWith(normalizedToken)) {
|
||||
@@ -108,10 +110,10 @@ class SearchResult {
|
||||
} else {
|
||||
// Try fuzzy matching for individual tokens with caps applied
|
||||
const editDistance = calculateOptimizedEditDistance(chunk, normalizedToken, FUZZY_SEARCH_CONFIG.MAX_EDIT_DISTANCE);
|
||||
if (editDistance <= FUZZY_SEARCH_CONFIG.MAX_EDIT_DISTANCE &&
|
||||
if (editDistance <= FUZZY_SEARCH_CONFIG.MAX_EDIT_DISTANCE &&
|
||||
normalizedToken.length >= FUZZY_SEARCH_CONFIG.MIN_FUZZY_TOKEN_LENGTH &&
|
||||
this.fuzzyScore < SCORE_WEIGHTS.MAX_TOTAL_FUZZY_SCORE) {
|
||||
|
||||
|
||||
const fuzzyWeight = SCORE_WEIGHTS.TOKEN_FUZZY_MATCH * (1 - editDistance / FUZZY_SEARCH_CONFIG.MAX_EDIT_DISTANCE);
|
||||
// Apply caps: limit token length multiplier and per-token contribution
|
||||
const cappedTokenLength = Math.min(token.length, SCORE_WEIGHTS.MAX_FUZZY_TOKEN_LENGTH_MULTIPLIER);
|
||||
@@ -119,7 +121,7 @@ class SearchResult {
|
||||
fuzzyWeight * cappedTokenLength * factor,
|
||||
SCORE_WEIGHTS.MAX_FUZZY_SCORE_PER_TOKEN
|
||||
);
|
||||
|
||||
|
||||
tokenScore += fuzzyTokenScore;
|
||||
this.fuzzyScore += fuzzyTokenScore;
|
||||
}
|
||||
@@ -129,13 +131,43 @@ class SearchResult {
|
||||
this.score += tokenScore;
|
||||
}
|
||||
|
||||
addOCRScore(tokens: string[], factor: number) {
|
||||
try {
|
||||
// Check if OCR is enabled
|
||||
if (!options.getOptionBool('ocrEnabled')) {
|
||||
return;
|
||||
}
|
||||
|
||||
// Search for OCR results for this note and its attachments
|
||||
const ocrResults = sql.getRows(`
|
||||
SELECT b.textRepresentation
|
||||
FROM blobs b
|
||||
WHERE b.textRepresentation IS NOT NULL
|
||||
AND b.textRepresentation != ''
|
||||
AND (
|
||||
b.blobId = (SELECT blobId FROM notes WHERE noteId = ? AND isDeleted = 0)
|
||||
OR b.blobId IN (
|
||||
SELECT blobId FROM attachments WHERE ownerId = ? AND isDeleted = 0
|
||||
)
|
||||
)
|
||||
`, [this.noteId, this.noteId]);
|
||||
|
||||
for (const ocrResult of ocrResults as Array<{textRepresentation: string}>) {
|
||||
// Add score for OCR text matches
|
||||
this.addScoreForStrings(tokens, ocrResult.textRepresentation, factor);
|
||||
}
|
||||
} catch (error) {
|
||||
// Silently fail if OCR service is not available
|
||||
console.debug('OCR scoring failed:', error);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Checks if the query matches as a complete word in the text
|
||||
*/
|
||||
private isWordMatch(text: string, query: string): boolean {
|
||||
return text.includes(` ${query} `) ||
|
||||
text.startsWith(`${query} `) ||
|
||||
return text.includes(` ${query} `) ||
|
||||
text.startsWith(`${query} `) ||
|
||||
text.endsWith(` ${query}`);
|
||||
}
|
||||
|
||||
@@ -147,21 +179,21 @@ class SearchResult {
|
||||
if (this.fuzzyScore >= SCORE_WEIGHTS.MAX_TOTAL_FUZZY_SCORE) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
const editDistance = calculateOptimizedEditDistance(title, query, FUZZY_SEARCH_CONFIG.MAX_EDIT_DISTANCE);
|
||||
const maxLen = Math.max(title.length, query.length);
|
||||
|
||||
|
||||
// Only apply fuzzy matching if the query is reasonably long and edit distance is small
|
||||
if (query.length >= FUZZY_SEARCH_CONFIG.MIN_FUZZY_TOKEN_LENGTH &&
|
||||
editDistance <= FUZZY_SEARCH_CONFIG.MAX_EDIT_DISTANCE &&
|
||||
if (query.length >= FUZZY_SEARCH_CONFIG.MIN_FUZZY_TOKEN_LENGTH &&
|
||||
editDistance <= FUZZY_SEARCH_CONFIG.MAX_EDIT_DISTANCE &&
|
||||
editDistance / maxLen <= 0.3) {
|
||||
const similarity = 1 - (editDistance / maxLen);
|
||||
const baseFuzzyScore = SCORE_WEIGHTS.TITLE_WORD_MATCH * similarity * 0.7; // Reduced weight for fuzzy matches
|
||||
|
||||
|
||||
// Apply cap to ensure fuzzy title matches don't exceed reasonable bounds
|
||||
return Math.min(baseFuzzyScore, SCORE_WEIGHTS.MAX_TOTAL_FUZZY_SCORE * 0.3);
|
||||
}
|
||||
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
337
apps/server/src/services/search/search_result_ocr.spec.ts
Normal file
337
apps/server/src/services/search/search_result_ocr.spec.ts
Normal file
@@ -0,0 +1,337 @@
|
||||
import { describe, it, expect, vi, beforeEach } from 'vitest';
|
||||
|
||||
// Mock dependencies
|
||||
const mockSql = {
|
||||
getRows: vi.fn()
|
||||
};
|
||||
|
||||
const mockOptions = {
|
||||
getOptionBool: vi.fn()
|
||||
};
|
||||
|
||||
const mockBecca = {
|
||||
notes: {},
|
||||
getNote: vi.fn()
|
||||
};
|
||||
|
||||
const mockBeccaService = {
|
||||
getNoteTitleForPath: vi.fn()
|
||||
};
|
||||
|
||||
vi.mock('../sql.js', () => ({
|
||||
default: mockSql
|
||||
}));
|
||||
|
||||
vi.mock('../options.js', () => ({
|
||||
default: mockOptions
|
||||
}));
|
||||
|
||||
// The SearchResult now uses proper ES imports which are mocked above
|
||||
|
||||
vi.mock('../../becca/becca.js', () => ({
|
||||
default: mockBecca
|
||||
}));
|
||||
|
||||
vi.mock('../../becca/becca_service.js', () => ({
|
||||
default: mockBeccaService
|
||||
}));
|
||||
|
||||
// Import SearchResult after mocking
|
||||
let SearchResult: any;
|
||||
|
||||
beforeEach(async () => {
|
||||
vi.clearAllMocks();
|
||||
|
||||
// Reset mock implementations
|
||||
mockOptions.getOptionBool.mockReturnValue(true);
|
||||
mockSql.getRows.mockReturnValue([]);
|
||||
mockBeccaService.getNoteTitleForPath.mockReturnValue('Test Note Title');
|
||||
|
||||
// Setup mock note
|
||||
const mockNote = {
|
||||
noteId: 'test123',
|
||||
title: 'Test Note',
|
||||
isInHiddenSubtree: vi.fn().mockReturnValue(false)
|
||||
};
|
||||
mockBecca.notes['test123'] = mockNote;
|
||||
|
||||
// Dynamically import SearchResult
|
||||
const module = await import('./search_result.js');
|
||||
SearchResult = module.default;
|
||||
});
|
||||
|
||||
describe('SearchResult', () => {
|
||||
describe('constructor', () => {
|
||||
it('should initialize with note path array', () => {
|
||||
const searchResult = new SearchResult(['root', 'folder', 'test123']);
|
||||
|
||||
expect(searchResult.notePathArray).toEqual(['root', 'folder', 'test123']);
|
||||
expect(searchResult.noteId).toBe('test123');
|
||||
expect(searchResult.notePath).toBe('root/folder/test123');
|
||||
expect(searchResult.score).toBe(0);
|
||||
expect(mockBeccaService.getNoteTitleForPath).toHaveBeenCalledWith(['root', 'folder', 'test123']);
|
||||
});
|
||||
});
|
||||
|
||||
describe('computeScore', () => {
|
||||
let searchResult: any;
|
||||
|
||||
beforeEach(() => {
|
||||
searchResult = new SearchResult(['root', 'test123']);
|
||||
});
|
||||
|
||||
describe('basic scoring', () => {
|
||||
it('should give highest score for exact note ID match', () => {
|
||||
searchResult.computeScore('test123', ['test123']);
|
||||
expect(searchResult.score).toBeGreaterThanOrEqual(1000);
|
||||
});
|
||||
|
||||
it('should give high score for exact title match', () => {
|
||||
searchResult.computeScore('test note', ['test', 'note']);
|
||||
expect(searchResult.score).toBeGreaterThan(2000);
|
||||
});
|
||||
|
||||
it('should give medium score for title prefix match', () => {
|
||||
searchResult.computeScore('test', ['test']);
|
||||
expect(searchResult.score).toBeGreaterThan(500);
|
||||
});
|
||||
|
||||
it('should give lower score for title word match', () => {
|
||||
mockBecca.notes['test123'].title = 'This is a test note';
|
||||
searchResult.computeScore('test', ['test']);
|
||||
expect(searchResult.score).toBeGreaterThan(300);
|
||||
});
|
||||
});
|
||||
|
||||
describe('OCR scoring integration', () => {
|
||||
beforeEach(() => {
|
||||
// Mock OCR-enabled
|
||||
mockOptions.getOptionBool.mockReturnValue(true);
|
||||
});
|
||||
|
||||
it('should add OCR score when OCR results exist', () => {
|
||||
const mockOCRResults = [
|
||||
{
|
||||
extracted_text: 'sample text from image',
|
||||
confidence: 0.95
|
||||
}
|
||||
];
|
||||
mockSql.getRows.mockReturnValue(mockOCRResults);
|
||||
|
||||
searchResult.computeScore('sample', ['sample']);
|
||||
|
||||
expect(mockSql.getRows).toHaveBeenCalledWith(
|
||||
expect.stringContaining('FROM ocr_results'),
|
||||
['test123', 'test123']
|
||||
);
|
||||
expect(searchResult.score).toBeGreaterThan(0);
|
||||
});
|
||||
|
||||
it('should apply confidence weighting to OCR scores', () => {
|
||||
const highConfidenceResult = [
|
||||
{
|
||||
extracted_text: 'sample text',
|
||||
confidence: 0.95
|
||||
}
|
||||
];
|
||||
const lowConfidenceResult = [
|
||||
{
|
||||
extracted_text: 'sample text',
|
||||
confidence: 0.30
|
||||
}
|
||||
];
|
||||
|
||||
// Test high confidence
|
||||
mockSql.getRows.mockReturnValue(highConfidenceResult);
|
||||
searchResult.computeScore('sample', ['sample']);
|
||||
const highConfidenceScore = searchResult.score;
|
||||
|
||||
// Reset and test low confidence
|
||||
searchResult.score = 0;
|
||||
mockSql.getRows.mockReturnValue(lowConfidenceResult);
|
||||
searchResult.computeScore('sample', ['sample']);
|
||||
const lowConfidenceScore = searchResult.score;
|
||||
|
||||
expect(highConfidenceScore).toBeGreaterThan(lowConfidenceScore);
|
||||
});
|
||||
|
||||
it('should handle multiple OCR results', () => {
|
||||
const multipleResults = [
|
||||
{
|
||||
extracted_text: 'first sample text',
|
||||
confidence: 0.90
|
||||
},
|
||||
{
|
||||
extracted_text: 'second sample document',
|
||||
confidence: 0.85
|
||||
}
|
||||
];
|
||||
mockSql.getRows.mockReturnValue(multipleResults);
|
||||
|
||||
searchResult.computeScore('sample', ['sample']);
|
||||
|
||||
expect(searchResult.score).toBeGreaterThan(0);
|
||||
// Score should account for multiple matches
|
||||
});
|
||||
|
||||
it('should skip OCR scoring when OCR is disabled', () => {
|
||||
mockOptions.getOptionBool.mockReturnValue(false);
|
||||
|
||||
searchResult.computeScore('sample', ['sample']);
|
||||
|
||||
expect(mockSql.getRows).not.toHaveBeenCalled();
|
||||
});
|
||||
|
||||
it('should handle OCR scoring errors gracefully', () => {
|
||||
mockSql.getRows.mockImplementation(() => {
|
||||
throw new Error('Database error');
|
||||
});
|
||||
|
||||
expect(() => {
|
||||
searchResult.computeScore('sample', ['sample']);
|
||||
}).not.toThrow();
|
||||
|
||||
// Score should still be calculated from other factors
|
||||
expect(searchResult.score).toBeGreaterThanOrEqual(0);
|
||||
});
|
||||
});
|
||||
|
||||
describe('hidden notes penalty', () => {
|
||||
it('should apply penalty for hidden notes', () => {
|
||||
mockBecca.notes['test123'].isInHiddenSubtree.mockReturnValue(true);
|
||||
|
||||
searchResult.computeScore('test', ['test']);
|
||||
const hiddenScore = searchResult.score;
|
||||
|
||||
// Reset and test non-hidden
|
||||
mockBecca.notes['test123'].isInHiddenSubtree.mockReturnValue(false);
|
||||
searchResult.score = 0;
|
||||
searchResult.computeScore('test', ['test']);
|
||||
const normalScore = searchResult.score;
|
||||
|
||||
expect(normalScore).toBeGreaterThan(hiddenScore);
|
||||
expect(hiddenScore).toBe(normalScore / 3);
|
||||
});
|
||||
});
|
||||
});
|
||||
|
||||
describe('addScoreForStrings', () => {
|
||||
let searchResult: any;
|
||||
|
||||
beforeEach(() => {
|
||||
searchResult = new SearchResult(['root', 'test123']);
|
||||
});
|
||||
|
||||
it('should give highest score for exact token match', () => {
|
||||
searchResult.addScoreForStrings(['sample'], 'sample text', 1.0);
|
||||
const exactScore = searchResult.score;
|
||||
|
||||
searchResult.score = 0;
|
||||
searchResult.addScoreForStrings(['sample'], 'sampling text', 1.0);
|
||||
const prefixScore = searchResult.score;
|
||||
|
||||
searchResult.score = 0;
|
||||
searchResult.addScoreForStrings(['sample'], 'text sample text', 1.0);
|
||||
const partialScore = searchResult.score;
|
||||
|
||||
expect(exactScore).toBeGreaterThan(prefixScore);
|
||||
expect(exactScore).toBeGreaterThanOrEqual(partialScore);
|
||||
});
|
||||
|
||||
it('should apply factor multiplier correctly', () => {
|
||||
searchResult.addScoreForStrings(['sample'], 'sample text', 2.0);
|
||||
const doubleFactorScore = searchResult.score;
|
||||
|
||||
searchResult.score = 0;
|
||||
searchResult.addScoreForStrings(['sample'], 'sample text', 1.0);
|
||||
const singleFactorScore = searchResult.score;
|
||||
|
||||
expect(doubleFactorScore).toBe(singleFactorScore * 2);
|
||||
});
|
||||
|
||||
it('should handle multiple tokens', () => {
|
||||
searchResult.addScoreForStrings(['hello', 'world'], 'hello world test', 1.0);
|
||||
expect(searchResult.score).toBeGreaterThan(0);
|
||||
});
|
||||
|
||||
it('should be case insensitive', () => {
|
||||
searchResult.addScoreForStrings(['sample'], 'sample text', 1.0);
|
||||
const lowerCaseScore = searchResult.score;
|
||||
|
||||
searchResult.score = 0;
|
||||
searchResult.addScoreForStrings(['sample'], 'SAMPLE text', 1.0);
|
||||
const upperCaseScore = searchResult.score;
|
||||
|
||||
expect(upperCaseScore).toEqual(lowerCaseScore);
|
||||
expect(upperCaseScore).toBeGreaterThan(0);
|
||||
});
|
||||
});
|
||||
|
||||
describe('addOCRScore', () => {
|
||||
let searchResult: any;
|
||||
|
||||
beforeEach(() => {
|
||||
searchResult = new SearchResult(['root', 'test123']);
|
||||
});
|
||||
|
||||
it('should query for both note and attachment OCR results', () => {
|
||||
mockOptions.getOptionBool.mockReturnValue(true);
|
||||
mockSql.getRows.mockReturnValue([]);
|
||||
|
||||
searchResult.addOCRScore(['sample'], 1.5);
|
||||
|
||||
expect(mockSql.getRows).toHaveBeenCalledWith(
|
||||
expect.stringContaining('FROM ocr_results'),
|
||||
['test123', 'test123']
|
||||
);
|
||||
});
|
||||
|
||||
it('should apply minimum confidence multiplier', () => {
|
||||
mockOptions.getOptionBool.mockReturnValue(true);
|
||||
const lowConfidenceResult = [
|
||||
{
|
||||
extracted_text: 'sample text',
|
||||
confidence: 0.1 // Very low confidence
|
||||
}
|
||||
];
|
||||
mockSql.getRows.mockReturnValue(lowConfidenceResult);
|
||||
|
||||
searchResult.addOCRScore(['sample'], 1.0);
|
||||
|
||||
// Should still get some score due to minimum 0.5x multiplier
|
||||
expect(searchResult.score).toBeGreaterThan(0);
|
||||
});
|
||||
|
||||
it('should handle database query errors', () => {
|
||||
mockOptions.getOptionBool.mockReturnValue(true);
|
||||
mockSql.getRows.mockImplementation(() => {
|
||||
throw new Error('Database connection failed');
|
||||
});
|
||||
|
||||
// Should not throw error
|
||||
expect(() => {
|
||||
searchResult.addOCRScore(['sample'], 1.5);
|
||||
}).not.toThrow();
|
||||
});
|
||||
|
||||
it('should skip when OCR is disabled', () => {
|
||||
mockOptions.getOptionBool.mockReturnValue(false);
|
||||
|
||||
searchResult.addOCRScore(['sample'], 1.5);
|
||||
|
||||
expect(mockSql.getRows).not.toHaveBeenCalled();
|
||||
});
|
||||
|
||||
it('should handle options service errors', () => {
|
||||
mockOptions.getOptionBool.mockImplementation(() => {
|
||||
throw new Error('Options service unavailable');
|
||||
});
|
||||
|
||||
expect(() => {
|
||||
searchResult.addOCRScore(['sample'], 1.5);
|
||||
}).not.toThrow();
|
||||
|
||||
expect(mockSql.getRows).not.toHaveBeenCalled();
|
||||
});
|
||||
});
|
||||
});
|
||||
@@ -1,28 +1,30 @@
|
||||
"use strict";
|
||||
|
||||
|
||||
import { dayjs } from "@triliumnext/commons";
|
||||
|
||||
import { removeDiacritic } from "../../utils.js";
|
||||
import AncestorExp from "../expressions/ancestor.js";
|
||||
import AndExp from "../expressions/and.js";
|
||||
import OrExp from "../expressions/or.js";
|
||||
import NotExp from "../expressions/not.js";
|
||||
import AttributeExistsExp from "../expressions/attribute_exists.js";
|
||||
import ChildOfExp from "../expressions/child_of.js";
|
||||
import DescendantOfExp from "../expressions/descendant_of.js";
|
||||
import ParentOfExp from "../expressions/parent_of.js";
|
||||
import RelationWhereExp from "../expressions/relation_where.js";
|
||||
import PropertyComparisonExp from "../expressions/property_comparison.js";
|
||||
import AttributeExistsExp from "../expressions/attribute_exists.js";
|
||||
import LabelComparisonExp from "../expressions/label_comparison.js";
|
||||
import NoteFlatTextExp from "../expressions/note_flat_text.js";
|
||||
import NoteContentFulltextExp from "../expressions/note_content_fulltext.js";
|
||||
import OrderByAndLimitExp from "../expressions/order_by_and_limit.js";
|
||||
import AncestorExp from "../expressions/ancestor.js";
|
||||
import buildComparator from "./build_comparator.js";
|
||||
import ValueExtractor from "../value_extractor.js";
|
||||
import { removeDiacritic } from "../../utils.js";
|
||||
import TrueExp from "../expressions/true.js";
|
||||
import IsHiddenExp from "../expressions/is_hidden.js";
|
||||
import type SearchContext from "../search_context.js";
|
||||
import type { TokenData, TokenStructure } from "./types.js";
|
||||
import type Expression from "../expressions/expression.js";
|
||||
import IsHiddenExp from "../expressions/is_hidden.js";
|
||||
import LabelComparisonExp from "../expressions/label_comparison.js";
|
||||
import NotExp from "../expressions/not.js";
|
||||
import NoteContentFulltextExp from "../expressions/note_content_fulltext.js";
|
||||
import NoteFlatTextExp from "../expressions/note_flat_text.js";
|
||||
import OCRContentExpression from "../expressions/ocr_content.js";
|
||||
import OrExp from "../expressions/or.js";
|
||||
import OrderByAndLimitExp from "../expressions/order_by_and_limit.js";
|
||||
import ParentOfExp from "../expressions/parent_of.js";
|
||||
import PropertyComparisonExp from "../expressions/property_comparison.js";
|
||||
import RelationWhereExp from "../expressions/relation_where.js";
|
||||
import TrueExp from "../expressions/true.js";
|
||||
import type SearchContext from "../search_context.js";
|
||||
import ValueExtractor from "../value_extractor.js";
|
||||
import buildComparator from "./build_comparator.js";
|
||||
import type { TokenData, TokenStructure } from "./types.js";
|
||||
|
||||
function getFulltext(_tokens: TokenData[], searchContext: SearchContext, leadingOperator?: string) {
|
||||
const tokens: string[] = _tokens.map((t) => removeDiacritic(t.token));
|
||||
@@ -42,16 +44,33 @@ function getFulltext(_tokens: TokenData[], searchContext: SearchContext, leading
|
||||
// Exact match on title OR exact match on content OR exact match in flat text (includes attributes)
|
||||
// For multi-word, join tokens with space to form exact phrase
|
||||
const titleSearchValue = tokens.join(" ");
|
||||
return new OrExp([
|
||||
const exactMatchExpressions: Expression[] = [
|
||||
new PropertyComparisonExp(searchContext, "title", "=", titleSearchValue),
|
||||
new NoteContentFulltextExp("=", { tokens, flatText: false }),
|
||||
new NoteContentFulltextExp("=", { tokens, flatText: true })
|
||||
]);
|
||||
];
|
||||
|
||||
// Add OCR content search for each token
|
||||
for (const token of tokens) {
|
||||
exactMatchExpressions.push(new OCRContentExpression(token));
|
||||
}
|
||||
|
||||
return new OrExp(exactMatchExpressions);
|
||||
}
|
||||
return new OrExp([new NoteFlatTextExp(tokens), new NoteContentFulltextExp(operator, { tokens, flatText: true })]);
|
||||
} else {
|
||||
return new NoteFlatTextExp(tokens);
|
||||
|
||||
const searchExpressions: Expression[] = [
|
||||
new NoteFlatTextExp(tokens),
|
||||
new NoteContentFulltextExp(operator, { tokens, flatText: true })
|
||||
];
|
||||
|
||||
// Add OCR content search for each token
|
||||
for (const token of tokens) {
|
||||
searchExpressions.push(new OCRContentExpression(token));
|
||||
}
|
||||
|
||||
return new OrExp(searchExpressions);
|
||||
}
|
||||
return new NoteFlatTextExp(tokens);
|
||||
}
|
||||
|
||||
const OPERATORS = new Set(["=", "!=", "*=*", "*=", "=*", ">", ">=", "<", "<=", "%=", "~=", "~*"]);
|
||||
@@ -298,9 +317,9 @@ function getExpression(tokens: TokenData[], searchContext: SearchContext, level
|
||||
searchContext.addError(`Relation can be compared only with property, e.g. ~relation.title=hello in ${context(i)}`);
|
||||
|
||||
return null;
|
||||
} else {
|
||||
return new AttributeExistsExp("relation", relationName, searchContext.fuzzyAttributeSearch);
|
||||
}
|
||||
return new AttributeExistsExp("relation", relationName, searchContext.fuzzyAttributeSearch);
|
||||
|
||||
}
|
||||
|
||||
function parseOrderByAndLimit() {
|
||||
@@ -308,7 +327,7 @@ function getExpression(tokens: TokenData[], searchContext: SearchContext, level
|
||||
valueExtractor: ValueExtractor;
|
||||
direction: string;
|
||||
}[] = [];
|
||||
let limit: number | undefined = undefined;
|
||||
let limit: number | undefined;
|
||||
|
||||
if (tokens[i].token === "orderby") {
|
||||
do {
|
||||
@@ -354,9 +373,9 @@ function getExpression(tokens: TokenData[], searchContext: SearchContext, level
|
||||
return AndExp.of(expressions);
|
||||
} else if (op === "or") {
|
||||
return OrExp.of(expressions);
|
||||
} else {
|
||||
throw new Error(`Unrecognized op=${op}`);
|
||||
}
|
||||
throw new Error(`Unrecognized op=${op}`);
|
||||
|
||||
}
|
||||
|
||||
for (i = 0; i < tokens.length; i++) {
|
||||
@@ -423,7 +442,7 @@ function getExpression(tokens: TokenData[], searchContext: SearchContext, level
|
||||
} else if (op !== token) {
|
||||
searchContext.addError("Mixed usage of AND/OR - always use parenthesis to group AND/OR expressions.");
|
||||
}
|
||||
} else if (isOperator({ token: token })) {
|
||||
} else if (isOperator({ token })) {
|
||||
searchContext.addError(`Misplaced or incomplete expression "${token}"`);
|
||||
} else {
|
||||
searchContext.addError(`Unrecognized expression "${token}"`);
|
||||
@@ -493,9 +512,9 @@ function getAncestorExp({ ancestorNoteId, ancestorDepth, includeHiddenNotes }: S
|
||||
return new AncestorExp(ancestorNoteId, ancestorDepth);
|
||||
} else if (!includeHiddenNotes) {
|
||||
return new NotExp(new IsHiddenExp());
|
||||
} else {
|
||||
return null;
|
||||
}
|
||||
return null;
|
||||
|
||||
}
|
||||
|
||||
export default parse;
|
||||
|
||||
@@ -9,7 +9,7 @@
|
||||
"preview": "pnpm build && vite preview"
|
||||
},
|
||||
"dependencies": {
|
||||
"i18next": "25.10.10",
|
||||
"i18next": "26.0.1",
|
||||
"i18next-http-backend": "3.0.2",
|
||||
"preact": "10.29.0",
|
||||
"preact-iso": "2.11.1",
|
||||
|
||||
@@ -27,8 +27,7 @@ export function initTranslations(lng: string) {
|
||||
initAsync: false,
|
||||
react: {
|
||||
useSuspense: false
|
||||
},
|
||||
showSupportNotice: false
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
|
||||
BIN
eng.traineddata
Normal file
BIN
eng.traineddata
Normal file
Binary file not shown.
@@ -36,7 +36,7 @@
|
||||
"test:all": "pnpm test:parallel && pnpm test:sequential",
|
||||
"test:parallel": "pnpm --filter=!server --filter=!ckeditor5-mermaid --filter=!ckeditor5-math --parallel test",
|
||||
"test:sequential": "pnpm --filter=server --filter=ckeditor5-mermaid --filter=ckeditor5-math --sequential test",
|
||||
"typecheck": "tsc --build",
|
||||
"typecheck": "tsx scripts/filter-tsc-output.mts",
|
||||
"dev:format-check": "eslint -c eslint.format.config.mjs .",
|
||||
"dev:format-fix": "eslint -c eslint.format.config.mjs . --fix",
|
||||
"dev:linter-check": "cross-env NODE_OPTIONS=--max_old_space_size=4096 eslint .",
|
||||
|
||||
@@ -144,6 +144,12 @@ export interface OptionDefinitions extends KeyboardShortcutsOptions<KeyboardActi
|
||||
// AI / LLM
|
||||
/** JSON array of configured LLM providers with their API keys */
|
||||
llmProviders: string;
|
||||
|
||||
// OCR options
|
||||
ocrEnabled: boolean;
|
||||
ocrLanguage: string;
|
||||
ocrAutoProcessImages: boolean;
|
||||
ocrMinConfidence: string;
|
||||
}
|
||||
|
||||
export type OptionNames = keyof OptionDefinitions;
|
||||
|
||||
@@ -72,6 +72,7 @@ export interface BlobRow {
|
||||
blobId: string;
|
||||
content: string | Buffer;
|
||||
contentLength: number;
|
||||
textRepresentation?: string | null;
|
||||
dateModified: string;
|
||||
utcDateModified: string;
|
||||
}
|
||||
|
||||
623
pnpm-lock.yaml
generated
623
pnpm-lock.yaml
generated
File diff suppressed because it is too large
Load Diff
56
scripts/filter-tsc-output.mts
Normal file
56
scripts/filter-tsc-output.mts
Normal file
@@ -0,0 +1,56 @@
|
||||
/**
|
||||
* Runs `tsc --build` and filters out noisy cascade errors (TS6305).
|
||||
* Numbers each remaining error and prints a summary at the end.
|
||||
*/
|
||||
|
||||
import { execSync } from "child_process";
|
||||
|
||||
const SUPPRESSED_CODES = [ "TS6305" ];
|
||||
const ERROR_LINE_PATTERN = /^.+\(\d+,\d+\): error TS\d+:/;
|
||||
|
||||
let output: string;
|
||||
try {
|
||||
output = execSync("tsc --build", {
|
||||
encoding: "utf-8",
|
||||
stdio: [ "inherit", "pipe", "pipe" ]
|
||||
});
|
||||
} catch (err: unknown) {
|
||||
const execErr = err as { stdout?: string; stderr?: string };
|
||||
output = (execErr.stdout ?? "") + (execErr.stderr ?? "");
|
||||
}
|
||||
|
||||
const lines = output.split(/\r?\n/);
|
||||
const filtered = lines.filter(
|
||||
(line) => !SUPPRESSED_CODES.some((code) => line.includes(code))
|
||||
);
|
||||
|
||||
let errorIndex = 0;
|
||||
const numbered: string[] = [];
|
||||
const seen = new Set<string>();
|
||||
let skipContinuation = false;
|
||||
|
||||
for (const line of filtered) {
|
||||
if (ERROR_LINE_PATTERN.test(line)) {
|
||||
if (seen.has(line)) {
|
||||
skipContinuation = true;
|
||||
continue;
|
||||
}
|
||||
seen.add(line);
|
||||
skipContinuation = false;
|
||||
errorIndex++;
|
||||
numbered.push(`[${errorIndex}] ${line}`);
|
||||
} else if (line.trim()) {
|
||||
// Continuation line (indented context for multi-line errors)
|
||||
if (!skipContinuation) {
|
||||
numbered.push(line);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (errorIndex > 0) {
|
||||
console.log(numbered.join("\n"));
|
||||
console.log(`\n${errorIndex} error(s) found.`);
|
||||
process.exit(1);
|
||||
} else {
|
||||
console.log("No errors found.");
|
||||
}
|
||||
Reference in New Issue
Block a user