mirror of
https://github.com/zadam/trilium.git
synced 2025-11-11 07:45:51 +01:00
514 lines
18 KiB
TypeScript
514 lines
18 KiB
TypeScript
|
|
/**
|
||
|
|
* SQLite Custom Functions Service
|
||
|
|
*
|
||
|
|
* This service manages custom SQLite functions that enhance search capabilities.
|
||
|
|
* Functions are registered with better-sqlite3 to provide native-speed operations
|
||
|
|
* directly within SQL queries, enabling efficient search indexing and querying.
|
||
|
|
*
|
||
|
|
* These functions are used by:
|
||
|
|
* - Database triggers for automatic search index maintenance
|
||
|
|
* - Direct SQL queries for search operations
|
||
|
|
* - Migration scripts for initial data population
|
||
|
|
*/
|
||
|
|
|
||
|
|
import type { Database } from "better-sqlite3";
|
||
|
|
import log from "../log.js";
|
||
|
|
import { normalize as utilsNormalize, stripTags } from "../utils.js";
|
||
|
|
|
||
|
|
/**
|
||
|
|
* Configuration for fuzzy search operations
|
||
|
|
*/
|
||
|
|
const FUZZY_CONFIG = {
|
||
|
|
MAX_EDIT_DISTANCE: 2,
|
||
|
|
MIN_TOKEN_LENGTH: 3,
|
||
|
|
MAX_STRING_LENGTH: 1000, // Performance guard for edit distance
|
||
|
|
} as const;
|
||
|
|
|
||
|
|
/**
|
||
|
|
* Interface for registering a custom SQL function
|
||
|
|
*/
|
||
|
|
interface SQLiteFunction {
|
||
|
|
name: string;
|
||
|
|
implementation: (...args: any[]) => any;
|
||
|
|
options?: {
|
||
|
|
deterministic?: boolean;
|
||
|
|
varargs?: boolean;
|
||
|
|
directOnly?: boolean;
|
||
|
|
};
|
||
|
|
}
|
||
|
|
|
||
|
|
/**
|
||
|
|
* Manages registration and lifecycle of custom SQLite functions
|
||
|
|
*/
|
||
|
|
export class SqliteFunctionsService {
|
||
|
|
private static instance: SqliteFunctionsService | null = null;
|
||
|
|
private registered = false;
|
||
|
|
private functions: SQLiteFunction[] = [];
|
||
|
|
|
||
|
|
private constructor() {
|
||
|
|
// Initialize the function definitions
|
||
|
|
this.initializeFunctions();
|
||
|
|
}
|
||
|
|
|
||
|
|
/**
|
||
|
|
* Get singleton instance of the service
|
||
|
|
*/
|
||
|
|
static getInstance(): SqliteFunctionsService {
|
||
|
|
if (!SqliteFunctionsService.instance) {
|
||
|
|
SqliteFunctionsService.instance = new SqliteFunctionsService();
|
||
|
|
}
|
||
|
|
return SqliteFunctionsService.instance;
|
||
|
|
}
|
||
|
|
|
||
|
|
/**
|
||
|
|
* Initialize all custom function definitions
|
||
|
|
*/
|
||
|
|
private initializeFunctions(): void {
|
||
|
|
// Bind all methods to preserve 'this' context
|
||
|
|
this.functions = [
|
||
|
|
{
|
||
|
|
name: "normalize_text",
|
||
|
|
implementation: this.normalizeText.bind(this),
|
||
|
|
options: {
|
||
|
|
deterministic: true,
|
||
|
|
varargs: false
|
||
|
|
}
|
||
|
|
},
|
||
|
|
{
|
||
|
|
name: "edit_distance",
|
||
|
|
implementation: this.editDistance.bind(this),
|
||
|
|
options: {
|
||
|
|
deterministic: true,
|
||
|
|
varargs: true // Changed to true to handle variable arguments
|
||
|
|
}
|
||
|
|
},
|
||
|
|
{
|
||
|
|
name: "regex_match",
|
||
|
|
implementation: this.regexMatch.bind(this),
|
||
|
|
options: {
|
||
|
|
deterministic: true,
|
||
|
|
varargs: true // Changed to true to handle variable arguments
|
||
|
|
}
|
||
|
|
},
|
||
|
|
{
|
||
|
|
name: "tokenize_text",
|
||
|
|
implementation: this.tokenizeText.bind(this),
|
||
|
|
options: {
|
||
|
|
deterministic: true,
|
||
|
|
varargs: false
|
||
|
|
}
|
||
|
|
},
|
||
|
|
{
|
||
|
|
name: "strip_html",
|
||
|
|
implementation: this.stripHtml.bind(this),
|
||
|
|
options: {
|
||
|
|
deterministic: true,
|
||
|
|
varargs: false
|
||
|
|
}
|
||
|
|
},
|
||
|
|
{
|
||
|
|
name: "fuzzy_match",
|
||
|
|
implementation: this.fuzzyMatch.bind(this),
|
||
|
|
options: {
|
||
|
|
deterministic: true,
|
||
|
|
varargs: true // Changed to true to handle variable arguments
|
||
|
|
}
|
||
|
|
}
|
||
|
|
];
|
||
|
|
}
|
||
|
|
|
||
|
|
/**
|
||
|
|
* Register all custom functions with the database connection
|
||
|
|
*
|
||
|
|
* @param db The better-sqlite3 database connection
|
||
|
|
* @returns true if registration was successful, false otherwise
|
||
|
|
*/
|
||
|
|
registerFunctions(db: Database): boolean {
|
||
|
|
if (this.registered) {
|
||
|
|
log.info("SQLite custom functions already registered");
|
||
|
|
return true;
|
||
|
|
}
|
||
|
|
|
||
|
|
try {
|
||
|
|
// Test if the database connection is valid first
|
||
|
|
// This will throw if the database is closed
|
||
|
|
db.pragma("user_version");
|
||
|
|
|
||
|
|
log.info("Registering SQLite custom functions...");
|
||
|
|
|
||
|
|
let successCount = 0;
|
||
|
|
for (const func of this.functions) {
|
||
|
|
try {
|
||
|
|
db.function(func.name, func.options || {}, func.implementation);
|
||
|
|
log.info(`Registered SQLite function: ${func.name}`);
|
||
|
|
successCount++;
|
||
|
|
} catch (error) {
|
||
|
|
log.error(`Failed to register SQLite function ${func.name}: ${error}`);
|
||
|
|
// Continue registering other functions even if one fails
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
// Only mark as registered if at least some functions were registered
|
||
|
|
if (successCount > 0) {
|
||
|
|
this.registered = true;
|
||
|
|
log.info(`SQLite custom functions registration completed (${successCount}/${this.functions.length})`);
|
||
|
|
return true;
|
||
|
|
} else {
|
||
|
|
log.error("No SQLite functions could be registered");
|
||
|
|
return false;
|
||
|
|
}
|
||
|
|
|
||
|
|
} catch (error) {
|
||
|
|
log.error(`Failed to register SQLite custom functions: ${error}`);
|
||
|
|
return false;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
/**
|
||
|
|
* Unregister all custom functions (for cleanup/testing)
|
||
|
|
* Note: better-sqlite3 doesn't provide a way to unregister functions,
|
||
|
|
* so this just resets the internal state
|
||
|
|
*/
|
||
|
|
unregister(): void {
|
||
|
|
this.registered = false;
|
||
|
|
}
|
||
|
|
|
||
|
|
/**
|
||
|
|
* Check if functions are currently registered
|
||
|
|
*/
|
||
|
|
isRegistered(): boolean {
|
||
|
|
return this.registered;
|
||
|
|
}
|
||
|
|
|
||
|
|
// ===== Function Implementations =====
|
||
|
|
|
||
|
|
/**
|
||
|
|
* Normalize text by removing diacritics and converting to lowercase
|
||
|
|
* Matches the behavior of utils.normalize() exactly
|
||
|
|
*
|
||
|
|
* @param text Text to normalize
|
||
|
|
* @returns Normalized text
|
||
|
|
*/
|
||
|
|
private normalizeText(text: string | null | undefined): string {
|
||
|
|
if (!text || typeof text !== 'string') {
|
||
|
|
return '';
|
||
|
|
}
|
||
|
|
|
||
|
|
// Use the exact same normalization as the rest of the codebase
|
||
|
|
return utilsNormalize(text);
|
||
|
|
}
|
||
|
|
|
||
|
|
/**
|
||
|
|
* Calculate Levenshtein edit distance between two strings
|
||
|
|
* Optimized with early termination and single-array approach
|
||
|
|
*
|
||
|
|
* SQLite will pass 2 or 3 arguments:
|
||
|
|
* - 2 args: str1, str2 (uses default maxDistance)
|
||
|
|
* - 3 args: str1, str2, maxDistance
|
||
|
|
*
|
||
|
|
* @returns Edit distance or maxDistance + 1 if exceeded
|
||
|
|
*/
|
||
|
|
private editDistance(...args: any[]): number {
|
||
|
|
// Handle variable arguments from SQLite
|
||
|
|
let str1: string | null | undefined = args[0];
|
||
|
|
let str2: string | null | undefined = args[1];
|
||
|
|
let maxDistance: number = args.length > 2 ? args[2] : FUZZY_CONFIG.MAX_EDIT_DISTANCE;
|
||
|
|
// Handle null/undefined inputs
|
||
|
|
if (!str1 || typeof str1 !== 'string') str1 = '';
|
||
|
|
if (!str2 || typeof str2 !== 'string') str2 = '';
|
||
|
|
|
||
|
|
// Validate and sanitize maxDistance
|
||
|
|
if (typeof maxDistance !== 'number' || !Number.isFinite(maxDistance)) {
|
||
|
|
maxDistance = FUZZY_CONFIG.MAX_EDIT_DISTANCE;
|
||
|
|
} else {
|
||
|
|
// Ensure it's a positive integer
|
||
|
|
maxDistance = Math.max(0, Math.floor(maxDistance));
|
||
|
|
}
|
||
|
|
|
||
|
|
const len1 = str1.length;
|
||
|
|
const len2 = str2.length;
|
||
|
|
|
||
|
|
// Performance guard for very long strings
|
||
|
|
if (len1 > FUZZY_CONFIG.MAX_STRING_LENGTH || len2 > FUZZY_CONFIG.MAX_STRING_LENGTH) {
|
||
|
|
return Math.abs(len1 - len2) <= maxDistance ? Math.abs(len1 - len2) : maxDistance + 1;
|
||
|
|
}
|
||
|
|
|
||
|
|
// Early termination: length difference exceeds max
|
||
|
|
if (Math.abs(len1 - len2) > maxDistance) {
|
||
|
|
return maxDistance + 1;
|
||
|
|
}
|
||
|
|
|
||
|
|
// Handle edge cases
|
||
|
|
if (len1 === 0) return len2 <= maxDistance ? len2 : maxDistance + 1;
|
||
|
|
if (len2 === 0) return len1 <= maxDistance ? len1 : maxDistance + 1;
|
||
|
|
|
||
|
|
// Single-array optimization for memory efficiency
|
||
|
|
let previousRow = Array.from({ length: len2 + 1 }, (_, i) => i);
|
||
|
|
let currentRow = new Array(len2 + 1);
|
||
|
|
|
||
|
|
for (let i = 1; i <= len1; i++) {
|
||
|
|
currentRow[0] = i;
|
||
|
|
let minInRow = i;
|
||
|
|
|
||
|
|
for (let j = 1; j <= len2; j++) {
|
||
|
|
const cost = str1[i - 1] === str2[j - 1] ? 0 : 1;
|
||
|
|
currentRow[j] = Math.min(
|
||
|
|
previousRow[j] + 1, // deletion
|
||
|
|
currentRow[j - 1] + 1, // insertion
|
||
|
|
previousRow[j - 1] + cost // substitution
|
||
|
|
);
|
||
|
|
|
||
|
|
if (currentRow[j] < minInRow) {
|
||
|
|
minInRow = currentRow[j];
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
// Early termination: minimum distance in row exceeds threshold
|
||
|
|
if (minInRow > maxDistance) {
|
||
|
|
return maxDistance + 1;
|
||
|
|
}
|
||
|
|
|
||
|
|
// Swap arrays for next iteration
|
||
|
|
[previousRow, currentRow] = [currentRow, previousRow];
|
||
|
|
}
|
||
|
|
|
||
|
|
const result = previousRow[len2];
|
||
|
|
return result <= maxDistance ? result : maxDistance + 1;
|
||
|
|
}
|
||
|
|
|
||
|
|
/**
|
||
|
|
* Test if a string matches a JavaScript regular expression
|
||
|
|
*
|
||
|
|
* SQLite will pass 2 or 3 arguments:
|
||
|
|
* - 2 args: text, pattern (uses default flags 'i')
|
||
|
|
* - 3 args: text, pattern, flags
|
||
|
|
*
|
||
|
|
* @returns 1 if match, 0 if no match, null on error
|
||
|
|
*/
|
||
|
|
private regexMatch(...args: any[]): number | null {
|
||
|
|
// Handle variable arguments from SQLite
|
||
|
|
let text: string | null | undefined = args[0];
|
||
|
|
let pattern: string | null | undefined = args[1];
|
||
|
|
let flags: string = args.length > 2 ? args[2] : 'i';
|
||
|
|
if (!text || !pattern) {
|
||
|
|
return 0;
|
||
|
|
}
|
||
|
|
|
||
|
|
if (typeof text !== 'string' || typeof pattern !== 'string') {
|
||
|
|
return null;
|
||
|
|
}
|
||
|
|
|
||
|
|
try {
|
||
|
|
// Validate flags
|
||
|
|
const validFlags = ['i', 'g', 'm', 's', 'u', 'y'];
|
||
|
|
const flagsArray = (flags || '').split('');
|
||
|
|
if (!flagsArray.every(f => validFlags.includes(f))) {
|
||
|
|
flags = 'i'; // Fall back to case-insensitive
|
||
|
|
}
|
||
|
|
|
||
|
|
const regex = new RegExp(pattern, flags);
|
||
|
|
return regex.test(text) ? 1 : 0;
|
||
|
|
} catch (error) {
|
||
|
|
// Invalid regex pattern
|
||
|
|
log.error(`Invalid regex pattern in SQL: ${pattern} - ${error}`);
|
||
|
|
return null;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
/**
|
||
|
|
* Tokenize text into searchable words
|
||
|
|
* Handles punctuation, camelCase, and snake_case
|
||
|
|
*
|
||
|
|
* @param text Text to tokenize
|
||
|
|
* @returns JSON array string of tokens
|
||
|
|
*/
|
||
|
|
private tokenizeText(text: string | null | undefined): string {
|
||
|
|
if (!text || typeof text !== 'string') {
|
||
|
|
return '[]';
|
||
|
|
}
|
||
|
|
|
||
|
|
try {
|
||
|
|
// Use a Set to avoid duplicates from the start
|
||
|
|
const expandedTokens: Set<string> = new Set();
|
||
|
|
|
||
|
|
// Split on word boundaries, preserving apostrophes within words
|
||
|
|
// But we need to handle underscore separately for snake_case
|
||
|
|
const tokens = text
|
||
|
|
.split(/[\s\n\r\t,;.!?()[\]{}"'`~@#$%^&*+=|\\/<>:-]+/)
|
||
|
|
.filter(token => token.length > 0);
|
||
|
|
|
||
|
|
// Process each token
|
||
|
|
for (const token of tokens) {
|
||
|
|
// Add the original token in lowercase
|
||
|
|
expandedTokens.add(token.toLowerCase());
|
||
|
|
|
||
|
|
// Handle snake_case first (split on underscore)
|
||
|
|
const snakeParts = token.split('_').filter(part => part.length > 0);
|
||
|
|
if (snakeParts.length > 1) {
|
||
|
|
// We have snake_case
|
||
|
|
for (const snakePart of snakeParts) {
|
||
|
|
// Add each snake part
|
||
|
|
expandedTokens.add(snakePart.toLowerCase());
|
||
|
|
|
||
|
|
// Also check for camelCase within each snake part
|
||
|
|
const camelParts = this.splitCamelCase(snakePart);
|
||
|
|
for (const camelPart of camelParts) {
|
||
|
|
if (camelPart.length > 0) {
|
||
|
|
expandedTokens.add(camelPart.toLowerCase());
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
} else {
|
||
|
|
// No snake_case, just check for camelCase
|
||
|
|
const camelParts = this.splitCamelCase(token);
|
||
|
|
for (const camelPart of camelParts) {
|
||
|
|
if (camelPart.length > 0) {
|
||
|
|
expandedTokens.add(camelPart.toLowerCase());
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
// Convert Set to Array for JSON serialization
|
||
|
|
const uniqueTokens = Array.from(expandedTokens);
|
||
|
|
|
||
|
|
// Return as JSON array string for SQL processing
|
||
|
|
return JSON.stringify(uniqueTokens);
|
||
|
|
} catch (error) {
|
||
|
|
log.error(`Error tokenizing text in SQL: ${error}`);
|
||
|
|
return '[]';
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
/**
|
||
|
|
* Helper method to split camelCase strings
|
||
|
|
* @param str String to split
|
||
|
|
* @returns Array of parts
|
||
|
|
*/
|
||
|
|
private splitCamelCase(str: string): string[] {
|
||
|
|
// Split on transitions from lowercase to uppercase
|
||
|
|
// Also handle sequences of uppercase letters (e.g., "XMLParser" -> ["XML", "Parser"])
|
||
|
|
return str.split(/(?<=[a-z])(?=[A-Z])|(?<=[A-Z])(?=[A-Z][a-z])/);
|
||
|
|
}
|
||
|
|
|
||
|
|
/**
|
||
|
|
* Strip HTML tags from content
|
||
|
|
* Removes script and style content, then strips tags and decodes entities
|
||
|
|
*
|
||
|
|
* @param html HTML content
|
||
|
|
* @returns Plain text without HTML tags
|
||
|
|
*/
|
||
|
|
private stripHtml(html: string | null | undefined): string {
|
||
|
|
if (!html || typeof html !== 'string') {
|
||
|
|
return '';
|
||
|
|
}
|
||
|
|
|
||
|
|
try {
|
||
|
|
let text = html;
|
||
|
|
|
||
|
|
// First remove script and style content entirely (including the tags)
|
||
|
|
// This needs to happen before stripTags to remove the content
|
||
|
|
text = text.replace(/<script\b[^<]*(?:(?!<\/script>)<[^<]*)*<\/script>/gi, '');
|
||
|
|
text = text.replace(/<style\b[^<]*(?:(?!<\/style>)<[^<]*)*<\/style>/gi, '');
|
||
|
|
|
||
|
|
// Now use stripTags to remove remaining HTML tags
|
||
|
|
text = stripTags(text);
|
||
|
|
|
||
|
|
// Decode common HTML entities
|
||
|
|
text = text.replace(/</g, '<');
|
||
|
|
text = text.replace(/>/g, '>');
|
||
|
|
text = text.replace(/&/g, '&');
|
||
|
|
text = text.replace(/"/g, '"');
|
||
|
|
text = text.replace(/'/g, "'");
|
||
|
|
text = text.replace(/'/g, "'");
|
||
|
|
text = text.replace(/ /g, ' ');
|
||
|
|
|
||
|
|
// Normalize whitespace - reduce multiple spaces to single space
|
||
|
|
// But don't trim leading/trailing space if it was from
|
||
|
|
text = text.replace(/\s+/g, ' ');
|
||
|
|
|
||
|
|
return text;
|
||
|
|
} catch (error) {
|
||
|
|
log.error(`Error stripping HTML in SQL: ${error}`);
|
||
|
|
return html; // Return original on error
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
/**
|
||
|
|
* Fuzzy match with configurable edit distance
|
||
|
|
* Combines exact and fuzzy matching for optimal performance
|
||
|
|
*
|
||
|
|
* SQLite will pass 2 or 3 arguments:
|
||
|
|
* - 2 args: needle, haystack (uses default maxDistance)
|
||
|
|
* - 3 args: needle, haystack, maxDistance
|
||
|
|
*
|
||
|
|
* @returns 1 if match found, 0 otherwise
|
||
|
|
*/
|
||
|
|
private fuzzyMatch(...args: any[]): number {
|
||
|
|
// Handle variable arguments from SQLite
|
||
|
|
let needle: string | null | undefined = args[0];
|
||
|
|
let haystack: string | null | undefined = args[1];
|
||
|
|
let maxDistance: number = args.length > 2 ? args[2] : FUZZY_CONFIG.MAX_EDIT_DISTANCE;
|
||
|
|
|
||
|
|
// Validate input types
|
||
|
|
if (!needle || !haystack) {
|
||
|
|
return 0;
|
||
|
|
}
|
||
|
|
|
||
|
|
if (typeof needle !== 'string' || typeof haystack !== 'string') {
|
||
|
|
return 0;
|
||
|
|
}
|
||
|
|
|
||
|
|
// Validate and sanitize maxDistance
|
||
|
|
if (typeof maxDistance !== 'number' || !Number.isFinite(maxDistance)) {
|
||
|
|
maxDistance = FUZZY_CONFIG.MAX_EDIT_DISTANCE;
|
||
|
|
} else {
|
||
|
|
// Ensure it's a positive integer
|
||
|
|
maxDistance = Math.max(0, Math.floor(maxDistance));
|
||
|
|
}
|
||
|
|
|
||
|
|
// Normalize for comparison
|
||
|
|
const normalizedNeedle = needle.toLowerCase();
|
||
|
|
const normalizedHaystack = haystack.toLowerCase();
|
||
|
|
|
||
|
|
// Check exact match first (most common case)
|
||
|
|
if (normalizedHaystack.includes(normalizedNeedle)) {
|
||
|
|
return 1;
|
||
|
|
}
|
||
|
|
|
||
|
|
// For fuzzy matching, check individual words
|
||
|
|
const words = normalizedHaystack.split(/\s+/).filter(w => w.length > 0);
|
||
|
|
|
||
|
|
for (const word of words) {
|
||
|
|
// Skip if word length difference is too large
|
||
|
|
if (Math.abs(word.length - normalizedNeedle.length) > maxDistance) {
|
||
|
|
continue;
|
||
|
|
}
|
||
|
|
|
||
|
|
// Check edit distance - call with all 3 args since we're calling internally
|
||
|
|
const distance = this.editDistance(normalizedNeedle, word, maxDistance);
|
||
|
|
if (distance <= maxDistance) {
|
||
|
|
return 1;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
return 0;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
// Export singleton instance getter
|
||
|
|
export function getSqliteFunctionsService(): SqliteFunctionsService {
|
||
|
|
return SqliteFunctionsService.getInstance();
|
||
|
|
}
|
||
|
|
|
||
|
|
/**
|
||
|
|
* Initialize SQLite custom functions with the given database connection
|
||
|
|
* This should be called once during application startup after the database is opened
|
||
|
|
*
|
||
|
|
* @param db The better-sqlite3 database connection
|
||
|
|
* @returns true if successful, false otherwise
|
||
|
|
*/
|
||
|
|
export function initializeSqliteFunctions(db: Database): boolean {
|
||
|
|
const service = getSqliteFunctionsService();
|
||
|
|
return service.registerFunctions(db);
|
||
|
|
}
|