feat(search): also limit note content that can be searched, but keep searchability of titles

feat(quick_search): remove some old variables that are no longer used now
feat(quick_search): just fuzzy match note titles for larger notes, while still matching on exact strings
2025-10-31 18:36:30 +01:00 · 2025-08-28 18:56:06 +00:00 · 2025-08-27 22:33:38 +00:00 · 2025-08-27 21:11:44 +00:00
2 changed files with 77 additions and 27 deletions
--- a/apps/server/src/services/search/expressions/note_content_fulltext.ts
+++ b/apps/server/src/services/search/expressions/note_content_fulltext.ts
@@ -120,6 +120,74 @@ class NoteContentFulltextExp extends Expression {
        }
        content = processedContent;
        // Check note size and determine search strategy
        const contentSize = content.length;
        const isExtremeNote = contentSize > FUZZY_SEARCH_CONFIG.EXTREME_NOTE_SIZE_THRESHOLD;
        const isLargeNote = contentSize > FUZZY_SEARCH_CONFIG.LARGE_NOTE_SIZE_THRESHOLD;
        const isFuzzyOperator = this.operator === "~=" || this.operator === "~*";
        // For extremely large notes (>5MB), only search title regardless of operator
        if (isExtremeNote) {
            const note = becca.notes[noteId];
            const title = note.title || "";
            log.info(`Note ${noteId} is ${(contentSize / (1024 * 1024)).toFixed(1)}MB - searching title only due to extreme size`);
            // For fuzzy operators, use fuzzy matching on title
            // For other operators, use exact/wildcard matching on title
            const normalizedTitle = normalizeSearchText(title);
            let titleMatches = false;
            if (isFuzzyOperator) {
                titleMatches = this.tokens.some(token => 
                    this.fuzzyMatchToken(normalizeSearchText(token), normalizedTitle)
                );
            } else {
                // Apply the operator to title matching
                titleMatches = this.tokens.every(token => {
                    const normalizedToken = normalizeSearchText(token);
                    if (this.operator === "*=*") return normalizedTitle.includes(normalizedToken);
                    if (this.operator === "=") return normalizedTitle === normalizedToken;
                    if (this.operator === "!=") return normalizedTitle !== normalizedToken;
                    if (this.operator === "*=") return normalizedTitle.endsWith(normalizedToken);
                    if (this.operator === "=*") return normalizedTitle.startsWith(normalizedToken);
                    return false;
                });
            }
            if (titleMatches) {
                resultNoteSet.add(becca.notes[noteId]);
            }
            return content;
        }
        // For large notes (250KB-5MB) with fuzzy operators, use optimized strategy
        if (isLargeNote && isFuzzyOperator) {
            const note = becca.notes[noteId];
            const title = note.title || "";
            log.info(`Note ${noteId} is ${(contentSize / 1024).toFixed(1)}KB - using optimized search (fuzzy on title, exact on content)`);
            // Perform fuzzy search on title
            const titleMatches = this.fuzzyMatchToken(normalizeSearchText(this.tokens[0]), normalizeSearchText(title));
            // Perform exact match on content for all tokens
            const contentMatches = this.tokens.every(token => {
                const normalizedToken = normalizeSearchText(token);
                const normalizedContent = normalizeSearchText(content);
                return normalizedContent.includes(normalizedToken);
            });
            // Add to results if either title matches with fuzzy or content matches exactly
            if (titleMatches || contentMatches) {
                resultNoteSet.add(becca.notes[noteId]);
            }
            return content;
        }
        // Standard search logic for non-large notes or non-fuzzy operators
        if (this.tokens.length === 1) {
            const [token] = this.tokens;
@@ -250,11 +318,6 @@ class NoteContentFulltextExp extends Expression {
            return false;
        }
        // Warn about large word counts but still attempt matching
        if (words.length > FUZZY_SEARCH_CONFIG.PERFORMANCE_WARNING_WORDS) {
            console.info(`Large word count for phrase matching: ${words.length} words - may take longer but will attempt full matching`);
        }
        // Find positions of each token
        const tokenPositions: number[][] = this.tokens.map(token => {
            const normalizedToken = normalizeSearchText(token);
--- a/apps/server/src/services/search/utils/text_utils.ts
+++ b/apps/server/src/services/search/utils/text_utils.ts
@@ -14,15 +14,13 @@ export const FUZZY_SEARCH_CONFIG = {
    MAX_EDIT_DISTANCE: 2,
    // Maximum proximity distance for phrase matching (in words)
    MAX_PHRASE_PROXIMITY: 10,
    // Large note threshold - above this, use optimized search strategy (fuzzy on title only)
    LARGE_NOTE_SIZE_THRESHOLD: 250000, // 250KB - switch to title-only fuzzy for performance
    // Extreme note threshold - above this, skip content search entirely
    EXTREME_NOTE_SIZE_THRESHOLD: 5 * 1024 * 1024, // 5MB - title search only
    // Absolute hard limits for extreme cases - only to prevent system crashes
    ABSOLUTE_MAX_CONTENT_SIZE: 100 * 1024 * 1024, // 100MB - extreme upper limit to prevent OOM
    ABSOLUTE_MAX_WORD_COUNT: 2000000, // 2M words - extreme upper limit for word processing
    // Performance warning thresholds - inform user but still attempt search
    PERFORMANCE_WARNING_SIZE: 5 * 1024 * 1024, // 5MB - warn about potential performance impact
    PERFORMANCE_WARNING_WORDS: 100000, // 100K words - warn about word count impact
    // Progressive processing thresholds for very large content
    PROGRESSIVE_PROCESSING_SIZE: 10 * 1024 * 1024, // 10MB - use progressive processing
    PROGRESSIVE_PROCESSING_WORDS: 500000, // 500K words - use progressive processing
    // Performance thresholds
    EARLY_TERMINATION_THRESHOLD: 3,
 } as const;
@@ -204,7 +202,8 @@ export function validateFuzzySearchTokens(tokens: string[], operator: string): {
 /**
 * Validates and preprocesses content for search operations.
- * Philosophy: Try to search everything! Only block truly extreme cases that could crash the system.
+ * Only blocks truly extreme cases that could crash the system.
 * Large notes (>50K words) are handled with optimized search strategy instead.
 * 
 * @param content The content to validate and preprocess
 * @param noteId The note ID (for logging purposes)
@@ -222,12 +221,7 @@ export function validateAndPreprocessContent(content: string, noteId?: string):
        return content.substring(0, FUZZY_SEARCH_CONFIG.ABSOLUTE_MAX_CONTENT_SIZE);
    }
-    // Warn about very large content but still process it
+    // For word count, only block truly extreme cases
    if (content.length > FUZZY_SEARCH_CONFIG.PERFORMANCE_WARNING_SIZE) {
        console.info(`Large content for note ${noteId || 'unknown'}: ${content.length} bytes - processing may take time but will attempt full search`);
    }
    // For word count, be even more permissive - only block truly extreme cases
    const wordCount = content.split(/\s+/).length;
    if (wordCount > FUZZY_SEARCH_CONFIG.ABSOLUTE_MAX_WORD_COUNT) {
        console.error(`Word count exceeds absolute system limit for note ${noteId || 'unknown'}: ${wordCount} words - this could cause system instability`);
@@ -235,15 +229,8 @@ export function validateAndPreprocessContent(content: string, noteId?: string):
        return content.split(/\s+/).slice(0, FUZZY_SEARCH_CONFIG.ABSOLUTE_MAX_WORD_COUNT).join(' ');
    }
-    // Warn about high word counts but still process them
+    // Notes above LARGE_NOTE_SIZE_THRESHOLD (250KB) will use optimized search strategy
-    if (wordCount > FUZZY_SEARCH_CONFIG.PERFORMANCE_WARNING_WORDS) {
+    // (handled in note_content_fulltext.ts)
        console.info(`High word count for note ${noteId || 'unknown'}: ${wordCount} words - phrase matching may take time but will attempt full search`);
    }
    // Progressive processing warning for very large content
    if (content.length > FUZZY_SEARCH_CONFIG.PROGRESSIVE_PROCESSING_SIZE || wordCount > FUZZY_SEARCH_CONFIG.PROGRESSIVE_PROCESSING_WORDS) {
        console.info(`Very large content for note ${noteId || 'unknown'} - using progressive processing to maintain responsiveness`);
    }
    return content;
 }
Author	SHA1	Message	Date
perf3ct	912bc61730	feat(search): also limit note content that can be searched, but keep searchability of titles	2025-08-28 18:56:06 +00:00
perf3ct	93e8459d4b	feat(quick_search): remove some old variables that are no longer used now	2025-08-27 22:33:38 +00:00
perf3ct	6c26fa709e	feat(quick_search): just fuzzy match note titles for larger notes, while still matching on exact strings	2025-08-27 21:11:44 +00:00