feat(search): implement additional weights for search_results, normalize text as well

2025-11-01 19:05:59 +01:00 · 2025-08-02 23:56:23 +00:00
parent 7a1ec266ad
commit 4db04519bd
2 changed files with 354 additions and 19 deletions
--- a/apps/server/src/services/search/search_result.ts
+++ b/apps/server/src/services/search/search_result.ts
@@ -2,6 +2,27 @@

 import beccaService from "../../becca/becca_service.js";
 import becca from "../../becca/becca.js";
+import { 
+    normalizeSearchText, 
+    calculateOptimizedEditDistance, 
+    FUZZY_SEARCH_CONFIG 
+} from "./utils/text_utils.js";
+
+// Scoring constants for better maintainability
+const SCORE_WEIGHTS = {
+    NOTE_ID_EXACT_MATCH: 1000,
+    TITLE_EXACT_MATCH: 2000,
+    TITLE_PREFIX_MATCH: 500,
+    TITLE_WORD_MATCH: 300,
+    TOKEN_EXACT_MATCH: 4,
+    TOKEN_PREFIX_MATCH: 2,
+    TOKEN_CONTAINS_MATCH: 1,
+    TOKEN_FUZZY_MATCH: 0.5,
+    TITLE_FACTOR: 2.0,
+    PATH_FACTOR: 0.3,
+    HIDDEN_NOTE_PENALTY: 3
+} as const;
+

 class SearchResult {
    notePathArray: string[];
@@ -27,49 +48,92 @@ class SearchResult {
        this.score = 0;

        const note = becca.notes[this.noteId];
-        const normalizedQuery = fulltextQuery.toLowerCase();
-        const normalizedTitle = note.title.toLowerCase();
+        const normalizedQuery = normalizeSearchText(fulltextQuery.toLowerCase());
+        const normalizedTitle = normalizeSearchText(note.title.toLowerCase());

        // Note ID exact match, much higher score
        if (note.noteId.toLowerCase() === fulltextQuery) {
-            this.score += 1000;
+            this.score += SCORE_WEIGHTS.NOTE_ID_EXACT_MATCH;
        }

-        // Title matching scores, make sure to always win
+        // Title matching scores with fuzzy matching support
        if (normalizedTitle === normalizedQuery) {
-            this.score += 2000; // Increased from 1000 to ensure exact matches always win
+            this.score += SCORE_WEIGHTS.TITLE_EXACT_MATCH;
        } else if (normalizedTitle.startsWith(normalizedQuery)) {
-            this.score += 500; // Increased to give more weight to prefix matches
-        } else if (normalizedTitle.includes(` ${normalizedQuery} `) || normalizedTitle.startsWith(`${normalizedQuery} `) || normalizedTitle.endsWith(` ${normalizedQuery}`)) {
-            this.score += 300; // Increased to better distinguish word matches
+            this.score += SCORE_WEIGHTS.TITLE_PREFIX_MATCH;
+        } else if (this.isWordMatch(normalizedTitle, normalizedQuery)) {
+            this.score += SCORE_WEIGHTS.TITLE_WORD_MATCH;
+        } else {
+            // Try fuzzy matching for typos
+            const fuzzyScore = this.calculateFuzzyTitleScore(normalizedTitle, normalizedQuery);
+            this.score += fuzzyScore;
        }

-        // Add scores for partial matches with adjusted weights
-        this.addScoreForStrings(tokens, note.title, 2.0); // Increased to give more weight to title matches
-        this.addScoreForStrings(tokens, this.notePathTitle, 0.3); // Reduced to further de-emphasize path matches
+        // Add scores for token matches
+        this.addScoreForStrings(tokens, note.title, SCORE_WEIGHTS.TITLE_FACTOR);
+        this.addScoreForStrings(tokens, this.notePathTitle, SCORE_WEIGHTS.PATH_FACTOR);

        if (note.isInHiddenSubtree()) {
-            this.score = this.score / 3; // Increased penalty for hidden notes
+            this.score = this.score / SCORE_WEIGHTS.HIDDEN_NOTE_PENALTY;
        }
    }

    addScoreForStrings(tokens: string[], str: string, factor: number) {
-        const chunks = str.toLowerCase().split(" ");
+        const normalizedStr = normalizeSearchText(str.toLowerCase());
+        const chunks = normalizedStr.split(" ");

        let tokenScore = 0;
        for (const chunk of chunks) {
            for (const token of tokens) {
-                if (chunk === token) {
-                    tokenScore += 4 * token.length * factor;
-                } else if (chunk.startsWith(token)) {
-                    tokenScore += 2 * token.length * factor;
-                } else if (chunk.includes(token)) {
-                    tokenScore += token.length * factor;
+                const normalizedToken = normalizeSearchText(token.toLowerCase());
+                
+                if (chunk === normalizedToken) {
+                    tokenScore += SCORE_WEIGHTS.TOKEN_EXACT_MATCH * token.length * factor;
+                } else if (chunk.startsWith(normalizedToken)) {
+                    tokenScore += SCORE_WEIGHTS.TOKEN_PREFIX_MATCH * token.length * factor;
+                } else if (chunk.includes(normalizedToken)) {
+                    tokenScore += SCORE_WEIGHTS.TOKEN_CONTAINS_MATCH * token.length * factor;
+                } else {
+                    // Try fuzzy matching for individual tokens
+                    const editDistance = calculateOptimizedEditDistance(chunk, normalizedToken, FUZZY_SEARCH_CONFIG.MAX_EDIT_DISTANCE);
+                    if (editDistance <= FUZZY_SEARCH_CONFIG.MAX_EDIT_DISTANCE && normalizedToken.length >= FUZZY_SEARCH_CONFIG.MIN_FUZZY_TOKEN_LENGTH) {
+                        const fuzzyWeight = SCORE_WEIGHTS.TOKEN_FUZZY_MATCH * (1 - editDistance / FUZZY_SEARCH_CONFIG.MAX_EDIT_DISTANCE);
+                        tokenScore += fuzzyWeight * token.length * factor;
+                    }
                }
            }
        }
        this.score += tokenScore;
    }
+
+
+    /**
+     * Checks if the query matches as a complete word in the text
+     */
+    private isWordMatch(text: string, query: string): boolean {
+        return text.includes(` ${query} `) || 
+               text.startsWith(`${query} `) || 
+               text.endsWith(` ${query}`);
+    }
+
+    /**
+     * Calculates fuzzy matching score for title matches
+     */
+    private calculateFuzzyTitleScore(title: string, query: string): number {
+        const editDistance = calculateOptimizedEditDistance(title, query, FUZZY_SEARCH_CONFIG.MAX_EDIT_DISTANCE);
+        const maxLen = Math.max(title.length, query.length);
+        
+        // Only apply fuzzy matching if the query is reasonably long and edit distance is small
+        if (query.length >= FUZZY_SEARCH_CONFIG.MIN_FUZZY_TOKEN_LENGTH && 
+            editDistance <= FUZZY_SEARCH_CONFIG.MAX_EDIT_DISTANCE && 
+            editDistance / maxLen <= 0.3) {
+            const similarity = 1 - (editDistance / maxLen);
+            return SCORE_WEIGHTS.TITLE_WORD_MATCH * similarity * 0.7; // Reduced weight for fuzzy matches
+        }
+        
+        return 0;
+    }
+
 }

 export default SearchResult;
--- a/apps/server/src/services/search/utils/text_utils.ts
+++ b/apps/server/src/services/search/utils/text_utils.ts
@@ -0,0 +1,271 @@
+"use strict";
+
+import { normalize } from "../../utils.js";
+
+/**
+ * Shared text processing utilities for search functionality
+ */
+
+// Configuration constants for fuzzy matching
+export const FUZZY_SEARCH_CONFIG = {
+    // Minimum token length for fuzzy operators to prevent false positives
+    MIN_FUZZY_TOKEN_LENGTH: 3,
+    // Maximum edit distance for fuzzy matching
+    MAX_EDIT_DISTANCE: 2,
+    // Maximum proximity distance for phrase matching (in words)
+    MAX_PHRASE_PROXIMITY: 10,
+    // Content size limits for memory protection
+    MAX_CONTENT_SIZE: 50 * 1024, // 50KB
+    MAX_WORD_COUNT: 10000,
+    // Performance thresholds
+    EARLY_TERMINATION_THRESHOLD: 3,
+} as const;
+
+/**
+ * Normalizes text by removing diacritics and converting to lowercase.
+ * This is the centralized text normalization function used across all search components.
+ * Uses the shared normalize function from utils for consistency.
+ * 
+ * Examples: 
+ * - "café" -> "cafe"
+ * - "naïve" -> "naive"
+ * - "HELLO WORLD" -> "hello world"
+ * 
+ * @param text The text to normalize
+ * @returns The normalized text
+ */
+export function normalizeSearchText(text: string): string {
+    if (!text || typeof text !== 'string') {
+        return '';
+    }
+    
+    // Use shared normalize function for consistency across the codebase
+    return normalize(text);
+}
+
+/**
+ * Optimized edit distance calculation using single array and early termination.
+ * This is significantly more memory efficient than the 2D matrix approach and includes
+ * early termination optimizations for better performance.
+ * 
+ * @param str1 First string
+ * @param str2 Second string
+ * @param maxDistance Maximum allowed distance (for early termination)
+ * @returns The edit distance between the strings, or maxDistance + 1 if exceeded
+ */
+export function calculateOptimizedEditDistance(str1: string, str2: string, maxDistance: number = FUZZY_SEARCH_CONFIG.MAX_EDIT_DISTANCE): number {
+    // Input validation
+    if (typeof str1 !== 'string' || typeof str2 !== 'string') {
+        throw new Error('Both arguments must be strings');
+    }
+    
+    if (maxDistance < 0 || !Number.isInteger(maxDistance)) {
+        throw new Error('maxDistance must be a non-negative integer');
+    }
+
+    const len1 = str1.length;
+    const len2 = str2.length;
+
+    // Performance guard: if strings are too long, limit processing
+    const maxStringLength = 1000;
+    if (len1 > maxStringLength || len2 > maxStringLength) {
+        // For very long strings, fall back to simple length-based heuristic
+        return Math.abs(len1 - len2) <= maxDistance ? Math.abs(len1 - len2) : maxDistance + 1;
+    }
+
+    // Early termination: if length difference exceeds max distance
+    if (Math.abs(len1 - len2) > maxDistance) {
+        return maxDistance + 1;
+    }
+
+    // Handle edge cases
+    if (len1 === 0) return len2 <= maxDistance ? len2 : maxDistance + 1;
+    if (len2 === 0) return len1 <= maxDistance ? len1 : maxDistance + 1;
+
+    // Use single array optimization for better memory usage
+    let previousRow = Array.from({ length: len2 + 1 }, (_, i) => i);
+    let currentRow = new Array(len2 + 1);
+
+    for (let i = 1; i <= len1; i++) {
+        currentRow[0] = i;
+        let minInRow = i;
+
+        for (let j = 1; j <= len2; j++) {
+            const cost = str1[i - 1] === str2[j - 1] ? 0 : 1;
+            currentRow[j] = Math.min(
+                previousRow[j] + 1,        // deletion
+                currentRow[j - 1] + 1,     // insertion
+                previousRow[j - 1] + cost  // substitution
+            );
+            
+            // Track minimum value in current row for early termination
+            if (currentRow[j] < minInRow) {
+                minInRow = currentRow[j];
+            }
+        }
+
+        // Early termination: if minimum distance in row exceeds threshold
+        if (minInRow > maxDistance) {
+            return maxDistance + 1;
+        }
+
+        // Swap arrays for next iteration
+        [previousRow, currentRow] = [currentRow, previousRow];
+    }
+
+    const result = previousRow[len2];
+    return result <= maxDistance ? result : maxDistance + 1;
+}
+
+/**
+ * Validates that tokens meet minimum requirements for fuzzy operators.
+ * 
+ * @param tokens Array of search tokens
+ * @param operator The search operator being used
+ * @returns Validation result with success status and error message
+ */
+export function validateFuzzySearchTokens(tokens: string[], operator: string): { isValid: boolean; error?: string } {
+    if (!operator || typeof operator !== 'string') {
+        return {
+            isValid: false,
+            error: 'Invalid operator: operator must be a non-empty string'
+        };
+    }
+
+    if (!Array.isArray(tokens)) {
+        return {
+            isValid: false,
+            error: 'Invalid tokens: tokens must be an array'
+        };
+    }
+
+    if (tokens.length === 0) {
+        return {
+            isValid: false,
+            error: 'Invalid tokens: at least one token is required'
+        };
+    }
+
+    // Check for null, undefined, or non-string tokens
+    const invalidTypeTokens = tokens.filter(token => 
+        token == null || typeof token !== 'string'
+    );
+    
+    if (invalidTypeTokens.length > 0) {
+        return {
+            isValid: false,
+            error: 'Invalid tokens: all tokens must be non-null strings'
+        };
+    }
+
+    // Check for empty string tokens
+    const emptyTokens = tokens.filter(token => token.trim().length === 0);
+    
+    if (emptyTokens.length > 0) {
+        return {
+            isValid: false,
+            error: 'Invalid tokens: empty or whitespace-only tokens are not allowed'
+        };
+    }
+
+    if (operator !== '~=' && operator !== '~*') {
+        return { isValid: true };
+    }
+
+    // Check minimum token length for fuzzy operators
+    const shortTokens = tokens.filter(token => token.length < FUZZY_SEARCH_CONFIG.MIN_FUZZY_TOKEN_LENGTH);
+    
+    if (shortTokens.length > 0) {
+        return {
+            isValid: false,
+            error: `Fuzzy search operators (~=, ~*) require tokens of at least ${FUZZY_SEARCH_CONFIG.MIN_FUZZY_TOKEN_LENGTH} characters. Invalid tokens: ${shortTokens.join(', ')}`
+        };
+    }
+
+    // Check for excessively long tokens that could cause performance issues
+    const maxTokenLength = 100; // Reasonable limit for search tokens
+    const longTokens = tokens.filter(token => token.length > maxTokenLength);
+    
+    if (longTokens.length > 0) {
+        return {
+            isValid: false,
+            error: `Tokens are too long (max ${maxTokenLength} characters). Long tokens: ${longTokens.map(t => t.substring(0, 20) + '...').join(', ')}`
+        };
+    }
+
+    return { isValid: true };
+}
+
+/**
+ * Validates and preprocesses content for search operations with size limits.
+ * 
+ * @param content The content to validate and preprocess
+ * @param noteId The note ID (for logging purposes)
+ * @returns Processed content or null if content exceeds limits
+ */
+export function validateAndPreprocessContent(content: string, noteId?: string): string | null {
+    if (!content || typeof content !== 'string') {
+        return null;
+    }
+
+    // Check content size limits
+    if (content.length > FUZZY_SEARCH_CONFIG.MAX_CONTENT_SIZE) {
+        console.warn(`Content size exceeds limit for note ${noteId || 'unknown'}: ${content.length} bytes`);
+        return content.substring(0, FUZZY_SEARCH_CONFIG.MAX_CONTENT_SIZE);
+    }
+
+    // Check word count limits for phrase matching
+    const wordCount = content.split(/\s+/).length;
+    if (wordCount > FUZZY_SEARCH_CONFIG.MAX_WORD_COUNT) {
+        console.warn(`Word count exceeds limit for note ${noteId || 'unknown'}: ${wordCount} words`);
+        // Take first MAX_WORD_COUNT words
+        return content.split(/\s+/).slice(0, FUZZY_SEARCH_CONFIG.MAX_WORD_COUNT).join(' ');
+    }
+
+    return content;
+}
+
+/**
+ * Checks if a word matches a token with fuzzy matching.
+ * Optimized for common case where distances are small.
+ * 
+ * @param token The search token (should be normalized)
+ * @param word The word to match against (should be normalized)
+ * @param maxDistance Maximum allowed edit distance
+ * @returns True if the word matches the token within the distance threshold
+ */
+export function fuzzyMatchWord(token: string, word: string, maxDistance: number = FUZZY_SEARCH_CONFIG.MAX_EDIT_DISTANCE): boolean {
+    // Input validation
+    if (typeof token !== 'string' || typeof word !== 'string') {
+        return false;
+    }
+    
+    if (token.length === 0 || word.length === 0) {
+        return false;
+    }
+    
+    try {
+        // Exact match check first (most common case)
+        if (word.includes(token)) {
+            return true;
+        }
+        
+        // Length difference check for early exit
+        if (Math.abs(word.length - token.length) > maxDistance) {
+            return false;
+        }
+        
+        // For very short tokens or very different lengths, be more strict
+        if (token.length < 4 || Math.abs(word.length - token.length) > 2) {
+            return false;
+        }
+        
+        // Use optimized edit distance calculation
+        const distance = calculateOptimizedEditDistance(token, word, maxDistance);
+        return distance <= maxDistance;
+    } catch (error) {
+        // Log error and return false for safety
+        console.warn('Error in fuzzy word matching:', error);
+        return false;
+    }
+}