feat(search): improve search weights and operators (#6536)

This commit is contained in:
Elian Doran
2025-08-13 13:10:30 +03:00
committed by GitHub
18 changed files with 1668 additions and 133 deletions

View File

@@ -52,10 +52,15 @@ function quickSearch(req: Request) {
fuzzyAttributeSearch: false
});
const resultNoteIds = searchService.findResultsWithQuery(searchString, searchContext).map((sr) => sr.noteId);
// Use the same highlighting logic as autocomplete for consistency
const searchResults = searchService.searchNotesForAutocomplete(searchString, false);
// Extract note IDs for backward compatibility
const resultNoteIds = searchResults.map((result) => result.notePath.split("/").pop()).filter(Boolean) as string[];
return {
searchResultNoteIds: resultNoteIds,
searchResults: searchResults,
error: searchContext.getError()
};
}

View File

@@ -1,5 +1,6 @@
import { describe, it, expect } from "vitest";
import { processMindmapContent } from "./note_content_fulltext.js";
import NoteContentFulltextExp from "./note_content_fulltext.js";
describe("processMindmapContent", () => {
it("supports empty JSON", () => {
@@ -11,3 +12,19 @@ describe("processMindmapContent", () => {
expect(processMindmapContent(`{ "node": " }`)).toEqual("");
});
});
describe("Fuzzy Search Operators", () => {
it("~= operator works with typos", () => {
// Test that the ~= operator can handle common typos
const expression = new NoteContentFulltextExp("~=", { tokens: ["hello"] });
expect(expression.tokens).toEqual(["hello"]);
expect(() => new NoteContentFulltextExp("~=", { tokens: ["he"] })).toThrow(); // Too short
});
it("~* operator works with fuzzy contains", () => {
// Test that the ~* operator handles fuzzy substring matching
const expression = new NoteContentFulltextExp("~*", { tokens: ["world"] });
expect(expression.tokens).toEqual(["world"]);
expect(() => new NoteContentFulltextExp("~*", { tokens: ["wo"] })).toThrow(); // Too short
});
});

View File

@@ -11,8 +11,19 @@ import protectedSessionService from "../../protected_session.js";
import striptags from "striptags";
import { normalize } from "../../utils.js";
import sql from "../../sql.js";
import {
normalizeSearchText,
calculateOptimizedEditDistance,
validateFuzzySearchTokens,
validateAndPreprocessContent,
fuzzyMatchWord,
FUZZY_SEARCH_CONFIG
} from "../utils/text_utils.js";
const ALLOWED_OPERATORS = new Set(["=", "!=", "*=*", "*=", "=*", "%="]);
const ALLOWED_OPERATORS = new Set(["=", "!=", "*=*", "*=", "=*", "%=", "~=", "~*"]);
// Maximum content size for search processing (2MB)
const MAX_SEARCH_CONTENT_SIZE = 2 * 1024 * 1024;
const cachedRegexes: Record<string, RegExp> = {};
@@ -41,6 +52,16 @@ class NoteContentFulltextExp extends Expression {
constructor(operator: string, { tokens, raw, flatText }: ConstructorOpts) {
super();
if (!operator || !tokens || !Array.isArray(tokens)) {
throw new Error('Invalid parameters: operator and tokens are required');
}
// Validate fuzzy search tokens
const validation = validateFuzzySearchTokens(tokens, operator);
if (!validation.isValid) {
throw new Error(validation.error!);
}
this.operator = operator;
this.tokens = tokens;
this.raw = !!raw;
@@ -59,7 +80,9 @@ class NoteContentFulltextExp extends Expression {
for (const row of sql.iterateRows<SearchRow>(`
SELECT noteId, type, mime, content, isProtected
FROM notes JOIN blobs USING (blobId)
WHERE type IN ('text', 'code', 'mermaid', 'canvas', 'mindMap') AND isDeleted = 0`)) {
WHERE type IN ('text', 'code', 'mermaid', 'canvas', 'mindMap')
AND isDeleted = 0
AND LENGTH(content) < ${MAX_SEARCH_CONTENT_SIZE}`)) {
this.findInText(row, inputNoteSet, resultNoteSet);
}
@@ -89,6 +112,13 @@ class NoteContentFulltextExp extends Expression {
}
content = this.preprocessContent(content, type, mime);
// Apply content size validation and preprocessing
const processedContent = validateAndPreprocessContent(content, noteId);
if (!processedContent) {
return; // Content too large or invalid
}
content = processedContent;
if (this.tokens.length === 1) {
const [token] = this.tokens;
@@ -99,21 +129,27 @@ class NoteContentFulltextExp extends Expression {
(this.operator === "*=" && content.endsWith(token)) ||
(this.operator === "=*" && content.startsWith(token)) ||
(this.operator === "*=*" && content.includes(token)) ||
(this.operator === "%=" && getRegex(token).test(content))
(this.operator === "%=" && getRegex(token).test(content)) ||
(this.operator === "~=" && this.matchesWithFuzzy(content, noteId)) ||
(this.operator === "~*" && this.fuzzyMatchToken(normalizeSearchText(token), normalizeSearchText(content)))
) {
resultNoteSet.add(becca.notes[noteId]);
}
} else {
const nonMatchingToken = this.tokens.find(
(token) =>
!content?.includes(token) &&
// in case of default fulltext search, we should consider both title, attrs and content
// so e.g. "hello world" should match when "hello" is in title and "world" in content
(!this.flatText || !becca.notes[noteId].getFlatText().includes(token))
);
// Multi-token matching with fuzzy support and phrase proximity
if (this.operator === "~=" || this.operator === "~*") {
if (this.matchesWithFuzzy(content, noteId)) {
resultNoteSet.add(becca.notes[noteId]);
}
} else {
const nonMatchingToken = this.tokens.find(
(token) =>
!this.tokenMatchesContent(token, content, noteId)
);
if (!nonMatchingToken) {
resultNoteSet.add(becca.notes[noteId]);
if (!nonMatchingToken) {
resultNoteSet.add(becca.notes[noteId]);
}
}
}
@@ -124,8 +160,8 @@ class NoteContentFulltextExp extends Expression {
content = normalize(content.toString());
if (type === "text" && mime === "text/html") {
if (!this.raw && content.length < 20000) {
// striptags is slow for very large notes
if (!this.raw) {
// Content size already filtered at DB level, safe to process
content = this.stripTags(content);
}
@@ -152,6 +188,147 @@ class NoteContentFulltextExp extends Expression {
return content.trim();
}
/**
* Checks if a token matches content with optional fuzzy matching
*/
private tokenMatchesContent(token: string, content: string, noteId: string): boolean {
const normalizedToken = normalizeSearchText(token);
const normalizedContent = normalizeSearchText(content);
if (normalizedContent.includes(normalizedToken)) {
return true;
}
// Check flat text for default fulltext search
if (!this.flatText || !becca.notes[noteId].getFlatText().includes(token)) {
return false;
}
return true;
}
/**
* Performs fuzzy matching with edit distance and phrase proximity
*/
private matchesWithFuzzy(content: string, noteId: string): boolean {
try {
const normalizedContent = normalizeSearchText(content);
const flatText = this.flatText ? normalizeSearchText(becca.notes[noteId].getFlatText()) : "";
// For phrase matching, check if tokens appear within reasonable proximity
if (this.tokens.length > 1) {
return this.matchesPhrase(normalizedContent, flatText);
}
// Single token fuzzy matching
const token = normalizeSearchText(this.tokens[0]);
return this.fuzzyMatchToken(token, normalizedContent) ||
(this.flatText && this.fuzzyMatchToken(token, flatText));
} catch (error) {
log.error(`Error in fuzzy matching for note ${noteId}: ${error}`);
return false;
}
}
/**
* Checks if multiple tokens match as a phrase with proximity consideration
*/
private matchesPhrase(content: string, flatText: string): boolean {
const searchText = this.flatText ? `${content} ${flatText}` : content;
// Apply content size limits for phrase matching
const limitedText = validateAndPreprocessContent(searchText);
if (!limitedText) {
return false;
}
const words = limitedText.toLowerCase().split(/\s+/);
// Only skip phrase matching for truly extreme word counts that could crash the system
if (words.length > FUZZY_SEARCH_CONFIG.ABSOLUTE_MAX_WORD_COUNT) {
console.error(`Phrase matching skipped due to extreme word count that could cause system instability: ${words.length} words`);
return false;
}
// Warn about large word counts but still attempt matching
if (words.length > FUZZY_SEARCH_CONFIG.PERFORMANCE_WARNING_WORDS) {
console.info(`Large word count for phrase matching: ${words.length} words - may take longer but will attempt full matching`);
}
// Find positions of each token
const tokenPositions: number[][] = this.tokens.map(token => {
const normalizedToken = normalizeSearchText(token);
const positions: number[] = [];
words.forEach((word, index) => {
if (this.fuzzyMatchSingle(normalizedToken, word)) {
positions.push(index);
}
});
return positions;
});
// Check if we found all tokens
if (tokenPositions.some(positions => positions.length === 0)) {
return false;
}
// Check for phrase proximity using configurable distance
return this.hasProximityMatch(tokenPositions, FUZZY_SEARCH_CONFIG.MAX_PHRASE_PROXIMITY);
}
/**
* Checks if token positions indicate a phrase match within max distance
*/
private hasProximityMatch(tokenPositions: number[][], maxDistance: number): boolean {
// For 2 tokens, simple proximity check
if (tokenPositions.length === 2) {
const [pos1, pos2] = tokenPositions;
return pos1.some(p1 => pos2.some(p2 => Math.abs(p1 - p2) <= maxDistance));
}
// For more tokens, check if we can find a sequence where all tokens are within range
const findSequence = (remaining: number[][], currentPos: number): boolean => {
if (remaining.length === 0) return true;
const [nextPositions, ...rest] = remaining;
return nextPositions.some(pos =>
Math.abs(pos - currentPos) <= maxDistance &&
findSequence(rest, pos)
);
};
const [firstPositions, ...rest] = tokenPositions;
return firstPositions.some(startPos => findSequence(rest, startPos));
}
/**
* Performs fuzzy matching for a single token against content
*/
private fuzzyMatchToken(token: string, content: string): boolean {
if (token.length < FUZZY_SEARCH_CONFIG.MIN_FUZZY_TOKEN_LENGTH) {
// For short tokens, require exact match to avoid too many false positives
return content.includes(token);
}
const words = content.split(/\s+/);
// Only limit word processing for truly extreme cases to prevent system instability
const limitedWords = words.slice(0, FUZZY_SEARCH_CONFIG.ABSOLUTE_MAX_WORD_COUNT);
return limitedWords.some(word => this.fuzzyMatchSingle(token, word));
}
/**
* Fuzzy matches a single token against a single word
*/
private fuzzyMatchSingle(token: string, word: string): boolean {
// Use shared optimized fuzzy matching logic
return fuzzyMatchWord(token, word, FUZZY_SEARCH_CONFIG.MAX_EDIT_DISTANCE);
}
stripTags(content: string) {
// we want to allow link to preserve URLs: https://github.com/zadam/trilium/issues/2412
// we want to insert space in place of block tags (because they imply text separation)

View File

@@ -7,6 +7,7 @@ import Expression from "./expression.js";
import NoteSet from "../note_set.js";
import becca from "../../../becca/becca.js";
import { normalize } from "../../utils.js";
import { normalizeSearchText, fuzzyMatchWord, fuzzyMatchWordWithResult } from "../utils/text_utils.js";
import beccaService from "../../../becca/becca_service.js";
class NoteFlatTextExp extends Expression {
@@ -15,7 +16,8 @@ class NoteFlatTextExp extends Expression {
constructor(tokens: string[]) {
super();
this.tokens = tokens;
// Normalize tokens using centralized normalization function
this.tokens = tokens.map(token => normalizeSearchText(token));
}
execute(inputNoteSet: NoteSet, executionContext: any, searchContext: SearchContext) {
@@ -55,14 +57,18 @@ class NoteFlatTextExp extends Expression {
const foundAttrTokens: string[] = [];
for (const token of remainingTokens) {
if (note.type.includes(token) || note.mime.includes(token)) {
// Add defensive checks for undefined properties
const typeMatches = note.type && note.type.includes(token);
const mimeMatches = note.mime && note.mime.includes(token);
if (typeMatches || mimeMatches) {
foundAttrTokens.push(token);
}
}
for (const attribute of note.getOwnedAttributes()) {
const normalizedName = normalize(attribute.name);
const normalizedValue = normalize(attribute.value);
const normalizedName = normalizeSearchText(attribute.name);
const normalizedValue = normalizeSearchText(attribute.value);
for (const token of remainingTokens) {
if (normalizedName.includes(token) || normalizedValue.includes(token)) {
@@ -72,11 +78,11 @@ class NoteFlatTextExp extends Expression {
}
for (const parentNote of note.parents) {
const title = normalize(beccaService.getNoteTitle(note.noteId, parentNote.noteId));
const title = normalizeSearchText(beccaService.getNoteTitle(note.noteId, parentNote.noteId));
const foundTokens: string[] = foundAttrTokens.slice();
for (const token of remainingTokens) {
if (title.includes(token)) {
if (this.smartMatch(title, token, searchContext)) {
foundTokens.push(token);
}
}
@@ -91,7 +97,7 @@ class NoteFlatTextExp extends Expression {
}
};
const candidateNotes = this.getCandidateNotes(inputNoteSet);
const candidateNotes = this.getCandidateNotes(inputNoteSet, searchContext);
for (const note of candidateNotes) {
// autocomplete should be able to find notes by their noteIds as well (only leafs)
@@ -103,23 +109,27 @@ class NoteFlatTextExp extends Expression {
const foundAttrTokens: string[] = [];
for (const token of this.tokens) {
if (note.type.includes(token) || note.mime.includes(token)) {
// Add defensive checks for undefined properties
const typeMatches = note.type && note.type.includes(token);
const mimeMatches = note.mime && note.mime.includes(token);
if (typeMatches || mimeMatches) {
foundAttrTokens.push(token);
}
for (const attribute of note.ownedAttributes) {
if (normalize(attribute.name).includes(token) || normalize(attribute.value).includes(token)) {
if (normalizeSearchText(attribute.name).includes(token) || normalizeSearchText(attribute.value).includes(token)) {
foundAttrTokens.push(token);
}
}
}
for (const parentNote of note.parents) {
const title = normalize(beccaService.getNoteTitle(note.noteId, parentNote.noteId));
const title = normalizeSearchText(beccaService.getNoteTitle(note.noteId, parentNote.noteId));
const foundTokens = foundAttrTokens.slice();
for (const token of this.tokens) {
if (title.includes(token)) {
if (this.smartMatch(title, token, searchContext)) {
foundTokens.push(token);
}
}
@@ -152,12 +162,13 @@ class NoteFlatTextExp extends Expression {
/**
* Returns noteIds which have at least one matching tokens
*/
getCandidateNotes(noteSet: NoteSet): BNote[] {
getCandidateNotes(noteSet: NoteSet, searchContext?: SearchContext): BNote[] {
const candidateNotes: BNote[] = [];
for (const note of noteSet.notes) {
const normalizedFlatText = normalizeSearchText(note.getFlatText());
for (const token of this.tokens) {
if (note.getFlatText().includes(token)) {
if (this.smartMatch(normalizedFlatText, token, searchContext)) {
candidateNotes.push(note);
break;
}
@@ -166,6 +177,34 @@ class NoteFlatTextExp extends Expression {
return candidateNotes;
}
/**
* Smart matching that tries exact match first, then fuzzy fallback
* @param text The text to search in
* @param token The token to search for
* @param searchContext The search context to track matched words for highlighting
* @returns True if match found (exact or fuzzy)
*/
private smartMatch(text: string, token: string, searchContext?: SearchContext): boolean {
// Exact match has priority
if (text.includes(token)) {
return true;
}
// Fuzzy fallback only if enabled and for tokens >= 4 characters
if (searchContext?.enableFuzzyMatching && token.length >= 4) {
const matchedWord = fuzzyMatchWordWithResult(token, text);
if (matchedWord) {
// Track the fuzzy matched word for highlighting
if (!searchContext.highlightedTokens.includes(matchedWord)) {
searchContext.highlightedTokens.push(matchedWord);
}
return true;
}
}
return false;
}
}
export default NoteFlatTextExp;

View File

@@ -18,6 +18,7 @@ class SearchContext {
debug?: boolean;
debugInfo: {} | null;
fuzzyAttributeSearch: boolean;
enableFuzzyMatching: boolean; // Controls whether fuzzy matching is enabled for this search phase
highlightedTokens: string[];
originalQuery: string;
fulltextQuery: string;
@@ -45,6 +46,7 @@ class SearchContext {
this.debug = params.debug;
this.debugInfo = null;
this.fuzzyAttributeSearch = !!params.fuzzyAttributeSearch;
this.enableFuzzyMatching = true; // Default to true for backward compatibility
this.highlightedTokens = [];
this.originalQuery = "";
this.fulltextQuery = ""; // complete fulltext part

View File

@@ -2,17 +2,46 @@
import beccaService from "../../becca/becca_service.js";
import becca from "../../becca/becca.js";
import {
normalizeSearchText,
calculateOptimizedEditDistance,
FUZZY_SEARCH_CONFIG
} from "./utils/text_utils.js";
// Scoring constants for better maintainability
const SCORE_WEIGHTS = {
NOTE_ID_EXACT_MATCH: 1000,
TITLE_EXACT_MATCH: 2000,
TITLE_PREFIX_MATCH: 500,
TITLE_WORD_MATCH: 300,
TOKEN_EXACT_MATCH: 4,
TOKEN_PREFIX_MATCH: 2,
TOKEN_CONTAINS_MATCH: 1,
TOKEN_FUZZY_MATCH: 0.5,
TITLE_FACTOR: 2.0,
PATH_FACTOR: 0.3,
HIDDEN_NOTE_PENALTY: 3,
// Score caps to prevent fuzzy matches from outranking exact matches
MAX_FUZZY_SCORE_PER_TOKEN: 3, // Cap fuzzy token contributions to stay below exact matches
MAX_FUZZY_TOKEN_LENGTH_MULTIPLIER: 3, // Limit token length impact for fuzzy matches
MAX_TOTAL_FUZZY_SCORE: 200 // Total cap on fuzzy scoring per search
} as const;
class SearchResult {
notePathArray: string[];
score: number;
notePathTitle: string;
highlightedNotePathTitle?: string;
contentSnippet?: string;
highlightedContentSnippet?: string;
private fuzzyScore: number; // Track fuzzy score separately
constructor(notePathArray: string[]) {
this.notePathArray = notePathArray;
this.notePathTitle = beccaService.getNoteTitleForPath(notePathArray);
this.score = 0;
this.fuzzyScore = 0;
}
get notePath() {
@@ -23,53 +52,117 @@ class SearchResult {
return this.notePathArray[this.notePathArray.length - 1];
}
computeScore(fulltextQuery: string, tokens: string[]) {
computeScore(fulltextQuery: string, tokens: string[], enableFuzzyMatching: boolean = true) {
this.score = 0;
this.fuzzyScore = 0; // Reset fuzzy score tracking
const note = becca.notes[this.noteId];
const normalizedQuery = fulltextQuery.toLowerCase();
const normalizedTitle = note.title.toLowerCase();
const normalizedQuery = normalizeSearchText(fulltextQuery.toLowerCase());
const normalizedTitle = normalizeSearchText(note.title.toLowerCase());
// Note ID exact match, much higher score
if (note.noteId.toLowerCase() === fulltextQuery) {
this.score += 1000;
this.score += SCORE_WEIGHTS.NOTE_ID_EXACT_MATCH;
}
// Title matching scores, make sure to always win
// Title matching scores with fuzzy matching support
if (normalizedTitle === normalizedQuery) {
this.score += 2000; // Increased from 1000 to ensure exact matches always win
this.score += SCORE_WEIGHTS.TITLE_EXACT_MATCH;
} else if (normalizedTitle.startsWith(normalizedQuery)) {
this.score += 500; // Increased to give more weight to prefix matches
} else if (normalizedTitle.includes(` ${normalizedQuery} `) || normalizedTitle.startsWith(`${normalizedQuery} `) || normalizedTitle.endsWith(` ${normalizedQuery}`)) {
this.score += 300; // Increased to better distinguish word matches
this.score += SCORE_WEIGHTS.TITLE_PREFIX_MATCH;
} else if (this.isWordMatch(normalizedTitle, normalizedQuery)) {
this.score += SCORE_WEIGHTS.TITLE_WORD_MATCH;
} else if (enableFuzzyMatching) {
// Try fuzzy matching for typos only if enabled
const fuzzyScore = this.calculateFuzzyTitleScore(normalizedTitle, normalizedQuery);
this.score += fuzzyScore;
this.fuzzyScore += fuzzyScore; // Track fuzzy score contributions
}
// Add scores for partial matches with adjusted weights
this.addScoreForStrings(tokens, note.title, 2.0); // Increased to give more weight to title matches
this.addScoreForStrings(tokens, this.notePathTitle, 0.3); // Reduced to further de-emphasize path matches
// Add scores for token matches
this.addScoreForStrings(tokens, note.title, SCORE_WEIGHTS.TITLE_FACTOR, enableFuzzyMatching);
this.addScoreForStrings(tokens, this.notePathTitle, SCORE_WEIGHTS.PATH_FACTOR, enableFuzzyMatching);
if (note.isInHiddenSubtree()) {
this.score = this.score / 3; // Increased penalty for hidden notes
this.score = this.score / SCORE_WEIGHTS.HIDDEN_NOTE_PENALTY;
}
}
addScoreForStrings(tokens: string[], str: string, factor: number) {
const chunks = str.toLowerCase().split(" ");
addScoreForStrings(tokens: string[], str: string, factor: number, enableFuzzyMatching: boolean = true) {
const normalizedStr = normalizeSearchText(str.toLowerCase());
const chunks = normalizedStr.split(" ");
let tokenScore = 0;
for (const chunk of chunks) {
for (const token of tokens) {
if (chunk === token) {
tokenScore += 4 * token.length * factor;
} else if (chunk.startsWith(token)) {
tokenScore += 2 * token.length * factor;
} else if (chunk.includes(token)) {
tokenScore += token.length * factor;
const normalizedToken = normalizeSearchText(token.toLowerCase());
if (chunk === normalizedToken) {
tokenScore += SCORE_WEIGHTS.TOKEN_EXACT_MATCH * token.length * factor;
} else if (chunk.startsWith(normalizedToken)) {
tokenScore += SCORE_WEIGHTS.TOKEN_PREFIX_MATCH * token.length * factor;
} else if (chunk.includes(normalizedToken)) {
tokenScore += SCORE_WEIGHTS.TOKEN_CONTAINS_MATCH * token.length * factor;
} else {
// Try fuzzy matching for individual tokens with caps applied
const editDistance = calculateOptimizedEditDistance(chunk, normalizedToken, FUZZY_SEARCH_CONFIG.MAX_EDIT_DISTANCE);
if (editDistance <= FUZZY_SEARCH_CONFIG.MAX_EDIT_DISTANCE &&
normalizedToken.length >= FUZZY_SEARCH_CONFIG.MIN_FUZZY_TOKEN_LENGTH &&
this.fuzzyScore < SCORE_WEIGHTS.MAX_TOTAL_FUZZY_SCORE) {
const fuzzyWeight = SCORE_WEIGHTS.TOKEN_FUZZY_MATCH * (1 - editDistance / FUZZY_SEARCH_CONFIG.MAX_EDIT_DISTANCE);
// Apply caps: limit token length multiplier and per-token contribution
const cappedTokenLength = Math.min(token.length, SCORE_WEIGHTS.MAX_FUZZY_TOKEN_LENGTH_MULTIPLIER);
const fuzzyTokenScore = Math.min(
fuzzyWeight * cappedTokenLength * factor,
SCORE_WEIGHTS.MAX_FUZZY_SCORE_PER_TOKEN
);
tokenScore += fuzzyTokenScore;
this.fuzzyScore += fuzzyTokenScore;
}
}
}
}
this.score += tokenScore;
}
/**
* Checks if the query matches as a complete word in the text
*/
private isWordMatch(text: string, query: string): boolean {
return text.includes(` ${query} `) ||
text.startsWith(`${query} `) ||
text.endsWith(` ${query}`);
}
/**
* Calculates fuzzy matching score for title matches with caps applied
*/
private calculateFuzzyTitleScore(title: string, query: string): number {
// Check if we've already hit the fuzzy scoring cap
if (this.fuzzyScore >= SCORE_WEIGHTS.MAX_TOTAL_FUZZY_SCORE) {
return 0;
}
const editDistance = calculateOptimizedEditDistance(title, query, FUZZY_SEARCH_CONFIG.MAX_EDIT_DISTANCE);
const maxLen = Math.max(title.length, query.length);
// Only apply fuzzy matching if the query is reasonably long and edit distance is small
if (query.length >= FUZZY_SEARCH_CONFIG.MIN_FUZZY_TOKEN_LENGTH &&
editDistance <= FUZZY_SEARCH_CONFIG.MAX_EDIT_DISTANCE &&
editDistance / maxLen <= 0.3) {
const similarity = 1 - (editDistance / maxLen);
const baseFuzzyScore = SCORE_WEIGHTS.TITLE_WORD_MATCH * similarity * 0.7; // Reduced weight for fuzzy matches
// Apply cap to ensure fuzzy title matches don't exceed reasonable bounds
return Math.min(baseFuzzyScore, SCORE_WEIGHTS.MAX_TOTAL_FUZZY_SCORE * 0.3);
}
return 0;
}
}
export default SearchResult;

View File

@@ -1,3 +1,5 @@
import { normalizeSearchText, fuzzyMatchWord, FUZZY_SEARCH_CONFIG } from "../utils/text_utils.js";
const cachedRegexes: Record<string, RegExp> = {};
function getRegex(str: string) {
@@ -20,7 +22,41 @@ const stringComparators: Record<string, Comparator<string>> = {
"*=": (comparedValue) => (val) => !!val && val.endsWith(comparedValue),
"=*": (comparedValue) => (val) => !!val && val.startsWith(comparedValue),
"*=*": (comparedValue) => (val) => !!val && val.includes(comparedValue),
"%=": (comparedValue) => (val) => !!val && !!getRegex(comparedValue).test(val)
"%=": (comparedValue) => (val) => !!val && !!getRegex(comparedValue).test(val),
"~=": (comparedValue) => (val) => {
if (!val || !comparedValue) return false;
// Validate minimum length for fuzzy search to prevent false positives
if (comparedValue.length < FUZZY_SEARCH_CONFIG.MIN_FUZZY_TOKEN_LENGTH) {
return val.includes(comparedValue);
}
const normalizedVal = normalizeSearchText(val);
const normalizedCompared = normalizeSearchText(comparedValue);
// First try exact substring match
if (normalizedVal.includes(normalizedCompared)) {
return true;
}
// Then try fuzzy word matching
const words = normalizedVal.split(/\s+/);
return words.some(word => fuzzyMatchWord(normalizedCompared, word));
},
"~*": (comparedValue) => (val) => {
if (!val || !comparedValue) return false;
// Validate minimum length for fuzzy search
if (comparedValue.length < FUZZY_SEARCH_CONFIG.MIN_FUZZY_TOKEN_LENGTH) {
return val.includes(comparedValue);
}
const normalizedVal = normalizeSearchText(val);
const normalizedCompared = normalizeSearchText(comparedValue);
// For ~* operator, use fuzzy matching across the entire content
return fuzzyMatchWord(normalizedCompared, normalizedVal);
}
};
const numericComparators: Record<string, Comparator<number>> = {

View File

@@ -40,7 +40,7 @@ function getFulltext(_tokens: TokenData[], searchContext: SearchContext) {
}
}
const OPERATORS = new Set(["=", "!=", "*=*", "*=", "=*", ">", ">=", "<", "<=", "%="]);
const OPERATORS = new Set(["=", "!=", "*=*", "*=", "=*", ">", ">=", "<", "<=", "%=", "~=", "~*"]);
function isOperator(token: TokenData) {
if (Array.isArray(token)) {

View File

@@ -0,0 +1,241 @@
import { describe, it, expect, beforeEach } from "vitest";
import searchService from "./search.js";
import BNote from "../../../becca/entities/bnote.js";
import BBranch from "../../../becca/entities/bbranch.js";
import SearchContext from "../search_context.js";
import becca from "../../../becca/becca.js";
import { findNoteByTitle, note, NoteBuilder } from "../../../test/becca_mocking.js";
describe("Progressive Search Strategy", () => {
let rootNote: any;
beforeEach(() => {
becca.reset();
rootNote = new NoteBuilder(new BNote({ noteId: "root", title: "root", type: "text" }));
new BBranch({
branchId: "none_root",
noteId: "root",
parentNoteId: "none",
notePosition: 10
});
});
describe("Phase 1: Exact Matches Only", () => {
it("should complete search with exact matches when sufficient results found", () => {
// Create notes with exact matches
rootNote
.child(note("Document Analysis One"))
.child(note("Document Report Two"))
.child(note("Document Review Three"))
.child(note("Document Summary Four"))
.child(note("Document Overview Five"))
.child(note("Documnt Analysis Six")); // This has a typo that should require fuzzy matching
const searchContext = new SearchContext();
const searchResults = searchService.findResultsWithQuery("document", searchContext);
// Should find 5 exact matches and not need fuzzy matching
expect(searchResults.length).toEqual(5);
// Verify all results have high scores (exact matches)
const highQualityResults = searchResults.filter(result => result.score >= 10);
expect(highQualityResults.length).toEqual(5);
// The typo document should not be in results since we have enough exact matches
expect(findNoteByTitle(searchResults, "Documnt Analysis Six")).toBeFalsy();
});
it("should use exact match scoring only in Phase 1", () => {
rootNote
.child(note("Testing Exact Match"))
.child(note("Test Document"))
.child(note("Another Test"));
const searchContext = new SearchContext();
const searchResults = searchService.findResultsWithQuery("test", searchContext);
// All results should have scores from exact matching only
for (const result of searchResults) {
expect(result.score).toBeGreaterThan(0);
// Scores should be from exact/prefix/contains matches, not fuzzy
expect(result.score % 0.5).not.toBe(0); // Fuzzy scores are multiples of 0.5
}
});
});
describe("Phase 2: Fuzzy Fallback", () => {
it("should trigger fuzzy matching when insufficient exact matches", () => {
// Create only a few notes, some with typos
rootNote
.child(note("Document One"))
.child(note("Report Two"))
.child(note("Anaylsis Three")) // Typo: "Analysis"
.child(note("Sumary Four")); // Typo: "Summary"
const searchContext = new SearchContext();
const searchResults = searchService.findResultsWithQuery("analysis", searchContext);
// Should find the typo through fuzzy matching
expect(searchResults.length).toBeGreaterThan(0);
expect(findNoteByTitle(searchResults, "Anaylsis Three")).toBeTruthy();
});
it("should merge exact and fuzzy results with exact matches always ranked higher", () => {
rootNote
.child(note("Analysis Report")) // Exact match
.child(note("Data Analysis")) // Exact match
.child(note("Anaylsis Doc")) // Fuzzy match
.child(note("Statistical Anlaysis")); // Fuzzy match
const searchContext = new SearchContext();
const searchResults = searchService.findResultsWithQuery("analysis", searchContext);
expect(searchResults.length).toBe(4);
// Get the note titles in result order
const resultTitles = searchResults.map(r => becca.notes[r.noteId].title);
// Find positions of exact and fuzzy matches
const exactPositions = resultTitles.map((title, index) =>
title.toLowerCase().includes("analysis") ? index : -1
).filter(pos => pos !== -1);
const fuzzyPositions = resultTitles.map((title, index) =>
(title.includes("Anaylsis") || title.includes("Anlaysis")) ? index : -1
).filter(pos => pos !== -1);
expect(exactPositions.length).toBe(2);
expect(fuzzyPositions.length).toBe(2);
// CRITICAL: All exact matches must come before all fuzzy matches
const lastExactPosition = Math.max(...exactPositions);
const firstFuzzyPosition = Math.min(...fuzzyPositions);
expect(lastExactPosition).toBeLessThan(firstFuzzyPosition);
});
it("should not duplicate results between phases", () => {
rootNote
.child(note("Test Document")) // Would match in both phases
.child(note("Tset Report")); // Only fuzzy match
const searchContext = new SearchContext();
const searchResults = searchService.findResultsWithQuery("test", searchContext);
// Should only have unique results
const noteIds = searchResults.map(r => r.noteId);
const uniqueNoteIds = [...new Set(noteIds)];
expect(noteIds.length).toBe(uniqueNoteIds.length);
expect(findNoteByTitle(searchResults, "Test Document")).toBeTruthy();
expect(findNoteByTitle(searchResults, "Tset Report")).toBeTruthy();
});
});
describe("Result Sufficiency Thresholds", () => {
it("should respect minimum result count threshold", () => {
// Create exactly 4 high-quality results (below threshold of 5)
rootNote
.child(note("Test One"))
.child(note("Test Two"))
.child(note("Test Three"))
.child(note("Test Four"))
.child(note("Tset Five")); // Typo that should be found via fuzzy
const searchContext = new SearchContext();
const searchResults = searchService.findResultsWithQuery("test", searchContext);
// Should proceed to Phase 2 and include fuzzy match
expect(searchResults.length).toBe(5);
expect(findNoteByTitle(searchResults, "Tset Five")).toBeTruthy();
});
it("should respect minimum quality score threshold", () => {
// Create notes that might have low exact match scores
rootNote
.child(note("Testing Document")) // Should have decent score
.child(note("Document with test inside")) // Lower score due to position
.child(note("Another test case"))
.child(note("Test case example"))
.child(note("Tset with typo")); // Fuzzy match
const searchContext = new SearchContext();
const searchResults = searchService.findResultsWithQuery("test", searchContext);
// Should include fuzzy results if exact results don't meet quality threshold
expect(searchResults.length).toBeGreaterThan(4);
});
});
describe("Fuzzy Score Management", () => {
it("should cap fuzzy token scores to prevent outranking exact matches", () => {
// Create note with exact match
rootNote.child(note("Test Document"));
// Create note that could accumulate high fuzzy scores
rootNote.child(note("Tset Documnt with many fuzzy tockens for testng")); // Multiple typos
const searchContext = new SearchContext();
const searchResults = searchService.findResultsWithQuery("test document", searchContext);
expect(searchResults.length).toBe(2);
// Find the exact and fuzzy match results
const exactResult = searchResults.find(r => becca.notes[r.noteId].title === "Test Document");
const fuzzyResult = searchResults.find(r => becca.notes[r.noteId].title.includes("Tset"));
expect(exactResult).toBeTruthy();
expect(fuzzyResult).toBeTruthy();
// Exact match should always score higher than fuzzy, even with multiple fuzzy matches
expect(exactResult!.score).toBeGreaterThan(fuzzyResult!.score);
});
it("should enforce maximum total fuzzy score per search", () => {
// Create note with many potential fuzzy matches
rootNote.child(note("Tset Documnt Anaylsis Sumary Reportng")); // Many typos
const searchContext = new SearchContext();
const searchResults = searchService.findResultsWithQuery("test document analysis summary reporting", searchContext);
expect(searchResults.length).toBe(1);
// Total score should be bounded despite many fuzzy matches
expect(searchResults[0].score).toBeLessThan(500); // Should not exceed reasonable bounds due to caps
});
});
describe("SearchContext Integration", () => {
it("should respect enableFuzzyMatching flag", () => {
rootNote
.child(note("Test Document"))
.child(note("Tset Report")); // Typo
// Test with fuzzy matching disabled
const exactOnlyContext = new SearchContext();
exactOnlyContext.enableFuzzyMatching = false;
const exactResults = searchService.findResultsWithQuery("test", exactOnlyContext);
expect(exactResults.length).toBe(1);
expect(findNoteByTitle(exactResults, "Test Document")).toBeTruthy();
expect(findNoteByTitle(exactResults, "Tset Report")).toBeFalsy();
// Test with fuzzy matching enabled (default)
const fuzzyContext = new SearchContext();
const fuzzyResults = searchService.findResultsWithQuery("test", fuzzyContext);
expect(fuzzyResults.length).toBe(2);
expect(findNoteByTitle(fuzzyResults, "Tset Report")).toBeTruthy();
});
});
describe("Edge Cases", () => {
it("should handle empty search results gracefully", () => {
rootNote.child(note("Unrelated Content"));
const searchContext = new SearchContext();
const searchResults = searchService.findResultsWithQuery("nonexistent", searchContext);
expect(searchResults.length).toBe(0);
});
});
});

View File

@@ -553,6 +553,70 @@ describe("Search", () => {
expect(becca.notes[searchResults[0].noteId].title).toEqual("Reddit is bad");
});
it("search completes in reasonable time", () => {
// Create a moderate-sized dataset to test performance
const countries = ["Austria", "Belgium", "Croatia", "Denmark", "Estonia", "Finland", "Germany", "Hungary", "Ireland", "Japan"];
const europeanCountries = note("Europe");
countries.forEach(country => {
europeanCountries.child(note(country).label("type", "country").label("continent", "Europe"));
});
rootNote.child(europeanCountries);
const searchContext = new SearchContext();
const startTime = Date.now();
// Perform a search that exercises multiple features
const searchResults = searchService.findResultsWithQuery("#type=country AND continent", searchContext);
const endTime = Date.now();
const duration = endTime - startTime;
// Search should complete in under 1 second for reasonable dataset
expect(duration).toBeLessThan(1000);
expect(searchResults.length).toEqual(10);
});
it("progressive search always puts exact matches before fuzzy matches", () => {
rootNote
.child(note("Analysis Report")) // Exact match
.child(note("Data Analysis")) // Exact match
.child(note("Test Analysis")) // Exact match
.child(note("Advanced Anaylsis")) // Fuzzy match (typo)
.child(note("Quick Anlaysis")); // Fuzzy match (typo)
const searchContext = new SearchContext();
const searchResults = searchService.findResultsWithQuery("analysis", searchContext);
// With only 3 exact matches (below threshold), fuzzy should be triggered
// Should find all 5 matches but exact ones should come first
expect(searchResults.length).toEqual(5);
// Get note titles in result order
const resultTitles = searchResults.map(r => becca.notes[r.noteId].title);
// Find all exact matches (contain "analysis")
const exactMatchIndices = resultTitles.map((title, index) =>
title.toLowerCase().includes("analysis") ? index : -1
).filter(index => index !== -1);
// Find all fuzzy matches (contain typos)
const fuzzyMatchIndices = resultTitles.map((title, index) =>
(title.includes("Anaylsis") || title.includes("Anlaysis")) ? index : -1
).filter(index => index !== -1);
expect(exactMatchIndices.length).toEqual(3);
expect(fuzzyMatchIndices.length).toEqual(2);
// CRITICAL: All exact matches must appear before all fuzzy matches
const lastExactIndex = Math.max(...exactMatchIndices);
const firstFuzzyIndex = Math.min(...fuzzyMatchIndices);
expect(lastExactIndex).toBeLessThan(firstFuzzyIndex);
});
// FIXME: test what happens when we order without any filter criteria
// it("comparison between labels", () => {

View File

@@ -17,6 +17,8 @@ import type { SearchParams, TokenStructure } from "./types.js";
import type Expression from "../expressions/expression.js";
import sql from "../../sql.js";
import scriptService from "../../script.js";
import striptags from "striptags";
import protectedSessionService from "../../protected_session.js";
export interface SearchNoteResult {
searchResultNoteIds: string[];
@@ -235,6 +237,41 @@ function findResultsWithExpression(expression: Expression, searchContext: Search
loadNeededInfoFromDatabase();
}
// If there's an explicit orderBy clause, skip progressive search
// as it would interfere with the ordering
if (searchContext.orderBy) {
// For ordered queries, don't use progressive search but respect
// the original fuzzy matching setting
return performSearch(expression, searchContext, searchContext.enableFuzzyMatching);
}
// If fuzzy matching is explicitly disabled, skip progressive search
if (!searchContext.enableFuzzyMatching) {
return performSearch(expression, searchContext, false);
}
// Phase 1: Try exact matches first (without fuzzy matching)
const exactResults = performSearch(expression, searchContext, false);
// Check if we have sufficient high-quality results
const minResultThreshold = 5;
const minScoreForQuality = 10; // Minimum score to consider a result "high quality"
const highQualityResults = exactResults.filter(result => result.score >= minScoreForQuality);
// If we have enough high-quality exact matches, return them
if (highQualityResults.length >= minResultThreshold) {
return exactResults;
}
// Phase 2: Add fuzzy matching as fallback when exact matches are insufficient
const fuzzyResults = performSearch(expression, searchContext, true);
// Merge results, ensuring exact matches always rank higher than fuzzy matches
return mergeExactAndFuzzyResults(exactResults, fuzzyResults);
}
function performSearch(expression: Expression, searchContext: SearchContext, enableFuzzyMatching: boolean): SearchResult[] {
const allNoteSet = becca.getAllNoteSet();
const noteIdToNotePath: Record<string, string[]> = {};
@@ -242,6 +279,10 @@ function findResultsWithExpression(expression: Expression, searchContext: Search
noteIdToNotePath
};
// Store original fuzzy setting and temporarily override it
const originalFuzzyMatching = searchContext.enableFuzzyMatching;
searchContext.enableFuzzyMatching = enableFuzzyMatching;
const noteSet = expression.execute(allNoteSet, executionContext, searchContext);
const searchResults = noteSet.notes.map((note) => {
@@ -255,9 +296,12 @@ function findResultsWithExpression(expression: Expression, searchContext: Search
});
for (const res of searchResults) {
res.computeScore(searchContext.fulltextQuery, searchContext.highlightedTokens);
res.computeScore(searchContext.fulltextQuery, searchContext.highlightedTokens, enableFuzzyMatching);
}
// Restore original fuzzy setting
searchContext.enableFuzzyMatching = originalFuzzyMatching;
if (!noteSet.sorted) {
searchResults.sort((a, b) => {
if (a.score > b.score) {
@@ -279,6 +323,49 @@ function findResultsWithExpression(expression: Expression, searchContext: Search
return searchResults;
}
function mergeExactAndFuzzyResults(exactResults: SearchResult[], fuzzyResults: SearchResult[]): SearchResult[] {
// Create a map of exact result note IDs for deduplication
const exactNoteIds = new Set(exactResults.map(result => result.noteId));
// Add fuzzy results that aren't already in exact results
const additionalFuzzyResults = fuzzyResults.filter(result => !exactNoteIds.has(result.noteId));
// Sort exact results by score (best exact matches first)
exactResults.sort((a, b) => {
if (a.score > b.score) {
return -1;
} else if (a.score < b.score) {
return 1;
}
// if score does not decide then sort results by depth of the note.
if (a.notePathArray.length === b.notePathArray.length) {
return a.notePathTitle < b.notePathTitle ? -1 : 1;
}
return a.notePathArray.length < b.notePathArray.length ? -1 : 1;
});
// Sort fuzzy results by score (best fuzzy matches first)
additionalFuzzyResults.sort((a, b) => {
if (a.score > b.score) {
return -1;
} else if (a.score < b.score) {
return 1;
}
// if score does not decide then sort results by depth of the note.
if (a.notePathArray.length === b.notePathArray.length) {
return a.notePathTitle < b.notePathTitle ? -1 : 1;
}
return a.notePathArray.length < b.notePathArray.length ? -1 : 1;
});
// CRITICAL: Always put exact matches before fuzzy matches, regardless of scores
return [...exactResults, ...additionalFuzzyResults];
}
function parseQueryToExpression(query: string, searchContext: SearchContext) {
const { fulltextQuery, fulltextTokens, expressionTokens } = lex(query);
searchContext.fulltextQuery = fulltextQuery;
@@ -328,6 +415,16 @@ function findResultsWithQuery(query: string, searchContext: SearchContext): Sear
return [];
}
// If the query starts with '#', it's a pure expression query.
// Don't use progressive search for these as they may have complex
// ordering or other logic that shouldn't be interfered with.
const isPureExpressionQuery = query.trim().startsWith('#');
if (isPureExpressionQuery) {
// For pure expression queries, use standard search without progressive phases
return performSearch(expression, searchContext, searchContext.enableFuzzyMatching);
}
return findResultsWithExpression(expression, searchContext);
}
@@ -337,6 +434,91 @@ function findFirstNoteWithQuery(query: string, searchContext: SearchContext): BN
return searchResults.length > 0 ? becca.notes[searchResults[0].noteId] : null;
}
function extractContentSnippet(noteId: string, searchTokens: string[], maxLength: number = 200): string {
const note = becca.notes[noteId];
if (!note) {
return "";
}
// Only extract content for text-based notes
if (!["text", "code", "mermaid", "canvas", "mindMap"].includes(note.type)) {
return "";
}
try {
let content = note.getContent();
if (!content || typeof content !== "string") {
return "";
}
// Handle protected notes
if (note.isProtected && protectedSessionService.isProtectedSessionAvailable()) {
try {
content = protectedSessionService.decryptString(content) || "";
} catch (e) {
return ""; // Can't decrypt, don't show content
}
} else if (note.isProtected) {
return ""; // Protected but no session available
}
// Strip HTML tags for text notes
if (note.type === "text") {
content = striptags(content);
}
// Normalize whitespace
content = content.replace(/\s+/g, " ").trim();
if (!content) {
return "";
}
// Try to find a snippet around the first matching token
const normalizedContent = normalizeString(content.toLowerCase());
let snippetStart = 0;
let matchFound = false;
for (const token of searchTokens) {
const normalizedToken = normalizeString(token.toLowerCase());
const matchIndex = normalizedContent.indexOf(normalizedToken);
if (matchIndex !== -1) {
// Center the snippet around the match
snippetStart = Math.max(0, matchIndex - maxLength / 2);
matchFound = true;
break;
}
}
// Extract snippet
let snippet = content.substring(snippetStart, snippetStart + maxLength);
// Try to start/end at word boundaries
if (snippetStart > 0) {
const firstSpace = snippet.indexOf(" ");
if (firstSpace > 0 && firstSpace < 20) {
snippet = snippet.substring(firstSpace + 1);
}
snippet = "..." + snippet;
}
if (snippetStart + maxLength < content.length) {
const lastSpace = snippet.lastIndexOf(" ");
if (lastSpace > snippet.length - 20) {
snippet = snippet.substring(0, lastSpace);
}
snippet = snippet + "...";
}
return snippet;
} catch (e) {
log.error(`Error extracting content snippet for note ${noteId}: ${e}`);
return "";
}
}
function searchNotesForAutocomplete(query: string, fastSearch: boolean = true) {
const searchContext = new SearchContext({
fastSearch: fastSearch,
@@ -351,6 +533,11 @@ function searchNotesForAutocomplete(query: string, fastSearch: boolean = true) {
const trimmed = allSearchResults.slice(0, 200);
// Extract content snippets
for (const result of trimmed) {
result.contentSnippet = extractContentSnippet(result.noteId, searchContext.highlightedTokens);
}
highlightSearchResults(trimmed, searchContext.highlightedTokens, searchContext.ignoreInternalAttributes);
return trimmed.map((result) => {
@@ -360,6 +547,8 @@ function searchNotesForAutocomplete(query: string, fastSearch: boolean = true) {
noteTitle: title,
notePathTitle: result.notePathTitle,
highlightedNotePathTitle: result.highlightedNotePathTitle,
contentSnippet: result.contentSnippet,
highlightedContentSnippet: result.highlightedContentSnippet,
icon: icon ?? "bx bx-note"
};
});
@@ -381,26 +570,11 @@ function highlightSearchResults(searchResults: SearchResult[], highlightedTokens
highlightedTokens.sort((a, b) => (a.length > b.length ? -1 : 1));
for (const result of searchResults) {
const note = becca.notes[result.noteId];
result.highlightedNotePathTitle = result.notePathTitle.replace(/[<{}]/g, "");
if (highlightedTokens.find((token) => note.type.includes(token))) {
result.highlightedNotePathTitle += ` "type: ${note.type}'`;
}
if (highlightedTokens.find((token) => note.mime.includes(token))) {
result.highlightedNotePathTitle += ` "mime: ${note.mime}'`;
}
for (const attr of note.getAttributes()) {
if (attr.type === "relation" && attr.name === "internalLink" && ignoreInternalAttributes) {
continue;
}
if (highlightedTokens.find((token) => normalize(attr.name).includes(token) || normalize(attr.value).includes(token))) {
result.highlightedNotePathTitle += ` "${formatAttribute(attr)}'`;
}
// Initialize highlighted content snippet
if (result.contentSnippet) {
result.highlightedContentSnippet = escapeHtml(result.contentSnippet).replace(/[<{}]/g, "");
}
}
@@ -419,40 +593,36 @@ function highlightSearchResults(searchResults: SearchResult[], highlightedTokens
const tokenRegex = new RegExp(escapeRegExp(token), "gi");
let match;
// Find all matches
if (!result.highlightedNotePathTitle) {
continue;
// Highlight in note path title
if (result.highlightedNotePathTitle) {
const titleRegex = new RegExp(escapeRegExp(token), "gi");
while ((match = titleRegex.exec(normalizeString(result.highlightedNotePathTitle))) !== null) {
result.highlightedNotePathTitle = wrapText(result.highlightedNotePathTitle, match.index, token.length, "{", "}");
// 2 characters are added, so we need to adjust the index
titleRegex.lastIndex += 2;
}
}
while ((match = tokenRegex.exec(normalizeString(result.highlightedNotePathTitle))) !== null) {
result.highlightedNotePathTitle = wrapText(result.highlightedNotePathTitle, match.index, token.length, "{", "}");
// 2 characters are added, so we need to adjust the index
tokenRegex.lastIndex += 2;
// Highlight in content snippet
if (result.highlightedContentSnippet) {
const contentRegex = new RegExp(escapeRegExp(token), "gi");
while ((match = contentRegex.exec(normalizeString(result.highlightedContentSnippet))) !== null) {
result.highlightedContentSnippet = wrapText(result.highlightedContentSnippet, match.index, token.length, "{", "}");
// 2 characters are added, so we need to adjust the index
contentRegex.lastIndex += 2;
}
}
}
}
for (const result of searchResults) {
if (!result.highlightedNotePathTitle) {
continue;
if (result.highlightedNotePathTitle) {
result.highlightedNotePathTitle = result.highlightedNotePathTitle.replace(/{/g, "<b>").replace(/}/g, "</b>");
}
result.highlightedNotePathTitle = result.highlightedNotePathTitle.replace(/"/g, "<small>").replace(/'/g, "</small>").replace(/{/g, "<b>").replace(/}/g, "</b>");
}
}
function formatAttribute(attr: BAttribute) {
if (attr.type === "relation") {
return `~${escapeHtml(attr.name)}=…`;
} else if (attr.type === "label") {
let label = `#${escapeHtml(attr.name)}`;
if (attr.value) {
const val = /[^\w-]/.test(attr.value) ? `"${attr.value}"` : attr.value;
label += `=${escapeHtml(val)}`;
if (result.highlightedContentSnippet) {
result.highlightedContentSnippet = result.highlightedContentSnippet.replace(/{/g, "<b>").replace(/}/g, "</b>");
}
return label;
}
}

View File

@@ -0,0 +1,65 @@
import { describe, it, expect } from "vitest";
import { calculateOptimizedEditDistance, validateFuzzySearchTokens, fuzzyMatchWord } from './text_utils.js';
describe('Fuzzy Search Core', () => {
describe('calculateOptimizedEditDistance', () => {
it('calculates edit distance for common typos', () => {
expect(calculateOptimizedEditDistance('hello', 'helo')).toBe(1);
expect(calculateOptimizedEditDistance('world', 'wrold')).toBe(2);
expect(calculateOptimizedEditDistance('cafe', 'café')).toBe(1);
expect(calculateOptimizedEditDistance('identical', 'identical')).toBe(0);
});
it('handles performance safety with oversized input', () => {
const longString = 'a'.repeat(2000);
const result = calculateOptimizedEditDistance(longString, 'short');
expect(result).toBeGreaterThan(2); // Should use fallback heuristic
});
});
describe('validateFuzzySearchTokens', () => {
it('validates minimum length requirements for fuzzy operators', () => {
const result1 = validateFuzzySearchTokens(['ab'], '~=');
expect(result1.isValid).toBe(false);
expect(result1.error).toContain('at least 3 characters');
const result2 = validateFuzzySearchTokens(['hello'], '~=');
expect(result2.isValid).toBe(true);
const result3 = validateFuzzySearchTokens(['ok'], '=');
expect(result3.isValid).toBe(true); // Non-fuzzy operators allow short tokens
});
it('validates token types and empty arrays', () => {
expect(validateFuzzySearchTokens([], '=')).toEqual({
isValid: false,
error: 'Invalid tokens: at least one token is required'
});
expect(validateFuzzySearchTokens([''], '=')).toEqual({
isValid: false,
error: 'Invalid tokens: empty or whitespace-only tokens are not allowed'
});
});
});
describe('fuzzyMatchWord', () => {
it('matches words with diacritics normalization', () => {
expect(fuzzyMatchWord('cafe', 'café')).toBe(true);
expect(fuzzyMatchWord('naive', 'naïve')).toBe(true);
});
it('matches with typos within distance threshold', () => {
expect(fuzzyMatchWord('hello', 'helo')).toBe(true);
expect(fuzzyMatchWord('world', 'wrold')).toBe(true);
expect(fuzzyMatchWord('test', 'tset')).toBe(true);
expect(fuzzyMatchWord('test', 'xyz')).toBe(false);
});
it('handles edge cases safely', () => {
expect(fuzzyMatchWord('', 'test')).toBe(false);
expect(fuzzyMatchWord('test', '')).toBe(false);
expect(fuzzyMatchWord('a', 'b')).toBe(false); // Very short tokens
});
});
});

View File

@@ -0,0 +1,334 @@
"use strict";
import { normalize } from "../../utils.js";
/**
* Shared text processing utilities for search functionality
*/
// Configuration constants for fuzzy matching
export const FUZZY_SEARCH_CONFIG = {
// Minimum token length for fuzzy operators to prevent false positives
MIN_FUZZY_TOKEN_LENGTH: 3,
// Maximum edit distance for fuzzy matching
MAX_EDIT_DISTANCE: 2,
// Maximum proximity distance for phrase matching (in words)
MAX_PHRASE_PROXIMITY: 10,
// Absolute hard limits for extreme cases - only to prevent system crashes
ABSOLUTE_MAX_CONTENT_SIZE: 100 * 1024 * 1024, // 100MB - extreme upper limit to prevent OOM
ABSOLUTE_MAX_WORD_COUNT: 2000000, // 2M words - extreme upper limit for word processing
// Performance warning thresholds - inform user but still attempt search
PERFORMANCE_WARNING_SIZE: 5 * 1024 * 1024, // 5MB - warn about potential performance impact
PERFORMANCE_WARNING_WORDS: 100000, // 100K words - warn about word count impact
// Progressive processing thresholds for very large content
PROGRESSIVE_PROCESSING_SIZE: 10 * 1024 * 1024, // 10MB - use progressive processing
PROGRESSIVE_PROCESSING_WORDS: 500000, // 500K words - use progressive processing
// Performance thresholds
EARLY_TERMINATION_THRESHOLD: 3,
} as const;
/**
* Normalizes text by removing diacritics and converting to lowercase.
* This is the centralized text normalization function used across all search components.
* Uses the shared normalize function from utils for consistency.
*
* Examples:
* - "café" -> "cafe"
* - "naïve" -> "naive"
* - "HELLO WORLD" -> "hello world"
*
* @param text The text to normalize
* @returns The normalized text
*/
export function normalizeSearchText(text: string): string {
if (!text || typeof text !== 'string') {
return '';
}
// Use shared normalize function for consistency across the codebase
return normalize(text);
}
/**
* Optimized edit distance calculation using single array and early termination.
* This is significantly more memory efficient than the 2D matrix approach and includes
* early termination optimizations for better performance.
*
* @param str1 First string
* @param str2 Second string
* @param maxDistance Maximum allowed distance (for early termination)
* @returns The edit distance between the strings, or maxDistance + 1 if exceeded
*/
export function calculateOptimizedEditDistance(str1: string, str2: string, maxDistance: number = FUZZY_SEARCH_CONFIG.MAX_EDIT_DISTANCE): number {
// Input validation
if (typeof str1 !== 'string' || typeof str2 !== 'string') {
throw new Error('Both arguments must be strings');
}
if (maxDistance < 0 || !Number.isInteger(maxDistance)) {
throw new Error('maxDistance must be a non-negative integer');
}
const len1 = str1.length;
const len2 = str2.length;
// Performance guard: if strings are too long, limit processing
const maxStringLength = 1000;
if (len1 > maxStringLength || len2 > maxStringLength) {
// For very long strings, fall back to simple length-based heuristic
return Math.abs(len1 - len2) <= maxDistance ? Math.abs(len1 - len2) : maxDistance + 1;
}
// Early termination: if length difference exceeds max distance
if (Math.abs(len1 - len2) > maxDistance) {
return maxDistance + 1;
}
// Handle edge cases
if (len1 === 0) return len2 <= maxDistance ? len2 : maxDistance + 1;
if (len2 === 0) return len1 <= maxDistance ? len1 : maxDistance + 1;
// Use single array optimization for better memory usage
let previousRow = Array.from({ length: len2 + 1 }, (_, i) => i);
let currentRow = new Array(len2 + 1);
for (let i = 1; i <= len1; i++) {
currentRow[0] = i;
let minInRow = i;
for (let j = 1; j <= len2; j++) {
const cost = str1[i - 1] === str2[j - 1] ? 0 : 1;
currentRow[j] = Math.min(
previousRow[j] + 1, // deletion
currentRow[j - 1] + 1, // insertion
previousRow[j - 1] + cost // substitution
);
// Track minimum value in current row for early termination
if (currentRow[j] < minInRow) {
minInRow = currentRow[j];
}
}
// Early termination: if minimum distance in row exceeds threshold
if (minInRow > maxDistance) {
return maxDistance + 1;
}
// Swap arrays for next iteration
[previousRow, currentRow] = [currentRow, previousRow];
}
const result = previousRow[len2];
return result <= maxDistance ? result : maxDistance + 1;
}
/**
* Validates that tokens meet minimum requirements for fuzzy operators.
*
* @param tokens Array of search tokens
* @param operator The search operator being used
* @returns Validation result with success status and error message
*/
export function validateFuzzySearchTokens(tokens: string[], operator: string): { isValid: boolean; error?: string } {
if (!operator || typeof operator !== 'string') {
return {
isValid: false,
error: 'Invalid operator: operator must be a non-empty string'
};
}
if (!Array.isArray(tokens)) {
return {
isValid: false,
error: 'Invalid tokens: tokens must be an array'
};
}
if (tokens.length === 0) {
return {
isValid: false,
error: 'Invalid tokens: at least one token is required'
};
}
// Check for null, undefined, or non-string tokens
const invalidTypeTokens = tokens.filter(token =>
token == null || typeof token !== 'string'
);
if (invalidTypeTokens.length > 0) {
return {
isValid: false,
error: 'Invalid tokens: all tokens must be non-null strings'
};
}
// Check for empty string tokens
const emptyTokens = tokens.filter(token => token.trim().length === 0);
if (emptyTokens.length > 0) {
return {
isValid: false,
error: 'Invalid tokens: empty or whitespace-only tokens are not allowed'
};
}
if (operator !== '~=' && operator !== '~*') {
return { isValid: true };
}
// Check minimum token length for fuzzy operators
const shortTokens = tokens.filter(token => token.length < FUZZY_SEARCH_CONFIG.MIN_FUZZY_TOKEN_LENGTH);
if (shortTokens.length > 0) {
return {
isValid: false,
error: `Fuzzy search operators (~=, ~*) require tokens of at least ${FUZZY_SEARCH_CONFIG.MIN_FUZZY_TOKEN_LENGTH} characters. Invalid tokens: ${shortTokens.join(', ')}`
};
}
// Check for excessively long tokens that could cause performance issues
const maxTokenLength = 100; // Reasonable limit for search tokens
const longTokens = tokens.filter(token => token.length > maxTokenLength);
if (longTokens.length > 0) {
return {
isValid: false,
error: `Tokens are too long (max ${maxTokenLength} characters). Long tokens: ${longTokens.map(t => t.substring(0, 20) + '...').join(', ')}`
};
}
return { isValid: true };
}
/**
* Validates and preprocesses content for search operations.
* Philosophy: Try to search everything! Only block truly extreme cases that could crash the system.
*
* @param content The content to validate and preprocess
* @param noteId The note ID (for logging purposes)
* @returns Processed content, only null for truly extreme cases that could cause system instability
*/
export function validateAndPreprocessContent(content: string, noteId?: string): string | null {
if (!content || typeof content !== 'string') {
return null;
}
// Only block content that could actually crash the system (100MB+)
if (content.length > FUZZY_SEARCH_CONFIG.ABSOLUTE_MAX_CONTENT_SIZE) {
console.error(`Content size exceeds absolute system limit for note ${noteId || 'unknown'}: ${content.length} bytes - this could cause system instability`);
// Only in truly extreme cases, truncate to prevent system crash
return content.substring(0, FUZZY_SEARCH_CONFIG.ABSOLUTE_MAX_CONTENT_SIZE);
}
// Warn about very large content but still process it
if (content.length > FUZZY_SEARCH_CONFIG.PERFORMANCE_WARNING_SIZE) {
console.info(`Large content for note ${noteId || 'unknown'}: ${content.length} bytes - processing may take time but will attempt full search`);
}
// For word count, be even more permissive - only block truly extreme cases
const wordCount = content.split(/\s+/).length;
if (wordCount > FUZZY_SEARCH_CONFIG.ABSOLUTE_MAX_WORD_COUNT) {
console.error(`Word count exceeds absolute system limit for note ${noteId || 'unknown'}: ${wordCount} words - this could cause system instability`);
// Only in truly extreme cases, truncate to prevent system crash
return content.split(/\s+/).slice(0, FUZZY_SEARCH_CONFIG.ABSOLUTE_MAX_WORD_COUNT).join(' ');
}
// Warn about high word counts but still process them
if (wordCount > FUZZY_SEARCH_CONFIG.PERFORMANCE_WARNING_WORDS) {
console.info(`High word count for note ${noteId || 'unknown'}: ${wordCount} words - phrase matching may take time but will attempt full search`);
}
// Progressive processing warning for very large content
if (content.length > FUZZY_SEARCH_CONFIG.PROGRESSIVE_PROCESSING_SIZE || wordCount > FUZZY_SEARCH_CONFIG.PROGRESSIVE_PROCESSING_WORDS) {
console.info(`Very large content for note ${noteId || 'unknown'} - using progressive processing to maintain responsiveness`);
}
return content;
}
/**
* Escapes special regex characters in a string for use in RegExp constructor
*/
function escapeRegExp(string: string): string {
return string.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
}
/**
* Checks if a word matches a token with fuzzy matching and returns the matched word.
* Optimized for common case where distances are small.
*
* @param token The search token (should be normalized)
* @param text The text to match against (should be normalized)
* @param maxDistance Maximum allowed edit distance
* @returns The matched word if found, null otherwise
*/
export function fuzzyMatchWordWithResult(token: string, text: string, maxDistance: number = FUZZY_SEARCH_CONFIG.MAX_EDIT_DISTANCE): string | null {
// Input validation
if (typeof token !== 'string' || typeof text !== 'string') {
return null;
}
if (token.length === 0 || text.length === 0) {
return null;
}
try {
// Normalize both strings for comparison
const normalizedToken = token.toLowerCase();
const normalizedText = text.toLowerCase();
// Exact match check first (most common case)
if (normalizedText.includes(normalizedToken)) {
// Find the exact match in the original text to preserve case
const exactMatch = text.match(new RegExp(escapeRegExp(token), 'i'));
return exactMatch ? exactMatch[0] : token;
}
// For fuzzy matching, we need to check individual words in the text
// Split the text into words and check each word against the token
const words = normalizedText.split(/\s+/).filter(word => word.length > 0);
const originalWords = text.split(/\s+/).filter(word => word.length > 0);
for (let i = 0; i < words.length; i++) {
const word = words[i];
const originalWord = originalWords[i];
// Skip if word is too different in length for fuzzy matching
if (Math.abs(word.length - normalizedToken.length) > maxDistance) {
continue;
}
// For very short tokens or very different lengths, be more strict
if (normalizedToken.length < 4 || Math.abs(word.length - normalizedToken.length) > 2) {
continue;
}
// Use optimized edit distance calculation
const distance = calculateOptimizedEditDistance(normalizedToken, word, maxDistance);
if (distance <= maxDistance) {
return originalWord; // Return the original word with case preserved
}
}
return null;
} catch (error) {
// Log error and return null for safety
console.warn('Error in fuzzy word matching:', error);
return null;
}
}
/**
* Checks if a word matches a token with fuzzy matching.
* Optimized for common case where distances are small.
*
* @param token The search token (should be normalized)
* @param word The word to match against (should be normalized)
* @param maxDistance Maximum allowed edit distance
* @returns True if the word matches the token within the distance threshold
*/
export function fuzzyMatchWord(token: string, text: string, maxDistance: number = FUZZY_SEARCH_CONFIG.MAX_EDIT_DISTANCE): boolean {
return fuzzyMatchWordWithResult(token, text, maxDistance) !== null;
}