mirror of
https://github.com/zadam/trilium.git
synced 2025-11-01 10:55:55 +01:00
feat(search): improve search weights and operators (#6536)
This commit is contained in:
@@ -52,10 +52,15 @@ function quickSearch(req: Request) {
|
||||
fuzzyAttributeSearch: false
|
||||
});
|
||||
|
||||
const resultNoteIds = searchService.findResultsWithQuery(searchString, searchContext).map((sr) => sr.noteId);
|
||||
// Use the same highlighting logic as autocomplete for consistency
|
||||
const searchResults = searchService.searchNotesForAutocomplete(searchString, false);
|
||||
|
||||
// Extract note IDs for backward compatibility
|
||||
const resultNoteIds = searchResults.map((result) => result.notePath.split("/").pop()).filter(Boolean) as string[];
|
||||
|
||||
return {
|
||||
searchResultNoteIds: resultNoteIds,
|
||||
searchResults: searchResults,
|
||||
error: searchContext.getError()
|
||||
};
|
||||
}
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
import { describe, it, expect } from "vitest";
|
||||
import { processMindmapContent } from "./note_content_fulltext.js";
|
||||
import NoteContentFulltextExp from "./note_content_fulltext.js";
|
||||
|
||||
describe("processMindmapContent", () => {
|
||||
it("supports empty JSON", () => {
|
||||
@@ -11,3 +12,19 @@ describe("processMindmapContent", () => {
|
||||
expect(processMindmapContent(`{ "node": " }`)).toEqual("");
|
||||
});
|
||||
});
|
||||
|
||||
describe("Fuzzy Search Operators", () => {
|
||||
it("~= operator works with typos", () => {
|
||||
// Test that the ~= operator can handle common typos
|
||||
const expression = new NoteContentFulltextExp("~=", { tokens: ["hello"] });
|
||||
expect(expression.tokens).toEqual(["hello"]);
|
||||
expect(() => new NoteContentFulltextExp("~=", { tokens: ["he"] })).toThrow(); // Too short
|
||||
});
|
||||
|
||||
it("~* operator works with fuzzy contains", () => {
|
||||
// Test that the ~* operator handles fuzzy substring matching
|
||||
const expression = new NoteContentFulltextExp("~*", { tokens: ["world"] });
|
||||
expect(expression.tokens).toEqual(["world"]);
|
||||
expect(() => new NoteContentFulltextExp("~*", { tokens: ["wo"] })).toThrow(); // Too short
|
||||
});
|
||||
});
|
||||
|
||||
@@ -11,8 +11,19 @@ import protectedSessionService from "../../protected_session.js";
|
||||
import striptags from "striptags";
|
||||
import { normalize } from "../../utils.js";
|
||||
import sql from "../../sql.js";
|
||||
import {
|
||||
normalizeSearchText,
|
||||
calculateOptimizedEditDistance,
|
||||
validateFuzzySearchTokens,
|
||||
validateAndPreprocessContent,
|
||||
fuzzyMatchWord,
|
||||
FUZZY_SEARCH_CONFIG
|
||||
} from "../utils/text_utils.js";
|
||||
|
||||
const ALLOWED_OPERATORS = new Set(["=", "!=", "*=*", "*=", "=*", "%="]);
|
||||
const ALLOWED_OPERATORS = new Set(["=", "!=", "*=*", "*=", "=*", "%=", "~=", "~*"]);
|
||||
|
||||
// Maximum content size for search processing (2MB)
|
||||
const MAX_SEARCH_CONTENT_SIZE = 2 * 1024 * 1024;
|
||||
|
||||
const cachedRegexes: Record<string, RegExp> = {};
|
||||
|
||||
@@ -41,6 +52,16 @@ class NoteContentFulltextExp extends Expression {
|
||||
constructor(operator: string, { tokens, raw, flatText }: ConstructorOpts) {
|
||||
super();
|
||||
|
||||
if (!operator || !tokens || !Array.isArray(tokens)) {
|
||||
throw new Error('Invalid parameters: operator and tokens are required');
|
||||
}
|
||||
|
||||
// Validate fuzzy search tokens
|
||||
const validation = validateFuzzySearchTokens(tokens, operator);
|
||||
if (!validation.isValid) {
|
||||
throw new Error(validation.error!);
|
||||
}
|
||||
|
||||
this.operator = operator;
|
||||
this.tokens = tokens;
|
||||
this.raw = !!raw;
|
||||
@@ -59,7 +80,9 @@ class NoteContentFulltextExp extends Expression {
|
||||
for (const row of sql.iterateRows<SearchRow>(`
|
||||
SELECT noteId, type, mime, content, isProtected
|
||||
FROM notes JOIN blobs USING (blobId)
|
||||
WHERE type IN ('text', 'code', 'mermaid', 'canvas', 'mindMap') AND isDeleted = 0`)) {
|
||||
WHERE type IN ('text', 'code', 'mermaid', 'canvas', 'mindMap')
|
||||
AND isDeleted = 0
|
||||
AND LENGTH(content) < ${MAX_SEARCH_CONTENT_SIZE}`)) {
|
||||
this.findInText(row, inputNoteSet, resultNoteSet);
|
||||
}
|
||||
|
||||
@@ -89,6 +112,13 @@ class NoteContentFulltextExp extends Expression {
|
||||
}
|
||||
|
||||
content = this.preprocessContent(content, type, mime);
|
||||
|
||||
// Apply content size validation and preprocessing
|
||||
const processedContent = validateAndPreprocessContent(content, noteId);
|
||||
if (!processedContent) {
|
||||
return; // Content too large or invalid
|
||||
}
|
||||
content = processedContent;
|
||||
|
||||
if (this.tokens.length === 1) {
|
||||
const [token] = this.tokens;
|
||||
@@ -99,21 +129,27 @@ class NoteContentFulltextExp extends Expression {
|
||||
(this.operator === "*=" && content.endsWith(token)) ||
|
||||
(this.operator === "=*" && content.startsWith(token)) ||
|
||||
(this.operator === "*=*" && content.includes(token)) ||
|
||||
(this.operator === "%=" && getRegex(token).test(content))
|
||||
(this.operator === "%=" && getRegex(token).test(content)) ||
|
||||
(this.operator === "~=" && this.matchesWithFuzzy(content, noteId)) ||
|
||||
(this.operator === "~*" && this.fuzzyMatchToken(normalizeSearchText(token), normalizeSearchText(content)))
|
||||
) {
|
||||
resultNoteSet.add(becca.notes[noteId]);
|
||||
}
|
||||
} else {
|
||||
const nonMatchingToken = this.tokens.find(
|
||||
(token) =>
|
||||
!content?.includes(token) &&
|
||||
// in case of default fulltext search, we should consider both title, attrs and content
|
||||
// so e.g. "hello world" should match when "hello" is in title and "world" in content
|
||||
(!this.flatText || !becca.notes[noteId].getFlatText().includes(token))
|
||||
);
|
||||
// Multi-token matching with fuzzy support and phrase proximity
|
||||
if (this.operator === "~=" || this.operator === "~*") {
|
||||
if (this.matchesWithFuzzy(content, noteId)) {
|
||||
resultNoteSet.add(becca.notes[noteId]);
|
||||
}
|
||||
} else {
|
||||
const nonMatchingToken = this.tokens.find(
|
||||
(token) =>
|
||||
!this.tokenMatchesContent(token, content, noteId)
|
||||
);
|
||||
|
||||
if (!nonMatchingToken) {
|
||||
resultNoteSet.add(becca.notes[noteId]);
|
||||
if (!nonMatchingToken) {
|
||||
resultNoteSet.add(becca.notes[noteId]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -124,8 +160,8 @@ class NoteContentFulltextExp extends Expression {
|
||||
content = normalize(content.toString());
|
||||
|
||||
if (type === "text" && mime === "text/html") {
|
||||
if (!this.raw && content.length < 20000) {
|
||||
// striptags is slow for very large notes
|
||||
if (!this.raw) {
|
||||
// Content size already filtered at DB level, safe to process
|
||||
content = this.stripTags(content);
|
||||
}
|
||||
|
||||
@@ -152,6 +188,147 @@ class NoteContentFulltextExp extends Expression {
|
||||
return content.trim();
|
||||
}
|
||||
|
||||
/**
|
||||
* Checks if a token matches content with optional fuzzy matching
|
||||
*/
|
||||
private tokenMatchesContent(token: string, content: string, noteId: string): boolean {
|
||||
const normalizedToken = normalizeSearchText(token);
|
||||
const normalizedContent = normalizeSearchText(content);
|
||||
|
||||
if (normalizedContent.includes(normalizedToken)) {
|
||||
return true;
|
||||
}
|
||||
|
||||
// Check flat text for default fulltext search
|
||||
if (!this.flatText || !becca.notes[noteId].getFlatText().includes(token)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Performs fuzzy matching with edit distance and phrase proximity
|
||||
*/
|
||||
private matchesWithFuzzy(content: string, noteId: string): boolean {
|
||||
try {
|
||||
const normalizedContent = normalizeSearchText(content);
|
||||
const flatText = this.flatText ? normalizeSearchText(becca.notes[noteId].getFlatText()) : "";
|
||||
|
||||
// For phrase matching, check if tokens appear within reasonable proximity
|
||||
if (this.tokens.length > 1) {
|
||||
return this.matchesPhrase(normalizedContent, flatText);
|
||||
}
|
||||
|
||||
// Single token fuzzy matching
|
||||
const token = normalizeSearchText(this.tokens[0]);
|
||||
return this.fuzzyMatchToken(token, normalizedContent) ||
|
||||
(this.flatText && this.fuzzyMatchToken(token, flatText));
|
||||
} catch (error) {
|
||||
log.error(`Error in fuzzy matching for note ${noteId}: ${error}`);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Checks if multiple tokens match as a phrase with proximity consideration
|
||||
*/
|
||||
private matchesPhrase(content: string, flatText: string): boolean {
|
||||
const searchText = this.flatText ? `${content} ${flatText}` : content;
|
||||
|
||||
// Apply content size limits for phrase matching
|
||||
const limitedText = validateAndPreprocessContent(searchText);
|
||||
if (!limitedText) {
|
||||
return false;
|
||||
}
|
||||
|
||||
const words = limitedText.toLowerCase().split(/\s+/);
|
||||
|
||||
// Only skip phrase matching for truly extreme word counts that could crash the system
|
||||
if (words.length > FUZZY_SEARCH_CONFIG.ABSOLUTE_MAX_WORD_COUNT) {
|
||||
console.error(`Phrase matching skipped due to extreme word count that could cause system instability: ${words.length} words`);
|
||||
return false;
|
||||
}
|
||||
|
||||
// Warn about large word counts but still attempt matching
|
||||
if (words.length > FUZZY_SEARCH_CONFIG.PERFORMANCE_WARNING_WORDS) {
|
||||
console.info(`Large word count for phrase matching: ${words.length} words - may take longer but will attempt full matching`);
|
||||
}
|
||||
|
||||
// Find positions of each token
|
||||
const tokenPositions: number[][] = this.tokens.map(token => {
|
||||
const normalizedToken = normalizeSearchText(token);
|
||||
const positions: number[] = [];
|
||||
|
||||
words.forEach((word, index) => {
|
||||
if (this.fuzzyMatchSingle(normalizedToken, word)) {
|
||||
positions.push(index);
|
||||
}
|
||||
});
|
||||
|
||||
return positions;
|
||||
});
|
||||
|
||||
// Check if we found all tokens
|
||||
if (tokenPositions.some(positions => positions.length === 0)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// Check for phrase proximity using configurable distance
|
||||
return this.hasProximityMatch(tokenPositions, FUZZY_SEARCH_CONFIG.MAX_PHRASE_PROXIMITY);
|
||||
}
|
||||
|
||||
/**
|
||||
* Checks if token positions indicate a phrase match within max distance
|
||||
*/
|
||||
private hasProximityMatch(tokenPositions: number[][], maxDistance: number): boolean {
|
||||
// For 2 tokens, simple proximity check
|
||||
if (tokenPositions.length === 2) {
|
||||
const [pos1, pos2] = tokenPositions;
|
||||
return pos1.some(p1 => pos2.some(p2 => Math.abs(p1 - p2) <= maxDistance));
|
||||
}
|
||||
|
||||
// For more tokens, check if we can find a sequence where all tokens are within range
|
||||
const findSequence = (remaining: number[][], currentPos: number): boolean => {
|
||||
if (remaining.length === 0) return true;
|
||||
|
||||
const [nextPositions, ...rest] = remaining;
|
||||
return nextPositions.some(pos =>
|
||||
Math.abs(pos - currentPos) <= maxDistance &&
|
||||
findSequence(rest, pos)
|
||||
);
|
||||
};
|
||||
|
||||
const [firstPositions, ...rest] = tokenPositions;
|
||||
return firstPositions.some(startPos => findSequence(rest, startPos));
|
||||
}
|
||||
|
||||
/**
|
||||
* Performs fuzzy matching for a single token against content
|
||||
*/
|
||||
private fuzzyMatchToken(token: string, content: string): boolean {
|
||||
if (token.length < FUZZY_SEARCH_CONFIG.MIN_FUZZY_TOKEN_LENGTH) {
|
||||
// For short tokens, require exact match to avoid too many false positives
|
||||
return content.includes(token);
|
||||
}
|
||||
|
||||
const words = content.split(/\s+/);
|
||||
|
||||
// Only limit word processing for truly extreme cases to prevent system instability
|
||||
const limitedWords = words.slice(0, FUZZY_SEARCH_CONFIG.ABSOLUTE_MAX_WORD_COUNT);
|
||||
|
||||
return limitedWords.some(word => this.fuzzyMatchSingle(token, word));
|
||||
}
|
||||
|
||||
/**
|
||||
* Fuzzy matches a single token against a single word
|
||||
*/
|
||||
private fuzzyMatchSingle(token: string, word: string): boolean {
|
||||
// Use shared optimized fuzzy matching logic
|
||||
return fuzzyMatchWord(token, word, FUZZY_SEARCH_CONFIG.MAX_EDIT_DISTANCE);
|
||||
}
|
||||
|
||||
|
||||
stripTags(content: string) {
|
||||
// we want to allow link to preserve URLs: https://github.com/zadam/trilium/issues/2412
|
||||
// we want to insert space in place of block tags (because they imply text separation)
|
||||
|
||||
@@ -7,6 +7,7 @@ import Expression from "./expression.js";
|
||||
import NoteSet from "../note_set.js";
|
||||
import becca from "../../../becca/becca.js";
|
||||
import { normalize } from "../../utils.js";
|
||||
import { normalizeSearchText, fuzzyMatchWord, fuzzyMatchWordWithResult } from "../utils/text_utils.js";
|
||||
import beccaService from "../../../becca/becca_service.js";
|
||||
|
||||
class NoteFlatTextExp extends Expression {
|
||||
@@ -15,7 +16,8 @@ class NoteFlatTextExp extends Expression {
|
||||
constructor(tokens: string[]) {
|
||||
super();
|
||||
|
||||
this.tokens = tokens;
|
||||
// Normalize tokens using centralized normalization function
|
||||
this.tokens = tokens.map(token => normalizeSearchText(token));
|
||||
}
|
||||
|
||||
execute(inputNoteSet: NoteSet, executionContext: any, searchContext: SearchContext) {
|
||||
@@ -55,14 +57,18 @@ class NoteFlatTextExp extends Expression {
|
||||
const foundAttrTokens: string[] = [];
|
||||
|
||||
for (const token of remainingTokens) {
|
||||
if (note.type.includes(token) || note.mime.includes(token)) {
|
||||
// Add defensive checks for undefined properties
|
||||
const typeMatches = note.type && note.type.includes(token);
|
||||
const mimeMatches = note.mime && note.mime.includes(token);
|
||||
|
||||
if (typeMatches || mimeMatches) {
|
||||
foundAttrTokens.push(token);
|
||||
}
|
||||
}
|
||||
|
||||
for (const attribute of note.getOwnedAttributes()) {
|
||||
const normalizedName = normalize(attribute.name);
|
||||
const normalizedValue = normalize(attribute.value);
|
||||
const normalizedName = normalizeSearchText(attribute.name);
|
||||
const normalizedValue = normalizeSearchText(attribute.value);
|
||||
|
||||
for (const token of remainingTokens) {
|
||||
if (normalizedName.includes(token) || normalizedValue.includes(token)) {
|
||||
@@ -72,11 +78,11 @@ class NoteFlatTextExp extends Expression {
|
||||
}
|
||||
|
||||
for (const parentNote of note.parents) {
|
||||
const title = normalize(beccaService.getNoteTitle(note.noteId, parentNote.noteId));
|
||||
const title = normalizeSearchText(beccaService.getNoteTitle(note.noteId, parentNote.noteId));
|
||||
const foundTokens: string[] = foundAttrTokens.slice();
|
||||
|
||||
for (const token of remainingTokens) {
|
||||
if (title.includes(token)) {
|
||||
if (this.smartMatch(title, token, searchContext)) {
|
||||
foundTokens.push(token);
|
||||
}
|
||||
}
|
||||
@@ -91,7 +97,7 @@ class NoteFlatTextExp extends Expression {
|
||||
}
|
||||
};
|
||||
|
||||
const candidateNotes = this.getCandidateNotes(inputNoteSet);
|
||||
const candidateNotes = this.getCandidateNotes(inputNoteSet, searchContext);
|
||||
|
||||
for (const note of candidateNotes) {
|
||||
// autocomplete should be able to find notes by their noteIds as well (only leafs)
|
||||
@@ -103,23 +109,27 @@ class NoteFlatTextExp extends Expression {
|
||||
const foundAttrTokens: string[] = [];
|
||||
|
||||
for (const token of this.tokens) {
|
||||
if (note.type.includes(token) || note.mime.includes(token)) {
|
||||
// Add defensive checks for undefined properties
|
||||
const typeMatches = note.type && note.type.includes(token);
|
||||
const mimeMatches = note.mime && note.mime.includes(token);
|
||||
|
||||
if (typeMatches || mimeMatches) {
|
||||
foundAttrTokens.push(token);
|
||||
}
|
||||
|
||||
for (const attribute of note.ownedAttributes) {
|
||||
if (normalize(attribute.name).includes(token) || normalize(attribute.value).includes(token)) {
|
||||
if (normalizeSearchText(attribute.name).includes(token) || normalizeSearchText(attribute.value).includes(token)) {
|
||||
foundAttrTokens.push(token);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for (const parentNote of note.parents) {
|
||||
const title = normalize(beccaService.getNoteTitle(note.noteId, parentNote.noteId));
|
||||
const title = normalizeSearchText(beccaService.getNoteTitle(note.noteId, parentNote.noteId));
|
||||
const foundTokens = foundAttrTokens.slice();
|
||||
|
||||
for (const token of this.tokens) {
|
||||
if (title.includes(token)) {
|
||||
if (this.smartMatch(title, token, searchContext)) {
|
||||
foundTokens.push(token);
|
||||
}
|
||||
}
|
||||
@@ -152,12 +162,13 @@ class NoteFlatTextExp extends Expression {
|
||||
/**
|
||||
* Returns noteIds which have at least one matching tokens
|
||||
*/
|
||||
getCandidateNotes(noteSet: NoteSet): BNote[] {
|
||||
getCandidateNotes(noteSet: NoteSet, searchContext?: SearchContext): BNote[] {
|
||||
const candidateNotes: BNote[] = [];
|
||||
|
||||
for (const note of noteSet.notes) {
|
||||
const normalizedFlatText = normalizeSearchText(note.getFlatText());
|
||||
for (const token of this.tokens) {
|
||||
if (note.getFlatText().includes(token)) {
|
||||
if (this.smartMatch(normalizedFlatText, token, searchContext)) {
|
||||
candidateNotes.push(note);
|
||||
break;
|
||||
}
|
||||
@@ -166,6 +177,34 @@ class NoteFlatTextExp extends Expression {
|
||||
|
||||
return candidateNotes;
|
||||
}
|
||||
|
||||
/**
|
||||
* Smart matching that tries exact match first, then fuzzy fallback
|
||||
* @param text The text to search in
|
||||
* @param token The token to search for
|
||||
* @param searchContext The search context to track matched words for highlighting
|
||||
* @returns True if match found (exact or fuzzy)
|
||||
*/
|
||||
private smartMatch(text: string, token: string, searchContext?: SearchContext): boolean {
|
||||
// Exact match has priority
|
||||
if (text.includes(token)) {
|
||||
return true;
|
||||
}
|
||||
|
||||
// Fuzzy fallback only if enabled and for tokens >= 4 characters
|
||||
if (searchContext?.enableFuzzyMatching && token.length >= 4) {
|
||||
const matchedWord = fuzzyMatchWordWithResult(token, text);
|
||||
if (matchedWord) {
|
||||
// Track the fuzzy matched word for highlighting
|
||||
if (!searchContext.highlightedTokens.includes(matchedWord)) {
|
||||
searchContext.highlightedTokens.push(matchedWord);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
export default NoteFlatTextExp;
|
||||
|
||||
@@ -18,6 +18,7 @@ class SearchContext {
|
||||
debug?: boolean;
|
||||
debugInfo: {} | null;
|
||||
fuzzyAttributeSearch: boolean;
|
||||
enableFuzzyMatching: boolean; // Controls whether fuzzy matching is enabled for this search phase
|
||||
highlightedTokens: string[];
|
||||
originalQuery: string;
|
||||
fulltextQuery: string;
|
||||
@@ -45,6 +46,7 @@ class SearchContext {
|
||||
this.debug = params.debug;
|
||||
this.debugInfo = null;
|
||||
this.fuzzyAttributeSearch = !!params.fuzzyAttributeSearch;
|
||||
this.enableFuzzyMatching = true; // Default to true for backward compatibility
|
||||
this.highlightedTokens = [];
|
||||
this.originalQuery = "";
|
||||
this.fulltextQuery = ""; // complete fulltext part
|
||||
|
||||
@@ -2,17 +2,46 @@
|
||||
|
||||
import beccaService from "../../becca/becca_service.js";
|
||||
import becca from "../../becca/becca.js";
|
||||
import {
|
||||
normalizeSearchText,
|
||||
calculateOptimizedEditDistance,
|
||||
FUZZY_SEARCH_CONFIG
|
||||
} from "./utils/text_utils.js";
|
||||
|
||||
// Scoring constants for better maintainability
|
||||
const SCORE_WEIGHTS = {
|
||||
NOTE_ID_EXACT_MATCH: 1000,
|
||||
TITLE_EXACT_MATCH: 2000,
|
||||
TITLE_PREFIX_MATCH: 500,
|
||||
TITLE_WORD_MATCH: 300,
|
||||
TOKEN_EXACT_MATCH: 4,
|
||||
TOKEN_PREFIX_MATCH: 2,
|
||||
TOKEN_CONTAINS_MATCH: 1,
|
||||
TOKEN_FUZZY_MATCH: 0.5,
|
||||
TITLE_FACTOR: 2.0,
|
||||
PATH_FACTOR: 0.3,
|
||||
HIDDEN_NOTE_PENALTY: 3,
|
||||
// Score caps to prevent fuzzy matches from outranking exact matches
|
||||
MAX_FUZZY_SCORE_PER_TOKEN: 3, // Cap fuzzy token contributions to stay below exact matches
|
||||
MAX_FUZZY_TOKEN_LENGTH_MULTIPLIER: 3, // Limit token length impact for fuzzy matches
|
||||
MAX_TOTAL_FUZZY_SCORE: 200 // Total cap on fuzzy scoring per search
|
||||
} as const;
|
||||
|
||||
|
||||
class SearchResult {
|
||||
notePathArray: string[];
|
||||
score: number;
|
||||
notePathTitle: string;
|
||||
highlightedNotePathTitle?: string;
|
||||
contentSnippet?: string;
|
||||
highlightedContentSnippet?: string;
|
||||
private fuzzyScore: number; // Track fuzzy score separately
|
||||
|
||||
constructor(notePathArray: string[]) {
|
||||
this.notePathArray = notePathArray;
|
||||
this.notePathTitle = beccaService.getNoteTitleForPath(notePathArray);
|
||||
this.score = 0;
|
||||
this.fuzzyScore = 0;
|
||||
}
|
||||
|
||||
get notePath() {
|
||||
@@ -23,53 +52,117 @@ class SearchResult {
|
||||
return this.notePathArray[this.notePathArray.length - 1];
|
||||
}
|
||||
|
||||
computeScore(fulltextQuery: string, tokens: string[]) {
|
||||
computeScore(fulltextQuery: string, tokens: string[], enableFuzzyMatching: boolean = true) {
|
||||
this.score = 0;
|
||||
this.fuzzyScore = 0; // Reset fuzzy score tracking
|
||||
|
||||
const note = becca.notes[this.noteId];
|
||||
const normalizedQuery = fulltextQuery.toLowerCase();
|
||||
const normalizedTitle = note.title.toLowerCase();
|
||||
const normalizedQuery = normalizeSearchText(fulltextQuery.toLowerCase());
|
||||
const normalizedTitle = normalizeSearchText(note.title.toLowerCase());
|
||||
|
||||
// Note ID exact match, much higher score
|
||||
if (note.noteId.toLowerCase() === fulltextQuery) {
|
||||
this.score += 1000;
|
||||
this.score += SCORE_WEIGHTS.NOTE_ID_EXACT_MATCH;
|
||||
}
|
||||
|
||||
// Title matching scores, make sure to always win
|
||||
// Title matching scores with fuzzy matching support
|
||||
if (normalizedTitle === normalizedQuery) {
|
||||
this.score += 2000; // Increased from 1000 to ensure exact matches always win
|
||||
this.score += SCORE_WEIGHTS.TITLE_EXACT_MATCH;
|
||||
} else if (normalizedTitle.startsWith(normalizedQuery)) {
|
||||
this.score += 500; // Increased to give more weight to prefix matches
|
||||
} else if (normalizedTitle.includes(` ${normalizedQuery} `) || normalizedTitle.startsWith(`${normalizedQuery} `) || normalizedTitle.endsWith(` ${normalizedQuery}`)) {
|
||||
this.score += 300; // Increased to better distinguish word matches
|
||||
this.score += SCORE_WEIGHTS.TITLE_PREFIX_MATCH;
|
||||
} else if (this.isWordMatch(normalizedTitle, normalizedQuery)) {
|
||||
this.score += SCORE_WEIGHTS.TITLE_WORD_MATCH;
|
||||
} else if (enableFuzzyMatching) {
|
||||
// Try fuzzy matching for typos only if enabled
|
||||
const fuzzyScore = this.calculateFuzzyTitleScore(normalizedTitle, normalizedQuery);
|
||||
this.score += fuzzyScore;
|
||||
this.fuzzyScore += fuzzyScore; // Track fuzzy score contributions
|
||||
}
|
||||
|
||||
// Add scores for partial matches with adjusted weights
|
||||
this.addScoreForStrings(tokens, note.title, 2.0); // Increased to give more weight to title matches
|
||||
this.addScoreForStrings(tokens, this.notePathTitle, 0.3); // Reduced to further de-emphasize path matches
|
||||
// Add scores for token matches
|
||||
this.addScoreForStrings(tokens, note.title, SCORE_WEIGHTS.TITLE_FACTOR, enableFuzzyMatching);
|
||||
this.addScoreForStrings(tokens, this.notePathTitle, SCORE_WEIGHTS.PATH_FACTOR, enableFuzzyMatching);
|
||||
|
||||
if (note.isInHiddenSubtree()) {
|
||||
this.score = this.score / 3; // Increased penalty for hidden notes
|
||||
this.score = this.score / SCORE_WEIGHTS.HIDDEN_NOTE_PENALTY;
|
||||
}
|
||||
}
|
||||
|
||||
addScoreForStrings(tokens: string[], str: string, factor: number) {
|
||||
const chunks = str.toLowerCase().split(" ");
|
||||
addScoreForStrings(tokens: string[], str: string, factor: number, enableFuzzyMatching: boolean = true) {
|
||||
const normalizedStr = normalizeSearchText(str.toLowerCase());
|
||||
const chunks = normalizedStr.split(" ");
|
||||
|
||||
let tokenScore = 0;
|
||||
for (const chunk of chunks) {
|
||||
for (const token of tokens) {
|
||||
if (chunk === token) {
|
||||
tokenScore += 4 * token.length * factor;
|
||||
} else if (chunk.startsWith(token)) {
|
||||
tokenScore += 2 * token.length * factor;
|
||||
} else if (chunk.includes(token)) {
|
||||
tokenScore += token.length * factor;
|
||||
const normalizedToken = normalizeSearchText(token.toLowerCase());
|
||||
|
||||
if (chunk === normalizedToken) {
|
||||
tokenScore += SCORE_WEIGHTS.TOKEN_EXACT_MATCH * token.length * factor;
|
||||
} else if (chunk.startsWith(normalizedToken)) {
|
||||
tokenScore += SCORE_WEIGHTS.TOKEN_PREFIX_MATCH * token.length * factor;
|
||||
} else if (chunk.includes(normalizedToken)) {
|
||||
tokenScore += SCORE_WEIGHTS.TOKEN_CONTAINS_MATCH * token.length * factor;
|
||||
} else {
|
||||
// Try fuzzy matching for individual tokens with caps applied
|
||||
const editDistance = calculateOptimizedEditDistance(chunk, normalizedToken, FUZZY_SEARCH_CONFIG.MAX_EDIT_DISTANCE);
|
||||
if (editDistance <= FUZZY_SEARCH_CONFIG.MAX_EDIT_DISTANCE &&
|
||||
normalizedToken.length >= FUZZY_SEARCH_CONFIG.MIN_FUZZY_TOKEN_LENGTH &&
|
||||
this.fuzzyScore < SCORE_WEIGHTS.MAX_TOTAL_FUZZY_SCORE) {
|
||||
|
||||
const fuzzyWeight = SCORE_WEIGHTS.TOKEN_FUZZY_MATCH * (1 - editDistance / FUZZY_SEARCH_CONFIG.MAX_EDIT_DISTANCE);
|
||||
// Apply caps: limit token length multiplier and per-token contribution
|
||||
const cappedTokenLength = Math.min(token.length, SCORE_WEIGHTS.MAX_FUZZY_TOKEN_LENGTH_MULTIPLIER);
|
||||
const fuzzyTokenScore = Math.min(
|
||||
fuzzyWeight * cappedTokenLength * factor,
|
||||
SCORE_WEIGHTS.MAX_FUZZY_SCORE_PER_TOKEN
|
||||
);
|
||||
|
||||
tokenScore += fuzzyTokenScore;
|
||||
this.fuzzyScore += fuzzyTokenScore;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
this.score += tokenScore;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Checks if the query matches as a complete word in the text
|
||||
*/
|
||||
private isWordMatch(text: string, query: string): boolean {
|
||||
return text.includes(` ${query} `) ||
|
||||
text.startsWith(`${query} `) ||
|
||||
text.endsWith(` ${query}`);
|
||||
}
|
||||
|
||||
/**
|
||||
* Calculates fuzzy matching score for title matches with caps applied
|
||||
*/
|
||||
private calculateFuzzyTitleScore(title: string, query: string): number {
|
||||
// Check if we've already hit the fuzzy scoring cap
|
||||
if (this.fuzzyScore >= SCORE_WEIGHTS.MAX_TOTAL_FUZZY_SCORE) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
const editDistance = calculateOptimizedEditDistance(title, query, FUZZY_SEARCH_CONFIG.MAX_EDIT_DISTANCE);
|
||||
const maxLen = Math.max(title.length, query.length);
|
||||
|
||||
// Only apply fuzzy matching if the query is reasonably long and edit distance is small
|
||||
if (query.length >= FUZZY_SEARCH_CONFIG.MIN_FUZZY_TOKEN_LENGTH &&
|
||||
editDistance <= FUZZY_SEARCH_CONFIG.MAX_EDIT_DISTANCE &&
|
||||
editDistance / maxLen <= 0.3) {
|
||||
const similarity = 1 - (editDistance / maxLen);
|
||||
const baseFuzzyScore = SCORE_WEIGHTS.TITLE_WORD_MATCH * similarity * 0.7; // Reduced weight for fuzzy matches
|
||||
|
||||
// Apply cap to ensure fuzzy title matches don't exceed reasonable bounds
|
||||
return Math.min(baseFuzzyScore, SCORE_WEIGHTS.MAX_TOTAL_FUZZY_SCORE * 0.3);
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
export default SearchResult;
|
||||
|
||||
@@ -1,3 +1,5 @@
|
||||
import { normalizeSearchText, fuzzyMatchWord, FUZZY_SEARCH_CONFIG } from "../utils/text_utils.js";
|
||||
|
||||
const cachedRegexes: Record<string, RegExp> = {};
|
||||
|
||||
function getRegex(str: string) {
|
||||
@@ -20,7 +22,41 @@ const stringComparators: Record<string, Comparator<string>> = {
|
||||
"*=": (comparedValue) => (val) => !!val && val.endsWith(comparedValue),
|
||||
"=*": (comparedValue) => (val) => !!val && val.startsWith(comparedValue),
|
||||
"*=*": (comparedValue) => (val) => !!val && val.includes(comparedValue),
|
||||
"%=": (comparedValue) => (val) => !!val && !!getRegex(comparedValue).test(val)
|
||||
"%=": (comparedValue) => (val) => !!val && !!getRegex(comparedValue).test(val),
|
||||
"~=": (comparedValue) => (val) => {
|
||||
if (!val || !comparedValue) return false;
|
||||
|
||||
// Validate minimum length for fuzzy search to prevent false positives
|
||||
if (comparedValue.length < FUZZY_SEARCH_CONFIG.MIN_FUZZY_TOKEN_LENGTH) {
|
||||
return val.includes(comparedValue);
|
||||
}
|
||||
|
||||
const normalizedVal = normalizeSearchText(val);
|
||||
const normalizedCompared = normalizeSearchText(comparedValue);
|
||||
|
||||
// First try exact substring match
|
||||
if (normalizedVal.includes(normalizedCompared)) {
|
||||
return true;
|
||||
}
|
||||
|
||||
// Then try fuzzy word matching
|
||||
const words = normalizedVal.split(/\s+/);
|
||||
return words.some(word => fuzzyMatchWord(normalizedCompared, word));
|
||||
},
|
||||
"~*": (comparedValue) => (val) => {
|
||||
if (!val || !comparedValue) return false;
|
||||
|
||||
// Validate minimum length for fuzzy search
|
||||
if (comparedValue.length < FUZZY_SEARCH_CONFIG.MIN_FUZZY_TOKEN_LENGTH) {
|
||||
return val.includes(comparedValue);
|
||||
}
|
||||
|
||||
const normalizedVal = normalizeSearchText(val);
|
||||
const normalizedCompared = normalizeSearchText(comparedValue);
|
||||
|
||||
// For ~* operator, use fuzzy matching across the entire content
|
||||
return fuzzyMatchWord(normalizedCompared, normalizedVal);
|
||||
}
|
||||
};
|
||||
|
||||
const numericComparators: Record<string, Comparator<number>> = {
|
||||
|
||||
@@ -40,7 +40,7 @@ function getFulltext(_tokens: TokenData[], searchContext: SearchContext) {
|
||||
}
|
||||
}
|
||||
|
||||
const OPERATORS = new Set(["=", "!=", "*=*", "*=", "=*", ">", ">=", "<", "<=", "%="]);
|
||||
const OPERATORS = new Set(["=", "!=", "*=*", "*=", "=*", ">", ">=", "<", "<=", "%=", "~=", "~*"]);
|
||||
|
||||
function isOperator(token: TokenData) {
|
||||
if (Array.isArray(token)) {
|
||||
|
||||
@@ -0,0 +1,241 @@
|
||||
import { describe, it, expect, beforeEach } from "vitest";
|
||||
import searchService from "./search.js";
|
||||
import BNote from "../../../becca/entities/bnote.js";
|
||||
import BBranch from "../../../becca/entities/bbranch.js";
|
||||
import SearchContext from "../search_context.js";
|
||||
import becca from "../../../becca/becca.js";
|
||||
import { findNoteByTitle, note, NoteBuilder } from "../../../test/becca_mocking.js";
|
||||
|
||||
describe("Progressive Search Strategy", () => {
|
||||
let rootNote: any;
|
||||
|
||||
beforeEach(() => {
|
||||
becca.reset();
|
||||
|
||||
rootNote = new NoteBuilder(new BNote({ noteId: "root", title: "root", type: "text" }));
|
||||
new BBranch({
|
||||
branchId: "none_root",
|
||||
noteId: "root",
|
||||
parentNoteId: "none",
|
||||
notePosition: 10
|
||||
});
|
||||
});
|
||||
|
||||
describe("Phase 1: Exact Matches Only", () => {
|
||||
it("should complete search with exact matches when sufficient results found", () => {
|
||||
// Create notes with exact matches
|
||||
rootNote
|
||||
.child(note("Document Analysis One"))
|
||||
.child(note("Document Report Two"))
|
||||
.child(note("Document Review Three"))
|
||||
.child(note("Document Summary Four"))
|
||||
.child(note("Document Overview Five"))
|
||||
.child(note("Documnt Analysis Six")); // This has a typo that should require fuzzy matching
|
||||
|
||||
const searchContext = new SearchContext();
|
||||
const searchResults = searchService.findResultsWithQuery("document", searchContext);
|
||||
|
||||
// Should find 5 exact matches and not need fuzzy matching
|
||||
expect(searchResults.length).toEqual(5);
|
||||
|
||||
// Verify all results have high scores (exact matches)
|
||||
const highQualityResults = searchResults.filter(result => result.score >= 10);
|
||||
expect(highQualityResults.length).toEqual(5);
|
||||
|
||||
// The typo document should not be in results since we have enough exact matches
|
||||
expect(findNoteByTitle(searchResults, "Documnt Analysis Six")).toBeFalsy();
|
||||
});
|
||||
|
||||
it("should use exact match scoring only in Phase 1", () => {
|
||||
rootNote
|
||||
.child(note("Testing Exact Match"))
|
||||
.child(note("Test Document"))
|
||||
.child(note("Another Test"));
|
||||
|
||||
const searchContext = new SearchContext();
|
||||
const searchResults = searchService.findResultsWithQuery("test", searchContext);
|
||||
|
||||
// All results should have scores from exact matching only
|
||||
for (const result of searchResults) {
|
||||
expect(result.score).toBeGreaterThan(0);
|
||||
// Scores should be from exact/prefix/contains matches, not fuzzy
|
||||
expect(result.score % 0.5).not.toBe(0); // Fuzzy scores are multiples of 0.5
|
||||
}
|
||||
});
|
||||
});
|
||||
|
||||
describe("Phase 2: Fuzzy Fallback", () => {
|
||||
it("should trigger fuzzy matching when insufficient exact matches", () => {
|
||||
// Create only a few notes, some with typos
|
||||
rootNote
|
||||
.child(note("Document One"))
|
||||
.child(note("Report Two"))
|
||||
.child(note("Anaylsis Three")) // Typo: "Analysis"
|
||||
.child(note("Sumary Four")); // Typo: "Summary"
|
||||
|
||||
const searchContext = new SearchContext();
|
||||
const searchResults = searchService.findResultsWithQuery("analysis", searchContext);
|
||||
|
||||
// Should find the typo through fuzzy matching
|
||||
expect(searchResults.length).toBeGreaterThan(0);
|
||||
expect(findNoteByTitle(searchResults, "Anaylsis Three")).toBeTruthy();
|
||||
});
|
||||
|
||||
it("should merge exact and fuzzy results with exact matches always ranked higher", () => {
|
||||
rootNote
|
||||
.child(note("Analysis Report")) // Exact match
|
||||
.child(note("Data Analysis")) // Exact match
|
||||
.child(note("Anaylsis Doc")) // Fuzzy match
|
||||
.child(note("Statistical Anlaysis")); // Fuzzy match
|
||||
|
||||
const searchContext = new SearchContext();
|
||||
const searchResults = searchService.findResultsWithQuery("analysis", searchContext);
|
||||
|
||||
expect(searchResults.length).toBe(4);
|
||||
|
||||
// Get the note titles in result order
|
||||
const resultTitles = searchResults.map(r => becca.notes[r.noteId].title);
|
||||
|
||||
// Find positions of exact and fuzzy matches
|
||||
const exactPositions = resultTitles.map((title, index) =>
|
||||
title.toLowerCase().includes("analysis") ? index : -1
|
||||
).filter(pos => pos !== -1);
|
||||
|
||||
const fuzzyPositions = resultTitles.map((title, index) =>
|
||||
(title.includes("Anaylsis") || title.includes("Anlaysis")) ? index : -1
|
||||
).filter(pos => pos !== -1);
|
||||
|
||||
expect(exactPositions.length).toBe(2);
|
||||
expect(fuzzyPositions.length).toBe(2);
|
||||
|
||||
// CRITICAL: All exact matches must come before all fuzzy matches
|
||||
const lastExactPosition = Math.max(...exactPositions);
|
||||
const firstFuzzyPosition = Math.min(...fuzzyPositions);
|
||||
|
||||
expect(lastExactPosition).toBeLessThan(firstFuzzyPosition);
|
||||
});
|
||||
|
||||
it("should not duplicate results between phases", () => {
|
||||
rootNote
|
||||
.child(note("Test Document")) // Would match in both phases
|
||||
.child(note("Tset Report")); // Only fuzzy match
|
||||
|
||||
const searchContext = new SearchContext();
|
||||
const searchResults = searchService.findResultsWithQuery("test", searchContext);
|
||||
|
||||
// Should only have unique results
|
||||
const noteIds = searchResults.map(r => r.noteId);
|
||||
const uniqueNoteIds = [...new Set(noteIds)];
|
||||
|
||||
expect(noteIds.length).toBe(uniqueNoteIds.length);
|
||||
expect(findNoteByTitle(searchResults, "Test Document")).toBeTruthy();
|
||||
expect(findNoteByTitle(searchResults, "Tset Report")).toBeTruthy();
|
||||
});
|
||||
});
|
||||
|
||||
describe("Result Sufficiency Thresholds", () => {
|
||||
it("should respect minimum result count threshold", () => {
|
||||
// Create exactly 4 high-quality results (below threshold of 5)
|
||||
rootNote
|
||||
.child(note("Test One"))
|
||||
.child(note("Test Two"))
|
||||
.child(note("Test Three"))
|
||||
.child(note("Test Four"))
|
||||
.child(note("Tset Five")); // Typo that should be found via fuzzy
|
||||
|
||||
const searchContext = new SearchContext();
|
||||
const searchResults = searchService.findResultsWithQuery("test", searchContext);
|
||||
|
||||
// Should proceed to Phase 2 and include fuzzy match
|
||||
expect(searchResults.length).toBe(5);
|
||||
expect(findNoteByTitle(searchResults, "Tset Five")).toBeTruthy();
|
||||
});
|
||||
|
||||
it("should respect minimum quality score threshold", () => {
|
||||
// Create notes that might have low exact match scores
|
||||
rootNote
|
||||
.child(note("Testing Document")) // Should have decent score
|
||||
.child(note("Document with test inside")) // Lower score due to position
|
||||
.child(note("Another test case"))
|
||||
.child(note("Test case example"))
|
||||
.child(note("Tset with typo")); // Fuzzy match
|
||||
|
||||
const searchContext = new SearchContext();
|
||||
const searchResults = searchService.findResultsWithQuery("test", searchContext);
|
||||
|
||||
// Should include fuzzy results if exact results don't meet quality threshold
|
||||
expect(searchResults.length).toBeGreaterThan(4);
|
||||
});
|
||||
});
|
||||
|
||||
describe("Fuzzy Score Management", () => {
|
||||
it("should cap fuzzy token scores to prevent outranking exact matches", () => {
|
||||
// Create note with exact match
|
||||
rootNote.child(note("Test Document"));
|
||||
// Create note that could accumulate high fuzzy scores
|
||||
rootNote.child(note("Tset Documnt with many fuzzy tockens for testng")); // Multiple typos
|
||||
|
||||
const searchContext = new SearchContext();
|
||||
const searchResults = searchService.findResultsWithQuery("test document", searchContext);
|
||||
|
||||
expect(searchResults.length).toBe(2);
|
||||
|
||||
// Find the exact and fuzzy match results
|
||||
const exactResult = searchResults.find(r => becca.notes[r.noteId].title === "Test Document");
|
||||
const fuzzyResult = searchResults.find(r => becca.notes[r.noteId].title.includes("Tset"));
|
||||
|
||||
expect(exactResult).toBeTruthy();
|
||||
expect(fuzzyResult).toBeTruthy();
|
||||
|
||||
// Exact match should always score higher than fuzzy, even with multiple fuzzy matches
|
||||
expect(exactResult!.score).toBeGreaterThan(fuzzyResult!.score);
|
||||
});
|
||||
|
||||
it("should enforce maximum total fuzzy score per search", () => {
|
||||
// Create note with many potential fuzzy matches
|
||||
rootNote.child(note("Tset Documnt Anaylsis Sumary Reportng")); // Many typos
|
||||
|
||||
const searchContext = new SearchContext();
|
||||
const searchResults = searchService.findResultsWithQuery("test document analysis summary reporting", searchContext);
|
||||
|
||||
expect(searchResults.length).toBe(1);
|
||||
|
||||
// Total score should be bounded despite many fuzzy matches
|
||||
expect(searchResults[0].score).toBeLessThan(500); // Should not exceed reasonable bounds due to caps
|
||||
});
|
||||
});
|
||||
|
||||
describe("SearchContext Integration", () => {
|
||||
it("should respect enableFuzzyMatching flag", () => {
|
||||
rootNote
|
||||
.child(note("Test Document"))
|
||||
.child(note("Tset Report")); // Typo
|
||||
|
||||
// Test with fuzzy matching disabled
|
||||
const exactOnlyContext = new SearchContext();
|
||||
exactOnlyContext.enableFuzzyMatching = false;
|
||||
|
||||
const exactResults = searchService.findResultsWithQuery("test", exactOnlyContext);
|
||||
expect(exactResults.length).toBe(1);
|
||||
expect(findNoteByTitle(exactResults, "Test Document")).toBeTruthy();
|
||||
expect(findNoteByTitle(exactResults, "Tset Report")).toBeFalsy();
|
||||
|
||||
// Test with fuzzy matching enabled (default)
|
||||
const fuzzyContext = new SearchContext();
|
||||
const fuzzyResults = searchService.findResultsWithQuery("test", fuzzyContext);
|
||||
expect(fuzzyResults.length).toBe(2);
|
||||
expect(findNoteByTitle(fuzzyResults, "Tset Report")).toBeTruthy();
|
||||
});
|
||||
});
|
||||
|
||||
describe("Edge Cases", () => {
|
||||
it("should handle empty search results gracefully", () => {
|
||||
rootNote.child(note("Unrelated Content"));
|
||||
|
||||
const searchContext = new SearchContext();
|
||||
const searchResults = searchService.findResultsWithQuery("nonexistent", searchContext);
|
||||
|
||||
expect(searchResults.length).toBe(0);
|
||||
});
|
||||
});
|
||||
});
|
||||
@@ -553,6 +553,70 @@ describe("Search", () => {
|
||||
expect(becca.notes[searchResults[0].noteId].title).toEqual("Reddit is bad");
|
||||
});
|
||||
|
||||
it("search completes in reasonable time", () => {
|
||||
// Create a moderate-sized dataset to test performance
|
||||
const countries = ["Austria", "Belgium", "Croatia", "Denmark", "Estonia", "Finland", "Germany", "Hungary", "Ireland", "Japan"];
|
||||
const europeanCountries = note("Europe");
|
||||
|
||||
countries.forEach(country => {
|
||||
europeanCountries.child(note(country).label("type", "country").label("continent", "Europe"));
|
||||
});
|
||||
|
||||
rootNote.child(europeanCountries);
|
||||
|
||||
const searchContext = new SearchContext();
|
||||
const startTime = Date.now();
|
||||
|
||||
// Perform a search that exercises multiple features
|
||||
const searchResults = searchService.findResultsWithQuery("#type=country AND continent", searchContext);
|
||||
|
||||
const endTime = Date.now();
|
||||
const duration = endTime - startTime;
|
||||
|
||||
// Search should complete in under 1 second for reasonable dataset
|
||||
expect(duration).toBeLessThan(1000);
|
||||
expect(searchResults.length).toEqual(10);
|
||||
});
|
||||
|
||||
it("progressive search always puts exact matches before fuzzy matches", () => {
|
||||
rootNote
|
||||
.child(note("Analysis Report")) // Exact match
|
||||
.child(note("Data Analysis")) // Exact match
|
||||
.child(note("Test Analysis")) // Exact match
|
||||
.child(note("Advanced Anaylsis")) // Fuzzy match (typo)
|
||||
.child(note("Quick Anlaysis")); // Fuzzy match (typo)
|
||||
|
||||
const searchContext = new SearchContext();
|
||||
const searchResults = searchService.findResultsWithQuery("analysis", searchContext);
|
||||
|
||||
// With only 3 exact matches (below threshold), fuzzy should be triggered
|
||||
// Should find all 5 matches but exact ones should come first
|
||||
expect(searchResults.length).toEqual(5);
|
||||
|
||||
// Get note titles in result order
|
||||
const resultTitles = searchResults.map(r => becca.notes[r.noteId].title);
|
||||
|
||||
// Find all exact matches (contain "analysis")
|
||||
const exactMatchIndices = resultTitles.map((title, index) =>
|
||||
title.toLowerCase().includes("analysis") ? index : -1
|
||||
).filter(index => index !== -1);
|
||||
|
||||
// Find all fuzzy matches (contain typos)
|
||||
const fuzzyMatchIndices = resultTitles.map((title, index) =>
|
||||
(title.includes("Anaylsis") || title.includes("Anlaysis")) ? index : -1
|
||||
).filter(index => index !== -1);
|
||||
|
||||
expect(exactMatchIndices.length).toEqual(3);
|
||||
expect(fuzzyMatchIndices.length).toEqual(2);
|
||||
|
||||
// CRITICAL: All exact matches must appear before all fuzzy matches
|
||||
const lastExactIndex = Math.max(...exactMatchIndices);
|
||||
const firstFuzzyIndex = Math.min(...fuzzyMatchIndices);
|
||||
|
||||
expect(lastExactIndex).toBeLessThan(firstFuzzyIndex);
|
||||
});
|
||||
|
||||
|
||||
// FIXME: test what happens when we order without any filter criteria
|
||||
|
||||
// it("comparison between labels", () => {
|
||||
|
||||
@@ -17,6 +17,8 @@ import type { SearchParams, TokenStructure } from "./types.js";
|
||||
import type Expression from "../expressions/expression.js";
|
||||
import sql from "../../sql.js";
|
||||
import scriptService from "../../script.js";
|
||||
import striptags from "striptags";
|
||||
import protectedSessionService from "../../protected_session.js";
|
||||
|
||||
export interface SearchNoteResult {
|
||||
searchResultNoteIds: string[];
|
||||
@@ -235,6 +237,41 @@ function findResultsWithExpression(expression: Expression, searchContext: Search
|
||||
loadNeededInfoFromDatabase();
|
||||
}
|
||||
|
||||
// If there's an explicit orderBy clause, skip progressive search
|
||||
// as it would interfere with the ordering
|
||||
if (searchContext.orderBy) {
|
||||
// For ordered queries, don't use progressive search but respect
|
||||
// the original fuzzy matching setting
|
||||
return performSearch(expression, searchContext, searchContext.enableFuzzyMatching);
|
||||
}
|
||||
|
||||
// If fuzzy matching is explicitly disabled, skip progressive search
|
||||
if (!searchContext.enableFuzzyMatching) {
|
||||
return performSearch(expression, searchContext, false);
|
||||
}
|
||||
|
||||
// Phase 1: Try exact matches first (without fuzzy matching)
|
||||
const exactResults = performSearch(expression, searchContext, false);
|
||||
|
||||
// Check if we have sufficient high-quality results
|
||||
const minResultThreshold = 5;
|
||||
const minScoreForQuality = 10; // Minimum score to consider a result "high quality"
|
||||
|
||||
const highQualityResults = exactResults.filter(result => result.score >= minScoreForQuality);
|
||||
|
||||
// If we have enough high-quality exact matches, return them
|
||||
if (highQualityResults.length >= minResultThreshold) {
|
||||
return exactResults;
|
||||
}
|
||||
|
||||
// Phase 2: Add fuzzy matching as fallback when exact matches are insufficient
|
||||
const fuzzyResults = performSearch(expression, searchContext, true);
|
||||
|
||||
// Merge results, ensuring exact matches always rank higher than fuzzy matches
|
||||
return mergeExactAndFuzzyResults(exactResults, fuzzyResults);
|
||||
}
|
||||
|
||||
function performSearch(expression: Expression, searchContext: SearchContext, enableFuzzyMatching: boolean): SearchResult[] {
|
||||
const allNoteSet = becca.getAllNoteSet();
|
||||
|
||||
const noteIdToNotePath: Record<string, string[]> = {};
|
||||
@@ -242,6 +279,10 @@ function findResultsWithExpression(expression: Expression, searchContext: Search
|
||||
noteIdToNotePath
|
||||
};
|
||||
|
||||
// Store original fuzzy setting and temporarily override it
|
||||
const originalFuzzyMatching = searchContext.enableFuzzyMatching;
|
||||
searchContext.enableFuzzyMatching = enableFuzzyMatching;
|
||||
|
||||
const noteSet = expression.execute(allNoteSet, executionContext, searchContext);
|
||||
|
||||
const searchResults = noteSet.notes.map((note) => {
|
||||
@@ -255,9 +296,12 @@ function findResultsWithExpression(expression: Expression, searchContext: Search
|
||||
});
|
||||
|
||||
for (const res of searchResults) {
|
||||
res.computeScore(searchContext.fulltextQuery, searchContext.highlightedTokens);
|
||||
res.computeScore(searchContext.fulltextQuery, searchContext.highlightedTokens, enableFuzzyMatching);
|
||||
}
|
||||
|
||||
// Restore original fuzzy setting
|
||||
searchContext.enableFuzzyMatching = originalFuzzyMatching;
|
||||
|
||||
if (!noteSet.sorted) {
|
||||
searchResults.sort((a, b) => {
|
||||
if (a.score > b.score) {
|
||||
@@ -279,6 +323,49 @@ function findResultsWithExpression(expression: Expression, searchContext: Search
|
||||
return searchResults;
|
||||
}
|
||||
|
||||
function mergeExactAndFuzzyResults(exactResults: SearchResult[], fuzzyResults: SearchResult[]): SearchResult[] {
|
||||
// Create a map of exact result note IDs for deduplication
|
||||
const exactNoteIds = new Set(exactResults.map(result => result.noteId));
|
||||
|
||||
// Add fuzzy results that aren't already in exact results
|
||||
const additionalFuzzyResults = fuzzyResults.filter(result => !exactNoteIds.has(result.noteId));
|
||||
|
||||
// Sort exact results by score (best exact matches first)
|
||||
exactResults.sort((a, b) => {
|
||||
if (a.score > b.score) {
|
||||
return -1;
|
||||
} else if (a.score < b.score) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
// if score does not decide then sort results by depth of the note.
|
||||
if (a.notePathArray.length === b.notePathArray.length) {
|
||||
return a.notePathTitle < b.notePathTitle ? -1 : 1;
|
||||
}
|
||||
|
||||
return a.notePathArray.length < b.notePathArray.length ? -1 : 1;
|
||||
});
|
||||
|
||||
// Sort fuzzy results by score (best fuzzy matches first)
|
||||
additionalFuzzyResults.sort((a, b) => {
|
||||
if (a.score > b.score) {
|
||||
return -1;
|
||||
} else if (a.score < b.score) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
// if score does not decide then sort results by depth of the note.
|
||||
if (a.notePathArray.length === b.notePathArray.length) {
|
||||
return a.notePathTitle < b.notePathTitle ? -1 : 1;
|
||||
}
|
||||
|
||||
return a.notePathArray.length < b.notePathArray.length ? -1 : 1;
|
||||
});
|
||||
|
||||
// CRITICAL: Always put exact matches before fuzzy matches, regardless of scores
|
||||
return [...exactResults, ...additionalFuzzyResults];
|
||||
}
|
||||
|
||||
function parseQueryToExpression(query: string, searchContext: SearchContext) {
|
||||
const { fulltextQuery, fulltextTokens, expressionTokens } = lex(query);
|
||||
searchContext.fulltextQuery = fulltextQuery;
|
||||
@@ -328,6 +415,16 @@ function findResultsWithQuery(query: string, searchContext: SearchContext): Sear
|
||||
return [];
|
||||
}
|
||||
|
||||
// If the query starts with '#', it's a pure expression query.
|
||||
// Don't use progressive search for these as they may have complex
|
||||
// ordering or other logic that shouldn't be interfered with.
|
||||
const isPureExpressionQuery = query.trim().startsWith('#');
|
||||
|
||||
if (isPureExpressionQuery) {
|
||||
// For pure expression queries, use standard search without progressive phases
|
||||
return performSearch(expression, searchContext, searchContext.enableFuzzyMatching);
|
||||
}
|
||||
|
||||
return findResultsWithExpression(expression, searchContext);
|
||||
}
|
||||
|
||||
@@ -337,6 +434,91 @@ function findFirstNoteWithQuery(query: string, searchContext: SearchContext): BN
|
||||
return searchResults.length > 0 ? becca.notes[searchResults[0].noteId] : null;
|
||||
}
|
||||
|
||||
function extractContentSnippet(noteId: string, searchTokens: string[], maxLength: number = 200): string {
|
||||
const note = becca.notes[noteId];
|
||||
if (!note) {
|
||||
return "";
|
||||
}
|
||||
|
||||
// Only extract content for text-based notes
|
||||
if (!["text", "code", "mermaid", "canvas", "mindMap"].includes(note.type)) {
|
||||
return "";
|
||||
}
|
||||
|
||||
try {
|
||||
let content = note.getContent();
|
||||
|
||||
if (!content || typeof content !== "string") {
|
||||
return "";
|
||||
}
|
||||
|
||||
// Handle protected notes
|
||||
if (note.isProtected && protectedSessionService.isProtectedSessionAvailable()) {
|
||||
try {
|
||||
content = protectedSessionService.decryptString(content) || "";
|
||||
} catch (e) {
|
||||
return ""; // Can't decrypt, don't show content
|
||||
}
|
||||
} else if (note.isProtected) {
|
||||
return ""; // Protected but no session available
|
||||
}
|
||||
|
||||
// Strip HTML tags for text notes
|
||||
if (note.type === "text") {
|
||||
content = striptags(content);
|
||||
}
|
||||
|
||||
// Normalize whitespace
|
||||
content = content.replace(/\s+/g, " ").trim();
|
||||
|
||||
if (!content) {
|
||||
return "";
|
||||
}
|
||||
|
||||
// Try to find a snippet around the first matching token
|
||||
const normalizedContent = normalizeString(content.toLowerCase());
|
||||
let snippetStart = 0;
|
||||
let matchFound = false;
|
||||
|
||||
for (const token of searchTokens) {
|
||||
const normalizedToken = normalizeString(token.toLowerCase());
|
||||
const matchIndex = normalizedContent.indexOf(normalizedToken);
|
||||
|
||||
if (matchIndex !== -1) {
|
||||
// Center the snippet around the match
|
||||
snippetStart = Math.max(0, matchIndex - maxLength / 2);
|
||||
matchFound = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// Extract snippet
|
||||
let snippet = content.substring(snippetStart, snippetStart + maxLength);
|
||||
|
||||
// Try to start/end at word boundaries
|
||||
if (snippetStart > 0) {
|
||||
const firstSpace = snippet.indexOf(" ");
|
||||
if (firstSpace > 0 && firstSpace < 20) {
|
||||
snippet = snippet.substring(firstSpace + 1);
|
||||
}
|
||||
snippet = "..." + snippet;
|
||||
}
|
||||
|
||||
if (snippetStart + maxLength < content.length) {
|
||||
const lastSpace = snippet.lastIndexOf(" ");
|
||||
if (lastSpace > snippet.length - 20) {
|
||||
snippet = snippet.substring(0, lastSpace);
|
||||
}
|
||||
snippet = snippet + "...";
|
||||
}
|
||||
|
||||
return snippet;
|
||||
} catch (e) {
|
||||
log.error(`Error extracting content snippet for note ${noteId}: ${e}`);
|
||||
return "";
|
||||
}
|
||||
}
|
||||
|
||||
function searchNotesForAutocomplete(query: string, fastSearch: boolean = true) {
|
||||
const searchContext = new SearchContext({
|
||||
fastSearch: fastSearch,
|
||||
@@ -351,6 +533,11 @@ function searchNotesForAutocomplete(query: string, fastSearch: boolean = true) {
|
||||
|
||||
const trimmed = allSearchResults.slice(0, 200);
|
||||
|
||||
// Extract content snippets
|
||||
for (const result of trimmed) {
|
||||
result.contentSnippet = extractContentSnippet(result.noteId, searchContext.highlightedTokens);
|
||||
}
|
||||
|
||||
highlightSearchResults(trimmed, searchContext.highlightedTokens, searchContext.ignoreInternalAttributes);
|
||||
|
||||
return trimmed.map((result) => {
|
||||
@@ -360,6 +547,8 @@ function searchNotesForAutocomplete(query: string, fastSearch: boolean = true) {
|
||||
noteTitle: title,
|
||||
notePathTitle: result.notePathTitle,
|
||||
highlightedNotePathTitle: result.highlightedNotePathTitle,
|
||||
contentSnippet: result.contentSnippet,
|
||||
highlightedContentSnippet: result.highlightedContentSnippet,
|
||||
icon: icon ?? "bx bx-note"
|
||||
};
|
||||
});
|
||||
@@ -381,26 +570,11 @@ function highlightSearchResults(searchResults: SearchResult[], highlightedTokens
|
||||
highlightedTokens.sort((a, b) => (a.length > b.length ? -1 : 1));
|
||||
|
||||
for (const result of searchResults) {
|
||||
const note = becca.notes[result.noteId];
|
||||
|
||||
result.highlightedNotePathTitle = result.notePathTitle.replace(/[<{}]/g, "");
|
||||
|
||||
if (highlightedTokens.find((token) => note.type.includes(token))) {
|
||||
result.highlightedNotePathTitle += ` "type: ${note.type}'`;
|
||||
}
|
||||
|
||||
if (highlightedTokens.find((token) => note.mime.includes(token))) {
|
||||
result.highlightedNotePathTitle += ` "mime: ${note.mime}'`;
|
||||
}
|
||||
|
||||
for (const attr of note.getAttributes()) {
|
||||
if (attr.type === "relation" && attr.name === "internalLink" && ignoreInternalAttributes) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (highlightedTokens.find((token) => normalize(attr.name).includes(token) || normalize(attr.value).includes(token))) {
|
||||
result.highlightedNotePathTitle += ` "${formatAttribute(attr)}'`;
|
||||
}
|
||||
|
||||
// Initialize highlighted content snippet
|
||||
if (result.contentSnippet) {
|
||||
result.highlightedContentSnippet = escapeHtml(result.contentSnippet).replace(/[<{}]/g, "");
|
||||
}
|
||||
}
|
||||
|
||||
@@ -419,40 +593,36 @@ function highlightSearchResults(searchResults: SearchResult[], highlightedTokens
|
||||
const tokenRegex = new RegExp(escapeRegExp(token), "gi");
|
||||
let match;
|
||||
|
||||
// Find all matches
|
||||
if (!result.highlightedNotePathTitle) {
|
||||
continue;
|
||||
// Highlight in note path title
|
||||
if (result.highlightedNotePathTitle) {
|
||||
const titleRegex = new RegExp(escapeRegExp(token), "gi");
|
||||
while ((match = titleRegex.exec(normalizeString(result.highlightedNotePathTitle))) !== null) {
|
||||
result.highlightedNotePathTitle = wrapText(result.highlightedNotePathTitle, match.index, token.length, "{", "}");
|
||||
// 2 characters are added, so we need to adjust the index
|
||||
titleRegex.lastIndex += 2;
|
||||
}
|
||||
}
|
||||
while ((match = tokenRegex.exec(normalizeString(result.highlightedNotePathTitle))) !== null) {
|
||||
result.highlightedNotePathTitle = wrapText(result.highlightedNotePathTitle, match.index, token.length, "{", "}");
|
||||
|
||||
// 2 characters are added, so we need to adjust the index
|
||||
tokenRegex.lastIndex += 2;
|
||||
// Highlight in content snippet
|
||||
if (result.highlightedContentSnippet) {
|
||||
const contentRegex = new RegExp(escapeRegExp(token), "gi");
|
||||
while ((match = contentRegex.exec(normalizeString(result.highlightedContentSnippet))) !== null) {
|
||||
result.highlightedContentSnippet = wrapText(result.highlightedContentSnippet, match.index, token.length, "{", "}");
|
||||
// 2 characters are added, so we need to adjust the index
|
||||
contentRegex.lastIndex += 2;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for (const result of searchResults) {
|
||||
if (!result.highlightedNotePathTitle) {
|
||||
continue;
|
||||
if (result.highlightedNotePathTitle) {
|
||||
result.highlightedNotePathTitle = result.highlightedNotePathTitle.replace(/{/g, "<b>").replace(/}/g, "</b>");
|
||||
}
|
||||
result.highlightedNotePathTitle = result.highlightedNotePathTitle.replace(/"/g, "<small>").replace(/'/g, "</small>").replace(/{/g, "<b>").replace(/}/g, "</b>");
|
||||
}
|
||||
}
|
||||
|
||||
function formatAttribute(attr: BAttribute) {
|
||||
if (attr.type === "relation") {
|
||||
return `~${escapeHtml(attr.name)}=…`;
|
||||
} else if (attr.type === "label") {
|
||||
let label = `#${escapeHtml(attr.name)}`;
|
||||
|
||||
if (attr.value) {
|
||||
const val = /[^\w-]/.test(attr.value) ? `"${attr.value}"` : attr.value;
|
||||
|
||||
label += `=${escapeHtml(val)}`;
|
||||
|
||||
if (result.highlightedContentSnippet) {
|
||||
result.highlightedContentSnippet = result.highlightedContentSnippet.replace(/{/g, "<b>").replace(/}/g, "</b>");
|
||||
}
|
||||
|
||||
return label;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
65
apps/server/src/services/search/utils/text_utils.spec.ts
Normal file
65
apps/server/src/services/search/utils/text_utils.spec.ts
Normal file
@@ -0,0 +1,65 @@
|
||||
import { describe, it, expect } from "vitest";
|
||||
import { calculateOptimizedEditDistance, validateFuzzySearchTokens, fuzzyMatchWord } from './text_utils.js';
|
||||
|
||||
describe('Fuzzy Search Core', () => {
|
||||
describe('calculateOptimizedEditDistance', () => {
|
||||
it('calculates edit distance for common typos', () => {
|
||||
expect(calculateOptimizedEditDistance('hello', 'helo')).toBe(1);
|
||||
expect(calculateOptimizedEditDistance('world', 'wrold')).toBe(2);
|
||||
expect(calculateOptimizedEditDistance('cafe', 'café')).toBe(1);
|
||||
expect(calculateOptimizedEditDistance('identical', 'identical')).toBe(0);
|
||||
});
|
||||
|
||||
it('handles performance safety with oversized input', () => {
|
||||
const longString = 'a'.repeat(2000);
|
||||
const result = calculateOptimizedEditDistance(longString, 'short');
|
||||
expect(result).toBeGreaterThan(2); // Should use fallback heuristic
|
||||
});
|
||||
});
|
||||
|
||||
describe('validateFuzzySearchTokens', () => {
|
||||
it('validates minimum length requirements for fuzzy operators', () => {
|
||||
const result1 = validateFuzzySearchTokens(['ab'], '~=');
|
||||
expect(result1.isValid).toBe(false);
|
||||
expect(result1.error).toContain('at least 3 characters');
|
||||
|
||||
const result2 = validateFuzzySearchTokens(['hello'], '~=');
|
||||
expect(result2.isValid).toBe(true);
|
||||
|
||||
const result3 = validateFuzzySearchTokens(['ok'], '=');
|
||||
expect(result3.isValid).toBe(true); // Non-fuzzy operators allow short tokens
|
||||
});
|
||||
|
||||
it('validates token types and empty arrays', () => {
|
||||
expect(validateFuzzySearchTokens([], '=')).toEqual({
|
||||
isValid: false,
|
||||
error: 'Invalid tokens: at least one token is required'
|
||||
});
|
||||
|
||||
expect(validateFuzzySearchTokens([''], '=')).toEqual({
|
||||
isValid: false,
|
||||
error: 'Invalid tokens: empty or whitespace-only tokens are not allowed'
|
||||
});
|
||||
});
|
||||
});
|
||||
|
||||
describe('fuzzyMatchWord', () => {
|
||||
it('matches words with diacritics normalization', () => {
|
||||
expect(fuzzyMatchWord('cafe', 'café')).toBe(true);
|
||||
expect(fuzzyMatchWord('naive', 'naïve')).toBe(true);
|
||||
});
|
||||
|
||||
it('matches with typos within distance threshold', () => {
|
||||
expect(fuzzyMatchWord('hello', 'helo')).toBe(true);
|
||||
expect(fuzzyMatchWord('world', 'wrold')).toBe(true);
|
||||
expect(fuzzyMatchWord('test', 'tset')).toBe(true);
|
||||
expect(fuzzyMatchWord('test', 'xyz')).toBe(false);
|
||||
});
|
||||
|
||||
it('handles edge cases safely', () => {
|
||||
expect(fuzzyMatchWord('', 'test')).toBe(false);
|
||||
expect(fuzzyMatchWord('test', '')).toBe(false);
|
||||
expect(fuzzyMatchWord('a', 'b')).toBe(false); // Very short tokens
|
||||
});
|
||||
});
|
||||
});
|
||||
334
apps/server/src/services/search/utils/text_utils.ts
Normal file
334
apps/server/src/services/search/utils/text_utils.ts
Normal file
@@ -0,0 +1,334 @@
|
||||
"use strict";
|
||||
|
||||
import { normalize } from "../../utils.js";
|
||||
|
||||
/**
|
||||
* Shared text processing utilities for search functionality
|
||||
*/
|
||||
|
||||
// Configuration constants for fuzzy matching
|
||||
export const FUZZY_SEARCH_CONFIG = {
|
||||
// Minimum token length for fuzzy operators to prevent false positives
|
||||
MIN_FUZZY_TOKEN_LENGTH: 3,
|
||||
// Maximum edit distance for fuzzy matching
|
||||
MAX_EDIT_DISTANCE: 2,
|
||||
// Maximum proximity distance for phrase matching (in words)
|
||||
MAX_PHRASE_PROXIMITY: 10,
|
||||
// Absolute hard limits for extreme cases - only to prevent system crashes
|
||||
ABSOLUTE_MAX_CONTENT_SIZE: 100 * 1024 * 1024, // 100MB - extreme upper limit to prevent OOM
|
||||
ABSOLUTE_MAX_WORD_COUNT: 2000000, // 2M words - extreme upper limit for word processing
|
||||
// Performance warning thresholds - inform user but still attempt search
|
||||
PERFORMANCE_WARNING_SIZE: 5 * 1024 * 1024, // 5MB - warn about potential performance impact
|
||||
PERFORMANCE_WARNING_WORDS: 100000, // 100K words - warn about word count impact
|
||||
// Progressive processing thresholds for very large content
|
||||
PROGRESSIVE_PROCESSING_SIZE: 10 * 1024 * 1024, // 10MB - use progressive processing
|
||||
PROGRESSIVE_PROCESSING_WORDS: 500000, // 500K words - use progressive processing
|
||||
// Performance thresholds
|
||||
EARLY_TERMINATION_THRESHOLD: 3,
|
||||
} as const;
|
||||
|
||||
/**
|
||||
* Normalizes text by removing diacritics and converting to lowercase.
|
||||
* This is the centralized text normalization function used across all search components.
|
||||
* Uses the shared normalize function from utils for consistency.
|
||||
*
|
||||
* Examples:
|
||||
* - "café" -> "cafe"
|
||||
* - "naïve" -> "naive"
|
||||
* - "HELLO WORLD" -> "hello world"
|
||||
*
|
||||
* @param text The text to normalize
|
||||
* @returns The normalized text
|
||||
*/
|
||||
export function normalizeSearchText(text: string): string {
|
||||
if (!text || typeof text !== 'string') {
|
||||
return '';
|
||||
}
|
||||
|
||||
// Use shared normalize function for consistency across the codebase
|
||||
return normalize(text);
|
||||
}
|
||||
|
||||
/**
|
||||
* Optimized edit distance calculation using single array and early termination.
|
||||
* This is significantly more memory efficient than the 2D matrix approach and includes
|
||||
* early termination optimizations for better performance.
|
||||
*
|
||||
* @param str1 First string
|
||||
* @param str2 Second string
|
||||
* @param maxDistance Maximum allowed distance (for early termination)
|
||||
* @returns The edit distance between the strings, or maxDistance + 1 if exceeded
|
||||
*/
|
||||
export function calculateOptimizedEditDistance(str1: string, str2: string, maxDistance: number = FUZZY_SEARCH_CONFIG.MAX_EDIT_DISTANCE): number {
|
||||
// Input validation
|
||||
if (typeof str1 !== 'string' || typeof str2 !== 'string') {
|
||||
throw new Error('Both arguments must be strings');
|
||||
}
|
||||
|
||||
if (maxDistance < 0 || !Number.isInteger(maxDistance)) {
|
||||
throw new Error('maxDistance must be a non-negative integer');
|
||||
}
|
||||
|
||||
const len1 = str1.length;
|
||||
const len2 = str2.length;
|
||||
|
||||
// Performance guard: if strings are too long, limit processing
|
||||
const maxStringLength = 1000;
|
||||
if (len1 > maxStringLength || len2 > maxStringLength) {
|
||||
// For very long strings, fall back to simple length-based heuristic
|
||||
return Math.abs(len1 - len2) <= maxDistance ? Math.abs(len1 - len2) : maxDistance + 1;
|
||||
}
|
||||
|
||||
// Early termination: if length difference exceeds max distance
|
||||
if (Math.abs(len1 - len2) > maxDistance) {
|
||||
return maxDistance + 1;
|
||||
}
|
||||
|
||||
// Handle edge cases
|
||||
if (len1 === 0) return len2 <= maxDistance ? len2 : maxDistance + 1;
|
||||
if (len2 === 0) return len1 <= maxDistance ? len1 : maxDistance + 1;
|
||||
|
||||
// Use single array optimization for better memory usage
|
||||
let previousRow = Array.from({ length: len2 + 1 }, (_, i) => i);
|
||||
let currentRow = new Array(len2 + 1);
|
||||
|
||||
for (let i = 1; i <= len1; i++) {
|
||||
currentRow[0] = i;
|
||||
let minInRow = i;
|
||||
|
||||
for (let j = 1; j <= len2; j++) {
|
||||
const cost = str1[i - 1] === str2[j - 1] ? 0 : 1;
|
||||
currentRow[j] = Math.min(
|
||||
previousRow[j] + 1, // deletion
|
||||
currentRow[j - 1] + 1, // insertion
|
||||
previousRow[j - 1] + cost // substitution
|
||||
);
|
||||
|
||||
// Track minimum value in current row for early termination
|
||||
if (currentRow[j] < minInRow) {
|
||||
minInRow = currentRow[j];
|
||||
}
|
||||
}
|
||||
|
||||
// Early termination: if minimum distance in row exceeds threshold
|
||||
if (minInRow > maxDistance) {
|
||||
return maxDistance + 1;
|
||||
}
|
||||
|
||||
// Swap arrays for next iteration
|
||||
[previousRow, currentRow] = [currentRow, previousRow];
|
||||
}
|
||||
|
||||
const result = previousRow[len2];
|
||||
return result <= maxDistance ? result : maxDistance + 1;
|
||||
}
|
||||
|
||||
/**
|
||||
* Validates that tokens meet minimum requirements for fuzzy operators.
|
||||
*
|
||||
* @param tokens Array of search tokens
|
||||
* @param operator The search operator being used
|
||||
* @returns Validation result with success status and error message
|
||||
*/
|
||||
export function validateFuzzySearchTokens(tokens: string[], operator: string): { isValid: boolean; error?: string } {
|
||||
if (!operator || typeof operator !== 'string') {
|
||||
return {
|
||||
isValid: false,
|
||||
error: 'Invalid operator: operator must be a non-empty string'
|
||||
};
|
||||
}
|
||||
|
||||
if (!Array.isArray(tokens)) {
|
||||
return {
|
||||
isValid: false,
|
||||
error: 'Invalid tokens: tokens must be an array'
|
||||
};
|
||||
}
|
||||
|
||||
if (tokens.length === 0) {
|
||||
return {
|
||||
isValid: false,
|
||||
error: 'Invalid tokens: at least one token is required'
|
||||
};
|
||||
}
|
||||
|
||||
// Check for null, undefined, or non-string tokens
|
||||
const invalidTypeTokens = tokens.filter(token =>
|
||||
token == null || typeof token !== 'string'
|
||||
);
|
||||
|
||||
if (invalidTypeTokens.length > 0) {
|
||||
return {
|
||||
isValid: false,
|
||||
error: 'Invalid tokens: all tokens must be non-null strings'
|
||||
};
|
||||
}
|
||||
|
||||
// Check for empty string tokens
|
||||
const emptyTokens = tokens.filter(token => token.trim().length === 0);
|
||||
|
||||
if (emptyTokens.length > 0) {
|
||||
return {
|
||||
isValid: false,
|
||||
error: 'Invalid tokens: empty or whitespace-only tokens are not allowed'
|
||||
};
|
||||
}
|
||||
|
||||
if (operator !== '~=' && operator !== '~*') {
|
||||
return { isValid: true };
|
||||
}
|
||||
|
||||
// Check minimum token length for fuzzy operators
|
||||
const shortTokens = tokens.filter(token => token.length < FUZZY_SEARCH_CONFIG.MIN_FUZZY_TOKEN_LENGTH);
|
||||
|
||||
if (shortTokens.length > 0) {
|
||||
return {
|
||||
isValid: false,
|
||||
error: `Fuzzy search operators (~=, ~*) require tokens of at least ${FUZZY_SEARCH_CONFIG.MIN_FUZZY_TOKEN_LENGTH} characters. Invalid tokens: ${shortTokens.join(', ')}`
|
||||
};
|
||||
}
|
||||
|
||||
// Check for excessively long tokens that could cause performance issues
|
||||
const maxTokenLength = 100; // Reasonable limit for search tokens
|
||||
const longTokens = tokens.filter(token => token.length > maxTokenLength);
|
||||
|
||||
if (longTokens.length > 0) {
|
||||
return {
|
||||
isValid: false,
|
||||
error: `Tokens are too long (max ${maxTokenLength} characters). Long tokens: ${longTokens.map(t => t.substring(0, 20) + '...').join(', ')}`
|
||||
};
|
||||
}
|
||||
|
||||
return { isValid: true };
|
||||
}
|
||||
|
||||
/**
|
||||
* Validates and preprocesses content for search operations.
|
||||
* Philosophy: Try to search everything! Only block truly extreme cases that could crash the system.
|
||||
*
|
||||
* @param content The content to validate and preprocess
|
||||
* @param noteId The note ID (for logging purposes)
|
||||
* @returns Processed content, only null for truly extreme cases that could cause system instability
|
||||
*/
|
||||
export function validateAndPreprocessContent(content: string, noteId?: string): string | null {
|
||||
if (!content || typeof content !== 'string') {
|
||||
return null;
|
||||
}
|
||||
|
||||
// Only block content that could actually crash the system (100MB+)
|
||||
if (content.length > FUZZY_SEARCH_CONFIG.ABSOLUTE_MAX_CONTENT_SIZE) {
|
||||
console.error(`Content size exceeds absolute system limit for note ${noteId || 'unknown'}: ${content.length} bytes - this could cause system instability`);
|
||||
// Only in truly extreme cases, truncate to prevent system crash
|
||||
return content.substring(0, FUZZY_SEARCH_CONFIG.ABSOLUTE_MAX_CONTENT_SIZE);
|
||||
}
|
||||
|
||||
// Warn about very large content but still process it
|
||||
if (content.length > FUZZY_SEARCH_CONFIG.PERFORMANCE_WARNING_SIZE) {
|
||||
console.info(`Large content for note ${noteId || 'unknown'}: ${content.length} bytes - processing may take time but will attempt full search`);
|
||||
}
|
||||
|
||||
// For word count, be even more permissive - only block truly extreme cases
|
||||
const wordCount = content.split(/\s+/).length;
|
||||
if (wordCount > FUZZY_SEARCH_CONFIG.ABSOLUTE_MAX_WORD_COUNT) {
|
||||
console.error(`Word count exceeds absolute system limit for note ${noteId || 'unknown'}: ${wordCount} words - this could cause system instability`);
|
||||
// Only in truly extreme cases, truncate to prevent system crash
|
||||
return content.split(/\s+/).slice(0, FUZZY_SEARCH_CONFIG.ABSOLUTE_MAX_WORD_COUNT).join(' ');
|
||||
}
|
||||
|
||||
// Warn about high word counts but still process them
|
||||
if (wordCount > FUZZY_SEARCH_CONFIG.PERFORMANCE_WARNING_WORDS) {
|
||||
console.info(`High word count for note ${noteId || 'unknown'}: ${wordCount} words - phrase matching may take time but will attempt full search`);
|
||||
}
|
||||
|
||||
// Progressive processing warning for very large content
|
||||
if (content.length > FUZZY_SEARCH_CONFIG.PROGRESSIVE_PROCESSING_SIZE || wordCount > FUZZY_SEARCH_CONFIG.PROGRESSIVE_PROCESSING_WORDS) {
|
||||
console.info(`Very large content for note ${noteId || 'unknown'} - using progressive processing to maintain responsiveness`);
|
||||
}
|
||||
|
||||
return content;
|
||||
}
|
||||
|
||||
/**
|
||||
* Escapes special regex characters in a string for use in RegExp constructor
|
||||
*/
|
||||
function escapeRegExp(string: string): string {
|
||||
return string.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
|
||||
}
|
||||
|
||||
/**
|
||||
* Checks if a word matches a token with fuzzy matching and returns the matched word.
|
||||
* Optimized for common case where distances are small.
|
||||
*
|
||||
* @param token The search token (should be normalized)
|
||||
* @param text The text to match against (should be normalized)
|
||||
* @param maxDistance Maximum allowed edit distance
|
||||
* @returns The matched word if found, null otherwise
|
||||
*/
|
||||
export function fuzzyMatchWordWithResult(token: string, text: string, maxDistance: number = FUZZY_SEARCH_CONFIG.MAX_EDIT_DISTANCE): string | null {
|
||||
// Input validation
|
||||
if (typeof token !== 'string' || typeof text !== 'string') {
|
||||
return null;
|
||||
}
|
||||
|
||||
if (token.length === 0 || text.length === 0) {
|
||||
return null;
|
||||
}
|
||||
|
||||
try {
|
||||
// Normalize both strings for comparison
|
||||
const normalizedToken = token.toLowerCase();
|
||||
const normalizedText = text.toLowerCase();
|
||||
|
||||
// Exact match check first (most common case)
|
||||
if (normalizedText.includes(normalizedToken)) {
|
||||
// Find the exact match in the original text to preserve case
|
||||
const exactMatch = text.match(new RegExp(escapeRegExp(token), 'i'));
|
||||
return exactMatch ? exactMatch[0] : token;
|
||||
}
|
||||
|
||||
// For fuzzy matching, we need to check individual words in the text
|
||||
// Split the text into words and check each word against the token
|
||||
const words = normalizedText.split(/\s+/).filter(word => word.length > 0);
|
||||
const originalWords = text.split(/\s+/).filter(word => word.length > 0);
|
||||
|
||||
for (let i = 0; i < words.length; i++) {
|
||||
const word = words[i];
|
||||
const originalWord = originalWords[i];
|
||||
|
||||
// Skip if word is too different in length for fuzzy matching
|
||||
if (Math.abs(word.length - normalizedToken.length) > maxDistance) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// For very short tokens or very different lengths, be more strict
|
||||
if (normalizedToken.length < 4 || Math.abs(word.length - normalizedToken.length) > 2) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// Use optimized edit distance calculation
|
||||
const distance = calculateOptimizedEditDistance(normalizedToken, word, maxDistance);
|
||||
if (distance <= maxDistance) {
|
||||
return originalWord; // Return the original word with case preserved
|
||||
}
|
||||
}
|
||||
|
||||
return null;
|
||||
} catch (error) {
|
||||
// Log error and return null for safety
|
||||
console.warn('Error in fuzzy word matching:', error);
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Checks if a word matches a token with fuzzy matching.
|
||||
* Optimized for common case where distances are small.
|
||||
*
|
||||
* @param token The search token (should be normalized)
|
||||
* @param word The word to match against (should be normalized)
|
||||
* @param maxDistance Maximum allowed edit distance
|
||||
* @returns True if the word matches the token within the distance threshold
|
||||
*/
|
||||
export function fuzzyMatchWord(token: string, text: string, maxDistance: number = FUZZY_SEARCH_CONFIG.MAX_EDIT_DISTANCE): boolean {
|
||||
return fuzzyMatchWordWithResult(token, text, maxDistance) !== null;
|
||||
}
|
||||
Reference in New Issue
Block a user