Revert "feat(search): I honestly have no idea what I'm doing"

This reverts commit b09a2c386d.
This commit is contained in:
perf3ct
2025-09-02 19:24:44 +00:00
parent b09a2c386d
commit 8572f82e0a

View File

@@ -61,10 +61,9 @@ export class FTSQueryError extends FTSError {
* Configuration for FTS5 search
*/
const FTS_CONFIG = {
DEFAULT_LIMIT: 100000, // Increased for unlimited results
MAX_RESULTS: 10000000, // Support millions of notes
BATCH_SIZE: 1000,
FUZZY_THRESHOLD: 0.7 // Similarity threshold for fuzzy matching
DEFAULT_LIMIT: 100,
MAX_RESULTS: 10000,
BATCH_SIZE: 1000
};
/**
@@ -133,7 +132,7 @@ class FTSSearchService {
}
/**
* Perform synchronous FTS5 search with hybrid substring and fuzzy support
* Perform synchronous FTS5 search
*/
searchSync(
tokens: string[],
@@ -145,18 +144,11 @@ class FTSSearchService {
throw new FTSNotAvailableError();
}
const limit = options.limit || FTS_CONFIG.DEFAULT_LIMIT;
const limit = Math.min(options.limit || FTS_CONFIG.DEFAULT_LIMIT, FTS_CONFIG.MAX_RESULTS);
const offset = options.offset || 0;
try {
// Special handling for substring and fuzzy operators
if (operator === '*=*') {
return this.hybridSubstringSearch(tokens, noteIds, limit, offset);
} else if (operator === '~=' || operator === '~*') {
return this.fuzzySearch(tokens, operator, noteIds, limit, offset);
}
// Standard FTS5 search for other operators
// Build FTS5 query based on operator
let ftsQuery = this.buildFTSQuery(tokens, operator);
// Build SQL query
@@ -210,208 +202,6 @@ class FTSSearchService {
}
}
/**
* Hybrid substring search using FTS5 for initial filtering and LIKE for exact substring matching
* Optimized for millions of notes
*/
private hybridSubstringSearch(
tokens: string[],
noteIds?: Set<string>,
limit: number = FTS_CONFIG.DEFAULT_LIMIT,
offset: number = 0
): FTSSearchResult[] {
try {
// Step 1: Create FTS query to find notes containing any of the tokens as whole words
// This dramatically reduces the search space for LIKE operations
const ftsQuery = tokens.map(t => `"${t.replace(/"/g, '""')}"`).join(' OR ');
// Step 2: Build LIKE conditions for true substring matching
// Use ESCAPE clause for proper handling of special characters
const likeConditions = tokens.map(token => {
const escapedToken = token.replace(/[_%\\]/g, '\\$&').replace(/'/g, "''");
return `(f.title LIKE '%${escapedToken}%' ESCAPE '\\' OR
f.content LIKE '%${escapedToken}%' ESCAPE '\\')`;
}).join(' AND ');
let query: string;
let params: any[] = [];
if (noteIds && noteIds.size > 0) {
// Use WITH clause for better query optimization with large noteId sets
const noteIdList = Array.from(noteIds);
const placeholders = noteIdList.map(() => '?').join(',');
query = `
WITH filtered_notes AS (
SELECT noteId FROM (VALUES ${noteIdList.map(() => '(?)').join(',')}) AS t(noteId)
)
SELECT DISTINCT
f.noteId,
n.title,
CASE
WHEN ${tokens.map(t => `f.title LIKE '%${t.replace(/'/g, "''")}%' ESCAPE '\\'`).join(' AND ')}
THEN -1000 -- Prioritize title matches
ELSE -rank
END as score
FROM notes_fts f
JOIN notes n ON n.noteId = f.noteId
JOIN filtered_notes fn ON fn.noteId = f.noteId
WHERE notes_fts MATCH ?
AND (${likeConditions})
AND n.isDeleted = 0
AND n.isProtected = 0
ORDER BY score
LIMIT ? OFFSET ?
`;
params = [...noteIdList, ftsQuery, limit, offset];
} else {
// Full search without noteId filtering
query = `
SELECT DISTINCT
f.noteId,
n.title,
CASE
WHEN ${tokens.map(t => `f.title LIKE '%${t.replace(/'/g, "''")}%' ESCAPE '\\'`).join(' AND ')}
THEN -1000 -- Prioritize title matches
ELSE -rank
END as score
FROM notes_fts f
JOIN notes n ON n.noteId = f.noteId
WHERE notes_fts MATCH ?
AND (${likeConditions})
AND n.isDeleted = 0
AND n.isProtected = 0
ORDER BY score
LIMIT ? OFFSET ?
`;
params = [ftsQuery, limit, offset];
}
const results = sql.getRows<FTSSearchResult>(query, params);
return results || [];
} catch (error: any) {
log.error(`Hybrid substring search failed: ${error.message}`);
throw new FTSError(`Substring search failed: ${error.message}`, 'FTS_SUBSTRING_ERROR');
}
}
/**
* Fuzzy search using SQLite's built-in soundex and edit distance capabilities
* Implements Levenshtein distance for true fuzzy matching
*/
private fuzzySearch(
tokens: string[],
operator: string,
noteIds?: Set<string>,
limit: number = FTS_CONFIG.DEFAULT_LIMIT,
offset: number = 0
): FTSSearchResult[] {
try {
// For fuzzy search, we use a combination of:
// 1. FTS5 OR query to get initial candidates
// 2. SQLite's editdist3 function if available, or fallback to soundex
const ftsQuery = tokens.map(t => {
const escaped = t.replace(/"/g, '""');
// Include the exact term and common variations
return `("${escaped}" OR "${escaped}*" OR "*${escaped}")`;
}).join(' OR ');
// Check if editdist3 is available (requires spellfix1 extension)
const hasEditDist = this.checkEditDistAvailability();
let query: string;
let params: any[] = [];
if (hasEditDist) {
// Use edit distance for true fuzzy matching
const editDistConditions = tokens.map(token => {
const escaped = token.replace(/'/g, "''");
// Calculate edit distance threshold based on token length
const threshold = Math.max(1, Math.floor(token.length * 0.3));
return `(
editdist3(LOWER(f.title), LOWER('${escaped}')) <= ${threshold} OR
editdist3(LOWER(SUBSTR(f.content, 1, 1000)), LOWER('${escaped}')) <= ${threshold}
)`;
}).join(operator === '~=' ? ' AND ' : ' OR ');
query = `
SELECT DISTINCT
f.noteId,
n.title,
MIN(${tokens.map(t => `editdist3(LOWER(f.title), LOWER('${t.replace(/'/g, "''")}'))`).join(', ')}) as score
FROM notes_fts f
JOIN notes n ON n.noteId = f.noteId
WHERE notes_fts MATCH ?
AND (${editDistConditions})
AND n.isDeleted = 0
AND n.isProtected = 0
GROUP BY f.noteId, n.title
ORDER BY score
LIMIT ? OFFSET ?
`;
} else {
// Fallback to soundex for basic phonetic matching
log.info("Edit distance not available, using soundex for fuzzy search");
const soundexConditions = tokens.map(token => {
const escaped = token.replace(/'/g, "''");
return `(
soundex(f.title) = soundex('${escaped}') OR
f.title LIKE '%${escaped}%' ESCAPE '\\' OR
f.content LIKE '%${escaped}%' ESCAPE '\\'
)`;
}).join(operator === '~=' ? ' AND ' : ' OR ');
query = `
SELECT DISTINCT
f.noteId,
n.title,
-rank as score
FROM notes_fts f
JOIN notes n ON n.noteId = f.noteId
WHERE notes_fts MATCH ?
AND (${soundexConditions})
AND n.isDeleted = 0
AND n.isProtected = 0
ORDER BY score
LIMIT ? OFFSET ?
`;
}
params = [ftsQuery, limit, offset];
// Add noteId filtering if specified
if (noteIds && noteIds.size > 0) {
const noteIdList = Array.from(noteIds).join("','");
query = query.replace(
'AND n.isDeleted = 0',
`AND f.noteId IN ('${noteIdList}') AND n.isDeleted = 0`
);
}
const results = sql.getRows<FTSSearchResult>(query, params);
return results || [];
} catch (error: any) {
log.error(`Fuzzy search failed: ${error.message}`);
// Fallback to simple substring search if fuzzy features aren't available
return this.hybridSubstringSearch(tokens, noteIds, limit, offset);
}
}
/**
* Check if edit distance function is available
*/
private checkEditDistAvailability(): boolean {
try {
// Try to use editdist3 function
sql.getValue(`SELECT editdist3('test', 'test')`);
return true;
} catch {
return false;
}
}
/**
* Search protected notes separately (not indexed in FTS)
*/
@@ -472,7 +262,7 @@ class FTSSearchService {
}
/**
* Sync missing notes to FTS index - optimized for millions of notes
* Sync missing notes to FTS index
*/
syncMissingNotes(): number {
if (!this.checkFTS5Availability()) {
@@ -480,11 +270,6 @@ class FTSSearchService {
}
try {
let totalSynced = 0;
let hasMore = true;
// Process in batches to handle millions of notes efficiently
while (hasMore) {
// Find notes that should be indexed but aren't
const missingNotes = sql.getRows<{noteId: string, title: string, content: string}>(`
SELECT n.noteId, n.title, b.content
@@ -496,70 +281,31 @@ class FTSSearchService {
AND n.isProtected = 0
AND b.content IS NOT NULL
AND f.noteId IS NULL
LIMIT ${FTS_CONFIG.BATCH_SIZE}
LIMIT 1000
`);
if (!missingNotes || missingNotes.length === 0) {
hasMore = false;
break;
return 0;
}
// Insert missing notes using efficient batch processing
sql.transactional(() => {
// Use batch insert for better performance
const batchInsertQuery = `
INSERT OR REPLACE INTO notes_fts (noteId, title, content)
VALUES ${missingNotes.map(() => '(?, ?, ?)').join(', ')}
`;
const params: any[] = [];
for (const note of missingNotes) {
params.push(note.noteId, note.title, note.content);
sql.execute(
`INSERT OR REPLACE INTO notes_fts (noteId, title, content) VALUES (?, ?, ?)`,
[note.noteId, note.title, note.content]
);
}
sql.execute(batchInsertQuery, params);
});
totalSynced += missingNotes.length;
// Log progress for large sync operations
if (totalSynced % 10000 === 0) {
log.info(`Synced ${totalSynced} notes to FTS index...`);
}
// Continue if we got a full batch
hasMore = missingNotes.length === FTS_CONFIG.BATCH_SIZE;
}
if (totalSynced > 0) {
log.info(`Completed syncing ${totalSynced} notes to FTS index`);
// Optimize the FTS index after large sync
if (totalSynced > 1000) {
this.optimizeIndex();
}
}
return totalSynced;
log.info(`Synced ${missingNotes.length} missing notes to FTS index`);
return missingNotes.length;
} catch (error) {
log.error(`Error syncing missing notes: ${error}`);
return 0;
}
}
/**
* Optimize FTS5 index for better performance
*/
optimizeIndex(): void {
try {
log.info("Optimizing FTS5 index...");
sql.execute(`INSERT INTO notes_fts(notes_fts) VALUES('optimize')`);
log.info("FTS5 index optimization completed");
} catch (error) {
log.error(`Error optimizing FTS5 index: ${error}`);
}
}
/**
* Build FTS5 query string from tokens and operator
*/
@@ -694,15 +440,31 @@ class FTSSearchService {
}
/**
* Get FTS index statistics
* Optimize FTS index (run during maintenance)
*/
getIndexStats(): { totalDocuments: number; indexSize: number } {
optimizeIndex(): void {
if (!this.checkFTS5Availability()) {
return { totalDocuments: 0, indexSize: 0 };
return;
}
try {
const totalDocuments = sql.getValue<number>(`
sql.execute(`INSERT INTO notes_fts(notes_fts) VALUES('optimize')`);
log.info("FTS5 index optimized");
} catch (error) {
log.error(`Error optimizing FTS5 index: ${error}`);
}
}
/**
* Get FTS index statistics
*/
getStatistics(): { documentCount: number; indexSize: number } {
if (!this.checkFTS5Availability()) {
return { documentCount: 0, indexSize: 0 };
}
try {
const documentCount = sql.getValue<number>(`
SELECT COUNT(*) FROM notes_fts
`) || 0;
@@ -713,13 +475,23 @@ class FTSSearchService {
WHERE name LIKE 'notes_fts%'
`) || 0;
return { totalDocuments, indexSize };
return { documentCount, indexSize };
} catch (error) {
log.error(`Error getting FTS statistics: ${error}`);
return { totalDocuments: 0, indexSize: 0 };
return { documentCount: 0, indexSize: 0 };
}
}
/**
* Get FTS index statistics (alias for getStatistics for API compatibility)
*/
getIndexStats(): { totalDocuments: number; indexSize: number } {
const stats = this.getStatistics();
return {
totalDocuments: stats.documentCount,
indexSize: stats.indexSize
};
}
/**
* Rebuild the entire FTS index from scratch
@@ -730,94 +502,44 @@ class FTSSearchService {
}
try {
log.info("Starting FTS index rebuild optimized for millions of notes...");
log.info("Starting FTS index rebuild");
// Clear existing index first
sql.transactional(() => {
// Clear existing index
sql.execute(`DELETE FROM notes_fts`);
// Get total count for progress reporting
const totalNotes = sql.getValue<number>(`
SELECT COUNT(*)
// Rebuild from all eligible notes
const notes = sql.getRows<{noteId: string, title: string, content: string}>(`
SELECT n.noteId, n.title, b.content
FROM notes n
LEFT JOIN blobs b ON n.blobId = b.blobId
WHERE n.type IN ('text', 'code', 'mermaid', 'canvas', 'mindMap')
AND n.isDeleted = 0
AND n.isProtected = 0
AND b.content IS NOT NULL
`) || 0;
`);
if (totalNotes === 0) {
log.info("No notes to index");
return;
}
log.info(`Rebuilding FTS index for ${totalNotes} notes...`);
let processedCount = 0;
let offset = 0;
if (notes && notes.length > 0) {
// Process in batches for better performance
const batchSize = FTS_CONFIG.BATCH_SIZE;
// Process in chunks to handle millions of notes without memory issues
while (offset < totalNotes) {
sql.transactional(() => {
const notesBatch = sql.getRows<{noteId: string, title: string, content: string}>(`
SELECT
n.noteId,
n.title,
b.content
FROM notes n
LEFT JOIN blobs b ON n.blobId = b.blobId
WHERE n.type IN ('text', 'code', 'mermaid', 'canvas', 'mindMap')
AND n.isDeleted = 0
AND n.isProtected = 0
AND b.content IS NOT NULL
ORDER BY n.noteId
LIMIT ? OFFSET ?
`, [batchSize, offset]);
for (let i = 0; i < notes.length; i += batchSize) {
const batch = notes.slice(i, i + batchSize);
if (!notesBatch || notesBatch.length === 0) {
return;
}
// Use batch insert for much better performance
if (notesBatch.length === 1) {
// Single insert
for (const note of batch) {
sql.execute(
`INSERT INTO notes_fts (noteId, title, content) VALUES (?, ?, ?)`,
[notesBatch[0].noteId, notesBatch[0].title, notesBatch[0].content]
[note.noteId, note.title, note.content]
);
} else {
// Batch insert
const batchInsertQuery = `
INSERT INTO notes_fts (noteId, title, content)
VALUES ${notesBatch.map(() => '(?, ?, ?)').join(', ')}
`;
const params: any[] = [];
for (const note of notesBatch) {
params.push(note.noteId, note.title, note.content);
}
}
sql.execute(batchInsertQuery, params);
log.info(`Rebuilt FTS index with ${notes.length} notes`);
}
processedCount += notesBatch.length;
});
offset += batchSize;
// Progress reporting for large rebuilds
if (processedCount % 10000 === 0 || processedCount >= totalNotes) {
const percentage = Math.round((processedCount / totalNotes) * 100);
log.info(`Indexed ${processedCount} of ${totalNotes} notes (${percentage}%)...`);
}
}
log.info(`FTS index rebuild completed. Indexed ${processedCount} notes.`);
// Optimize after rebuild
this.optimizeIndex();
} catch (error) {
log.error(`Error rebuilding FTS index: ${error}`);
throw new FTSError(`Failed to rebuild FTS index: ${error}`, 'FTS_REBUILD_ERROR');