feat: prefer HTML title tag over filename during import

When importing HTML files, extract and use the title from the <title> tag
if available, falling back to the filename only when no title tag is found.

This improves handling of titles with special characters that can't be
represented in filenames.
This commit is contained in:
maphew
2024-11-16 09:06:58 -07:00
parent 3ff75b14e9
commit 47c05b2c6d
2 changed files with 16 additions and 7 deletions

View File

@@ -149,15 +149,18 @@ function importMarkdown(taskContext: TaskContext, file: File, parentNote: BNote)
}
function importHtml(taskContext: TaskContext, file: File, parentNote: BNote) {
const title = utils.getNoteTitle(file.originalname, !!taskContext.data?.replaceUnderscoresWithSpaces);
let content = file.buffer.toString("utf-8");
if (taskContext?.data?.safeImport) {
content = htmlSanitizer.sanitize(content);
}
// Try to get title from HTML first, fall back to filename
const htmlTitle = importUtils.extractHtmlTitle(content);
const title = htmlTitle || utils.getNoteTitle(file.originalname, !!taskContext.data?.replaceUnderscoresWithSpaces);
content = importUtils.handleH1(content, title);
const {note} = noteService.createNewNote({
parentNoteId: parentNote.noteId,
title,
@@ -166,9 +169,9 @@ function importHtml(taskContext: TaskContext, file: File, parentNote: BNote) {
mime: 'text/html',
isProtected: parentNote.isProtected && protectedSessionService.isProtectedSessionAvailable(),
});
taskContext.increaseProgressCount();
return note;
}

View File

@@ -11,6 +11,12 @@ function handleH1(content: string, title: string) {
return content;
}
function extractHtmlTitle(content: string): string | null {
const titleMatch = content.match(/<title[^>]*>([^<]+)<\/title>/i);
return titleMatch ? titleMatch[1].trim() : null;
}
export default {
handleH1
handleH1,
extractHtmlTitle
};