fix: add pre-processing step to title generation logic so sbd doesn't fall over so badly

This commit is contained in:
Julian Lam
2025-09-17 10:44:51 -04:00
parent f7bbec7ccf
commit f7c4742987
2 changed files with 5 additions and 47 deletions

View File

@@ -339,52 +339,6 @@ Helpers.resolveObjects = async (ids) => {
return objects.length === 1 ? objects[0] : objects;
};
const titleishTags = ['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'title', 'p', 'span'];
const titleRegex = new RegExp(`<(${titleishTags.join('|')})>(.+?)</\\1>`, 'm');
Helpers.generateTitle = (html) => {
// Given an html string, generates a more appropriate title if possible
let title;
// Try the first paragraph-like element
const match = html.match(titleRegex);
if (match && match.index === 0) {
title = match[2];
}
// Fall back to newline splitting (i.e. if no paragraph elements)
title = title || html.split('\n').filter(Boolean).shift();
// Discard everything after a line break element
title = title.replace(/<br(\s\/)?>.*/g, '');
// Strip html
title = utils.stripHTMLTags(title);
// Split sentences and use only first one
const sentences = title
.split(/(\.|\?|!)\s/)
.reduce((memo, cur, idx, sentences) => {
if (idx % 2) {
memo.push(`${sentences[idx - 1]}${cur}`);
} else if (idx === sentences.length - 1) {
memo.push(cur);
}
return memo;
}, []);
if (sentences.length > 1) {
title = sentences.shift();
}
// Truncate down if too long
if (title.length > meta.config.maximumTitleLength) {
title = `${title.slice(0, meta.config.maximumTitleLength - 3)}...`;
}
return title;
};
Helpers.remoteAnchorToLocalProfile = async (content, isMarkdown = false) => {
let anchorRegex;
if (isMarkdown) {

View File

@@ -165,7 +165,11 @@ Notes.assert = async (uid, input, options = { skipChecks: false }) => {
// mainPid ok to leave as-is
if (!title) {
const sentences = tokenizer.sentences(content || sourceContent, { sanitize: true });
// Naive pre-processing prior to sbd tokenization
let sbdInput = content || sourceContent;
sbdInput = sbdInput.replace('</p><p>', '</p>\n<p>');
const sentences = tokenizer.sentences(sbdInput, { sanitize: true, newline_boundaries: true });
title = sentences.shift();
}