fix: add pre-processing step to title generation logic so sbd doesn't fall over so badly

2025-12-21 16:00:26 +01:00 · 2025-09-17 10:44:51 -04:00
parent f7bbec7ccf
commit f7c4742987
2 changed files with 5 additions and 47 deletions
--- a/src/activitypub/helpers.js
+++ b/src/activitypub/helpers.js
@@ -339,52 +339,6 @@ Helpers.resolveObjects = async (ids) => {
 	return objects.length === 1 ? objects[0] : objects;
 };

-const titleishTags = ['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'title', 'p', 'span'];
-const titleRegex = new RegExp(`<(${titleishTags.join('|')})>(.+?)</\\1>`, 'm');
-Helpers.generateTitle = (html) => {
-	// Given an html string, generates a more appropriate title if possible
-	let title;
-
-	// Try the first paragraph-like element
-	const match = html.match(titleRegex);
-	if (match && match.index === 0) {
-		title = match[2];
-	}
-
-	// Fall back to newline splitting (i.e. if no paragraph elements)
-	title = title || html.split('\n').filter(Boolean).shift();
-
-	// Discard everything after a line break element
-	title = title.replace(/<br(\s\/)?>.*/g, '');
-
-	// Strip html
-	title = utils.stripHTMLTags(title);
-
-	// Split sentences and use only first one
-	const sentences = title
-		.split(/(\.|\?|!)\s/)
-		.reduce((memo, cur, idx, sentences) => {
-			if (idx % 2) {
-				memo.push(`${sentences[idx - 1]}${cur}`);
-			} else if (idx === sentences.length - 1) {
-				memo.push(cur);
-			}
-
-			return memo;
-		}, []);
-
-	if (sentences.length > 1) {
-		title = sentences.shift();
-	}
-
-	// Truncate down if too long
-	if (title.length > meta.config.maximumTitleLength) {
-		title = `${title.slice(0, meta.config.maximumTitleLength - 3)}...`;
-	}
-
-	return title;
-};
-
 Helpers.remoteAnchorToLocalProfile = async (content, isMarkdown = false) => {
 	let anchorRegex;
 	if (isMarkdown) {
--- a/src/activitypub/notes.js
+++ b/src/activitypub/notes.js
@@ -165,7 +165,11 @@ Notes.assert = async (uid, input, options = { skipChecks: false }) => {

 			// mainPid ok to leave as-is
 			if (!title) {
-				const sentences = tokenizer.sentences(content || sourceContent, { sanitize: true });
+				// Naive pre-processing prior to sbd tokenization
+				let sbdInput = content || sourceContent;
+				sbdInput = sbdInput.replace('</p><p>', '</p>\n<p>');
+
+				const sentences = tokenizer.sentences(sbdInput, { sanitize: true, newline_boundaries: true });
 				title = sentences.shift();
 			}