wordCount Filter for Grav (#3957)

2025-10-26 07:56:07 +01:00 · 2025-10-13 21:35:33 +02:00
parent 03849923d4
commit c82645a42a
1 changed files with 57 additions and 0 deletions
--- a/system/src/Grav/Common/Twig/Extension/GravExtension.php
+++ b/system/src/Grav/Common/Twig/Extension/GravExtension.php
@@ -140,6 +140,7 @@ class GravExtension extends AbstractExtension implements GlobalsInterface
            new TwigFilter('starts_with', [$this, 'startsWithFilter']),
            new TwigFilter('truncate', [Utils::class, 'truncate']),
            new TwigFilter('truncate_html', [Utils::class, 'truncateHTML']),
+            new TwigFilter('wordcount', [$this, 'wordCountFilter']),
            new TwigFilter('json_decode', [$this, 'jsonDecodeFilter']),
            new TwigFilter('array_unique', 'array_unique'),
            new TwigFilter('basename', 'basename'),
@@ -578,6 +579,62 @@ class GravExtension extends AbstractExtension implements GlobalsInterface
        return $str;
    }

+    /**
+     * Count words in text with improved accuracy for multiple languages
+     *
+     * @param string $text The text to count words from
+     * @param string $locale Optional locale for language-specific counting (default: 'en')
+     * @return int Number of words
+     */
+    public function wordCountFilter($text, string $locale = 'en'): int
+    {
+        if (empty($text)) {
+            return 0;
+        }
+
+        // Strip HTML tags and decode entities
+        $cleanText = html_entity_decode(strip_tags($text), ENT_QUOTES, 'UTF-8');
+        
+        // Remove extra whitespace and normalize
+        $cleanText = trim(preg_replace('/\s+/', ' ', $cleanText));
+        
+        if (empty($cleanText)) {
+            return 0;
+        }
+
+        // Handle different languages
+        switch (strtolower($locale)) {
+            case 'zh':
+            case 'zh-cn':
+            case 'zh-tw':
+            case 'chinese':
+                // Chinese: count characters (excluding spaces and punctuation)
+                return mb_strlen(preg_replace('/[\s\p{P}]/u', '', $cleanText), 'UTF-8');
+                
+            case 'ja':
+            case 'japanese':
+                // Japanese: count characters (excluding spaces)
+                return mb_strlen(preg_replace('/\s/', '', $cleanText), 'UTF-8');
+                
+            case 'ko':
+            case 'korean':
+                // Korean: count characters (excluding spaces)
+                return mb_strlen(preg_replace('/\s/', '', $cleanText), 'UTF-8');
+                
+            default:
+                // Western languages: use improved word counting
+                // Handle contractions, hyphenated words, and numbers better
+                $words = preg_split('/\s+/', $cleanText, -1, PREG_SPLIT_NO_EMPTY);
+                
+                // Filter out pure punctuation
+                $words = array_filter($words, function($word) {
+                    return preg_match('/\w/', $word);
+                });
+                
+                return count($words);
+        }
+    }
+
    /**
     * Get Cron object for a crontab 'at' format
     *