Further improve lang detection perf

This commit is contained in:
Lim Chee Aun 2024-05-29 15:26:58 +08:00
parent f9a73777e7
commit 7546b42c7c
3 changed files with 53 additions and 5 deletions

View file

@ -1866,7 +1866,16 @@ const Textarea = forwardRef((props, ref) => {
// Newline to prevent multiple line breaks at the end from being collapsed, no idea why
}, 500);
const debouncedAutoDetectLanguage = useDebouncedCallback((text) => {
const debouncedAutoDetectLanguage = useDebouncedCallback(() => {
// Make use of the highlightRef to get the DOM
// Clone the dom
const dom = composeHighlightRef.current?.cloneNode(true);
if (!dom) return;
// Remove mark
dom.querySelectorAll('mark').forEach((mark) => {
mark.remove();
});
const text = dom.innerText?.trim();
if (!text) return;
const langs = detectLangs(text);
if (langs?.length) {
@ -1875,7 +1884,7 @@ const Textarea = forwardRef((props, ref) => {
languages: langs,
});
}
}, 1000);
}, 2000);
return (
<text-expander
@ -1944,7 +1953,7 @@ const Textarea = forwardRef((props, ref) => {
autoResizeTextarea(target);
props.onInput?.(e);
throttleHighlightText(text);
debouncedAutoDetectLanguage(text);
debouncedAutoDetectLanguage();
}}
style={{
width: '100%',

View file

@ -161,6 +161,8 @@ const SIZE_CLASS = {
};
const detectLang = mem((text) => {
text = text?.trim();
// Ref: https://github.com/komodojp/tinyld/blob/develop/docs/benchmark.md
// 500 should be enough for now, also the default max chars for Mastodon
if (text?.length > 500) {
@ -284,7 +286,40 @@ function Status({
emojiReactions,
} = status;
let languageAutoDetected = content && detectLang(getHTMLText(content));
const [languageAutoDetected, setLanguageAutoDetected] = useState(null);
useEffect(() => {
if (!content) return;
if (_language) return;
let timer;
timer = setTimeout(() => {
let detected = detectLang(
getHTMLText(content, {
preProcess: (dom) => {
// Remove anything that can skew the language detection
// Remove .mention, .hashtag, pre, code, a:has(.invisible)
dom
.querySelectorAll(
'.mention, .hashtag, pre, code, a:has(.invisible)',
)
.forEach((a) => {
a.remove();
});
// Remove links that contains text that starts with https?://
dom.querySelectorAll('a').forEach((a) => {
const text = a.innerText.trim();
if (text.startsWith('https://') || text.startsWith('http://')) {
a.remove();
}
});
},
}),
);
setLanguageAutoDetected(detected);
}, 1000);
return () => clearTimeout(timer);
}, [content, _language]);
const language = _language || languageAutoDetected;
// if (!mediaAttachments?.length) mediaFirst = false;

View file

@ -1,8 +1,10 @@
import mem from './mem';
const div = document.createElement('div');
function getHTMLText(html) {
function getHTMLText(html, opts) {
if (!html) return '';
const { preProcess } = opts || {};
div.innerHTML = html
.replace(/<\/p>/g, '</p>\n\n')
.replace(/<\/li>/g, '</li>\n');
@ -10,6 +12,8 @@ function getHTMLText(html) {
br.replaceWith('\n');
});
preProcess?.(div);
// MASTODON-SPECIFIC classes
// Remove .invisible
div.querySelectorAll('.invisible').forEach((el) => {