Improve muted word matching (#3018)

* Use name params * Add language exception matching
2024-02-28 10:38:31 -06:00 · 2024-02-28 10:38:31 -06:00 · 0c3d55db6f
parent 5cb45f9c16
commit 0c3d55db6f
2 changed files with 313 additions and 248 deletions
--- a/src/lib/tests/moderatePost_wrapped.test.ts
+++ b/src/lib/tests/moderatePost_wrapped.test.ts
@ -11,12 +11,12 @@ describe(`hasMutedWord`, () => {
      })
      rt.detectFacetsWithoutResolution()

-      const match = hasMutedWord(
-        [{value: 'outlineTag', targets: ['tag']}],
-        rt.text,
-        rt.facets,
-        ['outlineTag'],
-      )
+      const match = hasMutedWord({
+        mutedWords: [{value: 'outlineTag', targets: ['tag']}],
+        text: rt.text,
+        facets: rt.facets,
+        outlineTags: ['outlineTag'],
+      })

      expect(match).toBe(true)
    })
@ -27,12 +27,12 @@ describe(`hasMutedWord`, () => {
      })
      rt.detectFacetsWithoutResolution()

-      const match = hasMutedWord(
-        [{value: 'inlineTag', targets: ['tag']}],
-        rt.text,
-        rt.facets,
-        ['outlineTag'],
-      )
+      const match = hasMutedWord({
+        mutedWords: [{value: 'inlineTag', targets: ['tag']}],
+        text: rt.text,
+        facets: rt.facets,
+        outlineTags: ['outlineTag'],
+      })

      expect(match).toBe(true)
    })
@ -43,12 +43,12 @@ describe(`hasMutedWord`, () => {
      })
      rt.detectFacetsWithoutResolution()

-      const match = hasMutedWord(
-        [{value: 'inlineTag', targets: ['content']}],
-        rt.text,
-        rt.facets,
-        ['outlineTag'],
-      )
+      const match = hasMutedWord({
+        mutedWords: [{value: 'inlineTag', targets: ['content']}],
+        text: rt.text,
+        facets: rt.facets,
+        outlineTags: ['outlineTag'],
+      })

      expect(match).toBe(true)
    })
@ -59,12 +59,12 @@ describe(`hasMutedWord`, () => {
      })
      rt.detectFacetsWithoutResolution()

-      const match = hasMutedWord(
-        [{value: 'inlineTag', targets: ['tag']}],
-        rt.text,
-        rt.facets,
-        [],
-      )
+      const match = hasMutedWord({
+        mutedWords: [{value: 'inlineTag', targets: ['tag']}],
+        text: rt.text,
+        facets: rt.facets,
+        outlineTags: [],
+      })

      expect(match).toBe(false)
    })
@ -80,12 +80,12 @@ describe(`hasMutedWord`, () => {
      })
      rt.detectFacetsWithoutResolution()

-      const match = hasMutedWord(
-        [{value: '希', targets: ['content']}],
-        rt.text,
-        rt.facets,
-        [],
-      )
+      const match = hasMutedWord({
+        mutedWords: [{value: '希', targets: ['content']}],
+        text: rt.text,
+        facets: rt.facets,
+        outlineTags: [],
+      })

      expect(match).toBe(true)
    })
@ -96,12 +96,12 @@ describe(`hasMutedWord`, () => {
      })
      rt.detectFacetsWithoutResolution()

-      const match = hasMutedWord(
-        [{value: 'politics', targets: ['content']}],
-        rt.text,
-        rt.facets,
-        [],
-      )
+      const match = hasMutedWord({
+        mutedWords: [{value: 'politics', targets: ['content']}],
+        text: rt.text,
+        facets: rt.facets,
+        outlineTags: [],
+      })

      expect(match).toBe(false)
    })
@ -112,12 +112,12 @@ describe(`hasMutedWord`, () => {
      })
      rt.detectFacetsWithoutResolution()

-      const match = hasMutedWord(
-        [{value: 'javascript', targets: ['content']}],
-        rt.text,
-        rt.facets,
-        [],
-      )
+      const match = hasMutedWord({
+        mutedWords: [{value: 'javascript', targets: ['content']}],
+        text: rt.text,
+        facets: rt.facets,
+        outlineTags: [],
+      })

      expect(match).toBe(true)
    })
@ -130,12 +130,12 @@ describe(`hasMutedWord`, () => {
      })
      rt.detectFacetsWithoutResolution()

-      const match = hasMutedWord(
-        [{value: 'javascript', targets: ['content']}],
-        rt.text,
-        rt.facets,
-        [],
-      )
+      const match = hasMutedWord({
+        mutedWords: [{value: 'javascript', targets: ['content']}],
+        text: rt.text,
+        facets: rt.facets,
+        outlineTags: [],
+      })

      expect(match).toBe(true)
    })
@ -146,12 +146,12 @@ describe(`hasMutedWord`, () => {
      })
      rt.detectFacetsWithoutResolution()

-      const match = hasMutedWord(
-        [{value: 'ai', targets: ['content']}],
-        rt.text,
-        rt.facets,
-        [],
-      )
+      const match = hasMutedWord({
+        mutedWords: [{value: 'ai', targets: ['content']}],
+        text: rt.text,
+        facets: rt.facets,
+        outlineTags: [],
+      })

      expect(match).toBe(false)
    })
@ -162,12 +162,12 @@ describe(`hasMutedWord`, () => {
      })
      rt.detectFacetsWithoutResolution()

-      const match = hasMutedWord(
-        [{value: 'brain', targets: ['content']}],
-        rt.text,
-        rt.facets,
-        [],
-      )
+      const match = hasMutedWord({
+        mutedWords: [{value: 'brain', targets: ['content']}],
+        text: rt.text,
+        facets: rt.facets,
+        outlineTags: [],
+      })

      expect(match).toBe(true)
    })
@ -178,12 +178,12 @@ describe(`hasMutedWord`, () => {
      })
      rt.detectFacetsWithoutResolution()

-      const match = hasMutedWord(
-        [{value: `:)`, targets: ['content']}],
-        rt.text,
-        rt.facets,
-        [],
-      )
+      const match = hasMutedWord({
+        mutedWords: [{value: `:)`, targets: ['content']}],
+        text: rt.text,
+        facets: rt.facets,
+        outlineTags: [],
+      })

      expect(match).toBe(true)
    })
@ -197,23 +197,23 @@ describe(`hasMutedWord`, () => {
      rt.detectFacetsWithoutResolution()

      it(`match: yay!`, () => {
-        const match = hasMutedWord(
-          [{value: 'yay!', targets: ['content']}],
-          rt.text,
-          rt.facets,
-          [],
-        )
+        const match = hasMutedWord({
+          mutedWords: [{value: 'yay!', targets: ['content']}],
+          text: rt.text,
+          facets: rt.facets,
+          outlineTags: [],
+        })

        expect(match).toBe(true)
      })

      it(`match: yay`, () => {
-        const match = hasMutedWord(
-          [{value: 'yay', targets: ['content']}],
-          rt.text,
-          rt.facets,
-          [],
-        )
+        const match = hasMutedWord({
+          mutedWords: [{value: 'yay', targets: ['content']}],
+          text: rt.text,
+          facets: rt.facets,
+          outlineTags: [],
+        })

        expect(match).toBe(true)
      })
@ -226,24 +226,24 @@ describe(`hasMutedWord`, () => {
      rt.detectFacetsWithoutResolution()

      it(`match: y!ppee`, () => {
-        const match = hasMutedWord(
-          [{value: 'y!ppee', targets: ['content']}],
-          rt.text,
-          rt.facets,
-          [],
-        )
+        const match = hasMutedWord({
+          mutedWords: [{value: 'y!ppee', targets: ['content']}],
+          text: rt.text,
+          facets: rt.facets,
+          outlineTags: [],
+        })

        expect(match).toBe(true)
      })

      // single exclamation point, source has double
      it(`no match: y!ppee!`, () => {
-        const match = hasMutedWord(
-          [{value: 'y!ppee!', targets: ['content']}],
-          rt.text,
-          rt.facets,
-          [],
-        )
+        const match = hasMutedWord({
+          mutedWords: [{value: 'y!ppee!', targets: ['content']}],
+          text: rt.text,
+          facets: rt.facets,
+          outlineTags: [],
+        })

        expect(match).toBe(true)
      })
@ -256,23 +256,23 @@ describe(`hasMutedWord`, () => {
      rt.detectFacetsWithoutResolution()

      it(`match: S@assy`, () => {
-        const match = hasMutedWord(
-          [{value: 'S@assy', targets: ['content']}],
-          rt.text,
-          rt.facets,
-          [],
-        )
+        const match = hasMutedWord({
+          mutedWords: [{value: 'S@assy', targets: ['content']}],
+          text: rt.text,
+          facets: rt.facets,
+          outlineTags: [],
+        })

        expect(match).toBe(true)
      })

      it(`match: s@assy`, () => {
-        const match = hasMutedWord(
-          [{value: 's@assy', targets: ['content']}],
-          rt.text,
-          rt.facets,
-          [],
-        )
+        const match = hasMutedWord({
+          mutedWords: [{value: 's@assy', targets: ['content']}],
+          text: rt.text,
+          facets: rt.facets,
+          outlineTags: [],
+        })

        expect(match).toBe(true)
      })
@ -286,12 +286,12 @@ describe(`hasMutedWord`, () => {

      // case insensitive
      it(`match: new york times`, () => {
-        const match = hasMutedWord(
-          [{value: 'new york times', targets: ['content']}],
-          rt.text,
-          rt.facets,
-          [],
-        )
+        const match = hasMutedWord({
+          mutedWords: [{value: 'new york times', targets: ['content']}],
+          text: rt.text,
+          facets: rt.facets,
+          outlineTags: [],
+        })

        expect(match).toBe(true)
      })
@ -304,23 +304,23 @@ describe(`hasMutedWord`, () => {
      rt.detectFacetsWithoutResolution()

      it(`match: !command`, () => {
-        const match = hasMutedWord(
-          [{value: `!command`, targets: ['content']}],
-          rt.text,
-          rt.facets,
-          [],
-        )
+        const match = hasMutedWord({
+          mutedWords: [{value: `!command`, targets: ['content']}],
+          text: rt.text,
+          facets: rt.facets,
+          outlineTags: [],
+        })

        expect(match).toBe(true)
      })

      it(`match: command`, () => {
-        const match = hasMutedWord(
-          [{value: `command`, targets: ['content']}],
-          rt.text,
-          rt.facets,
-          [],
-        )
+        const match = hasMutedWord({
+          mutedWords: [{value: `command`, targets: ['content']}],
+          text: rt.text,
+          facets: rt.facets,
+          outlineTags: [],
+        })

        expect(match).toBe(true)
      })
@ -331,12 +331,12 @@ describe(`hasMutedWord`, () => {
        })
        rt.detectFacetsWithoutResolution()

-        const match = hasMutedWord(
-          [{value: `!command`, targets: ['content']}],
-          rt.text,
-          rt.facets,
-          [],
-        )
+        const match = hasMutedWord({
+          mutedWords: [{value: `!command`, targets: ['content']}],
+          text: rt.text,
+          facets: rt.facets,
+          outlineTags: [],
+        })

        expect(match).toBe(false)
      })
@ -349,23 +349,23 @@ describe(`hasMutedWord`, () => {
      rt.detectFacetsWithoutResolution()

      it(`match: e/acc`, () => {
-        const match = hasMutedWord(
-          [{value: `e/acc`, targets: ['content']}],
-          rt.text,
-          rt.facets,
-          [],
-        )
+        const match = hasMutedWord({
+          mutedWords: [{value: `e/acc`, targets: ['content']}],
+          text: rt.text,
+          facets: rt.facets,
+          outlineTags: [],
+        })

        expect(match).toBe(true)
      })

      it(`match: acc`, () => {
-        const match = hasMutedWord(
-          [{value: `acc`, targets: ['content']}],
-          rt.text,
-          rt.facets,
-          [],
-        )
+        const match = hasMutedWord({
+          mutedWords: [{value: `acc`, targets: ['content']}],
+          text: rt.text,
+          facets: rt.facets,
+          outlineTags: [],
+        })

        expect(match).toBe(true)
      })
@ -378,45 +378,45 @@ describe(`hasMutedWord`, () => {
      rt.detectFacetsWithoutResolution()

      it(`match: super-bad`, () => {
-        const match = hasMutedWord(
-          [{value: `super-bad`, targets: ['content']}],
-          rt.text,
-          rt.facets,
-          [],
-        )
+        const match = hasMutedWord({
+          mutedWords: [{value: `super-bad`, targets: ['content']}],
+          text: rt.text,
+          facets: rt.facets,
+          outlineTags: [],
+        })

        expect(match).toBe(true)
      })

      it(`match: super`, () => {
-        const match = hasMutedWord(
-          [{value: `super`, targets: ['content']}],
-          rt.text,
-          rt.facets,
-          [],
-        )
+        const match = hasMutedWord({
+          mutedWords: [{value: `super`, targets: ['content']}],
+          text: rt.text,
+          facets: rt.facets,
+          outlineTags: [],
+        })

        expect(match).toBe(true)
      })

      it(`match: super bad`, () => {
-        const match = hasMutedWord(
-          [{value: `super bad`, targets: ['content']}],
-          rt.text,
-          rt.facets,
-          [],
-        )
+        const match = hasMutedWord({
+          mutedWords: [{value: `super bad`, targets: ['content']}],
+          text: rt.text,
+          facets: rt.facets,
+          outlineTags: [],
+        })

        expect(match).toBe(true)
      })

      it(`match: superbad`, () => {
-        const match = hasMutedWord(
-          [{value: `superbad`, targets: ['content']}],
-          rt.text,
-          rt.facets,
-          [],
-        )
+        const match = hasMutedWord({
+          mutedWords: [{value: `superbad`, targets: ['content']}],
+          text: rt.text,
+          facets: rt.facets,
+          outlineTags: [],
+        })

        expect(match).toBe(false)
      })
@ -429,47 +429,49 @@ describe(`hasMutedWord`, () => {
      rt.detectFacetsWithoutResolution()

      it(`match: idk what this would be`, () => {
-        const match = hasMutedWord(
-          [{value: `idk what this would be`, targets: ['content']}],
-          rt.text,
-          rt.facets,
-          [],
-        )
+        const match = hasMutedWord({
+          mutedWords: [{value: `idk what this would be`, targets: ['content']}],
+          text: rt.text,
+          facets: rt.facets,
+          outlineTags: [],
+        })

        expect(match).toBe(true)
      })

      it(`no match: idk what this would be for`, () => {
        // extra word
-        const match = hasMutedWord(
-          [{value: `idk what this would be for`, targets: ['content']}],
-          rt.text,
-          rt.facets,
-          [],
-        )
+        const match = hasMutedWord({
+          mutedWords: [
+            {value: `idk what this would be for`, targets: ['content']},
+          ],
+          text: rt.text,
+          facets: rt.facets,
+          outlineTags: [],
+        })

        expect(match).toBe(false)
      })

      it(`match: idk`, () => {
        // extra word
-        const match = hasMutedWord(
-          [{value: `idk`, targets: ['content']}],
-          rt.text,
-          rt.facets,
-          [],
-        )
+        const match = hasMutedWord({
+          mutedWords: [{value: `idk`, targets: ['content']}],
+          text: rt.text,
+          facets: rt.facets,
+          outlineTags: [],
+        })

        expect(match).toBe(true)
      })

      it(`match: idkwhatthiswouldbe`, () => {
-        const match = hasMutedWord(
-          [{value: `idkwhatthiswouldbe`, targets: ['content']}],
-          rt.text,
-          rt.facets,
-          [],
-        )
+        const match = hasMutedWord({
+          mutedWords: [{value: `idkwhatthiswouldbe`, targets: ['content']}],
+          text: rt.text,
+          facets: rt.facets,
+          outlineTags: [],
+        })

        expect(match).toBe(false)
      })
@ -482,45 +484,45 @@ describe(`hasMutedWord`, () => {
      rt.detectFacetsWithoutResolution()

      it(`match: context(iykyk)`, () => {
-        const match = hasMutedWord(
-          [{value: `context(iykyk)`, targets: ['content']}],
-          rt.text,
-          rt.facets,
-          [],
-        )
+        const match = hasMutedWord({
+          mutedWords: [{value: `context(iykyk)`, targets: ['content']}],
+          text: rt.text,
+          facets: rt.facets,
+          outlineTags: [],
+        })

        expect(match).toBe(true)
      })

      it(`match: context`, () => {
-        const match = hasMutedWord(
-          [{value: `context`, targets: ['content']}],
-          rt.text,
-          rt.facets,
-          [],
-        )
+        const match = hasMutedWord({
+          mutedWords: [{value: `context`, targets: ['content']}],
+          text: rt.text,
+          facets: rt.facets,
+          outlineTags: [],
+        })

        expect(match).toBe(true)
      })

      it(`match: iykyk`, () => {
-        const match = hasMutedWord(
-          [{value: `iykyk`, targets: ['content']}],
-          rt.text,
-          rt.facets,
-          [],
-        )
+        const match = hasMutedWord({
+          mutedWords: [{value: `iykyk`, targets: ['content']}],
+          text: rt.text,
+          facets: rt.facets,
+          outlineTags: [],
+        })

        expect(match).toBe(true)
      })

      it(`match: (iykyk)`, () => {
-        const match = hasMutedWord(
-          [{value: `(iykyk)`, targets: ['content']}],
-          rt.text,
-          rt.facets,
-          [],
-        )
+        const match = hasMutedWord({
+          mutedWords: [{value: `(iykyk)`, targets: ['content']}],
+          text: rt.text,
+          facets: rt.facets,
+          outlineTags: [],
+        })

        expect(match).toBe(true)
      })
@ -533,12 +535,12 @@ describe(`hasMutedWord`, () => {
      rt.detectFacetsWithoutResolution()

      it(`match: 🦋`, () => {
-        const match = hasMutedWord(
-          [{value: `🦋`, targets: ['content']}],
-          rt.text,
-          rt.facets,
-          [],
-        )
+        const match = hasMutedWord({
+          mutedWords: [{value: `🦋`, targets: ['content']}],
+          text: rt.text,
+          facets: rt.facets,
+          outlineTags: [],
+        })

        expect(match).toBe(true)
      })
@ -553,23 +555,46 @@ describe(`hasMutedWord`, () => {
      rt.detectFacetsWithoutResolution()

      it(`match: stop worrying`, () => {
-        const match = hasMutedWord(
-          [{value: 'stop worrying', targets: ['content']}],
-          rt.text,
-          rt.facets,
-          [],
-        )
+        const match = hasMutedWord({
+          mutedWords: [{value: 'stop worrying', targets: ['content']}],
+          text: rt.text,
+          facets: rt.facets,
+          outlineTags: [],
+        })

        expect(match).toBe(true)
      })

      it(`match: turtles, or how`, () => {
-        const match = hasMutedWord(
-          [{value: 'turtles, or how', targets: ['content']}],
-          rt.text,
-          rt.facets,
-          [],
-        )
+        const match = hasMutedWord({
+          mutedWords: [{value: 'turtles, or how', targets: ['content']}],
+          text: rt.text,
+          facets: rt.facets,
+          outlineTags: [],
+        })
+
+        expect(match).toBe(true)
+      })
+    })
+  })
+
+  describe(`languages without spaces`, () => {
+    // I love turtles, or how I learned to stop worrying and love the internet
+    describe(`私はカメが好きです、またはどのようにして心配するのをやめてインターネットを愛するようになったのか`, () => {
+      const rt = new RichText({
+        text: `私はカメが好きです、またはどのようにして心配するのをやめてインターネットを愛するようになったのか`,
+      })
+      rt.detectFacetsWithoutResolution()
+
+      // internet
+      it(`match: インターネット`, () => {
+        const match = hasMutedWord({
+          mutedWords: [{value: 'インターネット', targets: ['content']}],
+          text: rt.text,
+          facets: rt.facets,
+          outlineTags: [],
+          languages: ['ja'],
+        })

        expect(match).toBe(true)
      })
--- a/src/lib/moderatePost_wrapped.ts
+++ b/src/lib/moderatePost_wrapped.ts
@ -21,12 +21,34 @@ const REGEX = {
  WORD_BOUNDARY: /[\s\n\t\r\f\v]+?/g,
 }

-export function hasMutedWord(
-  mutedWords: AppBskyActorDefs.MutedWord[],
-  text: string,
-  facets?: AppBskyRichtextFacet.Main[],
-  outlineTags?: string[],
-) {
+/**
+ * List of 2-letter lang codes for languages that either don't use spaces, or
+ * don't use spaces in a way conducive to word-based filtering.
+ *
+ * For these, we use a simple `String.includes` to check for a match.
+ */
+const LANGUAGE_EXCEPTIONS = [
+  'ja', // Japanese
+  'zh', // Chinese
+  'ko', // Korean
+  'th', // Thai
+  'vi', // Vietnamese
+]
+
+export function hasMutedWord({
+  mutedWords,
+  text,
+  facets,
+  outlineTags,
+  languages,
+}: {
+  mutedWords: AppBskyActorDefs.MutedWord[]
+  text: string
+  facets?: AppBskyRichtextFacet.Main[]
+  outlineTags?: string[]
+  languages?: string[]
+}) {
+  const exception = LANGUAGE_EXCEPTIONS.includes(languages?.[0] || '')
  const tags = ([] as string[])
    .concat(outlineTags || [])
    .concat(
@ -48,8 +70,9 @@ export function hasMutedWord(
    if (tags.includes(mutedWord)) return true
    // rest of the checks are for `content` only
    if (!mute.targets.includes('content')) continue
-    // single character, has to use includes
-    if (mutedWord.length === 1 && postText.includes(mutedWord)) return true
+    // single character or other exception, has to use includes
+    if ((mutedWord.length === 1 || exception) && postText.includes(mutedWord))
+      return true
    // too long
    if (mutedWord.length > postText.length) continue
    // exact match
@ -134,19 +157,28 @@ export function moderatePost_wrapped(
  }

  if (AppBskyFeedPost.isRecord(subject.record)) {
-    let muted = hasMutedWord(
+    let muted = hasMutedWord({
      mutedWords,
-      subject.record.text,
-      subject.record.facets || [],
-      subject.record.tags || [],
-    )
+      text: subject.record.text,
+      facets: subject.record.facets || [],
+      outlineTags: subject.record.tags || [],
+      languages: subject.record.langs,
+    })

    if (
      subject.record.embed &&
      AppBskyEmbedImages.isMain(subject.record.embed)
    ) {
      for (const image of subject.record.embed.images) {
-        muted = muted || hasMutedWord(mutedWords, image.alt, [], [])
+        muted =
+          muted ||
+          hasMutedWord({
+            mutedWords,
+            text: image.alt,
+            facets: [],
+            outlineTags: [],
+            languages: subject.record.langs,
+          })
      }
    }

@ -172,17 +204,25 @@ export function moderatePost_wrapped(
      if (AppBskyFeedPost.isRecord(subject.embed.record.value)) {
        embedHidden =
          embedHidden ||
-          hasMutedWord(
+          hasMutedWord({
            mutedWords,
-            subject.embed.record.value.text,
-            subject.embed.record.value.facets,
-            subject.embed.record.value.tags,
-          )
+            text: subject.embed.record.value.text,
+            facets: subject.embed.record.value.facets,
+            outlineTags: subject.embed.record.value.tags,
+            languages: subject.embed.record.value.langs,
+          })

        if (AppBskyEmbedImages.isMain(subject.embed.record.value.embed)) {
          for (const image of subject.embed.record.value.embed.images) {
            embedHidden =
-              embedHidden || hasMutedWord(mutedWords, image.alt, [], [])
+              embedHidden ||
+              hasMutedWord({
+                mutedWords,
+                text: image.alt,
+                facets: [],
+                outlineTags: [],
+                languages: subject.embed.record.value.langs,
+              })
          }
        }
      }