Implement a link metadata fetching util function

2022-11-23 16:29:17 -06:00 · 2022-11-23 16:29:17 -06:00 · 89638dbd18
commit 89638dbd18
parent eb106a9758
4 changed files with 1488 additions and 17 deletions
--- a/tests/link-meta-utils.ts
+++ b/tests/link-meta-utils.ts
@ -0,0 +1,118 @@
+import {LikelyType, getLinkMeta} from '../src/lib/link-meta'
+
+const exampleComHtml = `<!doctype html>
+<html>
+<head>
+    <title>Example Domain</title>
+    <meta name="description" content="An example website">
+
+    <meta charset="utf-8" />
+    <meta http-equiv="Content-type" content="text/html; charset=utf-8" />
+    <meta name="viewport" content="width=device-width, initial-scale=1" />
+    <style type="text/css">
+    body {
+        background-color: #f0f0f2;
+        margin: 0;
+        padding: 0;
+        font-family: -apple-system, system-ui, BlinkMacSystemFont, "Segoe UI", "Open Sans", "Helvetica Neue", Helvetica, Arial, sans-serif;
+
+    }
+    div {
+        width: 600px;
+        margin: 5em auto;
+        padding: 2em;
+        background-color: #fdfdff;
+        border-radius: 0.5em;
+        box-shadow: 2px 3px 7px 2px rgba(0,0,0,0.02);
+    }
+    a:link, a:visited {
+        color: #38488f;
+        text-decoration: none;
+    }
+    @media (max-width: 700px) {
+        div {
+            margin: 0 auto;
+            width: auto;
+        }
+    }
+    </style>
+</head>
+
+<body>
+<div>
+    <h1>Example Domain</h1>
+    <p>This domain is for use in illustrative examples in documents. You may use this
+    domain in literature without prior coordination or asking for permission.</p>
+    <p><a href="https://www.iana.org/domains/example">More information...</a></p>
+</div>
+</body>
+</html>`
+
+describe('getLinkMeta', () => {
+  const inputs = [
+    '',
+    'httpbadurl',
+    'https://example.com',
+    'https://example.com/index.html',
+    'https://example.com/image.png',
+    'https://example.com/video.avi',
+    'https://example.com/audio.ogg',
+    'https://example.com/javascript.js',
+  ]
+  const outputs = [
+    {
+      error: 'Invalid URL',
+      likelyType: LikelyType.Other,
+      url: '',
+    },
+    {
+      error: 'Invalid URL',
+      likelyType: LikelyType.Other,
+      url: 'httpbadurl',
+    },
+    {
+      likelyType: LikelyType.HTML,
+      url: 'https://example.com',
+      title: 'Example Domain',
+      description: 'An example website',
+    },
+    {
+      likelyType: LikelyType.HTML,
+      url: 'https://example.com/index.html',
+      title: 'Example Domain',
+      description: 'An example website',
+    },
+    {
+      likelyType: LikelyType.Image,
+      url: 'https://example.com/image.png',
+    },
+    {
+      likelyType: LikelyType.Video,
+      url: 'https://example.com/video.avi',
+    },
+    {
+      likelyType: LikelyType.Audio,
+      url: 'https://example.com/audio.ogg',
+    },
+    {
+      likelyType: LikelyType.Other,
+      url: 'https://example.com/javascript.js',
+    },
+  ]
+  it('correctly handles a set of text inputs', async () => {
+    for (let i = 0; i < inputs.length; i++) {
+      global.fetch = jest.fn().mockImplementationOnce(() => {
+        return new Promise((resolve, reject) => {
+          resolve({
+            ok: true,
+            status: 200,
+            text: () => exampleComHtml,
+          })
+        })
+      })
+      const input = inputs[i]
+      const output = await getLinkMeta(input)
+      expect(output).toEqual(outputs[i])
+    }
+  })
+})
--- a/tests/string-utils.ts
+++ b/tests/string-utils.ts
@ -1,17 +1,21 @@
-import {extractEntities, detectLinkables} from '../src/lib/strings'
+import {
+  extractEntities,
+  detectLinkables,
+  extractHtmlMeta,
+} from '../src/lib/strings'

 describe('extractEntities', () => {
-  const knownHandles = new Set(['handle', 'full123.test-of-chars'])
+  const knownHandles = new Set(['handle.com', 'full123.test-of-chars'])
  const inputs = [
    'no mention',
-    '@handle middle end',
-    'start @handle end',
-    'start middle @handle',
-    '@handle @handle @handle',
+    '@handle.com middle end',
+    'start @handle.com end',
+    'start middle @handle.com',
+    '@handle.com @handle.com @handle.com',
    '@full123.test-of-chars',
    'not@right',
-    '@handle!@#$chars',
-    '@handle\n@handle',
+    '@handle.com!@#$chars',
+    '@handle.com\n@handle.com',
    'start https://middle.com end',
    'start https://middle.com/foo/bar end',
    'start https://middle.com/foo/bar?baz=bux end',
@ -35,13 +39,13 @@ describe('extractEntities', () => {
  }
  const outputs: Output[][] = [
    [],
-    [{type: 'mention', value: 'handle'}],
-    [{type: 'mention', value: 'handle'}],
-    [{type: 'mention', value: 'handle'}],
+    [{type: 'mention', value: 'handle.com'}],
+    [{type: 'mention', value: 'handle.com'}],
+    [{type: 'mention', value: 'handle.com'}],
    [
-      {type: 'mention', value: 'handle'},
-      {type: 'mention', value: 'handle'},
-      {type: 'mention', value: 'handle'},
+      {type: 'mention', value: 'handle.com'},
+      {type: 'mention', value: 'handle.com'},
+      {type: 'mention', value: 'handle.com'},
    ],
    [
      {
@ -50,10 +54,10 @@ describe('extractEntities', () => {
      },
    ],
    [],
-    [{type: 'mention', value: 'handle'}],
+    [{type: 'mention', value: 'handle.com'}],
    [
-      {type: 'mention', value: 'handle'},
-      {type: 'mention', value: 'handle'},
+      {type: 'mention', value: 'handle.com'},
+      {type: 'mention', value: 'handle.com'},
    ],
    [{type: 'link', value: 'https://middle.com'}],
    [{type: 'link', value: 'https://middle.com/foo/bar'}],
@ -176,3 +180,45 @@ describe('detectLinkables', () => {
    }
  })
 })
+
+describe('extractHtmlMeta', () => {
+  const inputs = [
+    '',
+    'nothing',
+    '<title>title</title>',
+    '<title> aSd!@#AC </title>',
+    '<title>\n  title\n  </title>',
+    '<meta name="title" content="meta title">',
+    '<meta name="description" content="meta description">',
+    '<meta property="og:title" content="og title">',
+    '<meta property="og:description" content="og description">',
+    '<meta property="og:image" content="https://ogimage.com/foo.png">',
+    '<meta property="twitter:title" content="twitter title">',
+    '<meta property="twitter:description" content="twitter description">',
+    '<meta property="twitter:image" content="https://twitterimage.com/foo.png">',
+    '<meta\n  name="title"\n  content="meta title"\n>',
+  ]
+  const outputs = [
+    {},
+    {},
+    {title: 'title'},
+    {title: 'aSd!@#AC'},
+    {title: 'title'},
+    {title: 'meta title'},
+    {description: 'meta description'},
+    {title: 'og title'},
+    {description: 'og description'},
+    {image: 'https://ogimage.com/foo.png'},
+    {title: 'twitter title'},
+    {description: 'twitter description'},
+    {image: 'https://twitterimage.com/foo.png'},
+    {title: 'meta title'},
+  ]
+  it('correctly handles a set of text inputs', () => {
+    for (let i = 0; i < inputs.length; i++) {
+      const input = inputs[i]
+      const output = extractHtmlMeta(input)
+      expect(output).toEqual(outputs[i])
+    }
+  })
+})
--- a/src/lib/link-meta.ts
+++ b/src/lib/link-meta.ts
--- a/src/lib/strings.ts
+++ b/src/lib/strings.ts
@ -220,3 +220,54 @@ export function convertBskyAppUrlIfNeeded(url: string): string {
  }
  return url
 }
+
+const htmlTitleRegex = /<title>([^<]+)<\/title>/i
+export function extractHtmlMeta(html: string): Record<string, string> {
+  const res: Record<string, string> = {}
+
+  {
+    const match = htmlTitleRegex.exec(html)
+    if (match) {
+      res.title = match[1].trim()
+    }
+  }
+
+  {
+    let metaMatch
+    let propMatch
+    const metaRe = /<meta[\s]([^>]+)>/gis
+    while ((metaMatch = metaRe.exec(html))) {
+      let propName
+      let propValue
+      const propRe = /(name|property|content)="([^"]+)"/gis
+      while ((propMatch = propRe.exec(metaMatch[1]))) {
+        if (propMatch[1] === 'content') {
+          propValue = propMatch[2]
+        } else {
+          propName = propMatch[2]
+        }
+      }
+      if (!propName || !propValue) {
+        continue
+      }
+      switch (propName?.trim()) {
+        case 'title':
+        case 'og:title':
+        case 'twitter:title':
+          res.title = propValue?.trim()
+          break
+        case 'description':
+        case 'og:description':
+        case 'twitter:description':
+          res.description = propValue?.trim()
+          break
+        case 'og:image':
+        case 'twitter:image':
+          res.image = propValue?.trim()
+          break
+      }
+    }
+  }
+
+  return res
+}