Implement a link metadata fetching util function

zio/stable
Paul Frazee 2022-11-23 16:29:17 -06:00
parent eb106a9758
commit 89638dbd18
4 changed files with 1488 additions and 17 deletions

View File

@ -0,0 +1,118 @@
import {LikelyType, getLinkMeta} from '../src/lib/link-meta'
const exampleComHtml = `<!doctype html>
<html>
<head>
<title>Example Domain</title>
<meta name="description" content="An example website">
<meta charset="utf-8" />
<meta http-equiv="Content-type" content="text/html; charset=utf-8" />
<meta name="viewport" content="width=device-width, initial-scale=1" />
<style type="text/css">
body {
background-color: #f0f0f2;
margin: 0;
padding: 0;
font-family: -apple-system, system-ui, BlinkMacSystemFont, "Segoe UI", "Open Sans", "Helvetica Neue", Helvetica, Arial, sans-serif;
}
div {
width: 600px;
margin: 5em auto;
padding: 2em;
background-color: #fdfdff;
border-radius: 0.5em;
box-shadow: 2px 3px 7px 2px rgba(0,0,0,0.02);
}
a:link, a:visited {
color: #38488f;
text-decoration: none;
}
@media (max-width: 700px) {
div {
margin: 0 auto;
width: auto;
}
}
</style>
</head>
<body>
<div>
<h1>Example Domain</h1>
<p>This domain is for use in illustrative examples in documents. You may use this
domain in literature without prior coordination or asking for permission.</p>
<p><a href="https://www.iana.org/domains/example">More information...</a></p>
</div>
</body>
</html>`
describe('getLinkMeta', () => {
const inputs = [
'',
'httpbadurl',
'https://example.com',
'https://example.com/index.html',
'https://example.com/image.png',
'https://example.com/video.avi',
'https://example.com/audio.ogg',
'https://example.com/javascript.js',
]
const outputs = [
{
error: 'Invalid URL',
likelyType: LikelyType.Other,
url: '',
},
{
error: 'Invalid URL',
likelyType: LikelyType.Other,
url: 'httpbadurl',
},
{
likelyType: LikelyType.HTML,
url: 'https://example.com',
title: 'Example Domain',
description: 'An example website',
},
{
likelyType: LikelyType.HTML,
url: 'https://example.com/index.html',
title: 'Example Domain',
description: 'An example website',
},
{
likelyType: LikelyType.Image,
url: 'https://example.com/image.png',
},
{
likelyType: LikelyType.Video,
url: 'https://example.com/video.avi',
},
{
likelyType: LikelyType.Audio,
url: 'https://example.com/audio.ogg',
},
{
likelyType: LikelyType.Other,
url: 'https://example.com/javascript.js',
},
]
it('correctly handles a set of text inputs', async () => {
for (let i = 0; i < inputs.length; i++) {
global.fetch = jest.fn().mockImplementationOnce(() => {
return new Promise((resolve, reject) => {
resolve({
ok: true,
status: 200,
text: () => exampleComHtml,
})
})
})
const input = inputs[i]
const output = await getLinkMeta(input)
expect(output).toEqual(outputs[i])
}
})
})

View File

@ -1,17 +1,21 @@
import {extractEntities, detectLinkables} from '../src/lib/strings'
import {
extractEntities,
detectLinkables,
extractHtmlMeta,
} from '../src/lib/strings'
describe('extractEntities', () => {
const knownHandles = new Set(['handle', 'full123.test-of-chars'])
const knownHandles = new Set(['handle.com', 'full123.test-of-chars'])
const inputs = [
'no mention',
'@handle middle end',
'start @handle end',
'start middle @handle',
'@handle @handle @handle',
'@handle.com middle end',
'start @handle.com end',
'start middle @handle.com',
'@handle.com @handle.com @handle.com',
'@full123.test-of-chars',
'not@right',
'@handle!@#$chars',
'@handle\n@handle',
'@handle.com!@#$chars',
'@handle.com\n@handle.com',
'start https://middle.com end',
'start https://middle.com/foo/bar end',
'start https://middle.com/foo/bar?baz=bux end',
@ -35,13 +39,13 @@ describe('extractEntities', () => {
}
const outputs: Output[][] = [
[],
[{type: 'mention', value: 'handle'}],
[{type: 'mention', value: 'handle'}],
[{type: 'mention', value: 'handle'}],
[{type: 'mention', value: 'handle.com'}],
[{type: 'mention', value: 'handle.com'}],
[{type: 'mention', value: 'handle.com'}],
[
{type: 'mention', value: 'handle'},
{type: 'mention', value: 'handle'},
{type: 'mention', value: 'handle'},
{type: 'mention', value: 'handle.com'},
{type: 'mention', value: 'handle.com'},
{type: 'mention', value: 'handle.com'},
],
[
{
@ -50,10 +54,10 @@ describe('extractEntities', () => {
},
],
[],
[{type: 'mention', value: 'handle'}],
[{type: 'mention', value: 'handle.com'}],
[
{type: 'mention', value: 'handle'},
{type: 'mention', value: 'handle'},
{type: 'mention', value: 'handle.com'},
{type: 'mention', value: 'handle.com'},
],
[{type: 'link', value: 'https://middle.com'}],
[{type: 'link', value: 'https://middle.com/foo/bar'}],
@ -176,3 +180,45 @@ describe('detectLinkables', () => {
}
})
})
describe('extractHtmlMeta', () => {
const inputs = [
'',
'nothing',
'<title>title</title>',
'<title> aSd!@#AC </title>',
'<title>\n title\n </title>',
'<meta name="title" content="meta title">',
'<meta name="description" content="meta description">',
'<meta property="og:title" content="og title">',
'<meta property="og:description" content="og description">',
'<meta property="og:image" content="https://ogimage.com/foo.png">',
'<meta property="twitter:title" content="twitter title">',
'<meta property="twitter:description" content="twitter description">',
'<meta property="twitter:image" content="https://twitterimage.com/foo.png">',
'<meta\n name="title"\n content="meta title"\n>',
]
const outputs = [
{},
{},
{title: 'title'},
{title: 'aSd!@#AC'},
{title: 'title'},
{title: 'meta title'},
{description: 'meta description'},
{title: 'og title'},
{description: 'og description'},
{image: 'https://ogimage.com/foo.png'},
{title: 'twitter title'},
{description: 'twitter description'},
{image: 'https://twitterimage.com/foo.png'},
{title: 'meta title'},
]
it('correctly handles a set of text inputs', () => {
for (let i = 0; i < inputs.length; i++) {
const input = inputs[i]
const output = extractHtmlMeta(input)
expect(output).toEqual(outputs[i])
}
})
})

1256
src/lib/link-meta.ts 100644

File diff suppressed because it is too large Load Diff

View File

@ -220,3 +220,54 @@ export function convertBskyAppUrlIfNeeded(url: string): string {
}
return url
}
const htmlTitleRegex = /<title>([^<]+)<\/title>/i
export function extractHtmlMeta(html: string): Record<string, string> {
const res: Record<string, string> = {}
{
const match = htmlTitleRegex.exec(html)
if (match) {
res.title = match[1].trim()
}
}
{
let metaMatch
let propMatch
const metaRe = /<meta[\s]([^>]+)>/gis
while ((metaMatch = metaRe.exec(html))) {
let propName
let propValue
const propRe = /(name|property|content)="([^"]+)"/gis
while ((propMatch = propRe.exec(metaMatch[1]))) {
if (propMatch[1] === 'content') {
propValue = propMatch[2]
} else {
propName = propMatch[2]
}
}
if (!propName || !propValue) {
continue
}
switch (propName?.trim()) {
case 'title':
case 'og:title':
case 'twitter:title':
res.title = propValue?.trim()
break
case 'description':
case 'og:description':
case 'twitter:description':
res.description = propValue?.trim()
break
case 'og:image':
case 'twitter:image':
res.image = propValue?.trim()
break
}
}
}
return res
}