Improvements to link and mention detection regarding surrounding punctuation

zio/stable
Paul Frazee 2022-11-29 10:01:57 -06:00
parent 2ccbe4f702
commit 7ae1bac620
2 changed files with 103 additions and 8 deletions

View File

@ -16,6 +16,7 @@ describe('extractEntities', () => {
'not@right',
'@handle.com!@#$chars',
'@handle.com\n@handle.com',
'parenthetical (@handle.com)',
'start https://middle.com end',
'start https://middle.com/foo/bar end',
'start https://middle.com/foo/bar?baz=bux end',
@ -36,6 +37,12 @@ describe('extractEntities', () => {
'website.com.jpg',
'e.g./foo',
'website.com.jpg/foo',
'Classic article https://socket3.wordpress.com/2018/02/03/designing-windows-95s-user-interface/',
'Classic article https://socket3.wordpress.com/2018/02/03/designing-windows-95s-user-interface/ ',
'https://foo.com https://bar.com/whatever https://baz.com',
'punctuation https://foo.com, https://bar.com/whatever; https://baz.com.',
'parenthentical (https://foo.com)',
'except for https://foo.com/thing_(cool)',
]
interface Output {
type: string
@ -64,6 +71,7 @@ describe('extractEntities', () => {
{type: 'mention', value: 'handle.com'},
{type: 'mention', value: 'handle.com'},
],
[{type: 'mention', value: 'handle.com'}],
[{type: 'link', value: 'https://middle.com'}],
[{type: 'link', value: 'https://middle.com/foo/bar'}],
[{type: 'link', value: 'https://middle.com/foo/bar?baz=bux'}],
@ -90,6 +98,32 @@ describe('extractEntities', () => {
[],
[],
[],
[
{
type: 'link',
value:
'https://socket3.wordpress.com/2018/02/03/designing-windows-95s-user-interface/',
},
],
[
{
type: 'link',
value:
'https://socket3.wordpress.com/2018/02/03/designing-windows-95s-user-interface/',
},
],
[
{type: 'link', value: 'https://foo.com'},
{type: 'link', value: 'https://bar.com/whatever'},
{type: 'link', value: 'https://baz.com'},
],
[
{type: 'link', value: 'https://foo.com'},
{type: 'link', value: 'https://bar.com/whatever'},
{type: 'link', value: 'https://baz.com'},
],
[{type: 'link', value: 'https://foo.com'}],
[{type: 'link', value: 'https://foo.com/thing_(cool)'}],
]
it('correctly handles a set of text inputs', () => {
for (let i = 0; i < inputs.length; i++) {
@ -140,6 +174,7 @@ describe('detectLinkables', () => {
'not@right',
'@bad!@#$chars',
'@newline1\n@newline2',
'parenthetical (@handle)',
'start https://middle.com end',
'start https://middle.com/foo/bar end',
'start https://middle.com/foo/bar?baz=bux end',
@ -161,6 +196,12 @@ describe('detectLinkables', () => {
'website.com.jpg',
'e.g./foo',
'website.com.jpg/foo',
'Classic article https://socket3.wordpress.com/2018/02/03/designing-windows-95s-user-interface/',
'Classic article https://socket3.wordpress.com/2018/02/03/designing-windows-95s-user-interface/ ',
'https://foo.com https://bar.com/whatever https://baz.com',
'punctuation https://foo.com, https://bar.com/whatever; https://baz.com.',
'parenthentical (https://foo.com)',
'except for https://foo.com/thing_(cool)',
]
const outputs = [
['no linkable'],
@ -172,6 +213,7 @@ describe('detectLinkables', () => {
['not@right'],
[{link: '@bad'}, '!@#$chars'],
[{link: '@newline1'}, '\n', {link: '@newline2'}],
['parenthetical (', {link: '@handle'}, ')'],
['start ', {link: 'https://middle.com'}, ' end'],
['start ', {link: 'https://middle.com/foo/bar'}, ' end'],
['start ', {link: 'https://middle.com/foo/bar?baz=bux'}, ' end'],
@ -193,6 +235,37 @@ describe('detectLinkables', () => {
['website.com.jpg'],
['e.g./foo'],
['website.com.jpg/foo'],
[
'Classic article ',
{
link: 'https://socket3.wordpress.com/2018/02/03/designing-windows-95s-user-interface/',
},
],
[
'Classic article ',
{
link: 'https://socket3.wordpress.com/2018/02/03/designing-windows-95s-user-interface/',
},
' ',
],
[
{link: 'https://foo.com'},
' ',
{link: 'https://bar.com/whatever'},
' ',
{link: 'https://baz.com'},
],
[
'punctuation ',
{link: 'https://foo.com'},
', ',
{link: 'https://bar.com/whatever'},
'; ',
{link: 'https://baz.com'},
'.',
],
['parenthentical (', {link: 'https://foo.com'}, ')'],
['except for ', {link: 'https://foo.com/thing_(cool)'}],
]
it('correctly handles a set of text inputs', () => {
for (let i = 0; i < inputs.length; i++) {

View File

@ -74,7 +74,7 @@ export function extractEntities(
let ents: Entity[] = []
{
// mentions
const re = /(^|\s)(@)([a-zA-Z0-9\.-]+)(\b)/dg
const re = /(^|\s|\()(@)([a-zA-Z0-9\.-]+)(\b)/dg
while ((match = re.exec(text))) {
if (knownHandles && !knownHandles.has(match[3])) {
continue // not a known handle
@ -94,7 +94,7 @@ export function extractEntities(
{
// links
const re =
/(^|\s)((https?:\/\/[\S]+)|((?<domain>[a-z][a-z0-9]*(\.[a-z0-9]+)+)[\S]*))(\b)/dg
/(^|\s|\()((https?:\/\/[\S]+)|((?<domain>[a-z][a-z0-9]*(\.[a-z0-9]+)+)[\S]*))/dgm
while ((match = re.exec(text))) {
let value = match[2]
if (!value.startsWith('http')) {
@ -104,13 +104,25 @@ export function extractEntities(
}
value = `https://${value}`
}
const index = {
start: match.indices[2][0], // skip the (^|\s)
end: match.indices[2][1],
}
{
// strip ending puncuation
if (/[.,;!?]$/.test(value)) {
value = value.slice(0, -1)
index.end--
}
if (/[)]$/.test(value) && !value.includes('(')) {
value = value.slice(0, -1)
index.end--
}
}
ents.push({
type: 'link',
value,
index: {
start: match.indices[2][0], // skip the (^|\s)
end: match.indices[2][1],
},
index,
})
}
}
@ -123,7 +135,7 @@ interface DetectedLink {
type DetectedLinkable = string | DetectedLink
export function detectLinkables(text: string): DetectedLinkable[] {
const re =
/((^|\s)@[a-z0-9\.-]*)|((^|\s)https?:\/\/[\S]+)|((^|\s)(?<domain>[a-z][a-z0-9]*(\.[a-z0-9]+)+)[\S]*)/gi
/((^|\s|\()@[a-z0-9\.-]*)|((^|\s|\()https?:\/\/[\S]+)|((^|\s|\()(?<domain>[a-z][a-z0-9]*(\.[a-z0-9]+)+)[\S]*)/gi
const segments = []
let match
let start = 0
@ -135,7 +147,7 @@ export function detectLinkables(text: string): DetectedLinkable[] {
continue
}
if (/\s/.test(matchValue)) {
if (/\s|\(/.test(matchValue)) {
// HACK
// skip the starting space
// we have to do this because RN doesnt support negative lookaheads
@ -144,6 +156,16 @@ export function detectLinkables(text: string): DetectedLinkable[] {
matchValue = matchValue.slice(1)
}
{
// strip ending puncuation
if (/[.,;!?]$/.test(matchValue)) {
matchValue = matchValue.slice(0, -1)
}
if (/[)]$/.test(matchValue) && !matchValue.includes('(')) {
matchValue = matchValue.slice(0, -1)
}
}
if (start !== matchIndex) {
segments.push(text.slice(start, matchIndex))
}