From 7ae1bac6208c6c00363aae344c76261a28433614 Mon Sep 17 00:00:00 2001 From: Paul Frazee Date: Tue, 29 Nov 2022 10:01:57 -0600 Subject: [PATCH] Improvements to link and mention detection regarding surrounding punctuation --- __tests__/string-utils.ts | 73 +++++++++++++++++++++++++++++++++++++++ src/lib/strings.ts | 38 +++++++++++++++----- 2 files changed, 103 insertions(+), 8 deletions(-) diff --git a/__tests__/string-utils.ts b/__tests__/string-utils.ts index fc7a8f27..a1bd59fe 100644 --- a/__tests__/string-utils.ts +++ b/__tests__/string-utils.ts @@ -16,6 +16,7 @@ describe('extractEntities', () => { 'not@right', '@handle.com!@#$chars', '@handle.com\n@handle.com', + 'parenthetical (@handle.com)', 'start https://middle.com end', 'start https://middle.com/foo/bar end', 'start https://middle.com/foo/bar?baz=bux end', @@ -36,6 +37,12 @@ describe('extractEntities', () => { 'website.com.jpg', 'e.g./foo', 'website.com.jpg/foo', + 'Classic article https://socket3.wordpress.com/2018/02/03/designing-windows-95s-user-interface/', + 'Classic article https://socket3.wordpress.com/2018/02/03/designing-windows-95s-user-interface/ ', + 'https://foo.com https://bar.com/whatever https://baz.com', + 'punctuation https://foo.com, https://bar.com/whatever; https://baz.com.', + 'parenthentical (https://foo.com)', + 'except for https://foo.com/thing_(cool)', ] interface Output { type: string @@ -64,6 +71,7 @@ describe('extractEntities', () => { {type: 'mention', value: 'handle.com'}, {type: 'mention', value: 'handle.com'}, ], + [{type: 'mention', value: 'handle.com'}], [{type: 'link', value: 'https://middle.com'}], [{type: 'link', value: 'https://middle.com/foo/bar'}], [{type: 'link', value: 'https://middle.com/foo/bar?baz=bux'}], @@ -90,6 +98,32 @@ describe('extractEntities', () => { [], [], [], + [ + { + type: 'link', + value: + 'https://socket3.wordpress.com/2018/02/03/designing-windows-95s-user-interface/', + }, + ], + [ + { + type: 'link', + value: + 'https://socket3.wordpress.com/2018/02/03/designing-windows-95s-user-interface/', + }, + ], + [ + {type: 'link', value: 'https://foo.com'}, + {type: 'link', value: 'https://bar.com/whatever'}, + {type: 'link', value: 'https://baz.com'}, + ], + [ + {type: 'link', value: 'https://foo.com'}, + {type: 'link', value: 'https://bar.com/whatever'}, + {type: 'link', value: 'https://baz.com'}, + ], + [{type: 'link', value: 'https://foo.com'}], + [{type: 'link', value: 'https://foo.com/thing_(cool)'}], ] it('correctly handles a set of text inputs', () => { for (let i = 0; i < inputs.length; i++) { @@ -140,6 +174,7 @@ describe('detectLinkables', () => { 'not@right', '@bad!@#$chars', '@newline1\n@newline2', + 'parenthetical (@handle)', 'start https://middle.com end', 'start https://middle.com/foo/bar end', 'start https://middle.com/foo/bar?baz=bux end', @@ -161,6 +196,12 @@ describe('detectLinkables', () => { 'website.com.jpg', 'e.g./foo', 'website.com.jpg/foo', + 'Classic article https://socket3.wordpress.com/2018/02/03/designing-windows-95s-user-interface/', + 'Classic article https://socket3.wordpress.com/2018/02/03/designing-windows-95s-user-interface/ ', + 'https://foo.com https://bar.com/whatever https://baz.com', + 'punctuation https://foo.com, https://bar.com/whatever; https://baz.com.', + 'parenthentical (https://foo.com)', + 'except for https://foo.com/thing_(cool)', ] const outputs = [ ['no linkable'], @@ -172,6 +213,7 @@ describe('detectLinkables', () => { ['not@right'], [{link: '@bad'}, '!@#$chars'], [{link: '@newline1'}, '\n', {link: '@newline2'}], + ['parenthetical (', {link: '@handle'}, ')'], ['start ', {link: 'https://middle.com'}, ' end'], ['start ', {link: 'https://middle.com/foo/bar'}, ' end'], ['start ', {link: 'https://middle.com/foo/bar?baz=bux'}, ' end'], @@ -193,6 +235,37 @@ describe('detectLinkables', () => { ['website.com.jpg'], ['e.g./foo'], ['website.com.jpg/foo'], + [ + 'Classic article ', + { + link: 'https://socket3.wordpress.com/2018/02/03/designing-windows-95s-user-interface/', + }, + ], + [ + 'Classic article ', + { + link: 'https://socket3.wordpress.com/2018/02/03/designing-windows-95s-user-interface/', + }, + ' ', + ], + [ + {link: 'https://foo.com'}, + ' ', + {link: 'https://bar.com/whatever'}, + ' ', + {link: 'https://baz.com'}, + ], + [ + 'punctuation ', + {link: 'https://foo.com'}, + ', ', + {link: 'https://bar.com/whatever'}, + '; ', + {link: 'https://baz.com'}, + '.', + ], + ['parenthentical (', {link: 'https://foo.com'}, ')'], + ['except for ', {link: 'https://foo.com/thing_(cool)'}], ] it('correctly handles a set of text inputs', () => { for (let i = 0; i < inputs.length; i++) { diff --git a/src/lib/strings.ts b/src/lib/strings.ts index fb9d15b2..66dd5970 100644 --- a/src/lib/strings.ts +++ b/src/lib/strings.ts @@ -74,7 +74,7 @@ export function extractEntities( let ents: Entity[] = [] { // mentions - const re = /(^|\s)(@)([a-zA-Z0-9\.-]+)(\b)/dg + const re = /(^|\s|\()(@)([a-zA-Z0-9\.-]+)(\b)/dg while ((match = re.exec(text))) { if (knownHandles && !knownHandles.has(match[3])) { continue // not a known handle @@ -94,7 +94,7 @@ export function extractEntities( { // links const re = - /(^|\s)((https?:\/\/[\S]+)|((?[a-z][a-z0-9]*(\.[a-z0-9]+)+)[\S]*))(\b)/dg + /(^|\s|\()((https?:\/\/[\S]+)|((?[a-z][a-z0-9]*(\.[a-z0-9]+)+)[\S]*))/dgm while ((match = re.exec(text))) { let value = match[2] if (!value.startsWith('http')) { @@ -104,13 +104,25 @@ export function extractEntities( } value = `https://${value}` } + const index = { + start: match.indices[2][0], // skip the (^|\s) + end: match.indices[2][1], + } + { + // strip ending puncuation + if (/[.,;!?]$/.test(value)) { + value = value.slice(0, -1) + index.end-- + } + if (/[)]$/.test(value) && !value.includes('(')) { + value = value.slice(0, -1) + index.end-- + } + } ents.push({ type: 'link', value, - index: { - start: match.indices[2][0], // skip the (^|\s) - end: match.indices[2][1], - }, + index, }) } } @@ -123,7 +135,7 @@ interface DetectedLink { type DetectedLinkable = string | DetectedLink export function detectLinkables(text: string): DetectedLinkable[] { const re = - /((^|\s)@[a-z0-9\.-]*)|((^|\s)https?:\/\/[\S]+)|((^|\s)(?[a-z][a-z0-9]*(\.[a-z0-9]+)+)[\S]*)/gi + /((^|\s|\()@[a-z0-9\.-]*)|((^|\s|\()https?:\/\/[\S]+)|((^|\s|\()(?[a-z][a-z0-9]*(\.[a-z0-9]+)+)[\S]*)/gi const segments = [] let match let start = 0 @@ -135,7 +147,7 @@ export function detectLinkables(text: string): DetectedLinkable[] { continue } - if (/\s/.test(matchValue)) { + if (/\s|\(/.test(matchValue)) { // HACK // skip the starting space // we have to do this because RN doesnt support negative lookaheads @@ -144,6 +156,16 @@ export function detectLinkables(text: string): DetectedLinkable[] { matchValue = matchValue.slice(1) } + { + // strip ending puncuation + if (/[.,;!?]$/.test(matchValue)) { + matchValue = matchValue.slice(0, -1) + } + if (/[)]$/.test(matchValue) && !matchValue.includes('(')) { + matchValue = matchValue.slice(0, -1) + } + } + if (start !== matchIndex) { segments.push(text.slice(start, matchIndex)) }