Improvements to link and mention detection regarding surrounding punctuation
parent
2ccbe4f702
commit
7ae1bac620
|
@ -16,6 +16,7 @@ describe('extractEntities', () => {
|
||||||
'not@right',
|
'not@right',
|
||||||
'@handle.com!@#$chars',
|
'@handle.com!@#$chars',
|
||||||
'@handle.com\n@handle.com',
|
'@handle.com\n@handle.com',
|
||||||
|
'parenthetical (@handle.com)',
|
||||||
'start https://middle.com end',
|
'start https://middle.com end',
|
||||||
'start https://middle.com/foo/bar end',
|
'start https://middle.com/foo/bar end',
|
||||||
'start https://middle.com/foo/bar?baz=bux end',
|
'start https://middle.com/foo/bar?baz=bux end',
|
||||||
|
@ -36,6 +37,12 @@ describe('extractEntities', () => {
|
||||||
'website.com.jpg',
|
'website.com.jpg',
|
||||||
'e.g./foo',
|
'e.g./foo',
|
||||||
'website.com.jpg/foo',
|
'website.com.jpg/foo',
|
||||||
|
'Classic article https://socket3.wordpress.com/2018/02/03/designing-windows-95s-user-interface/',
|
||||||
|
'Classic article https://socket3.wordpress.com/2018/02/03/designing-windows-95s-user-interface/ ',
|
||||||
|
'https://foo.com https://bar.com/whatever https://baz.com',
|
||||||
|
'punctuation https://foo.com, https://bar.com/whatever; https://baz.com.',
|
||||||
|
'parenthentical (https://foo.com)',
|
||||||
|
'except for https://foo.com/thing_(cool)',
|
||||||
]
|
]
|
||||||
interface Output {
|
interface Output {
|
||||||
type: string
|
type: string
|
||||||
|
@ -64,6 +71,7 @@ describe('extractEntities', () => {
|
||||||
{type: 'mention', value: 'handle.com'},
|
{type: 'mention', value: 'handle.com'},
|
||||||
{type: 'mention', value: 'handle.com'},
|
{type: 'mention', value: 'handle.com'},
|
||||||
],
|
],
|
||||||
|
[{type: 'mention', value: 'handle.com'}],
|
||||||
[{type: 'link', value: 'https://middle.com'}],
|
[{type: 'link', value: 'https://middle.com'}],
|
||||||
[{type: 'link', value: 'https://middle.com/foo/bar'}],
|
[{type: 'link', value: 'https://middle.com/foo/bar'}],
|
||||||
[{type: 'link', value: 'https://middle.com/foo/bar?baz=bux'}],
|
[{type: 'link', value: 'https://middle.com/foo/bar?baz=bux'}],
|
||||||
|
@ -90,6 +98,32 @@ describe('extractEntities', () => {
|
||||||
[],
|
[],
|
||||||
[],
|
[],
|
||||||
[],
|
[],
|
||||||
|
[
|
||||||
|
{
|
||||||
|
type: 'link',
|
||||||
|
value:
|
||||||
|
'https://socket3.wordpress.com/2018/02/03/designing-windows-95s-user-interface/',
|
||||||
|
},
|
||||||
|
],
|
||||||
|
[
|
||||||
|
{
|
||||||
|
type: 'link',
|
||||||
|
value:
|
||||||
|
'https://socket3.wordpress.com/2018/02/03/designing-windows-95s-user-interface/',
|
||||||
|
},
|
||||||
|
],
|
||||||
|
[
|
||||||
|
{type: 'link', value: 'https://foo.com'},
|
||||||
|
{type: 'link', value: 'https://bar.com/whatever'},
|
||||||
|
{type: 'link', value: 'https://baz.com'},
|
||||||
|
],
|
||||||
|
[
|
||||||
|
{type: 'link', value: 'https://foo.com'},
|
||||||
|
{type: 'link', value: 'https://bar.com/whatever'},
|
||||||
|
{type: 'link', value: 'https://baz.com'},
|
||||||
|
],
|
||||||
|
[{type: 'link', value: 'https://foo.com'}],
|
||||||
|
[{type: 'link', value: 'https://foo.com/thing_(cool)'}],
|
||||||
]
|
]
|
||||||
it('correctly handles a set of text inputs', () => {
|
it('correctly handles a set of text inputs', () => {
|
||||||
for (let i = 0; i < inputs.length; i++) {
|
for (let i = 0; i < inputs.length; i++) {
|
||||||
|
@ -140,6 +174,7 @@ describe('detectLinkables', () => {
|
||||||
'not@right',
|
'not@right',
|
||||||
'@bad!@#$chars',
|
'@bad!@#$chars',
|
||||||
'@newline1\n@newline2',
|
'@newline1\n@newline2',
|
||||||
|
'parenthetical (@handle)',
|
||||||
'start https://middle.com end',
|
'start https://middle.com end',
|
||||||
'start https://middle.com/foo/bar end',
|
'start https://middle.com/foo/bar end',
|
||||||
'start https://middle.com/foo/bar?baz=bux end',
|
'start https://middle.com/foo/bar?baz=bux end',
|
||||||
|
@ -161,6 +196,12 @@ describe('detectLinkables', () => {
|
||||||
'website.com.jpg',
|
'website.com.jpg',
|
||||||
'e.g./foo',
|
'e.g./foo',
|
||||||
'website.com.jpg/foo',
|
'website.com.jpg/foo',
|
||||||
|
'Classic article https://socket3.wordpress.com/2018/02/03/designing-windows-95s-user-interface/',
|
||||||
|
'Classic article https://socket3.wordpress.com/2018/02/03/designing-windows-95s-user-interface/ ',
|
||||||
|
'https://foo.com https://bar.com/whatever https://baz.com',
|
||||||
|
'punctuation https://foo.com, https://bar.com/whatever; https://baz.com.',
|
||||||
|
'parenthentical (https://foo.com)',
|
||||||
|
'except for https://foo.com/thing_(cool)',
|
||||||
]
|
]
|
||||||
const outputs = [
|
const outputs = [
|
||||||
['no linkable'],
|
['no linkable'],
|
||||||
|
@ -172,6 +213,7 @@ describe('detectLinkables', () => {
|
||||||
['not@right'],
|
['not@right'],
|
||||||
[{link: '@bad'}, '!@#$chars'],
|
[{link: '@bad'}, '!@#$chars'],
|
||||||
[{link: '@newline1'}, '\n', {link: '@newline2'}],
|
[{link: '@newline1'}, '\n', {link: '@newline2'}],
|
||||||
|
['parenthetical (', {link: '@handle'}, ')'],
|
||||||
['start ', {link: 'https://middle.com'}, ' end'],
|
['start ', {link: 'https://middle.com'}, ' end'],
|
||||||
['start ', {link: 'https://middle.com/foo/bar'}, ' end'],
|
['start ', {link: 'https://middle.com/foo/bar'}, ' end'],
|
||||||
['start ', {link: 'https://middle.com/foo/bar?baz=bux'}, ' end'],
|
['start ', {link: 'https://middle.com/foo/bar?baz=bux'}, ' end'],
|
||||||
|
@ -193,6 +235,37 @@ describe('detectLinkables', () => {
|
||||||
['website.com.jpg'],
|
['website.com.jpg'],
|
||||||
['e.g./foo'],
|
['e.g./foo'],
|
||||||
['website.com.jpg/foo'],
|
['website.com.jpg/foo'],
|
||||||
|
[
|
||||||
|
'Classic article ',
|
||||||
|
{
|
||||||
|
link: 'https://socket3.wordpress.com/2018/02/03/designing-windows-95s-user-interface/',
|
||||||
|
},
|
||||||
|
],
|
||||||
|
[
|
||||||
|
'Classic article ',
|
||||||
|
{
|
||||||
|
link: 'https://socket3.wordpress.com/2018/02/03/designing-windows-95s-user-interface/',
|
||||||
|
},
|
||||||
|
' ',
|
||||||
|
],
|
||||||
|
[
|
||||||
|
{link: 'https://foo.com'},
|
||||||
|
' ',
|
||||||
|
{link: 'https://bar.com/whatever'},
|
||||||
|
' ',
|
||||||
|
{link: 'https://baz.com'},
|
||||||
|
],
|
||||||
|
[
|
||||||
|
'punctuation ',
|
||||||
|
{link: 'https://foo.com'},
|
||||||
|
', ',
|
||||||
|
{link: 'https://bar.com/whatever'},
|
||||||
|
'; ',
|
||||||
|
{link: 'https://baz.com'},
|
||||||
|
'.',
|
||||||
|
],
|
||||||
|
['parenthentical (', {link: 'https://foo.com'}, ')'],
|
||||||
|
['except for ', {link: 'https://foo.com/thing_(cool)'}],
|
||||||
]
|
]
|
||||||
it('correctly handles a set of text inputs', () => {
|
it('correctly handles a set of text inputs', () => {
|
||||||
for (let i = 0; i < inputs.length; i++) {
|
for (let i = 0; i < inputs.length; i++) {
|
||||||
|
|
|
@ -74,7 +74,7 @@ export function extractEntities(
|
||||||
let ents: Entity[] = []
|
let ents: Entity[] = []
|
||||||
{
|
{
|
||||||
// mentions
|
// mentions
|
||||||
const re = /(^|\s)(@)([a-zA-Z0-9\.-]+)(\b)/dg
|
const re = /(^|\s|\()(@)([a-zA-Z0-9\.-]+)(\b)/dg
|
||||||
while ((match = re.exec(text))) {
|
while ((match = re.exec(text))) {
|
||||||
if (knownHandles && !knownHandles.has(match[3])) {
|
if (knownHandles && !knownHandles.has(match[3])) {
|
||||||
continue // not a known handle
|
continue // not a known handle
|
||||||
|
@ -94,7 +94,7 @@ export function extractEntities(
|
||||||
{
|
{
|
||||||
// links
|
// links
|
||||||
const re =
|
const re =
|
||||||
/(^|\s)((https?:\/\/[\S]+)|((?<domain>[a-z][a-z0-9]*(\.[a-z0-9]+)+)[\S]*))(\b)/dg
|
/(^|\s|\()((https?:\/\/[\S]+)|((?<domain>[a-z][a-z0-9]*(\.[a-z0-9]+)+)[\S]*))/dgm
|
||||||
while ((match = re.exec(text))) {
|
while ((match = re.exec(text))) {
|
||||||
let value = match[2]
|
let value = match[2]
|
||||||
if (!value.startsWith('http')) {
|
if (!value.startsWith('http')) {
|
||||||
|
@ -104,13 +104,25 @@ export function extractEntities(
|
||||||
}
|
}
|
||||||
value = `https://${value}`
|
value = `https://${value}`
|
||||||
}
|
}
|
||||||
|
const index = {
|
||||||
|
start: match.indices[2][0], // skip the (^|\s)
|
||||||
|
end: match.indices[2][1],
|
||||||
|
}
|
||||||
|
{
|
||||||
|
// strip ending puncuation
|
||||||
|
if (/[.,;!?]$/.test(value)) {
|
||||||
|
value = value.slice(0, -1)
|
||||||
|
index.end--
|
||||||
|
}
|
||||||
|
if (/[)]$/.test(value) && !value.includes('(')) {
|
||||||
|
value = value.slice(0, -1)
|
||||||
|
index.end--
|
||||||
|
}
|
||||||
|
}
|
||||||
ents.push({
|
ents.push({
|
||||||
type: 'link',
|
type: 'link',
|
||||||
value,
|
value,
|
||||||
index: {
|
index,
|
||||||
start: match.indices[2][0], // skip the (^|\s)
|
|
||||||
end: match.indices[2][1],
|
|
||||||
},
|
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -123,7 +135,7 @@ interface DetectedLink {
|
||||||
type DetectedLinkable = string | DetectedLink
|
type DetectedLinkable = string | DetectedLink
|
||||||
export function detectLinkables(text: string): DetectedLinkable[] {
|
export function detectLinkables(text: string): DetectedLinkable[] {
|
||||||
const re =
|
const re =
|
||||||
/((^|\s)@[a-z0-9\.-]*)|((^|\s)https?:\/\/[\S]+)|((^|\s)(?<domain>[a-z][a-z0-9]*(\.[a-z0-9]+)+)[\S]*)/gi
|
/((^|\s|\()@[a-z0-9\.-]*)|((^|\s|\()https?:\/\/[\S]+)|((^|\s|\()(?<domain>[a-z][a-z0-9]*(\.[a-z0-9]+)+)[\S]*)/gi
|
||||||
const segments = []
|
const segments = []
|
||||||
let match
|
let match
|
||||||
let start = 0
|
let start = 0
|
||||||
|
@ -135,7 +147,7 @@ export function detectLinkables(text: string): DetectedLinkable[] {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
|
||||||
if (/\s/.test(matchValue)) {
|
if (/\s|\(/.test(matchValue)) {
|
||||||
// HACK
|
// HACK
|
||||||
// skip the starting space
|
// skip the starting space
|
||||||
// we have to do this because RN doesnt support negative lookaheads
|
// we have to do this because RN doesnt support negative lookaheads
|
||||||
|
@ -144,6 +156,16 @@ export function detectLinkables(text: string): DetectedLinkable[] {
|
||||||
matchValue = matchValue.slice(1)
|
matchValue = matchValue.slice(1)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
{
|
||||||
|
// strip ending puncuation
|
||||||
|
if (/[.,;!?]$/.test(matchValue)) {
|
||||||
|
matchValue = matchValue.slice(0, -1)
|
||||||
|
}
|
||||||
|
if (/[)]$/.test(matchValue) && !matchValue.includes('(')) {
|
||||||
|
matchValue = matchValue.slice(0, -1)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
if (start !== matchIndex) {
|
if (start !== matchIndex) {
|
||||||
segments.push(text.slice(start, matchIndex))
|
segments.push(text.slice(start, matchIndex))
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue