Improvements to link and mention detection regarding surrounding punctuation
parent
2ccbe4f702
commit
7ae1bac620
|
@ -16,6 +16,7 @@ describe('extractEntities', () => {
|
|||
'not@right',
|
||||
'@handle.com!@#$chars',
|
||||
'@handle.com\n@handle.com',
|
||||
'parenthetical (@handle.com)',
|
||||
'start https://middle.com end',
|
||||
'start https://middle.com/foo/bar end',
|
||||
'start https://middle.com/foo/bar?baz=bux end',
|
||||
|
@ -36,6 +37,12 @@ describe('extractEntities', () => {
|
|||
'website.com.jpg',
|
||||
'e.g./foo',
|
||||
'website.com.jpg/foo',
|
||||
'Classic article https://socket3.wordpress.com/2018/02/03/designing-windows-95s-user-interface/',
|
||||
'Classic article https://socket3.wordpress.com/2018/02/03/designing-windows-95s-user-interface/ ',
|
||||
'https://foo.com https://bar.com/whatever https://baz.com',
|
||||
'punctuation https://foo.com, https://bar.com/whatever; https://baz.com.',
|
||||
'parenthentical (https://foo.com)',
|
||||
'except for https://foo.com/thing_(cool)',
|
||||
]
|
||||
interface Output {
|
||||
type: string
|
||||
|
@ -64,6 +71,7 @@ describe('extractEntities', () => {
|
|||
{type: 'mention', value: 'handle.com'},
|
||||
{type: 'mention', value: 'handle.com'},
|
||||
],
|
||||
[{type: 'mention', value: 'handle.com'}],
|
||||
[{type: 'link', value: 'https://middle.com'}],
|
||||
[{type: 'link', value: 'https://middle.com/foo/bar'}],
|
||||
[{type: 'link', value: 'https://middle.com/foo/bar?baz=bux'}],
|
||||
|
@ -90,6 +98,32 @@ describe('extractEntities', () => {
|
|||
[],
|
||||
[],
|
||||
[],
|
||||
[
|
||||
{
|
||||
type: 'link',
|
||||
value:
|
||||
'https://socket3.wordpress.com/2018/02/03/designing-windows-95s-user-interface/',
|
||||
},
|
||||
],
|
||||
[
|
||||
{
|
||||
type: 'link',
|
||||
value:
|
||||
'https://socket3.wordpress.com/2018/02/03/designing-windows-95s-user-interface/',
|
||||
},
|
||||
],
|
||||
[
|
||||
{type: 'link', value: 'https://foo.com'},
|
||||
{type: 'link', value: 'https://bar.com/whatever'},
|
||||
{type: 'link', value: 'https://baz.com'},
|
||||
],
|
||||
[
|
||||
{type: 'link', value: 'https://foo.com'},
|
||||
{type: 'link', value: 'https://bar.com/whatever'},
|
||||
{type: 'link', value: 'https://baz.com'},
|
||||
],
|
||||
[{type: 'link', value: 'https://foo.com'}],
|
||||
[{type: 'link', value: 'https://foo.com/thing_(cool)'}],
|
||||
]
|
||||
it('correctly handles a set of text inputs', () => {
|
||||
for (let i = 0; i < inputs.length; i++) {
|
||||
|
@ -140,6 +174,7 @@ describe('detectLinkables', () => {
|
|||
'not@right',
|
||||
'@bad!@#$chars',
|
||||
'@newline1\n@newline2',
|
||||
'parenthetical (@handle)',
|
||||
'start https://middle.com end',
|
||||
'start https://middle.com/foo/bar end',
|
||||
'start https://middle.com/foo/bar?baz=bux end',
|
||||
|
@ -161,6 +196,12 @@ describe('detectLinkables', () => {
|
|||
'website.com.jpg',
|
||||
'e.g./foo',
|
||||
'website.com.jpg/foo',
|
||||
'Classic article https://socket3.wordpress.com/2018/02/03/designing-windows-95s-user-interface/',
|
||||
'Classic article https://socket3.wordpress.com/2018/02/03/designing-windows-95s-user-interface/ ',
|
||||
'https://foo.com https://bar.com/whatever https://baz.com',
|
||||
'punctuation https://foo.com, https://bar.com/whatever; https://baz.com.',
|
||||
'parenthentical (https://foo.com)',
|
||||
'except for https://foo.com/thing_(cool)',
|
||||
]
|
||||
const outputs = [
|
||||
['no linkable'],
|
||||
|
@ -172,6 +213,7 @@ describe('detectLinkables', () => {
|
|||
['not@right'],
|
||||
[{link: '@bad'}, '!@#$chars'],
|
||||
[{link: '@newline1'}, '\n', {link: '@newline2'}],
|
||||
['parenthetical (', {link: '@handle'}, ')'],
|
||||
['start ', {link: 'https://middle.com'}, ' end'],
|
||||
['start ', {link: 'https://middle.com/foo/bar'}, ' end'],
|
||||
['start ', {link: 'https://middle.com/foo/bar?baz=bux'}, ' end'],
|
||||
|
@ -193,6 +235,37 @@ describe('detectLinkables', () => {
|
|||
['website.com.jpg'],
|
||||
['e.g./foo'],
|
||||
['website.com.jpg/foo'],
|
||||
[
|
||||
'Classic article ',
|
||||
{
|
||||
link: 'https://socket3.wordpress.com/2018/02/03/designing-windows-95s-user-interface/',
|
||||
},
|
||||
],
|
||||
[
|
||||
'Classic article ',
|
||||
{
|
||||
link: 'https://socket3.wordpress.com/2018/02/03/designing-windows-95s-user-interface/',
|
||||
},
|
||||
' ',
|
||||
],
|
||||
[
|
||||
{link: 'https://foo.com'},
|
||||
' ',
|
||||
{link: 'https://bar.com/whatever'},
|
||||
' ',
|
||||
{link: 'https://baz.com'},
|
||||
],
|
||||
[
|
||||
'punctuation ',
|
||||
{link: 'https://foo.com'},
|
||||
', ',
|
||||
{link: 'https://bar.com/whatever'},
|
||||
'; ',
|
||||
{link: 'https://baz.com'},
|
||||
'.',
|
||||
],
|
||||
['parenthentical (', {link: 'https://foo.com'}, ')'],
|
||||
['except for ', {link: 'https://foo.com/thing_(cool)'}],
|
||||
]
|
||||
it('correctly handles a set of text inputs', () => {
|
||||
for (let i = 0; i < inputs.length; i++) {
|
||||
|
|
|
@ -74,7 +74,7 @@ export function extractEntities(
|
|||
let ents: Entity[] = []
|
||||
{
|
||||
// mentions
|
||||
const re = /(^|\s)(@)([a-zA-Z0-9\.-]+)(\b)/dg
|
||||
const re = /(^|\s|\()(@)([a-zA-Z0-9\.-]+)(\b)/dg
|
||||
while ((match = re.exec(text))) {
|
||||
if (knownHandles && !knownHandles.has(match[3])) {
|
||||
continue // not a known handle
|
||||
|
@ -94,7 +94,7 @@ export function extractEntities(
|
|||
{
|
||||
// links
|
||||
const re =
|
||||
/(^|\s)((https?:\/\/[\S]+)|((?<domain>[a-z][a-z0-9]*(\.[a-z0-9]+)+)[\S]*))(\b)/dg
|
||||
/(^|\s|\()((https?:\/\/[\S]+)|((?<domain>[a-z][a-z0-9]*(\.[a-z0-9]+)+)[\S]*))/dgm
|
||||
while ((match = re.exec(text))) {
|
||||
let value = match[2]
|
||||
if (!value.startsWith('http')) {
|
||||
|
@ -104,13 +104,25 @@ export function extractEntities(
|
|||
}
|
||||
value = `https://${value}`
|
||||
}
|
||||
const index = {
|
||||
start: match.indices[2][0], // skip the (^|\s)
|
||||
end: match.indices[2][1],
|
||||
}
|
||||
{
|
||||
// strip ending puncuation
|
||||
if (/[.,;!?]$/.test(value)) {
|
||||
value = value.slice(0, -1)
|
||||
index.end--
|
||||
}
|
||||
if (/[)]$/.test(value) && !value.includes('(')) {
|
||||
value = value.slice(0, -1)
|
||||
index.end--
|
||||
}
|
||||
}
|
||||
ents.push({
|
||||
type: 'link',
|
||||
value,
|
||||
index: {
|
||||
start: match.indices[2][0], // skip the (^|\s)
|
||||
end: match.indices[2][1],
|
||||
},
|
||||
index,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
@ -123,7 +135,7 @@ interface DetectedLink {
|
|||
type DetectedLinkable = string | DetectedLink
|
||||
export function detectLinkables(text: string): DetectedLinkable[] {
|
||||
const re =
|
||||
/((^|\s)@[a-z0-9\.-]*)|((^|\s)https?:\/\/[\S]+)|((^|\s)(?<domain>[a-z][a-z0-9]*(\.[a-z0-9]+)+)[\S]*)/gi
|
||||
/((^|\s|\()@[a-z0-9\.-]*)|((^|\s|\()https?:\/\/[\S]+)|((^|\s|\()(?<domain>[a-z][a-z0-9]*(\.[a-z0-9]+)+)[\S]*)/gi
|
||||
const segments = []
|
||||
let match
|
||||
let start = 0
|
||||
|
@ -135,7 +147,7 @@ export function detectLinkables(text: string): DetectedLinkable[] {
|
|||
continue
|
||||
}
|
||||
|
||||
if (/\s/.test(matchValue)) {
|
||||
if (/\s|\(/.test(matchValue)) {
|
||||
// HACK
|
||||
// skip the starting space
|
||||
// we have to do this because RN doesnt support negative lookaheads
|
||||
|
@ -144,6 +156,16 @@ export function detectLinkables(text: string): DetectedLinkable[] {
|
|||
matchValue = matchValue.slice(1)
|
||||
}
|
||||
|
||||
{
|
||||
// strip ending puncuation
|
||||
if (/[.,;!?]$/.test(matchValue)) {
|
||||
matchValue = matchValue.slice(0, -1)
|
||||
}
|
||||
if (/[)]$/.test(matchValue) && !matchValue.includes('(')) {
|
||||
matchValue = matchValue.slice(0, -1)
|
||||
}
|
||||
}
|
||||
|
||||
if (start !== matchIndex) {
|
||||
segments.push(text.slice(start, matchIndex))
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue