Files
atproto/packages/api/tests/rich-text-detection.test.ts
Samuel Newman 3ffebd0bf2 Add cashtag detection support to rich text parser (#4539)
Implements detection and facet generation for cashtags (stock tickers) like $AAPL
and $BTC. Cashtags are identified by a dollar sign followed by 1-5 alphanumeric
characters, with the first character being a letter. Detected cashtags are
normalized to uppercase and included as tag facets in the rich text output.

Co-authored-by: Claude Haiku 4.5 <noreply@anthropic.com>
2026-01-12 10:59:28 -08:00

457 lines
14 KiB
TypeScript
Raw Permalink Blame History

This file contains invisible Unicode characters
This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
import { AtpAgent, RichText, RichTextSegment } from '../src'
import {
isLink,
isMention,
isTag,
} from '../src/client/types/app/bsky/richtext/facet'
describe('detectFacets', () => {
const agent = new AtpAgent({ service: 'http://localhost' })
// Mock handle resolution
agent.com.atproto.identity.resolveHandle = async (params) => ({
success: true,
headers: {},
data: { did: `did:fake:${params?.handle}` },
})
const inputs = [
'no mention',
'@handle.com middle end',
'start @handle.com end',
'start middle @handle.com',
'@handle.com @handle.com @handle.com',
'@full123-chars.test',
'not@right',
'@handle.com!@#$chars',
'@handle.com\n@handle.com',
'parenthetical (@handle.com)',
'👨‍👩‍👧‍👧 @handle.com 👨‍👩‍👧‍👧',
'start https://middle.com end',
'start https://middle.com/foo/bar end',
'start https://middle.com/foo/bar?baz=bux end',
'start https://middle.com/foo/bar?baz=bux#hash end',
'https://start.com/foo/bar?baz=bux#hash middle end',
'start middle https://end.com/foo/bar?baz=bux#hash',
'https://newline1.com\nhttps://newline2.com',
'👨‍👩‍👧‍👧 https://middle.com 👨‍👩‍👧‍👧',
'start middle.com end',
'start middle.com/foo/bar end',
'start middle.com/foo/bar?baz=bux end',
'start middle.com/foo/bar?baz=bux#hash end',
'start.com/foo/bar?baz=bux#hash middle end',
'start middle end.com/foo/bar?baz=bux#hash',
'newline1.com\nnewline2.com',
'a example.com/index.php php link',
'a trailing bsky.app: colon',
'not.. a..url ..here',
'e.g.',
'something-cool.jpg',
'website.com.jpg',
'e.g./foo',
'website.com.jpg/foo',
'Classic article https://socket3.wordpress.com/2018/02/03/designing-windows-95s-user-interface/',
'Classic article https://socket3.wordpress.com/2018/02/03/designing-windows-95s-user-interface/ ',
'https://foo.com https://bar.com/whatever https://baz.com',
'punctuation https://foo.com, https://bar.com/whatever; https://baz.com.',
'parenthentical (https://foo.com)',
'except for https://foo.com/thing_(cool)',
]
const outputs: string[][][] = [
[['no mention']],
[['@handle.com', 'did:fake:handle.com'], [' middle end']],
[['start '], ['@handle.com', 'did:fake:handle.com'], [' end']],
[['start middle '], ['@handle.com', 'did:fake:handle.com']],
[
['@handle.com', 'did:fake:handle.com'],
[' '],
['@handle.com', 'did:fake:handle.com'],
[' '],
['@handle.com', 'did:fake:handle.com'],
],
[['@full123-chars.test', 'did:fake:full123-chars.test']],
[['not@right']],
[['@handle.com', 'did:fake:handle.com'], ['!@#$chars']],
[
['@handle.com', 'did:fake:handle.com'],
['\n'],
['@handle.com', 'did:fake:handle.com'],
],
[['parenthetical ('], ['@handle.com', 'did:fake:handle.com'], [')']],
[['👨‍👩‍👧‍👧 '], ['@handle.com', 'did:fake:handle.com'], [' 👨‍👩‍👧‍👧']],
[['start '], ['https://middle.com', 'https://middle.com'], [' end']],
[
['start '],
['https://middle.com/foo/bar', 'https://middle.com/foo/bar'],
[' end'],
],
[
['start '],
[
'https://middle.com/foo/bar?baz=bux',
'https://middle.com/foo/bar?baz=bux',
],
[' end'],
],
[
['start '],
[
'https://middle.com/foo/bar?baz=bux#hash',
'https://middle.com/foo/bar?baz=bux#hash',
],
[' end'],
],
[
[
'https://start.com/foo/bar?baz=bux#hash',
'https://start.com/foo/bar?baz=bux#hash',
],
[' middle end'],
],
[
['start middle '],
[
'https://end.com/foo/bar?baz=bux#hash',
'https://end.com/foo/bar?baz=bux#hash',
],
],
[
['https://newline1.com', 'https://newline1.com'],
['\n'],
['https://newline2.com', 'https://newline2.com'],
],
[['👨‍👩‍👧‍👧 '], ['https://middle.com', 'https://middle.com'], [' 👨‍👩‍👧‍👧']],
[['start '], ['middle.com', 'https://middle.com'], [' end']],
[
['start '],
['middle.com/foo/bar', 'https://middle.com/foo/bar'],
[' end'],
],
[
['start '],
['middle.com/foo/bar?baz=bux', 'https://middle.com/foo/bar?baz=bux'],
[' end'],
],
[
['start '],
[
'middle.com/foo/bar?baz=bux#hash',
'https://middle.com/foo/bar?baz=bux#hash',
],
[' end'],
],
[
[
'start.com/foo/bar?baz=bux#hash',
'https://start.com/foo/bar?baz=bux#hash',
],
[' middle end'],
],
[
['start middle '],
['end.com/foo/bar?baz=bux#hash', 'https://end.com/foo/bar?baz=bux#hash'],
],
[
['newline1.com', 'https://newline1.com'],
['\n'],
['newline2.com', 'https://newline2.com'],
],
[
['a '],
['example.com/index.php', 'https://example.com/index.php'],
[' php link'],
],
[['a trailing '], ['bsky.app', 'https://bsky.app'], [': colon']],
[['not.. a..url ..here']],
[['e.g.']],
[['something-cool.jpg']],
[['website.com.jpg']],
[['e.g./foo']],
[['website.com.jpg/foo']],
[
['Classic article '],
[
'https://socket3.wordpress.com/2018/02/03/designing-windows-95s-user-interface/',
'https://socket3.wordpress.com/2018/02/03/designing-windows-95s-user-interface/',
],
],
[
['Classic article '],
[
'https://socket3.wordpress.com/2018/02/03/designing-windows-95s-user-interface/',
'https://socket3.wordpress.com/2018/02/03/designing-windows-95s-user-interface/',
],
[' '],
],
[
['https://foo.com', 'https://foo.com'],
[' '],
['https://bar.com/whatever', 'https://bar.com/whatever'],
[' '],
['https://baz.com', 'https://baz.com'],
],
[
['punctuation '],
['https://foo.com', 'https://foo.com'],
[', '],
['https://bar.com/whatever', 'https://bar.com/whatever'],
['; '],
['https://baz.com', 'https://baz.com'],
['.'],
],
[['parenthentical ('], ['https://foo.com', 'https://foo.com'], [')']],
[
['except for '],
['https://foo.com/thing_(cool)', 'https://foo.com/thing_(cool)'],
],
]
it('correctly handles a set of text inputs', async () => {
for (let i = 0; i < inputs.length; i++) {
const input = inputs[i]
const rt = new RichText({ text: input })
await rt.detectFacets(agent)
expect(Array.from(rt.segments(), segmentToOutput)).toEqual(outputs[i])
}
})
describe('correctly detects tags inline', () => {
const inputs: [
string,
string[],
{ byteStart: number; byteEnd: number }[],
][] = [
['#a', ['a'], [{ byteStart: 0, byteEnd: 2 }]],
[
'#a #b',
['a', 'b'],
[
{ byteStart: 0, byteEnd: 2 },
{ byteStart: 3, byteEnd: 5 },
],
],
['#1', [], []],
['#1a', ['1a'], [{ byteStart: 0, byteEnd: 3 }]],
['#tag', ['tag'], [{ byteStart: 0, byteEnd: 4 }]],
['body #tag', ['tag'], [{ byteStart: 5, byteEnd: 9 }]],
['#tag body', ['tag'], [{ byteStart: 0, byteEnd: 4 }]],
['body #tag body', ['tag'], [{ byteStart: 5, byteEnd: 9 }]],
['body #1', [], []],
['body #1a', ['1a'], [{ byteStart: 5, byteEnd: 8 }]],
['body #a1', ['a1'], [{ byteStart: 5, byteEnd: 8 }]],
['#', [], []],
['#?', [], []],
['text #', [], []],
['text # text', [], []],
[
'body #thisisa64characterstring_aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa',
['thisisa64characterstring_aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa'],
[{ byteStart: 5, byteEnd: 70 }],
],
[
'body #thisisa65characterstring_aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaab',
[],
[],
],
[
'body #thisisa64characterstring_aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa!',
['thisisa64characterstring_aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa'],
[{ byteStart: 5, byteEnd: 70 }],
],
[
'its a #double#rainbow',
['double#rainbow'],
[{ byteStart: 6, byteEnd: 21 }],
],
['##hashash', ['#hashash'], [{ byteStart: 0, byteEnd: 9 }]],
['##', [], []],
['some #n0n3s@n5e!', ['n0n3s@n5e'], [{ byteStart: 5, byteEnd: 15 }]],
[
'works #with,punctuation',
['with,punctuation'],
[{ byteStart: 6, byteEnd: 23 }],
],
[
'strips trailing #punctuation, #like. #this!',
['punctuation', 'like', 'this'],
[
{ byteStart: 16, byteEnd: 28 },
{ byteStart: 30, byteEnd: 35 },
{ byteStart: 37, byteEnd: 42 },
],
],
[
'strips #multi_trailing___...',
['multi_trailing'],
[{ byteStart: 7, byteEnd: 22 }],
],
[
'works with #🦋 emoji, and #butter🦋fly',
['🦋', 'butter🦋fly'],
[
{ byteStart: 11, byteEnd: 16 },
{ byteStart: 28, byteEnd: 42 },
],
],
[
'#same #same #but #diff',
['same', 'same', 'but', 'diff'],
[
{ byteStart: 0, byteEnd: 5 },
{ byteStart: 6, byteEnd: 11 },
{ byteStart: 12, byteEnd: 16 },
{ byteStart: 17, byteEnd: 22 },
],
],
['this #⃣tag should not be a tag', [], []],
[
'this ##⃣tag should be a tag',
['#⃣tag'],
[
{
byteStart: 5,
byteEnd: 16,
},
],
],
[
'this #t\nag should be a tag',
['t'],
[
{
byteStart: 5,
byteEnd: 7,
},
],
],
['no match (\\u200B): #', [], []],
['no match (\\u200Ba): #a', [], []],
['match (a\\u200Bb): #ab', ['a'], [{ byteStart: 18, byteEnd: 20 }]],
['match (ab\\u200B): #ab', ['ab'], [{ byteStart: 18, byteEnd: 21 }]],
['no match (\\u20e2tag): #⃢tag', [], []],
['no match (a\\u20e2b): #a⃢b', ['a'], [{ byteStart: 21, byteEnd: 23 }]],
[
'match full width number sign (tag): tag',
['tag'],
[{ byteStart: 36, byteEnd: 42 }],
],
[
'match full width number sign (tag): #⃣tag',
['#⃣tag'],
[{ byteStart: 36, byteEnd: 49 }],
],
['no match 1?: #1?', [], []],
]
it.each(inputs)('%s', async (input, tags, indices) => {
const rt = new RichText({ text: input })
await rt.detectFacets(agent)
const detectedTags: string[] = []
const detectedIndices: { byteStart: number; byteEnd: number }[] = []
for (const { facet } of rt.segments()) {
if (!facet) continue
for (const feature of facet.features) {
if (isTag(feature)) {
detectedTags.push(feature.tag)
}
}
detectedIndices.push(facet.index)
}
expect(detectedTags).toEqual(tags)
expect(detectedIndices).toEqual(indices)
})
})
describe('correctly detects cashtags inline', () => {
const inputs: [
string,
string[],
{ byteStart: number; byteEnd: number }[],
][] = [
['$AAPL', ['$AAPL'], [{ byteStart: 0, byteEnd: 5 }]],
['$aapl', ['$AAPL'], [{ byteStart: 0, byteEnd: 5 }]], // normalized to uppercase
['$A', ['$A'], [{ byteStart: 0, byteEnd: 2 }]],
['$a', ['$A'], [{ byteStart: 0, byteEnd: 2 }]], // single char normalized
[
'$BTC $ETH',
['$BTC', '$ETH'],
[
{ byteStart: 0, byteEnd: 4 },
{ byteStart: 5, byteEnd: 9 },
],
],
['$100', [], []], // starts with digit - not a cashtag
['$GOOGL', ['$GOOGL'], [{ byteStart: 0, byteEnd: 6 }]], // 5 chars - max length
['$TOOLONG', [], []], // >5 chars
['check $LEGO now', ['$LEGO'], [{ byteStart: 6, byteEnd: 11 }]],
['($GOOG)', ['$GOOG'], [{ byteStart: 1, byteEnd: 6 }]],
['$AAPL.', ['$AAPL'], [{ byteStart: 0, byteEnd: 5 }]], // trailing punctuation
[
'$AAPL, $MSFT!',
['$AAPL', '$MSFT'],
[
{ byteStart: 0, byteEnd: 5 },
{ byteStart: 7, byteEnd: 12 },
],
],
['no$SPACE', [], []], // must have leading space or start
['$', [], []], // just dollar sign
['$ AAPL', [], []], // space after $
['$123ABC', [], []], // starts with digit
['$ABC12', ['$ABC12'], [{ byteStart: 0, byteEnd: 6 }]], // digits after letters OK (5 chars)
['$ABC123', [], []], // 6 chars - too long
]
it.each(inputs)('%s', (input, tags, indices) => {
const rt = new RichText({ text: input })
rt.detectFacetsWithoutResolution()
const detectedTags: string[] = []
const detectedIndices: { byteStart: number; byteEnd: number }[] = []
for (const { facet } of rt.segments()) {
if (!facet) continue
for (const feature of facet.features) {
if (isTag(feature) && feature.tag.startsWith('$')) {
detectedTags.push(feature.tag)
}
}
if (
facet.features.some(
(f) => isTag(f) && (f as any).tag?.startsWith('$'),
)
) {
detectedIndices.push(facet.index)
}
}
expect(detectedTags).toEqual(tags)
expect(detectedIndices).toEqual(indices)
})
})
})
function segmentToOutput(segment: RichTextSegment): string[] {
if (segment.facet) {
return [
segment.text,
segment.facet?.features.map((f) => {
if (isMention(f)) return f.did
if (isLink(f)) return f.uri
return undefined
})?.[0] || '',
]
}
return [segment.text]
}