atproto/packages/api/tests/rich-text-sanitization.test.ts
Daniel Holmgren 7f008c05a0
Lexicon refactor (#658)
* remove return in test

* couple of fixups in other pacakges

* Add dummy checks to declaration and follow app migrations, remove paranoid join

* update db nsid migration

* Ensure there are writes in follow app migration

* Add dumy check to votes-to-likes app migration, tidy

* Ensure there are writes in vote-to-like app migration

* update migration name

* pr feedback

* count utf8 & grapheme length

* add maxUtf8

* siwtch max semantics

* plural

* update post schema

* added bytes & cid refs

* add ipld<>json

* fixin up a could tings

* Add app.bsky.richtext.facet, replace post entities with facets

* plural actors

* wip

* Setup backlinks table on pds

* wip

* send & recieve cids/bytes with xrpc

* Track backlinks when indexing records on pds

* handle ipld vals in xrpc server

* added cids & bytes to codegen

* In createRecord, add deletions to avoid duplicate likes/follows/reposts

* Tests and fixes for prevention of dupe follows, likes, reposts

* Backlink migration tidy

* cleanup dag json parser

* Fix dupe backlink inserts

* Tidy

* blob refs + codegen

* Make profile displayName optional

* Test view and updateProfile for empty display name

* working into pds

* Make aggregate counts optional on post and profile views

* Make viewer state optional on post view for consistency

* Remove deprecated myState field on profile view

* Tidy repo method descriptions

* tests & types & fixes

* Implementation and tests for putRecord

* Remove updateProfile method

* Update repo service so that head can be taken for update externally

* Lex updates for compare-and-swap records/commits

* Add error to lex for bad repo compare-and-swaps

* Improve update-at-head thru repo service

* common package

* Implement and test compare-and-swaps on repo write methods

* Use lex discriminator for applyWrites

* Remove post entity/facet index

* Update lex descriptions to clarify repo write semantics

* Make deleteRecord idempotent w/ tests

* cleanup

* fix things up

* adding more formats

* tests

* updating schema

* Only generate tid rkeys on pds, support literal rkeys on client

* Add backlink indexes

* Update format of post embed views, fix external uri validation

* fixing up tests

* Include embeds on record embeds

* cleanup

* Notify users when they are quoted

* Remove determineRkey indirection

* fix api tests

* support concatenated cbor

* integrating to server

* re-enable tests

* fix up tests

* Thread compare-and-swaps down into repo service rather than use pinned storage

* Tidy

* Update packages/common/tests/ipld-multi.test.ts

Co-authored-by: devin ivy <devinivy@gmail.com>

* Update packages/lexicon/src/validators/formats.ts

Co-authored-by: devin ivy <devinivy@gmail.com>

* pr feedback

* pr feedback

* Add postgres-specific migration path for missing profile display names

* Tidy/clarify deep embeds

* Tidy

* rm unused escape

* decrease crud race count

* update subscribeRepos lexicon

* Fix applyWrite lexicon re: collection fields

* sign post event type

* update cids & bytes json encoding

* update lex blob & cid-link types

* updated codegen & pds

* number -> float

* missed a couple

* remove old image constraints

* pr feedback + descripts

* no hardcoded port numbers

* remove separate tooLarge evt

* fix dumb build error

* fixin gup lex + xrpc server

* better parsing of message types

* dont mutate body in subscription

* bugfix in subscription

* rm commented out code

* init feature branch

* undo

* Remove old lexicons

* Remove creator from profile view

* wip

* rework seqs

* fixed up tests

* bug fixing

* sequence handles & notify in dbTxn

* tidy

* update lex to include times

* test syncing handle changes

* one more fix

* handle too big evts

* dont thread sequencer through everything

* Split common into server vs web-friendly versions

* Make lexicon, identifier web-safe using common-web

* Switch api package to be a browser build, fix identifier package for browser bundling

* Fix pds and repo for lexicon package changes, tidy

* Make common-web a browser build, tidy

* fixing up deps

* fix up test

* turn off caching in actions

* Standardize repo write interfaces around repo input

* Update repo write endpoints for repo input field

* Remove scene follows during app migration

* API package updates (#712)

* Add bsky agent and various sugars to the api package

* Add richtext library to api package

* Update richtext to use facets and deprecate entities

* Update richtext to use utf8 indices

* Richtext converts deprecated entity indices from utf16 locations to utf8 locations

* Add note about encodings in the lexicon

* Add RichText facet detection

* Remove dead code

* Add deprecation notices to lexicons

* Usability improvements to RichText

* Update the api package readme

* Add RichText#detectFacetsWithoutResolution

* Add upsertProfile to bsky-agent

* Update packages/pds/src/api/com/atproto/repo/applyWrites.ts

Co-authored-by: devin ivy <devinivy@gmail.com>

* pr feedback

* fix flaky timing streaming tests

* simplify emptyPromise

* fixed up open handles

* fix missed repo syntax

* fix error in test from fkey constraint

* fix another api agent bug

* Embed consistency, add complex record embed

* Tidy embed lex descriptions

* rename pg schemas

* use swc for jest

* fix up deps

* cleanup

* Update pds indexing, views, tests for complex record embeds

* fixing up profile view semantics

* wip

* update snaps

* Rename embed.complexRecord to embed.recordWithMedia

* Tidy aroud record w/ media embeds

* Add grapheme utilities to api RichText (#720)

Co-authored-by: dholms <dtholmgren@gmail.com>

* Fix: app.bsky.feed.getPostThread#... to app.bsky.feed.defs#... (#726)

* Update bskyagent to use repo param

* Minor typing fix

* Add exports to api package: blobref & lex/json converters (#727)

* Add exports to api package: BlobRef & lex/json converters

* Add an example react-native fetch handler

* Switch all lingering references of recordRef to strongRef

* Update lexicon for richtext facets to have multiple features, byte slice rather than text slice

* Implement multi-feature richtext facets on pds

* Update api package to use updated richtext facets

* Minor fixes to admin repo/record views

* Fix app migration exports, remove old app migration

* Fix: sort richtext facets so they can render correctly

* Disable app migration dummy checks that don't work on live deploy

* Optimize lex de/serialization using simple checks

* Tidy comment typos

* App migration to cleanup notifications for likes, follows, old scene notifs

* Fix notification reason for change from vote to like

---------

Co-authored-by: Devin Ivy <devinivy@gmail.com>
Co-authored-by: Paul Frazee <pfrazee@gmail.com>
2023-03-31 13:34:51 -04:00

212 lines
8.5 KiB
TypeScript
Raw Permalink Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import { RichText, sanitizeRichText, Facet, UnicodeString } from '../src'
describe('sanitizeRichText: cleanNewlines', () => {
it('removes more than two consecutive new lines', () => {
const input = new RichText({
text: 'test\n\n\n\n\ntest\n\n\n\n\n\n\ntest\n\n\n\n\n\n\ntest\n\n\n\n\n\n\ntest',
})
const output = sanitizeRichText(input, { cleanNewlines: true })
expect(String(output.unicodeText)).toEqual(
'test\n\ntest\n\ntest\n\ntest\n\ntest',
)
})
it('removes more than two consecutive new lines w/fat unicode', () => {
const input = new RichText({
text: 'test👨👩👧👧\n\n\n\n\n👨👩👧👧test\n\n\n\n\n\n\ntest👨👩👧👧\n\n\n\n\n\n\ntest\n\n\n\n\n\n\n👨👩👧👧test',
})
const output = sanitizeRichText(input, { cleanNewlines: true })
expect(String(output.unicodeText)).toEqual(
'test👨👩👧👧\n\n👨👩👧👧test\n\ntest👨👩👧👧\n\ntest\n\n👨👩👧👧test',
)
})
it('removes more than two consecutive new lines with spaces', () => {
const input = new RichText({
text: 'test\n\n\n\n\ntest\n \n \n \n \n\n\ntest\n\n\n\n\n\n\ntest\n\n\n\n\n \n\ntest',
})
const output = sanitizeRichText(input, { cleanNewlines: true })
expect(String(output.unicodeText)).toEqual(
'test\n\ntest\n\ntest\n\ntest\n\ntest',
)
})
it('returns original string if there are no consecutive new lines', () => {
const input = new RichText({ text: 'test\n\ntest\n\ntest\n\ntest\n\ntest' })
const output = sanitizeRichText(input, { cleanNewlines: true })
expect(String(output.unicodeText)).toEqual(String(input.unicodeText))
})
it('returns original string if there are no new lines', () => {
const input = new RichText({ text: 'test test test test test' })
const output = sanitizeRichText(input, { cleanNewlines: true })
expect(String(output.unicodeText)).toEqual(String(input.unicodeText))
})
it('returns empty string if input is empty', () => {
const input = new RichText({ text: '' })
const output = sanitizeRichText(input, { cleanNewlines: true })
expect(String(output.unicodeText)).toEqual('')
})
it('works with different types of new line characters', () => {
const input = new RichText({
text: 'test\r\ntest\n\rtest\rtest\n\n\n\ntest\n\r \n \n \n \n\n\ntest',
})
const output = sanitizeRichText(input, { cleanNewlines: true })
expect(String(output.unicodeText)).toEqual(
'test\r\ntest\n\rtest\rtest\n\ntest\n\ntest',
)
})
it('removes more than two consecutive new lines with zero width space', () => {
const input = new RichText({
text: 'test\n\n\n\n\ntest\n\u200B\u200B\n\n\n\ntest\n \u200B\u200B \n\n\n\ntest\n\n\n\n\n\n\ntest',
})
const output = sanitizeRichText(input, { cleanNewlines: true })
expect(String(output.unicodeText)).toEqual(
'test\n\ntest\n\ntest\n\ntest\n\ntest',
)
})
it('removes more than two consecutive new lines with zero width non-joiner', () => {
const input = new RichText({
text: 'test\n\n\n\n\ntest\n\u200C\u200C\n\n\n\ntest\n \u200C\u200C \n\n\n\ntest\n\n\n\n\n\n\ntest',
})
const output = sanitizeRichText(input, { cleanNewlines: true })
expect(String(output.unicodeText)).toEqual(
'test\n\ntest\n\ntest\n\ntest\n\ntest',
)
})
it('removes more than two consecutive new lines with zero width joiner', () => {
const input = new RichText({
text: 'test\n\n\n\n\ntest\n\u200D\u200D\n\n\n\ntest\n \u200D\u200D \n\n\n\ntest\n\n\n\n\n\n\ntest',
})
const output = sanitizeRichText(input, { cleanNewlines: true })
expect(String(output.unicodeText)).toEqual(
'test\n\ntest\n\ntest\n\ntest\n\ntest',
)
})
it('removes more than two consecutive new lines with soft hyphen', () => {
const input = new RichText({
text: 'test\n\n\n\n\ntest\n\u00AD\u00AD\n\n\n\ntest\n \u00AD\u00AD \n\n\n\ntest\n\n\n\n\n\n\ntest',
})
const output = sanitizeRichText(input, { cleanNewlines: true })
expect(String(output.unicodeText)).toEqual(
'test\n\ntest\n\ntest\n\ntest\n\ntest',
)
})
it('removes more than two consecutive new lines with word joiner', () => {
const input = new RichText({
text: 'test\n\n\n\n\ntest\n\u2060\u2060\n\n\n\ntest\n \u2060\u2060 \n\n\n\ntest\n\n\n\n\n\n\ntest',
})
const output = sanitizeRichText(input, { cleanNewlines: true })
expect(String(output.unicodeText)).toEqual(
'test\n\ntest\n\ntest\n\ntest\n\ntest',
)
})
})
describe('sanitizeRichText w/facets: cleanNewlines', () => {
it('preserves entities as expected', () => {
const input = new RichText({
text: 'test\n\n\n\n\ntest\n\n\n\n\n\n\ntest\n\n\n\n\n\n\ntest\n\n\n\n\n\n\ntest',
facets: [
{ index: { byteStart: 0, byteEnd: 13 }, features: [{ $type: '' }] },
{ index: { byteStart: 13, byteEnd: 24 }, features: [{ $type: '' }] },
{ index: { byteStart: 9, byteEnd: 15 }, features: [{ $type: '' }] },
{ index: { byteStart: 4, byteEnd: 9 }, features: [{ $type: '' }] },
],
})
const output = sanitizeRichText(input, { cleanNewlines: true })
expect(facetToStr(String(input.unicodeText), input.facets?.[0])).toEqual(
'test\n\n\n\n\ntest',
)
expect(facetToStr(String(input.unicodeText), input.facets?.[1])).toEqual(
'\n\n\n\n\n',
)
expect(facetToStr(String(input.unicodeText), input.facets?.[2])).toEqual(
'test\n\n',
)
expect(facetToStr(String(input.unicodeText), input.facets?.[3])).toEqual(
'\n\n\n\n\n\n\ntest',
)
expect(String(output.unicodeText)).toEqual(
'test\n\ntest\n\ntest\n\ntest\n\ntest',
)
expect(facetToStr(String(output.unicodeText), output.facets?.[0])).toEqual(
'test\n\ntest',
)
expect(facetToStr(String(output.unicodeText), output.facets?.[1])).toEqual(
'test',
)
expect(facetToStr(String(output.unicodeText), output.facets?.[2])).toEqual(
'test',
)
expect(output.facets?.[3]).toEqual(undefined)
})
it('preserves entities as expected w/fat unicode', () => {
const str = new UnicodeString(
'👨👩👧👧test\n\n\n\n\n👨👩👧👧test\n\n\n\n\n👨👩👧👧test\n\n\n\n\n👨👩👧👧test\n\n\n\n\n👨👩👧👧test\n\n\n\n\n👨👩👧👧test\n\n\n\n\n👨👩👧👧test\n\n\n\n\n',
)
let lastI = 0
const makeFacet = (match: string) => {
const i = str.utf16.indexOf(match, lastI)
lastI = i + match.length
const byteStart = str.utf16IndexToUtf8Index(i)
const byteEnd = byteStart + new UnicodeString(match).length
return {
index: { byteStart, byteEnd },
features: [{ $type: '' }],
}
}
const input = new RichText({
text: str.utf16,
facets: [
makeFacet('👨👩👧👧test\n\n\n\n\n👨👩👧👧test'),
makeFacet('\n\n\n\n\n👨👩👧👧test'),
makeFacet('👨👩👧👧test\n\n'),
makeFacet('\n\n'),
],
})
const output = sanitizeRichText(input, { cleanNewlines: true })
expect(facetToStr(String(input.unicodeText), input.facets?.[0])).toEqual(
'👨👩👧👧test\n\n\n\n\n👨👩👧👧test',
)
expect(facetToStr(String(input.unicodeText), input.facets?.[1])).toEqual(
'\n\n\n\n\n👨👩👧👧test',
)
expect(facetToStr(String(input.unicodeText), input.facets?.[2])).toEqual(
'👨👩👧👧test\n\n',
)
expect(facetToStr(String(input.unicodeText), input.facets?.[3])).toEqual(
'\n\n',
)
expect(String(output.unicodeText)).toEqual(
'👨👩👧👧test\n\n👨👩👧👧test\n\n👨👩👧👧test\n\n👨👩👧👧test\n\n👨👩👧👧test\n\n👨👩👧👧test\n\n👨👩👧👧test\n\n',
)
expect(facetToStr(String(output.unicodeText), output.facets?.[0])).toEqual(
'👨👩👧👧test\n\n👨👩👧👧test',
)
expect(facetToStr(String(output.unicodeText), output.facets?.[1])).toEqual(
'👨👩👧👧test',
)
expect(facetToStr(String(output.unicodeText), output.facets?.[2])).toEqual(
'👨👩👧👧test',
)
expect(output.facets?.[3]).toEqual(undefined)
})
})
function facetToStr(str: string, ent?: Facet) {
if (!ent) {
return ''
}
return new UnicodeString(str).slice(ent.index.byteStart, ent.index.byteEnd)
}