Skip to content

Commit

Permalink
More unicode.
Browse files Browse the repository at this point in the history
  • Loading branch information
ncruces committed Jan 16, 2025
1 parent a159b54 commit ab09da7
Show file tree
Hide file tree
Showing 2 changed files with 57 additions and 10 deletions.
53 changes: 43 additions & 10 deletions ext/unicode/unicode.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,16 +5,18 @@
// - LIKE and REGEXP operators,
// - collation sequences.
//
// It also provides, from PostgreSQL:
// - unaccent(),
// - initcap().
//
// The implementation is not 100% compatible with the [ICU extension]:
// - upper() and lower() use [strings.ToUpper], [strings.ToLower] and [cases];
// - the LIKE operator follows [strings.EqualFold] rules;
// - the REGEXP operator uses Go [regexp/syntax];
// - collation sequences use [collate].
//
// It also provides (approximately) from PostgreSQL:
// - casefold(),
// - initcap(),
// - normalize(),
// - unaccent().
//
// Expect subtle differences (e.g.) in the handling of Turkish case folding.
//
// [ICU extension]: https://sqlite.org/src/dir/ext/icu
Expand Down Expand Up @@ -48,21 +50,24 @@ var RegisterLike = true
// Register registers Unicode aware functions for a database connection.
func Register(db *sqlite3.Conn) error {
const flags = sqlite3.DETERMINISTIC | sqlite3.INNOCUOUS
var errs util.ErrorJoiner
var lkfn sqlite3.ScalarFunction
if RegisterLike {
errs.Join(
db.CreateFunction("like", 2, flags, like),
db.CreateFunction("like", 3, flags, like))
lkfn = like
}
errs.Join(
return errors.Join(
db.CreateFunction("like", 2, flags, lkfn),
db.CreateFunction("like", 3, flags, lkfn),
db.CreateFunction("upper", 1, flags, upper),
db.CreateFunction("upper", 2, flags, upper),
db.CreateFunction("lower", 1, flags, lower),
db.CreateFunction("lower", 2, flags, lower),
db.CreateFunction("regexp", 2, flags, regex),
db.CreateFunction("initcap", 1, flags, initcap),
db.CreateFunction("initcap", 2, flags, initcap),
db.CreateFunction("casefold", 1, flags, casefold),
db.CreateFunction("unaccent", 1, flags, unaccent),
db.CreateFunction("normalize", 1, flags, normalize),
db.CreateFunction("normalize", 2, flags, normalize),
db.CreateFunction("icu_load_collation", 2, sqlite3.DIRECTONLY,
func(ctx sqlite3.Context, arg ...sqlite3.Value) {
name := arg[1].Text()
Expand All @@ -76,7 +81,6 @@ func Register(db *sqlite3.Conn) error {
return // notest
}
}))
return errors.Join(errs...)
}

// RegisterCollation registers a Unicode collation sequence for a database connection.
Expand Down Expand Up @@ -154,6 +158,10 @@ func initcap(ctx sqlite3.Context, arg ...sqlite3.Value) {
ctx.ResultRawText(cs.Bytes(arg[0].RawText()))
}

func casefold(ctx sqlite3.Context, arg ...sqlite3.Value) {
ctx.ResultRawText(cases.Fold().Bytes(arg[0].RawText()))
}

func unaccent(ctx sqlite3.Context, arg ...sqlite3.Value) {
unaccent := transform.Chain(norm.NFD, runes.Remove(runes.In(unicode.Mn)), norm.NFC)
res, _, err := transform.Bytes(unaccent, arg[0].RawText())
Expand All @@ -164,6 +172,31 @@ func unaccent(ctx sqlite3.Context, arg ...sqlite3.Value) {
}
}

func normalize(ctx sqlite3.Context, arg ...sqlite3.Value) {
form := norm.NFC
if len(arg) > 1 {
switch strings.ToUpper(arg[1].Text()) {
case "NFC":
//
case "NFD":
form = norm.NFD
case "NFKC":
form = norm.NFKC
case "NFKD":
form = norm.NFKD
default:
ctx.ResultError(util.ErrorString("unicode: invalid form"))
return
}
}
res, _, err := transform.Bytes(form, arg[0].RawText())
if err != nil {
ctx.ResultError(err) // notest
} else {
ctx.ResultRawText(res)
}
}

func regex(ctx sqlite3.Context, arg ...sqlite3.Value) {
re, ok := ctx.GetAuxData(0).(*regexp.Regexp)
if !ok {
Expand Down
14 changes: 14 additions & 0 deletions ext/unicode/unicode_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,12 @@ func TestRegister(t *testing.T) {
{`upper('Dünyanın İlk Borsası', 'tr-TR')`, "DÜNYANIN İLK BORSASI"},
{`initcap('Kad je hladno Marko nosi džemper')`, "Kad Je Hladno Marko Nosi Džemper"},
{`initcap('Kad je hladno Marko nosi džemper', 'hr-HR')`, "Kad Je Hladno Marko Nosi Džemper"},
{`normalize(X'61cc88')`, "ä"},
{`normalize(X'61cc88', 'NFC' )`, "ä"},
{`normalize(X'61cc88', 'NFKC')`, "ä"},
{`normalize('ä', 'NFD' )`, "\x61\xcc\x88"},
{`normalize('ä', 'NFKD')`, "\x61\xcc\x88"},
{`casefold('Maße')`, "masse"},
{`unaccent('Hôtel')`, "Hotel"},
{`'Hello' REGEXP 'ell'`, "1"},
{`'Hello' REGEXP 'el.'`, "1"},
Expand Down Expand Up @@ -208,6 +214,14 @@ func TestRegister_error(t *testing.T) {
t.Errorf("got %v, want sqlite3.ERROR", err)
}

err = db.Exec(`SELECT normalize('', 'NF')`)
if err == nil {
t.Error("want error")
}
if !errors.Is(err, sqlite3.ERROR) {
t.Errorf("got %v, want sqlite3.ERROR", err)
}

err = db.Exec(`SELECT 'hello' REGEXP '\'`)
if err == nil {
t.Error("want error")
Expand Down

0 comments on commit ab09da7

Please sign in to comment.